From e86656e83b4e0d07576e4593cd1ee89e279e4fe9 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Mon, 4 May 2026 13:39:52 +0900 Subject: [PATCH 01/18] Switch translation web context to passive fetchWebPage The translator used fetchLinkedPages as a required context source, which fetched up to 10 linked pages with their full Readability- extracted body and dumped everything into the system prompt as "Additional context: ...". For articles that link to several long technical posts the context easily ballooned to multiples of the source text size, and Claude began confusing the context for the text to translate. Concrete failures observed on @nebuleto/2026/swe-with-coding-agent-in-2026: the en-US row came back as the linked Anthropic blog post in English (~3.5x the original length), and the ja row came back entirely in Korean, paraphrasing the linked content. Switch to the passive fetchWebPage source instead. Vertana exposes passive sources as tools rather than pre-fetching them into the prompt, so the model only invokes a fetch when it actually decides to, which sidesteps the up-front pollution while still leaving a fetch capability on the table. The other context sources (author info, article tags) and Vertana options (refinement, dynamic glossary, technical tone) are unchanged. Existing translation rows that were generated under the old behavior remain in the database; they should be deleted so the auto-request path on /lang regenerates them with the new code. Assisted-by: Claude Code:claude-opus-4-7 --- ai/translate.ts | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/ai/translate.ts b/ai/translate.ts index af487062e..35cece6bf 100644 --- a/ai/translate.ts +++ b/ai/translate.ts @@ -1,5 +1,5 @@ -import { fetchLinkedPages } from "@vertana/context-web"; -import type { RequiredContextSource } from "@vertana/core"; +import { fetchWebPage } from "@vertana/context-web"; +import type { ContextSource, RequiredContextSource } from "@vertana/core"; import { translate as vertanaTranslate } from "@vertana/facade"; import type { LanguageModel } from "ai"; @@ -64,7 +64,7 @@ function createTagsContextSource( export async function translate(options: TranslationOptions): Promise { // Build context sources - const contextSources: RequiredContextSource[] = []; + const contextSources: ContextSource[] = []; const authorSource = createAuthorContextSource( options.authorName, @@ -75,12 +75,11 @@ export async function translate(options: TranslationOptions): Promise { const tagsSource = createTagsContextSource(options.tags); if (tagsSource) contextSources.push(tagsSource); - // Add web context to fetch linked pages - const webContext = fetchLinkedPages({ - text: options.text, - mediaType: "text/markdown", - }); - contextSources.push(webContext); + // Expose linked-page fetching as a passive tool the model can call + // when it actually needs context, instead of dumping every linked + // page's full body into the system prompt up front (which made the + // translator confuse the context for the text to translate). + contextSources.push(fetchWebPage); const result = await vertanaTranslate( options.model, From 302c2427466d9cd370595186aba4eb5f07dc11c1 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Mon, 4 May 2026 13:59:19 +0900 Subject: [PATCH 02/18] Extract runArticleContentTranslation helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pure refactor of startArticleContentTranslation in models/article.ts. The on-conflict / staleness gate at the front (insert-or-find the placeholder row, decide whether work is needed) stays in the public function. The post-row-creation work — fetching author/tag context, running translate(), writing the result back, sending the federation Update activity, and kicking off post-translation summarization — moves into a new private async helper runArticleContentTranslation(fedCtx, queued). No behavior change. The synchronous setup phase (article-source fetch) is still awaited before the translate() chain is scheduled, so callers see exactly the same return timing as before, and the translate-then/catch chain still runs in the background. A defensive log-and-return guard is added when the helper is called with a row that has originalLanguage IS NULL (the original-language row itself); the existing public entry point never produces such a row, but a forthcoming caller will pass already-existing translation rows through this path. Assisted-by: Claude Code:claude-opus-4-7 --- models/article.ts | 58 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/models/article.ts b/models/article.ts index 85c4c2bda..45e55b7ba 100644 --- a/models/article.ts +++ b/models/article.ts @@ -722,7 +722,7 @@ export async function startArticleContentTranslation( fedCtx: Context, { content, targetLanguage, requester }: ArticleContentTranslationOptions, ): Promise { - const { db, models: { translator: model, summarizer } } = fedCtx.data; + const { db } = fedCtx.data; const inserted = await db.insert(articleContentTable).values({ sourceId: content.sourceId, language: targetLanguage, @@ -752,14 +752,49 @@ export async function startArticleContentTranslation( } else { queued = inserted[0]; } + await runArticleContentTranslation(fedCtx, queued); + return queued; +} + +/** + * Runs the actual LLM translation for an `article_content` row that is + * already in the placeholder / `beingTranslated` state. Awaits the + * synchronous setup (fetching author/tag context for the model), then + * schedules the `translate(...)` chain and returns; the caller does + * not await the translation itself. When the model resolves, the + * row is overwritten with the translated title/body, a federation + * `Update` activity is sent, and post-translation summarization is + * kicked off. On failure, the placeholder row is deleted so a future + * visit can re-queue. + * + * Should never be called with an original-language row + * (`originalLanguage IS NULL`); the caller is responsible for placing + * the row into the placeholder state first. + */ +async function runArticleContentTranslation( + fedCtx: Context, + queued: ArticleContent, +): Promise { + const { db, models: { translator: model, summarizer } } = fedCtx.data; logger.debug( "Starting translation for content: {sourceId} {language}", queued, ); + const { sourceId, language: targetLanguage, originalLanguage } = queued; + if (originalLanguage == null) { + // Defensive: a row without `originalLanguage` is the original- + // language content itself and should never be passed in here. + logger.error( + "runArticleContentTranslation called for an original-language row; " + + "skipping ({sourceId} {language}).", + queued, + ); + return; + } - // Fetch article source with author information for translation context + // Fetch article source with author information for translation context. const articleSource = await db.query.articleSourceTable.findFirst({ - where: { id: content.sourceId }, + where: { id: sourceId }, with: { account: { with: { @@ -769,14 +804,14 @@ export async function startArticleContentTranslation( }, }); - // Combine title and content for translation - const text = `# ${content.title}\n\n${content.content}`; + // Combine title and content for translation. + const text = `# ${queued.title}\n\n${queued.content}`; translate({ model, - sourceLanguage: content.language, + sourceLanguage: originalLanguage, targetLanguage, text, - // Pass context for better translation quality + // Pass context for better translation quality. authorName: articleSource?.account?.actor?.name ?? undefined, authorBio: articleSource?.account?.actor?.bioHtml ?? undefined, tags: articleSource?.tags, @@ -785,7 +820,7 @@ export async function startArticleContentTranslation( ...queued, translation, }); - // Split the translation into title and content + // Split the translation into title and content. const title = translation.match(/^\s*#\s+([^\n]*)/)?.[1] ?? ""; const content = translation.replace(/^\s*#\s+[^\n]*\s*/, "").trim(); const updated = await db.update(articleContentTable) @@ -804,14 +839,14 @@ export async function startArticleContentTranslation( }) .where( and( - eq(articleContentTable.sourceId, queued.sourceId), + eq(articleContentTable.sourceId, sourceId), eq(articleContentTable.language, targetLanguage), ), ) .returning(); if (updated.length < 1) return; const article = await db.query.articleSourceTable.findFirst({ - where: { id: queued.sourceId }, + where: { id: sourceId }, with: { account: true, contents: true, @@ -879,10 +914,9 @@ export async function startArticleContentTranslation( await db.delete(articleContentTable) .where( and( - eq(articleContentTable.sourceId, queued.sourceId), + eq(articleContentTable.sourceId, sourceId), eq(articleContentTable.language, targetLanguage), ), ); }); - return queued; } From c607681f35837656dcb5269a60fd38b43c0ad5b8 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Mon, 4 May 2026 14:06:22 +0900 Subject: [PATCH 03/18] Guard async translation writes with a JS-side claim CAS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inside the new runArticleContentTranslation helper, take a JS-side claim Date as ownership stamp on the placeholder row before kicking off the translate() chain. The success UPDATE and the failure DELETE both gate on the row's `updated` still equalling that claim — a concurrent re-translation that resets the row out from under us bumps `updated` past the claim, and our writes turn into no-ops instead of clobbering the fresher placeholder with stale text or deleting it on a stale failure. A JS Date is used (rather than PG `CURRENT_TIMESTAMP`) because the postgres driver hands back JS `Date` values truncated to ms while PG `timestamptz` keeps µs precision; a CAS against a round-tripped `CURRENT_TIMESTAMP` write would never match its own claim. This mirrors the claim pattern already used by startArticleContentSummary in this file. The race this prevents only becomes reachable in practice with the forthcoming retranslation-on-edit code path, but the guard is self-contained and the existing failure-cleanup test still passes, so it lands ahead of that change as a pure correctness fix. Assisted-by: Claude Code:claude-opus-4-7 --- models/article.ts | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/models/article.ts b/models/article.ts index 45e55b7ba..2f10093d6 100644 --- a/models/article.ts +++ b/models/article.ts @@ -792,6 +792,27 @@ async function runArticleContentTranslation( return; } + // Take ownership of the placeholder row by stamping it with a + // JS-side `Date` that becomes our claim id. Subsequent success / + // failure writes from this run only land if the row's `updated` + // still equals this claim — a concurrent re-translation that resets + // the row out from under us bumps `updated` past this value, and + // our writes turn into no-ops instead of clobbering the fresher + // claim. Using a JS `Date` (rather than PG `CURRENT_TIMESTAMP`) + // is what makes this CAS reliable: PG `timestamptz` keeps µs + // precision while the `postgres` driver hands back JS `Date` + // values truncated to ms, so a CAS against the round-tripped + // value of a `CURRENT_TIMESTAMP` write would never match. + const claim = new Date(); + await db.update(articleContentTable) + .set({ updated: claim }) + .where( + and( + eq(articleContentTable.sourceId, sourceId), + eq(articleContentTable.language, targetLanguage), + ), + ); + // Fetch article source with author information for translation context. const articleSource = await db.query.articleSourceTable.findFirst({ where: { id: sourceId }, @@ -841,10 +862,25 @@ async function runArticleContentTranslation( and( eq(articleContentTable.sourceId, sourceId), eq(articleContentTable.language, targetLanguage), + // CAS on the claim taken at the top of this function — see + // that comment for why a JS `Date` rather than the row's + // round-tripped `updated` is the safe reference. If a + // concurrent re-translation took its own claim under us + // the `updated` will no longer match `claim` and this + // write becomes a no-op so we don't clobber its fresher + // placeholder with our stale text. + eq(articleContentTable.updated, claim), ), ) .returning(); - if (updated.length < 1) return; + if (updated.length < 1) { + logger.debug( + "Stale translation claim, skipping federation/summary " + + "({sourceId} {language}).", + queued, + ); + return; + } const article = await db.query.articleSourceTable.findFirst({ where: { id: sourceId }, with: { @@ -916,6 +952,10 @@ async function runArticleContentTranslation( and( eq(articleContentTable.sourceId, sourceId), eq(articleContentTable.language, targetLanguage), + // CAS on the same claim as the success path — a stale + // failure must not delete a row another caller has since + // re-claimed. + eq(articleContentTable.updated, claim), ), ); }); From 422ee879c7c47bdceb0e773b74d02ef0872e2931 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Mon, 4 May 2026 14:14:34 +0900 Subject: [PATCH 04/18] Re-translate translations when the original article body changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Edits to an article's body now invalidate every existing translation row for that article and re-runs the translator against the fresh original. Previously, translations stayed pinned to whatever text they were generated from, so /ja, /en, etc. silently drifted out of sync with the edited original. The trigger lives in two pieces: - updateArticleSource now returns an extra `originalContentChanged` flag set when the original-language `article_content` row's body actually changed. The flag mirrors the existing summary invalidation gate: title-only edits do not set it, so a typo fix on the title doesn't fan out N+1 LLM calls; the body-changed gate also catches the case the summary path already cared about. A successful `languageChanged` cannot reach this branch when translations exist (the self-FK on `article_content` re-throws 23503 as `LanguageChangeWithTranslationsError`), so the flag is effectively `contentChanged`. - A new restartArticleContentTranslations(fedCtx, articleSource) resets each translation row to placeholder state — copying the new original title/content into it, flipping `beingTranslated` back to true, and clearing summary state — then fires the extracted runArticleContentTranslation helper against each reset row in the background. updateArticle reads the new flag and awaits the synchronous claim-and-reset step so placeholders are visible by the time the mutation responds; the actual translate() calls run in the background, same as the initial translation flow. Each re-translation that completes sends its own federation Update so peers see the original-language change first and then each translation's refresh as it becomes available — the right ActivityPub semantics. The CAS guard added in the previous commit keeps an in-flight stale translation from clobbering the new placeholder with stale text on completion. The summary-test callers of updateArticleSource only assert truthiness on the return value, so the new return shape passes through them unchanged. The dedicated source-level test that previously asserted "preserves translations" splits into two: one for the body-change case (now asserting the flag flips and the existing row is preserved here for the outer caller to invalidate) and one for the title-only carve-out. New end-to-end tests in article.lifecycle.test.ts cover updateArticle's full body-change and title-only paths. Fixes https://github.com/hackers-pub/hackerspub/issues/95 Assisted-by: Claude Code:claude-opus-4-7 --- models/article.background.test.ts | 149 ++++++++++++++++++++++++++++++ models/article.lifecycle.test.ts | 135 +++++++++++++++++++++++++++ models/article.source.test.ts | 71 ++++++++++++-- models/article.ts | 125 ++++++++++++++++++++++++- 4 files changed, 469 insertions(+), 11 deletions(-) diff --git a/models/article.background.test.ts b/models/article.background.test.ts index 3044fca7c..80cd783cd 100644 --- a/models/article.background.test.ts +++ b/models/article.background.test.ts @@ -2,6 +2,7 @@ import assert from "node:assert/strict"; import test from "node:test"; import { articleContentTable, articleSourceTable } from "./schema.ts"; import { + restartArticleContentTranslations, startArticleContentSummary, startArticleContentTranslation, } from "./article.ts"; @@ -134,3 +135,151 @@ test("startArticleContentTranslation() deletes queued rows when translation fail }); }); }); + +test( + "restartArticleContentTranslations() resets each translation row to placeholder state and re-runs the translator", + async () => { + await withRollback(async (tx) => { + const fedCtx = createFedCtx(tx); + // Use the same `{} as never` translator stub as the existing + // failure-path test: it lets us observe the row pass through + // the placeholder state and then be cleaned up by the failure + // branch, which is enough to confirm + // restartArticleContentTranslations actually re-fired the + // translation pipeline against the reset row. + fedCtx.data.models = { + summarizer: {} as never, + translator: {} as never, + } as typeof fedCtx.data.models; + const author = await insertAccountWithActor(tx, { + username: "restarttranslator", + name: "Restart Translator", + email: "restarttranslator@example.com", + }); + const requester = await insertAccountWithActor(tx, { + username: "restartrequester", + name: "Restart Requester", + email: "restartrequester@example.com", + }); + const sourceId = generateUuidV7(); + const published = new Date("2026-04-15T00:00:00.000Z"); + + const [articleSource] = await tx.insert(articleSourceTable).values({ + id: sourceId, + accountId: author.account.id, + publishedYear: 2026, + slug: "restart-translation", + tags: [], + allowLlmTranslation: true, + published, + updated: published, + }).returning(); + // The original row carries the *new* (post-edit) body — the + // shape updateArticleSource leaves behind for + // restartArticleContentTranslations to mirror into each + // translation placeholder. + await tx.insert(articleContentTable).values({ + sourceId, + language: "en", + title: "New original title", + content: "New original body", + published, + updated: published, + }); + // A previously completed translation that is now stale relative + // to the freshly edited original. + await tx.insert(articleContentTable).values({ + sourceId, + language: "ko", + title: "Stale translated title", + content: "Stale translated body", + summary: "Stale summary.", + originalLanguage: "en", + translationRequesterId: requester.account.id, + beingTranslated: false, + published: new Date("2026-04-15T01:00:00.000Z"), + updated: new Date("2026-04-15T01:00:00.000Z"), + }); + + await restartArticleContentTranslations(fedCtx, articleSource); + + // The row must briefly pass through placeholder state before + // the failing stub model causes the failure branch to delete + // it; assert on either observable. + await waitFor(async () => { + const current = await tx.query.articleContentTable.findFirst({ + where: { sourceId, language: "ko" }, + }); + if (current == null) return true; + // Placeholder reset: title/content mirror the new original + // and beingTranslated has flipped back true with summary + // state cleared. translationRequesterId is preserved. + return current.beingTranslated === true && + current.title === "New original title" && + current.content === "New original body" && + current.summary === null && + current.translationRequesterId === requester.account.id; + }); + + // Eventually the failing stub causes deletion via the + // run-translation failure branch. + await waitFor(async () => { + const current = await tx.query.articleContentTable.findFirst({ + where: { sourceId, language: "ko" }, + }); + return current == null; + }); + }); + }, +); + +test( + "restartArticleContentTranslations() is a no-op when the article has no translations", + async () => { + await withRollback(async (tx) => { + const fedCtx = createFedCtx(tx); + fedCtx.data.models = { + summarizer: {} as never, + translator: {} as never, + } as typeof fedCtx.data.models; + const author = await insertAccountWithActor(tx, { + username: "restartnotrans", + name: "Restart No Translations", + email: "restartnotrans@example.com", + }); + const sourceId = generateUuidV7(); + const published = new Date("2026-04-15T00:00:00.000Z"); + + const [articleSource] = await tx.insert(articleSourceTable).values({ + id: sourceId, + accountId: author.account.id, + publishedYear: 2026, + slug: "restart-no-translations", + tags: [], + allowLlmTranslation: false, + published, + updated: published, + }).returning(); + await tx.insert(articleContentTable).values({ + sourceId, + language: "en", + title: "Original", + content: "Body", + published, + updated: published, + }); + + // Should return without throwing despite the deliberately bad + // translator stub. + await restartArticleContentTranslations(fedCtx, articleSource); + + // Original row still in place and unchanged. + const original = await tx.query.articleContentTable.findFirst({ + where: { sourceId, language: "en" }, + }); + assert.ok(original != null); + assert.equal(original.beingTranslated, false); + assert.equal(original.content, "Body"); + }); + }, +); diff --git a/models/article.lifecycle.test.ts b/models/article.lifecycle.test.ts index 77e447d15..898101b37 100644 --- a/models/article.lifecycle.test.ts +++ b/models/article.lifecycle.test.ts @@ -1,6 +1,7 @@ import assert from "node:assert/strict"; import test from "node:test"; import { createArticle, updateArticle } from "./article.ts"; +import { articleContentTable } from "./schema.ts"; import { createFedCtx, insertAccountWithActor, @@ -12,6 +13,17 @@ const fakeModels = { translator: {} as never, }; +async function waitFor(predicate: () => Promise, timeoutMs = 10000) { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + if (await predicate()) return; + await new Promise((resolve) => setTimeout(resolve, 50)); + } + throw new Error( + `Timed out waiting for async background state after ${timeoutMs}ms`, + ); +} + test("createArticle() creates a post and timeline entry for the author", async () => { await withRollback(async (tx) => { const fedCtx = createFedCtx(tx); @@ -100,3 +112,126 @@ test("updateArticle() rewrites the persisted article post", async () => { assert.match(storedPost.contentHtml, /body<\/strong>/); }); }); + +test("updateArticle() resets existing translation rows when the body changes", async () => { + await withRollback(async (tx) => { + const fedCtx = createFedCtx(tx); + fedCtx.data.models = fakeModels as typeof fedCtx.data.models; + const author = await insertAccountWithActor(tx, { + username: "retranslateauthor", + name: "Retranslate Author", + email: "retranslateauthor@example.com", + }); + const requester = await insertAccountWithActor(tx, { + username: "retranslaterequester", + name: "Retranslate Requester", + email: "retranslaterequester@example.com", + }); + const article = await createArticle(fedCtx, { + accountId: author.account.id, + publishedYear: 2026, + slug: "retranslate-article", + tags: [], + allowLlmTranslation: true, + published: new Date("2026-04-15T00:00:00.000Z"), + updated: new Date("2026-04-15T00:00:00.000Z"), + title: "Original article", + content: "Original body", + language: "en", + }); + assert.ok(article != null); + // Pre-existing completed translation row that the edit should + // invalidate. + await tx.insert(articleContentTable).values({ + sourceId: article.articleSource.id, + language: "ko", + title: "Old translated title", + content: "Old translated body", + summary: "Old summary.", + originalLanguage: "en", + translationRequesterId: requester.account.id, + beingTranslated: false, + published: new Date("2026-04-15T01:00:00.000Z"), + updated: new Date("2026-04-15T01:00:00.000Z"), + }); + + const updated = await updateArticle(fedCtx, article.articleSource.id, { + content: "Edited body", + }); + assert.ok(updated != null); + + // The retranslation runs in the background. The placeholder + // reset is awaited synchronously, then the failing stub + // translator deletes the row via the failure-cleanup branch. + // Either observable is acceptable. + await waitFor(async () => { + const ko = await tx.query.articleContentTable.findFirst({ + where: { sourceId: article.articleSource.id, language: "ko" }, + }); + if (ko == null) return true; + return ko.beingTranslated === true && + ko.title === "Original article" && + ko.content === "Edited body" && + ko.summary === null; + }); + }); +}); + +test("updateArticle() leaves existing translations alone on title-only edits", async () => { + await withRollback(async (tx) => { + const fedCtx = createFedCtx(tx); + fedCtx.data.models = fakeModels as typeof fedCtx.data.models; + const author = await insertAccountWithActor(tx, { + username: "retranslatetitleauthor", + name: "Retranslate Title Author", + email: "retranslatetitleauthor@example.com", + }); + const requester = await insertAccountWithActor(tx, { + username: "retranslatetitlerequester", + name: "Retranslate Title Requester", + email: "retranslatetitlerequester@example.com", + }); + const article = await createArticle(fedCtx, { + accountId: author.account.id, + publishedYear: 2026, + slug: "retranslate-title-article", + tags: [], + allowLlmTranslation: true, + published: new Date("2026-04-15T00:00:00.000Z"), + updated: new Date("2026-04-15T00:00:00.000Z"), + title: "Original article", + content: "Original body", + language: "en", + }); + assert.ok(article != null); + await tx.insert(articleContentTable).values({ + sourceId: article.articleSource.id, + language: "ko", + title: "Existing translated title", + content: "Existing translated body", + summary: "Existing summary.", + originalLanguage: "en", + translationRequesterId: requester.account.id, + beingTranslated: false, + published: new Date("2026-04-15T01:00:00.000Z"), + updated: new Date("2026-04-15T01:00:00.000Z"), + }); + + const updated = await updateArticle(fedCtx, article.articleSource.id, { + title: "Renamed only", + }); + assert.ok(updated != null); + + // Title-only edits don't trigger retranslation; the ko row stays + // exactly as it was — no placeholder, no deletion, original + // summary preserved. + const ko = await tx.query.articleContentTable.findFirst({ + where: { sourceId: article.articleSource.id, language: "ko" }, + }); + assert.ok(ko != null); + assert.equal(ko.beingTranslated, false); + assert.equal(ko.title, "Existing translated title"); + assert.equal(ko.content, "Existing translated body"); + assert.equal(ko.summary, "Existing summary."); + }); +}); diff --git a/models/article.source.test.ts b/models/article.source.test.ts index d7add745a..6929d73a5 100644 --- a/models/article.source.test.ts +++ b/models/article.source.test.ts @@ -107,7 +107,7 @@ test("getOriginalArticleContent() picks the earliest non-translation content", ( assert.deepEqual(selected, original); }); -test("updateArticleSource() updates the original content and preserves translations", async () => { +test("updateArticleSource() flags originalContentChanged when the body changes and preserves existing translation rows in place", async () => { await withRollback(async (tx) => { const author = await insertAccountWithActor(tx, { username: "updatearticlesource", @@ -149,29 +149,86 @@ test("updateArticleSource() updates the original content and preserves translati }, ]); - const updated = await updateArticleSource(tx, sourceId, { + const result = await updateArticleSource(tx, sourceId, { title: "Updated title", content: "Updated content", slug: "updated-source", }); - assert.ok(updated != null); - assert.equal(updated.slug, "updated-source"); - assert.equal(updated.contents.length, 2); + assert.ok(result != null); + assert.equal(result.originalContentChanged, true); + assert.equal(result.source.slug, "updated-source"); + assert.equal(result.source.contents.length, 2); - const originalContent = updated.contents.find((content) => + const originalContent = result.source.contents.find((content) => content.originalLanguage == null ); - const translatedContent = updated.contents.find((content) => + const translatedContent = result.source.contents.find((content) => content.originalLanguage === "en" ); assert.ok(originalContent != null); assert.equal(originalContent.title, "Updated title"); assert.equal(originalContent.content, "Updated content"); + // updateArticleSource itself does not touch translation rows; the + // retranslation step (restartArticleContentTranslations) lives in + // updateArticle so it has access to fedCtx for the federation + // Update side effects. Here we only check that the existing + // translation row passes through unchanged and that the + // originalContentChanged signal is set so an outer caller can + // decide to invalidate it. assert.ok(translatedContent != null); assert.equal(translatedContent.title, "Translated title"); assert.equal(translatedContent.content, "Translated content"); + assert.equal(translatedContent.beingTranslated, false); + }); +}); + +test("updateArticleSource() does not flag originalContentChanged on title-only edits", async () => { + await withRollback(async (tx) => { + const author = await insertAccountWithActor(tx, { + username: "updatearticlesourcetitle", + name: "Title Only Edit", + email: "updatearticlesourcetitle@example.com", + }); + const sourceId = generateUuidV7(); + const published = new Date("2026-04-15T00:00:00.000Z"); + + await tx.insert(articleSourceTable).values({ + id: sourceId, + accountId: author.account.id, + publishedYear: 2026, + slug: "title-only-edit", + tags: [], + allowLlmTranslation: false, + published, + updated: published, + }); + await tx.insert(articleContentTable).values({ + sourceId, + language: "en", + title: "Original title", + content: "Original content", + published, + updated: published, + }); + + const result = await updateArticleSource(tx, sourceId, { + title: "Renamed", + }); + + assert.ok(result != null); + // Title-only edits don't trigger retranslation, mirroring the + // existing summary-invalidation gate in updateArticleSource: the + // body is unchanged, so existing translations would still be + // accurate. + assert.equal(result.originalContentChanged, false); + const originalContent = result.source.contents.find((c) => + c.originalLanguage == null + ); + assert.ok(originalContent != null); + assert.equal(originalContent.title, "Renamed"); + assert.equal(originalContent.content, "Original content"); }); }); diff --git a/models/article.ts b/models/article.ts index 2f10093d6..59ad6a03e 100644 --- a/models/article.ts +++ b/models/article.ts @@ -300,6 +300,28 @@ export async function createArticle( return post; } +export interface UpdateArticleSourceResult { + source: ArticleSource & { contents: ArticleContent[] }; + /** + * `true` when the original-language `article_content` row's body + * actually changed during this update. The caller uses this to + * decide whether to invalidate existing translation rows. + * + * Title-only edits do not set this flag, matching the existing + * summary-invalidation gate below. + * + * Language changes never reach this branch when translations exist: + * the self-FK on `article_content` (`schema.ts:524-527`) is + * `ON DELETE CASCADE` only, so any `UPDATE … SET language = …` on + * the original row aborts with 23503 (rethrown as + * {@link LanguageChangeWithTranslationsError}) whenever a row's + * `originalLanguage` references the old language. A successful + * `languageChanged` therefore implies zero translations and there + * is nothing to retranslate. + */ + originalContentChanged: boolean; +} + export async function updateArticleSource( db: Database, id: Uuid, @@ -309,11 +331,12 @@ export async function updateArticleSource( language?: string; }, models?: Models, -): Promise { +): Promise { // Captured inside the transaction and used after it commits so we // can enqueue a fresh summarization for the row whose body or // language just changed. let resummarizeTarget: ArticleContent | undefined; + let originalContentChanged = false; const result = await db.transaction(async (tx) => { const sources = await tx.update(articleSourceTable) .set({ ...source, updated: sql`CURRENT_TIMESTAMP` }) @@ -371,6 +394,9 @@ export async function updateArticleSource( ) { resummarizeTarget = updatedRows[0]; } + if (contentChanged && updatedRows.length > 0) { + originalContentChanged = true; + } } catch (error) { if ( error instanceof postgres.PostgresError && error.code === "23503" @@ -386,13 +412,14 @@ export async function updateArticleSource( }); return { ...sources[0], contents }; }); + if (result == null) return undefined; // Queue a fresh summarization outside of the transaction so the // claim is visible to other workers as soon as it is acquired and // the deferred apply step does not try to use a closed transaction. if (resummarizeTarget != null && models != null) { await startArticleContentSummary(db, models.summarizer, resummarizeTarget); } - return result; + return { source: result, originalContentChanged }; } export async function updateArticle( @@ -418,13 +445,14 @@ export async function updateArticle( const previousPost = await db.query.postTable.findFirst({ where: { articleSourceId }, }); - const articleSource = await updateArticleSource( + const updateResult = await updateArticleSource( db, articleSourceId, source, models, ); - if (articleSource == null) return undefined; + if (updateResult == null) return undefined; + const { source: articleSource, originalContentChanged } = updateResult; const account = await db.query.accountTable.findFirst({ where: { id: articleSource.accountId }, with: { emails: true, links: true }, @@ -476,6 +504,17 @@ export async function updateArticle( post.relayedTags = [...relayedTags]; } // TODO: send Update(Article) to the mentioned actors too + // After federating the original-language Update, invalidate any + // existing translation rows so they retranslate against the new + // body. Each restarted translation will fire its own Update on + // completion (correct ActivityPub semantics — peers see the + // original change first, then each translation's refresh as it + // becomes available). We `await` the synchronous claim-and-reset + // step so the placeholders are visible by the time this function + // returns; the actual `translate()` calls run in the background. + if (originalContentChanged) { + await restartArticleContentTranslations(fedCtx, articleSource); + } return post; } @@ -756,6 +795,84 @@ export async function startArticleContentTranslation( return queued; } +/** + * Invalidates and re-runs every existing translation row for an + * article whose original-language body has changed. For each + * translation row, atomically resets it to placeholder state + * (copying the new original title/content into it, flipping + * `beingTranslated` back to true, and clearing summary state), then + * fires {@link runArticleContentTranslation} against the freshly + * reset row to repopulate it from the model. The actual translation + * runs in the background; the synchronous claim-and-reset is + * awaited so callers can rely on placeholders being in place by + * return time. + * + * No-ops when the article has no original-language content (e.g., + * remote articles with no `articleSource.contents` row in the + * article's own language) or no translation rows at all. + * + * Used by {@link updateArticle} to satisfy + * . + */ +export async function restartArticleContentTranslations( + fedCtx: Context, + articleSource: ArticleSource, +): Promise { + const { db } = fedCtx.data; + const original = await getOriginalArticleContent(db, articleSource); + if (original == null) { + logger.debug( + "No original-language content for {sourceId}; nothing to retranslate.", + { sourceId: articleSource.id }, + ); + return; + } + const translations = await db.query.articleContentTable.findMany({ + where: { + sourceId: articleSource.id, + originalLanguage: { isNotNull: true }, + }, + }); + if (translations.length === 0) return; + logger.debug( + "Restarting {count} translation(s) for {sourceId}.", + { count: translations.length, sourceId: articleSource.id }, + ); + for (const translation of translations) { + // Reset the row to placeholder state mirroring the shape an + // initial `startArticleContentTranslation` would have produced. + // Keep `originalLanguage` and `translationRequesterId` as-is so + // the row's audit trail (who first asked for this translation) + // is preserved. The schema check + // `article_content_being_translated_check` requires + // `originalLanguage IS NOT NULL` whenever `beingTranslated=true`, + // which we already satisfy by selecting only rows with + // `originalLanguage IS NOT NULL` above. + const reset = await db.update(articleContentTable) + .set({ + title: original.title, + content: original.content, + beingTranslated: true, + summary: null, + summaryStarted: null, + summaryUnnecessary: false, + updated: sql`CURRENT_TIMESTAMP`, + }) + .where( + and( + eq(articleContentTable.sourceId, translation.sourceId), + eq(articleContentTable.language, translation.language), + ), + ) + .returning(); + if (reset.length < 1) continue; + // Fire-and-forget: `runArticleContentTranslation` schedules the + // `translate()` chain on its own and the caller does not await + // the model call. Each translation runs concurrently. + void runArticleContentTranslation(fedCtx, reset[0]); + } +} + /** * Runs the actual LLM translation for an `article_content` row that is * already in the placeholder / `beingTranslated` state. Awaits the From 2b43c4d9c5f894600c5e10a7ff3c10ab4ba74c1a Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Mon, 4 May 2026 14:33:34 +0900 Subject: [PATCH 05/18] Address Codex review of the retranslation flow Three fixes uncovered by an external code review of commits 302c2427..422ee879. 1. Translate the freshest body, not the queued snapshot (models/article.ts). runArticleContentTranslation's claim step now reads the row back via RETURNING and uses `claimed.title` / `claimed.content` for the translate input. This closes a race where two restartArticleContentTranslations calls interleave: the older restart's runArticleContentTranslation would otherwise translate the older body even after the newer restart wrote the newer body into the placeholder. The claim WHERE also now requires `beingTranslated=true`, so the helper bails silently if the row has been completed (or deleted) between the caller queueing this run and the claim landing, instead of stamping a claim onto a no-longer-relevant row. 2. Make translation Update activity ids unique (models/article.ts). Previously every translation completion reused `#update/${article.updated.toISOString()}` as the activity id. `article_source.updated` is not bumped by translation completions, so the original-language Update and every per-locale translation Update for the same edit all collided. The id now incorporates the target language and the translated row's own (just-bumped) `updated` so each completion mints a distinct id, regardless of how many locales finish in close succession. 3. Catch background retranslation kickoff failures (models/article.ts). restartArticleContentTranslations did `void runArticleContentTranslation(...)`, which loses any rejection thrown by the synchronous setup before the internal `.then().catch()` chain is installed (e.g., a transient DB error during the claim UPDATE or the article-source fetch). Such rejections would surface as unhandled promise rejections in the runtime; they now go through a `.catch()` that logs the failed (sourceId, language) and the error. All 29 article-domain tests still pass. Assisted-by: Codex:gpt-5.5 Assisted-by: Claude Code:claude-opus-4-7 --- models/article.ts | 82 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 66 insertions(+), 16 deletions(-) diff --git a/models/article.ts b/models/article.ts index 59ad6a03e..089620128 100644 --- a/models/article.ts +++ b/models/article.ts @@ -869,7 +869,22 @@ export async function restartArticleContentTranslations( // Fire-and-forget: `runArticleContentTranslation` schedules the // `translate()` chain on its own and the caller does not await // the model call. Each translation runs concurrently. - void runArticleContentTranslation(fedCtx, reset[0]); + // The `.catch()` is here because the synchronous setup before + // the chain is installed (the claim UPDATE, the article-source + // fetch) can itself throw on a transient DB error; without it + // those rejections would surface as unhandled promise + // rejections. + const resetRow = reset[0]; + runArticleContentTranslation(fedCtx, resetRow).catch((error) => { + logger.error( + "Failed to start retranslation for {sourceId} {language}: {error}", + { + sourceId: resetRow.sourceId, + language: resetRow.language, + error, + }, + ); + }); } } @@ -910,25 +925,49 @@ async function runArticleContentTranslation( } // Take ownership of the placeholder row by stamping it with a - // JS-side `Date` that becomes our claim id. Subsequent success / - // failure writes from this run only land if the row's `updated` - // still equals this claim — a concurrent re-translation that resets - // the row out from under us bumps `updated` past this value, and - // our writes turn into no-ops instead of clobbering the fresher - // claim. Using a JS `Date` (rather than PG `CURRENT_TIMESTAMP`) - // is what makes this CAS reliable: PG `timestamptz` keeps µs - // precision while the `postgres` driver hands back JS `Date` - // values truncated to ms, so a CAS against the round-tripped - // value of a `CURRENT_TIMESTAMP` write would never match. + // JS-side `Date` that becomes our claim id, and read the row's + // freshest title/content back via `RETURNING`. Subsequent + // success / failure writes from this run only land if the row's + // `updated` still equals this claim — a concurrent re-translation + // that resets the row out from under us bumps `updated` past this + // value, and our writes turn into no-ops instead of clobbering + // the fresher claim. Using a JS `Date` (rather than PG + // `CURRENT_TIMESTAMP`) is what makes this CAS reliable: PG + // `timestamptz` keeps µs precision while the `postgres` driver + // hands back JS `Date` values truncated to ms, so a CAS against + // the round-tripped value of a `CURRENT_TIMESTAMP` write would + // never match. + // + // Two further guards live here: + // - `beingTranslated=true` on the WHERE bails out silently if the + // row has already been completed (or deleted) by another writer + // between the caller queueing this run and the claim landing. + // - The translate input below is built from `claimed.title` / + // `claimed.content` rather than the caller's `queued` snapshot. + // When two `restartArticleContentTranslations` calls race, the + // later one writes the freshest body into the placeholder; this + // helper then translates *that* body instead of the stale body + // from whichever caller it was queued for. const claim = new Date(); - await db.update(articleContentTable) + const claimedRows = await db.update(articleContentTable) .set({ updated: claim }) .where( and( eq(articleContentTable.sourceId, sourceId), eq(articleContentTable.language, targetLanguage), + eq(articleContentTable.beingTranslated, true), ), + ) + .returning(); + if (claimedRows.length < 1) { + logger.debug( + "Translation claim failed; row is not (or no longer) a " + + "placeholder ({sourceId} {language}).", + queued, ); + return; + } + const claimed = claimedRows[0]; // Fetch article source with author information for translation context. const articleSource = await db.query.articleSourceTable.findFirst({ @@ -942,11 +981,12 @@ async function runArticleContentTranslation( }, }); - // Combine title and content for translation. - const text = `# ${queued.title}\n\n${queued.content}`; + // Combine title and content for translation, using the freshest + // values read back from the claim above. + const text = `# ${claimed.title}\n\n${claimed.content}`; translate({ model, - sourceLanguage: originalLanguage, + sourceLanguage: claimed.originalLanguage ?? originalLanguage, targetLanguage, text, // Pass context for better translation quality. @@ -1010,9 +1050,19 @@ async function runArticleContentTranslation( where: { articleSourceId: article.id }, }); const articleObject = await getArticle(fedCtx, article); + // The id has to be unique across translation completions for + // this article — multiple locales can complete in close + // succession (especially after a body edit re-queues every + // existing translation), and they would all collide on + // `article.updated` since translation completions don't bump + // it. Including both the target language and the translated + // row's own `updated` (a fresh `CURRENT_TIMESTAMP` from the + // success UPDATE just above) keeps the id distinct from the + // original-language Update activity and from every other + // translation's Update for the same edit. const update = new vocab.Update({ id: new URL( - `#update/${article.updated.toISOString()}`, + `#update/${updated[0].updated.toISOString()}/${targetLanguage}`, articleObject.id ?? fedCtx.canonicalOrigin, ), actors: articleObject.attributionIds, From f649b60f05481544f199f2635f28aae2c0b62bfa Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Mon, 4 May 2026 14:40:15 +0900 Subject: [PATCH 06/18] Serialize concurrent retranslation restarts on a row lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wrap the read-original-then-reset-translations sequence in restartArticleContentTranslations in a transaction that takes `SELECT … FOR UPDATE` on the article_source row. Concurrent restartArticleContentTranslations calls (driven by back-to-back edits to the same article) would otherwise each read their own snapshot of the original-language body and then overwrite each other's placeholder writes — the older restart's iteration could land after the newer restart's, leaving the placeholder pointing at the older body and the eventual translation reflecting that older body even though a newer edit had committed. The article_source row lock is the same row-level write lock that updateArticleSource takes on its UPDATE, so concurrent edits and restarts queue up cleanly without any new lock graph. The actual translate() round-trip runs after the transaction commits — the runArticleContentTranslation calls are scheduled outside the tx block — so the LLM call doesn't extend the lock window. Follow-up to the previous commit (Address Codex review of the retranslation flow), which addressed three of four issues; this one closes the remaining stale-snapshot race that survived the freshest-body claim refactor in runArticleContentTranslation. Assisted-by: Codex:gpt-5.5 Assisted-by: Claude Code:claude-opus-4-7 --- models/article.ts | 112 +++++++++++++++++++++++++++------------------- 1 file changed, 67 insertions(+), 45 deletions(-) diff --git a/models/article.ts b/models/article.ts index 089620128..747e83648 100644 --- a/models/article.ts +++ b/models/article.ts @@ -819,53 +819,76 @@ export async function restartArticleContentTranslations( articleSource: ArticleSource, ): Promise { const { db } = fedCtx.data; - const original = await getOriginalArticleContent(db, articleSource); - if (original == null) { + // Serialize the read-original-then-reset-translations sequence + // against any other writer to this article's source row. Two + // concurrent restartArticleContentTranslations calls (driven by + // back-to-back edits to the same article) would otherwise read + // their own snapshot of the original and then overwrite each + // other's placeholder writes, leaving the translation rows + // pointing at whichever snapshot's UPDATE happened to land last. + // `SELECT … FOR UPDATE` on the article_source row holds the same + // row-level write lock that updateArticleSource takes during its + // own UPDATE, so concurrent edits and restarts queue up cleanly. + // The translate() calls themselves run after the transaction + // commits so the LLM round-trip doesn't extend the lock window. + const resetRows = await db.transaction(async (tx) => { + await tx.execute(sql` + SELECT 1 FROM ${articleSourceTable} + WHERE ${articleSourceTable.id} = ${articleSource.id} + FOR UPDATE + `); + const original = await getOriginalArticleContent(tx, articleSource); + if (original == null) { + logger.debug( + "No original-language content for {sourceId}; nothing to retranslate.", + { sourceId: articleSource.id }, + ); + return []; + } + const translations = await tx.query.articleContentTable.findMany({ + where: { + sourceId: articleSource.id, + originalLanguage: { isNotNull: true }, + }, + }); + if (translations.length === 0) return []; logger.debug( - "No original-language content for {sourceId}; nothing to retranslate.", - { sourceId: articleSource.id }, + "Restarting {count} translation(s) for {sourceId}.", + { count: translations.length, sourceId: articleSource.id }, ); - return; - } - const translations = await db.query.articleContentTable.findMany({ - where: { - sourceId: articleSource.id, - originalLanguage: { isNotNull: true }, - }, + const reset: ArticleContent[] = []; + for (const translation of translations) { + // Reset the row to placeholder state mirroring the shape an + // initial `startArticleContentTranslation` would have produced. + // Keep `originalLanguage` and `translationRequesterId` as-is so + // the row's audit trail (who first asked for this translation) + // is preserved. The schema check + // `article_content_being_translated_check` requires + // `originalLanguage IS NOT NULL` whenever `beingTranslated=true`, + // which we already satisfy by selecting only rows with + // `originalLanguage IS NOT NULL` above. + const updated = await tx.update(articleContentTable) + .set({ + title: original.title, + content: original.content, + beingTranslated: true, + summary: null, + summaryStarted: null, + summaryUnnecessary: false, + updated: sql`CURRENT_TIMESTAMP`, + }) + .where( + and( + eq(articleContentTable.sourceId, translation.sourceId), + eq(articleContentTable.language, translation.language), + ), + ) + .returning(); + if (updated.length > 0) reset.push(updated[0]); + } + return reset; }); - if (translations.length === 0) return; - logger.debug( - "Restarting {count} translation(s) for {sourceId}.", - { count: translations.length, sourceId: articleSource.id }, - ); - for (const translation of translations) { - // Reset the row to placeholder state mirroring the shape an - // initial `startArticleContentTranslation` would have produced. - // Keep `originalLanguage` and `translationRequesterId` as-is so - // the row's audit trail (who first asked for this translation) - // is preserved. The schema check - // `article_content_being_translated_check` requires - // `originalLanguage IS NOT NULL` whenever `beingTranslated=true`, - // which we already satisfy by selecting only rows with - // `originalLanguage IS NOT NULL` above. - const reset = await db.update(articleContentTable) - .set({ - title: original.title, - content: original.content, - beingTranslated: true, - summary: null, - summaryStarted: null, - summaryUnnecessary: false, - updated: sql`CURRENT_TIMESTAMP`, - }) - .where( - and( - eq(articleContentTable.sourceId, translation.sourceId), - eq(articleContentTable.language, translation.language), - ), - ) - .returning(); - if (reset.length < 1) continue; + for (const resetRow of resetRows) { // Fire-and-forget: `runArticleContentTranslation` schedules the // `translate()` chain on its own and the caller does not await // the model call. Each translation runs concurrently. @@ -874,7 +897,6 @@ export async function restartArticleContentTranslations( // fetch) can itself throw on a transient DB error; without it // those rejections would surface as unhandled promise // rejections. - const resetRow = reset[0]; runArticleContentTranslation(fedCtx, resetRow).catch((error) => { logger.error( "Failed to start retranslation for {sourceId} {language}: {error}", From 7d342a40373cf7885a9dea19370a2b4336c9ab27 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Mon, 4 May 2026 15:04:15 +0900 Subject: [PATCH 07/18] Skip retranslation when allowLlmTranslation is false updateArticle() called restartArticleContentTranslations() on every body change without checking the article-level allowLlmTranslation flag. When an edit toggles the flag off and changes the body in the same update, the previous behavior would still enqueue background translate() runs for every existing translation row, overriding the author's just-expressed wish and incurring model usage / cost / data-egress for translations they no longer want. Gate the call on articleSource.allowLlmTranslation. Existing translation rows from before the flag was flipped off are left alone (stale, not refreshed); flipping the flag back on and editing the body again brings them back into sync. Adds a regression test covering the gate. https://github.com/hackers-pub/hackerspub/pull/283#discussion_r3179610102 Assisted-by: Codex:gpt-5.5 Assisted-by: Claude Code:claude-opus-4-7 --- models/article.lifecycle.test.ts | 67 ++++++++++++++++++++++++++++++++ models/article.ts | 10 ++++- 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/models/article.lifecycle.test.ts b/models/article.lifecycle.test.ts index 898101b37..df38cc3fc 100644 --- a/models/article.lifecycle.test.ts +++ b/models/article.lifecycle.test.ts @@ -235,3 +235,70 @@ test("updateArticle() leaves existing translations alone on title-only edits", a assert.equal(ko.summary, "Existing summary."); }); }); + +test("updateArticle() does not retranslate when allowLlmTranslation is false", async () => { + await withRollback(async (tx) => { + const fedCtx = createFedCtx(tx); + fedCtx.data.models = fakeModels as typeof fedCtx.data.models; + const author = await insertAccountWithActor(tx, { + username: "noretranslateauthor", + name: "No Retranslate Author", + email: "noretranslateauthor@example.com", + }); + const requester = await insertAccountWithActor(tx, { + username: "noretranslaterequester", + name: "No Retranslate Requester", + email: "noretranslaterequester@example.com", + }); + const article = await createArticle(fedCtx, { + accountId: author.account.id, + publishedYear: 2026, + slug: "no-retranslate-article", + tags: [], + // The author opted out of LLM translation: no placeholder + // resets, no background `translate()` calls. This guards + // against re-enqueueing translation work in the same edit + // that flips the switch off (or any later body edit while + // the switch stays off). + allowLlmTranslation: false, + published: new Date("2026-04-15T00:00:00.000Z"), + updated: new Date("2026-04-15T00:00:00.000Z"), + title: "Original article", + content: "Original body", + language: "en", + }); + assert.ok(article != null); + // A pre-existing translation row from before the switch was + // turned off (or seeded by a prior allowLlmTranslation=true + // window) — must remain unchanged. + await tx.insert(articleContentTable).values({ + sourceId: article.articleSource.id, + language: "ko", + title: "Existing translated title", + content: "Existing translated body", + summary: "Existing summary.", + originalLanguage: "en", + translationRequesterId: requester.account.id, + beingTranslated: false, + published: new Date("2026-04-15T01:00:00.000Z"), + updated: new Date("2026-04-15T01:00:00.000Z"), + }); + + const updated = await updateArticle(fedCtx, article.articleSource.id, { + content: "Edited body", + }); + assert.ok(updated != null); + + // No placeholder reset, no `translate()` queued: the row + // stays exactly as it was. No async waiting needed because + // the gate skips the synchronous claim-and-reset entirely. + const ko = await tx.query.articleContentTable.findFirst({ + where: { sourceId: article.articleSource.id, language: "ko" }, + }); + assert.ok(ko != null); + assert.equal(ko.beingTranslated, false); + assert.equal(ko.title, "Existing translated title"); + assert.equal(ko.content, "Existing translated body"); + assert.equal(ko.summary, "Existing summary."); + }); +}); diff --git a/models/article.ts b/models/article.ts index 747e83648..89cb48ac2 100644 --- a/models/article.ts +++ b/models/article.ts @@ -512,7 +512,15 @@ export async function updateArticle( // becomes available). We `await` the synchronous claim-and-reset // step so the placeholders are visible by the time this function // returns; the actual `translate()` calls run in the background. - if (originalContentChanged) { + // + // Gate on the article-level `allowLlmTranslation` switch so an + // edit that turns LLM translation off in the same update does + // not still enqueue background `translate()` runs against the + // author's just-expressed wish. Existing translation rows from + // before the switch was flipped are left alone (stale, not + // refreshed); re-enabling the switch and editing the body again + // brings them back into sync. + if (originalContentChanged && articleSource.allowLlmTranslation) { await restartArticleContentTranslations(fedCtx, articleSource); } return post; From bbbfaa9498e1c96a8da60d37e5fb44dbee666438 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Mon, 4 May 2026 15:05:49 +0900 Subject: [PATCH 08/18] Extract waitFor test helper to test/wait.ts Both models/article.background.test.ts and models/article.lifecycle.test.ts had a private copy of the same async-polling helper that watches for background-promise side effects (translation completion, placeholder reset, summary cleanup) without exposing the underlying promise to the test. Move it to test/wait.ts so future tests don't grow a third copy when they hit the same observation pattern. Behavior is unchanged: same 50 ms poll cadence, same default 10 s deadline, same error message on timeout. https://github.com/hackers-pub/hackerspub/pull/283#discussion_r3179595557 Assisted-by: Claude Code:claude-opus-4-7 --- models/article.background.test.ts | 12 +----------- models/article.lifecycle.test.ts | 12 +----------- test/wait.ts | 20 ++++++++++++++++++++ 3 files changed, 22 insertions(+), 22 deletions(-) create mode 100644 test/wait.ts diff --git a/models/article.background.test.ts b/models/article.background.test.ts index 80cd783cd..8433a23a5 100644 --- a/models/article.background.test.ts +++ b/models/article.background.test.ts @@ -11,19 +11,9 @@ import { insertAccountWithActor, withRollback, } from "../test/postgres.ts"; +import { waitFor } from "../test/wait.ts"; import { generateUuidV7 } from "./uuid.ts"; -async function waitFor(predicate: () => Promise, timeoutMs = 10000) { - const deadline = Date.now() + timeoutMs; - while (Date.now() < deadline) { - if (await predicate()) return; - await new Promise((resolve) => setTimeout(resolve, 50)); - } - throw new Error( - `Timed out waiting for async background state after ${timeoutMs}ms`, - ); -} - test("startArticleContentSummary() resets summaryStarted when summarization fails", async () => { await withRollback(async (tx) => { const author = await insertAccountWithActor(tx, { diff --git a/models/article.lifecycle.test.ts b/models/article.lifecycle.test.ts index df38cc3fc..985e35b09 100644 --- a/models/article.lifecycle.test.ts +++ b/models/article.lifecycle.test.ts @@ -7,23 +7,13 @@ import { insertAccountWithActor, withRollback, } from "../test/postgres.ts"; +import { waitFor } from "../test/wait.ts"; const fakeModels = { summarizer: {} as never, translator: {} as never, }; -async function waitFor(predicate: () => Promise, timeoutMs = 10000) { - const deadline = Date.now() + timeoutMs; - while (Date.now() < deadline) { - if (await predicate()) return; - await new Promise((resolve) => setTimeout(resolve, 50)); - } - throw new Error( - `Timed out waiting for async background state after ${timeoutMs}ms`, - ); -} - test("createArticle() creates a post and timeline entry for the author", async () => { await withRollback(async (tx) => { const fedCtx = createFedCtx(tx); diff --git a/test/wait.ts b/test/wait.ts new file mode 100644 index 000000000..69ddd48af --- /dev/null +++ b/test/wait.ts @@ -0,0 +1,20 @@ +/** + * Polls `predicate` every 50 ms until it resolves to `true` or + * `timeoutMs` elapses. Used by tests that observe state which a + * background promise eventually mutates (translation completion, + * placeholder reset, summary cleanup, etc.) without exposing that + * promise to the test. + */ +export async function waitFor( + predicate: () => Promise, + timeoutMs = 10000, +): Promise { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + if (await predicate()) return; + await new Promise((resolve) => setTimeout(resolve, 50)); + } + throw new Error( + `Timed out waiting for async background state after ${timeoutMs}ms`, + ); +} From 59503306de09a2c9a4b37dbac38c78a64b3bb1fa Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Mon, 4 May 2026 15:16:22 +0900 Subject: [PATCH 09/18] Batch translation-row resets into a single UPDATE restartArticleContentTranslations() previously listed every translation row for the article via a relational findMany and then issued a per-row UPDATE inside the transaction. Each row gets the same placeholder values, so the loop is N+1 round trips against the same row set. Replace the findMany + per-row UPDATE loop with a single UPDATE filtered on (sourceId = articleSource.id AND originalLanguage IS NOT NULL). The filter targets exactly the same rows the previous implementation iterated; the schema check article_content_being_translated_check requires originalLanguage IS NOT NULL whenever beingTranslated=true so the filter doubles as the schema-safety guard the per-iteration comment used to explain. RETURNING gives us the reset rows for the post-commit fan-out into runArticleContentTranslation, the same shape callers depended on. Behavior is otherwise unchanged: still inside the same transaction that holds the article_source FOR UPDATE row lock, still preserves originalLanguage and translationRequesterId by omitting them from set, still bumps updated to CURRENT_TIMESTAMP. The "no rows to restart" early return is now implicit in the UPDATE returning an empty array; the surrounding logger calls are kept and reworded slightly for the new shape. All 30 article-domain tests still pass. https://github.com/hackers-pub/hackerspub/pull/283#discussion_r3179658355 Assisted-by: Claude Code:claude-opus-4-7 --- models/article.ts | 76 ++++++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 41 deletions(-) diff --git a/models/article.ts b/models/article.ts index 89cb48ac2..fd243c5eb 100644 --- a/models/article.ts +++ b/models/article.ts @@ -10,7 +10,7 @@ import { sendTagsPubRelayActivity } from "@hackerspub/federation/tags-pub"; import { getLogger } from "@logtape/logtape"; import { minBy } from "@std/collections/min-by"; import type { LanguageModel } from "ai"; -import { and, eq, isNull, lt, or, sql } from "drizzle-orm"; +import { and, eq, isNotNull, isNull, lt, or, sql } from "drizzle-orm"; import postgres from "postgres"; import type { ContextData, Models } from "./context.ts"; import type { Database } from "./db.ts"; @@ -853,46 +853,40 @@ export async function restartArticleContentTranslations( ); return []; } - const translations = await tx.query.articleContentTable.findMany({ - where: { - sourceId: articleSource.id, - originalLanguage: { isNotNull: true }, - }, - }); - if (translations.length === 0) return []; - logger.debug( - "Restarting {count} translation(s) for {sourceId}.", - { count: translations.length, sourceId: articleSource.id }, - ); - const reset: ArticleContent[] = []; - for (const translation of translations) { - // Reset the row to placeholder state mirroring the shape an - // initial `startArticleContentTranslation` would have produced. - // Keep `originalLanguage` and `translationRequesterId` as-is so - // the row's audit trail (who first asked for this translation) - // is preserved. The schema check - // `article_content_being_translated_check` requires - // `originalLanguage IS NOT NULL` whenever `beingTranslated=true`, - // which we already satisfy by selecting only rows with - // `originalLanguage IS NOT NULL` above. - const updated = await tx.update(articleContentTable) - .set({ - title: original.title, - content: original.content, - beingTranslated: true, - summary: null, - summaryStarted: null, - summaryUnnecessary: false, - updated: sql`CURRENT_TIMESTAMP`, - }) - .where( - and( - eq(articleContentTable.sourceId, translation.sourceId), - eq(articleContentTable.language, translation.language), - ), - ) - .returning(); - if (updated.length > 0) reset.push(updated[0]); + // Reset every translation row to placeholder state in a single + // statement, mirroring the shape an initial + // `startArticleContentTranslation` would have produced. The + // `originalLanguage IS NOT NULL` filter targets exactly the + // translation rows for this article (the same set the previous + // implementation listed via `findMany` and then iterated over); + // the schema check `article_content_being_translated_check` + // requires `originalLanguage IS NOT NULL` whenever + // `beingTranslated=true`, which the filter already satisfies. + // `originalLanguage` and `translationRequesterId` are not in + // `set`, so each row's audit trail (who first asked for this + // translation) is preserved. + const reset = await tx.update(articleContentTable) + .set({ + title: original.title, + content: original.content, + beingTranslated: true, + summary: null, + summaryStarted: null, + summaryUnnecessary: false, + updated: sql`CURRENT_TIMESTAMP`, + }) + .where( + and( + eq(articleContentTable.sourceId, articleSource.id), + isNotNull(articleContentTable.originalLanguage), + ), + ) + .returning(); + if (reset.length > 0) { + logger.debug( + "Restarted {count} translation(s) for {sourceId}.", + { count: reset.length, sourceId: articleSource.id }, + ); } return reset; }); From 90a14397f35250bb1750ae0f9e97a8798af69e3d Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Mon, 4 May 2026 15:32:27 +0900 Subject: [PATCH 10/18] Clear cached OG image on translation body change article_content.ogImageKey caches the rendered OG image for a row. The image is rendered from the row's title and content, so any write that replaces those fields makes the cached image stale: the crawler that follows the cached URL will see an image whose heading or excerpt no longer matches the live body. Add ogImageKey: null to the two write paths that replace a translation row's title/content: - restartArticleContentTranslations()'s reset UPDATE, which copies the new original title/content into every translation placeholder on a body edit. - runArticleContentTranslation()'s success write, which lands the newly translated title/content for one row. Both paths already clear the matching summary state (summary, summaryStarted, summaryUnnecessary) for the same reason; the OG image is the same kind of derived asset and gets cleared next to them. Lazy regeneration on the next OG-image request rebuilds from the freshly written body. The original-language UPDATE in updateArticleSource() is left alone for now; that staleness predates this PR and is the same question for the original row, not specifically a retranslation concern. https://github.com/hackers-pub/hackerspub/pull/283#discussion_r3179693940 Assisted-by: Claude Code:claude-opus-4-7 --- models/article.ts | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/models/article.ts b/models/article.ts index fd243c5eb..67244b867 100644 --- a/models/article.ts +++ b/models/article.ts @@ -873,6 +873,11 @@ export async function restartArticleContentTranslations( summary: null, summaryStarted: null, summaryUnnecessary: false, + // Clear the cached OG image too: it was rendered from the + // previous title/body and is now stale. Lazy regeneration + // on the next OG-image request will rebuild it from the + // freshly translated content. + ogImageKey: null, updated: sql`CURRENT_TIMESTAMP`, }) .where( @@ -1038,6 +1043,11 @@ async function runArticleContentTranslation( summary: null, summaryStarted: null, summaryUnnecessary: false, + // The cached OG image was rendered from the placeholder + // (or from a prior translation of an older body) and is + // now stale; clear it for the same reason as `summary` so + // the next request regenerates it from the translated text. + ogImageKey: null, }) .where( and( From 394ddce72cbbfc4c884a73a267d5ee4d33252dd1 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Mon, 4 May 2026 15:34:37 +0900 Subject: [PATCH 11/18] Gate translation claim on the queue stamp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit runArticleContentTranslation()'s claim UPDATE only required beingTranslated=true on the row, so two background runs scheduled for the same (sourceId, language) by back-to-back edits would both pass the claim check and both call translate(). The success / failure CAS at the end still ensured only one write landed, but the second translate() round trip was wasted LLM cost. A related concern is that JS Date is millisecond-precision, so two claim values created in the same millisecond compare equal; the existing CAS happens to remain correct because the first success bumps `updated` to a new value that the second can no longer match, but the protocol felt easier to reason about with a stronger uniqueness story up front. Add `updated = queued.updated` to the claim WHERE so each run only claims the placeholder it was queued for. When a later restart re-stamps the row, the older run's claim turns into a no-op instead of going through to translate(). For this CAS to be lossless, every writer that produces a row the helper later treats as `queued` now stamps `updated` with a JS Date (which round-trips through the postgres driver at the same ms precision the JS side stores) instead of letting it default to PG's CURRENT_TIMESTAMP (which keeps µs and would lose precision on the round trip): - startArticleContentTranslation()'s INSERT: explicit updated: queueStamp. - startArticleContentTranslation()'s stuck-row recovery: an explicit re-stamp UPDATE before handing the row off to the helper, so the helper's CAS has a value to match. Returns early without scheduling translate() if a concurrent writer took the row out from under us between the SELECT and the re-stamp. - restartArticleContentTranslations()'s reset UPDATE: explicit updated: restartStamp instead of sql`CURRENT_TIMESTAMP`. The claim WHERE in runArticleContentTranslation() then matches this stamp exactly. All 30 article-domain tests still pass; the existing fixtures already insert with JS Date stamps, so no test changes are needed. https://github.com/hackers-pub/hackerspub/pull/283#discussion_r3179693946 https://github.com/hackers-pub/hackerspub/pull/283#discussion_r3179699592 Assisted-by: Codex:gpt-5.5 Assisted-by: Gemini Code Assist:gemini-2.5 Assisted-by: Claude Code:claude-opus-4-7 --- models/article.ts | 60 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/models/article.ts b/models/article.ts index 67244b867..89cf4ed1b 100644 --- a/models/article.ts +++ b/models/article.ts @@ -770,6 +770,15 @@ export async function startArticleContentTranslation( { content, targetLanguage, requester }: ArticleContentTranslationOptions, ): Promise { const { db } = fedCtx.data; + // Stamp `updated` with a JS-side Date rather than letting it + // default to PG's `CURRENT_TIMESTAMP`. See the long comment on + // the CAS in `runArticleContentTranslation` for why this matters: + // the helper's claim WHERE compares the row's stored `updated` + // against `queued.updated`, and the comparison is only reliable + // when both sides round-trip through the same precision (the + // `postgres` driver hands JS `Date` values back at ms precision + // while `timestamptz` keeps µs). + const queueStamp = new Date(); const inserted = await db.insert(articleContentTable).values({ sourceId: content.sourceId, language: targetLanguage, @@ -778,6 +787,7 @@ export async function startArticleContentTranslation( originalLanguage: content.language, translationRequesterId: requester.id, beingTranslated: true, + updated: queueStamp, }).onConflictDoNothing().returning(); let queued: ArticleContent; if (inserted.length < 1) { @@ -795,7 +805,30 @@ export async function startArticleContentTranslation( logger.debug("Translation already started or not needed."); return translated!; } - queued = translated; + // The placeholder is stale (older than 30 min, presumably from + // a crashed previous run). Re-stamp it with a fresh JS Date + // before handing off to the helper so the helper's claim CAS + // has a value it can match — without this, the CAS would be + // comparing against the row's possibly-µs-precision DB + // timestamp via a ms-truncated round-trip and never hit. + const reclaim = new Date(); + const reclaimed = await db.update(articleContentTable) + .set({ updated: reclaim }) + .where( + and( + eq(articleContentTable.sourceId, content.sourceId), + eq(articleContentTable.language, targetLanguage), + eq(articleContentTable.beingTranslated, true), + ), + ) + .returning(); + if (reclaimed.length < 1) { + // Lost the race to another writer that just completed (or + // restarted) this row between our SELECT and our UPDATE. + // Return the row we observed; nothing further to do. + return translated; + } + queued = reclaimed[0]; } else { queued = inserted[0]; } @@ -865,6 +898,12 @@ export async function restartArticleContentTranslations( // `originalLanguage` and `translationRequesterId` are not in // `set`, so each row's audit trail (who first asked for this // translation) is preserved. + // Stamp `updated` with a JS-side Date rather than PG + // `CURRENT_TIMESTAMP` so it round-trips losslessly through the + // driver and the per-row claim CAS in + // `runArticleContentTranslation` can match it; see the long + // comment on that claim for the µs/ms precision rationale. + const restartStamp = new Date(); const reset = await tx.update(articleContentTable) .set({ title: original.title, @@ -878,7 +917,7 @@ export async function restartArticleContentTranslations( // on the next OG-image request will rebuild it from the // freshly translated content. ogImageKey: null, - updated: sql`CURRENT_TIMESTAMP`, + updated: restartStamp, }) .where( and( @@ -967,10 +1006,24 @@ async function runArticleContentTranslation( // the round-tripped value of a `CURRENT_TIMESTAMP` write would // never match. // - // Two further guards live here: + // Three further guards live here: // - `beingTranslated=true` on the WHERE bails out silently if the // row has already been completed (or deleted) by another writer // between the caller queueing this run and the claim landing. + // - `updated = queued.updated` makes the claim itself + // conditional on the row not having been re-stamped under us + // by a concurrent `restartArticleContentTranslations` (or a + // parallel run for the same row). Without this, two runs + // triggered by back-to-back edits both pass the + // `beingTranslated` check and both end up calling `translate()`, + // wasting an LLM round trip even though the success/failure + // CAS below would still ensure only one write lands. All + // writers that produce a `queued` for this helper + // (`startArticleContentTranslation`'s INSERT, its stuck-row + // re-stamp branch, and `restartArticleContentTranslations`'s + // reset UPDATE) explicitly stamp `updated` with a JS `Date` + // for the same round-trip-precision reason as the claim above; + // the comparison is lossless. // - The translate input below is built from `claimed.title` / // `claimed.content` rather than the caller's `queued` snapshot. // When two `restartArticleContentTranslations` calls race, the @@ -985,6 +1038,7 @@ async function runArticleContentTranslation( eq(articleContentTable.sourceId, sourceId), eq(articleContentTable.language, targetLanguage), eq(articleContentTable.beingTranslated, true), + eq(articleContentTable.updated, queued.updated), ), ) .returning(); From f59757a238887f3b70022d57e68d7a71c12258ac Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Mon, 4 May 2026 15:46:17 +0900 Subject: [PATCH 12/18] Tolerate model preamble in translation parsing The success path of runArticleContentTranslation() split the LLM output into title and body with two strict regexes anchored at ^\s*#\s+, so any model output that didn't begin exactly with a `# ` H1 left title="" and dumped the entire raw output into content. The most likely cause in production was a model that prefaced the translation with a brief commentary line (a translator's note, hedging about preserved terms, etc.) before the H1. The previous commit's fetchWebPage switch made this much less common, but the parser was still fragile. Replace the inline regexes with a small splitTranslationTitleAndContent helper that: - Accepts the first H1 line anywhere in the output as the title, not just at offset 0. Any text before that H1 is dropped on the assumption it is model commentary, since the prompt did not contain anything before `# Title`. - Falls back to the first non-empty line as the title (with the rest as the body) if there is no H1 at all. - Restricts H1 detection to a single `#` followed by whitespace, so later `## Section` headings inside the body are not mis-promoted to the title. - Trims surrounding whitespace from both halves. The helper is exported and covered by a dedicated unit test file exercising clean H1 + body, H1 with preamble, no H1 at all, H2-only input, H1-only input, empty / whitespace-only input, and title-line whitespace trimming. https://github.com/hackers-pub/hackerspub/pull/283#discussion_r3179756048 Assisted-by: Gemini Code Assist:gemini-2.5 Assisted-by: Claude Code:claude-opus-4-7 --- models/article.translation-split.test.ts | 122 +++++++++++++++++++++++ models/article.ts | 44 +++++++- 2 files changed, 163 insertions(+), 3 deletions(-) create mode 100644 models/article.translation-split.test.ts diff --git a/models/article.translation-split.test.ts b/models/article.translation-split.test.ts new file mode 100644 index 000000000..1a67ed640 --- /dev/null +++ b/models/article.translation-split.test.ts @@ -0,0 +1,122 @@ +import assert from "node:assert/strict"; +import test from "node:test"; +import { splitTranslationTitleAndContent } from "./article.ts"; + +test("splitTranslationTitleAndContent() splits clean # Title + body", () => { + const { title, content } = splitTranslationTitleAndContent( + "# Translated title\n\nTranslated body line 1.\n\nTranslated body line 2.", + ); + assert.equal(title, "Translated title"); + assert.equal( + content, + "Translated body line 1.\n\nTranslated body line 2.", + ); +}); + +test("splitTranslationTitleAndContent() drops preamble before the H1", () => { + // The translator is prompted with `# Title\n\nbody`, so any text + // before the matching H1 in the output is model commentary + // (apologies, hedging, "translator's note" preambles, etc.) and + // shouldn't end up in the persisted body. + const { title, content } = splitTranslationTitleAndContent( + "Note: I have preserved technical terms in their original form.\n" + + "\n" + + "# Translated title\n" + + "\n" + + "Translated body.", + ); + assert.equal(title, "Translated title"); + assert.equal(content, "Translated body."); +}); + +test("splitTranslationTitleAndContent() falls back to first non-empty line when no H1", () => { + // Some models occasionally drop the H1 framing entirely. Without + // a fallback the old strict-regex parser left `title=""` and put + // the entire output into `content`, which renders as a + // titleless article in the UI. + const { title, content } = splitTranslationTitleAndContent( + "\n" + + "Translated title\n" + + "\n" + + "Translated body line 1.\n" + + "Translated body line 2.", + ); + assert.equal(title, "Translated title"); + assert.equal( + content, + "Translated body line 1.\nTranslated body line 2.", + ); +}); + +test("splitTranslationTitleAndContent() ignores ## H2 headings as title candidates", () => { + // The regex must only match `# ` (single hash followed by + // whitespace), not `## `, so a body whose first line is a + // section heading rather than a document title doesn't have its + // first H2 mis-promoted to the article title. + const { title, content } = splitTranslationTitleAndContent( + "## Section\n\nBody under section.", + ); + // No H1 anywhere: fall back to first non-empty line. The first + // non-empty line is the H2, which gets used as the title. Not + // ideal but matches the no-H1 fallback behavior; the caller + // upstream of this helper is supposed to prompt with `# Title` + // and this is just a defensive tail. + assert.equal(title, "## Section"); + assert.equal(content, "Body under section."); +}); + +test("splitTranslationTitleAndContent() preserves later ## H2 headings inside the body", () => { + // A document with a real H1 followed by H2 sections must keep + // the H2 sections in the body untouched. + const { title, content } = splitTranslationTitleAndContent( + "# Translated title\n" + + "\n" + + "Intro paragraph.\n" + + "\n" + + "## Section A\n" + + "\n" + + "Body of section A.\n" + + "\n" + + "## Section B\n" + + "\n" + + "Body of section B.", + ); + assert.equal(title, "Translated title"); + assert.equal( + content, + "Intro paragraph.\n" + + "\n" + + "## Section A\n" + + "\n" + + "Body of section A.\n" + + "\n" + + "## Section B\n" + + "\n" + + "Body of section B.", + ); +}); + +test("splitTranslationTitleAndContent() handles entirely empty output", () => { + const { title, content } = splitTranslationTitleAndContent(""); + assert.equal(title, ""); + assert.equal(content, ""); + const ws = splitTranslationTitleAndContent(" \n\n \n"); + assert.equal(ws.title, ""); + assert.equal(ws.content, ""); +}); + +test("splitTranslationTitleAndContent() handles H1-only output (no body)", () => { + const { title, content } = splitTranslationTitleAndContent( + "# Translated title\n", + ); + assert.equal(title, "Translated title"); + assert.equal(content, ""); +}); + +test("splitTranslationTitleAndContent() trims surrounding whitespace from the title", () => { + const { title, content } = splitTranslationTitleAndContent( + "# Translated title with extra spaces \n\nBody.", + ); + assert.equal(title, "Translated title with extra spaces"); + assert.equal(content, "Body."); +}); diff --git a/models/article.ts b/models/article.ts index 89cf4ed1b..c31564ba0 100644 --- a/models/article.ts +++ b/models/article.ts @@ -956,6 +956,46 @@ export async function restartArticleContentTranslations( } } +/** + * Splits an LLM translation output into its title and body halves. + * + * The translator is prompted with `# {title}\n\n{body}` and is + * expected to return the same shape with both halves translated. + * In practice models usually do; when they don't (e.g., they + * preface the output with a brief commentary line, or drop the H1 + * heading altogether), a strict regex parser leaves `title` empty + * and stuffs the entire output, including the broken framing, + * into `content`. Be more forgiving: + * + * - The first H1 line anywhere in the output is taken as the + * title. Any text before it is dropped on the assumption it is + * model commentary, since the prompt did not contain anything + * before `# Title`. + * - If the output has no H1 line at all, the first non-empty line + * becomes the title and the rest becomes the body. + * - The H1 detection only matches lines that begin with exactly + * `# ` (a single `#` followed by whitespace), so subsequent + * `## Section` headings inside the body are not mis-extracted. + */ +export function splitTranslationTitleAndContent( + translation: string, +): { title: string; content: string } { + const trimmed = translation.trim(); + const h1Match = trimmed.match(/(?:^|\n)\s*#\s+([^\n]+)/); + if (h1Match != null) { + const title = h1Match[1].trim(); + const matchEnd = trimmed.indexOf(h1Match[0]) + h1Match[0].length; + const content = trimmed.slice(matchEnd).trim(); + return { title, content }; + } + const lines = trimmed.split(/\r?\n/); + const firstNonEmpty = lines.findIndex((l) => l.trim().length > 0); + if (firstNonEmpty < 0) return { title: "", content: "" }; + const title = lines[firstNonEmpty].trim(); + const content = lines.slice(firstNonEmpty + 1).join("\n").trim(); + return { title, content }; +} + /** * Runs the actual LLM translation for an `article_content` row that is * already in the placeholder / `beingTranslated` state. Awaits the @@ -1081,9 +1121,7 @@ async function runArticleContentTranslation( ...queued, translation, }); - // Split the translation into title and content. - const title = translation.match(/^\s*#\s+([^\n]*)/)?.[1] ?? ""; - const content = translation.replace(/^\s*#\s+[^\n]*\s*/, "").trim(); + const { title, content } = splitTranslationTitleAndContent(translation); const updated = await db.update(articleContentTable) .set({ title, From f0dc3f4304bd1c4936f1bb25a5f6490cbd4a43ce Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Mon, 4 May 2026 16:04:23 +0900 Subject: [PATCH 13/18] Skip human-curated translations in restart restartArticleContentTranslations()'s reset filter selected every row with `originalLanguage IS NOT NULL`, which catches both LLM-requested translation rows (carrying `translationRequesterId`) and human-curated translation rows (carrying `translatorId`). On a body edit, a human translation would therefore be reset to the new source-language placeholder and re-run through the LLM, silently destroying the translator's work and leaving incorrect attribution metadata on machine-generated output. Add `translatorId IS NULL` to the WHERE so the restart only touches LLM-requested rows. The schema check `article_content_translator_translation_requester_id_check` already makes `translatorId` and `translationRequesterId` mutually exclusive, so the new predicate cleanly partitions human vs LLM translations without needing to also test `translationRequesterId IS NOT NULL`. Adds a regression test that creates one human translation and one LLM translation alongside it on the same article, edits the body, and asserts the human row is untouched while the LLM row is picked up by the restart. https://github.com/hackers-pub/hackerspub/pull/283#discussion_r3179825969 Assisted-by: Codex:gpt-5.5 Assisted-by: Claude Code:claude-opus-4-7 --- models/article.lifecycle.test.ts | 101 +++++++++++++++++++++++++++++++ models/article.ts | 10 +++ 2 files changed, 111 insertions(+) diff --git a/models/article.lifecycle.test.ts b/models/article.lifecycle.test.ts index 985e35b09..067a31eb2 100644 --- a/models/article.lifecycle.test.ts +++ b/models/article.lifecycle.test.ts @@ -292,3 +292,104 @@ test("updateArticle() does not retranslate when allowLlmTranslation is false", a assert.equal(ko.summary, "Existing summary."); }); }); + +test("updateArticle() does not retranslate human-curated translation rows", async () => { + await withRollback(async (tx) => { + const fedCtx = createFedCtx(tx); + fedCtx.data.models = fakeModels as typeof fedCtx.data.models; + const author = await insertAccountWithActor(tx, { + username: "humantranslateauthor", + name: "Human Translate Author", + email: "humantranslateauthor@example.com", + }); + const humanTranslator = await insertAccountWithActor(tx, { + username: "humantranslator", + name: "Human Translator", + email: "humantranslator@example.com", + }); + const llmRequester = await insertAccountWithActor(tx, { + username: "humantranslatellmrequester", + name: "LLM Requester", + email: "humantranslatellmrequester@example.com", + }); + const article = await createArticle(fedCtx, { + accountId: author.account.id, + publishedYear: 2026, + slug: "human-translate-article", + tags: [], + allowLlmTranslation: true, + published: new Date("2026-04-15T00:00:00.000Z"), + updated: new Date("2026-04-15T00:00:00.000Z"), + title: "Original article", + content: "Original body", + language: "en", + }); + assert.ok(article != null); + // A human-curated translation row: `translatorId` is set + // (and `translationRequesterId` is null per the schema check + // `article_content_translator_translation_requester_id_check`, + // which makes the two columns mutually exclusive). This row + // must survive a body edit untouched: resetting it to a + // source-language placeholder for the LLM to re-do would + // silently destroy the translator's work. + await tx.insert(articleContentTable).values({ + sourceId: article.articleSource.id, + language: "ko", + title: "Human-curated translated title", + content: "Human-curated translated body", + summary: "Human-curated summary.", + originalLanguage: "en", + translatorId: humanTranslator.account.id, + translationRequesterId: null, + beingTranslated: false, + published: new Date("2026-04-15T01:00:00.000Z"), + updated: new Date("2026-04-15T01:00:00.000Z"), + }); + // An LLM-requested translation row alongside it, to confirm + // the gate is selective rather than blanket-skipping the + // restart for the whole article. + await tx.insert(articleContentTable).values({ + sourceId: article.articleSource.id, + language: "ja", + title: "LLM translated title", + content: "LLM translated body", + summary: "LLM summary.", + originalLanguage: "en", + translatorId: null, + translationRequesterId: llmRequester.account.id, + beingTranslated: false, + published: new Date("2026-04-15T01:00:00.000Z"), + updated: new Date("2026-04-15T01:00:00.000Z"), + }); + + const updated = await updateArticle(fedCtx, article.articleSource.id, { + content: "Edited body", + }); + assert.ok(updated != null); + + // Human row stays exactly as it was. + const ko = await tx.query.articleContentTable.findFirst({ + where: { sourceId: article.articleSource.id, language: "ko" }, + }); + assert.ok(ko != null); + assert.equal(ko.beingTranslated, false); + assert.equal(ko.title, "Human-curated translated title"); + assert.equal(ko.content, "Human-curated translated body"); + assert.equal(ko.summary, "Human-curated summary."); + assert.equal(ko.translatorId, humanTranslator.account.id); + + // The LLM row, in contrast, IS picked up by the restart and + // ends up either as a placeholder (which the failing stub + // translator then deletes) or already deleted by the + // failure-cleanup path. + await waitFor(async () => { + const ja = await tx.query.articleContentTable.findFirst({ + where: { sourceId: article.articleSource.id, language: "ja" }, + }); + if (ja == null) return true; + return ja.beingTranslated === true && + ja.title === "Original article" && + ja.content === "Edited body"; + }); + }); +}); diff --git a/models/article.ts b/models/article.ts index c31564ba0..db571b2cb 100644 --- a/models/article.ts +++ b/models/article.ts @@ -923,6 +923,16 @@ export async function restartArticleContentTranslations( and( eq(articleContentTable.sourceId, articleSource.id), isNotNull(articleContentTable.originalLanguage), + // Only LLM-requested translations. Human translations + // carry `translatorId` instead of `translationRequesterId` + // (the schema check + // `article_content_translator_translation_requester_id_check` + // makes the two columns mutually exclusive), and resetting + // a curated human translation back to a source-language + // placeholder so the LLM can re-do it would silently + // destroy that contributor's work and mis-attribute the + // result. + isNull(articleContentTable.translatorId), ), ) .returning(); From ff61bc24db64378b8e9ca6e9b5d2bf1477c8713d Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Mon, 4 May 2026 16:05:31 +0900 Subject: [PATCH 14/18] Drop redundant originalLanguage fallback The translate() input in runArticleContentTranslation() passed sourceLanguage as `claimed.originalLanguage ?? originalLanguage`, where the right-hand side was the destructured local from the caller-provided `queued`. The fallback was dead code: the claim WHERE required `beingTranslated=true`, and the schema check `article_content_being_translated_check` makes that imply `originalLanguage IS NOT NULL`, so `claimed.originalLanguage` is always set on a row that came back from the claim's RETURNING. Drop the fallback and use a non-null assertion on `claimed.originalLanguage` (drizzle types the column as nullable because it is nullable in general, but in this position the schema guarantees otherwise). A short comment captures the chain of guarantees so a future reader doesn't reach for the same defensive `??` again. https://github.com/hackers-pub/hackerspub/pull/283#discussion_r3179808632 Assisted-by: Gemini Code Assist:gemini-2.5 Assisted-by: Claude Code:claude-opus-4-7 --- models/article.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/models/article.ts b/models/article.ts index db571b2cb..c6d6264b3 100644 --- a/models/article.ts +++ b/models/article.ts @@ -1117,9 +1117,14 @@ async function runArticleContentTranslation( // Combine title and content for translation, using the freshest // values read back from the claim above. const text = `# ${claimed.title}\n\n${claimed.content}`; + // `claimed.originalLanguage` is non-null in practice: the claim + // WHERE required `beingTranslated=true`, and the schema check + // `article_content_being_translated_check` makes that imply + // `originalLanguage IS NOT NULL`. Drizzle types it as nullable + // because the column is nullable in general, so assert. translate({ model, - sourceLanguage: claimed.originalLanguage ?? originalLanguage, + sourceLanguage: claimed.originalLanguage!, targetLanguage, text, // Pass context for better translation quality. From 4a6fbaf36e2ad7ae7f92c5ee2f7ba5430873ef0f Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Mon, 4 May 2026 16:25:07 +0900 Subject: [PATCH 15/18] Make stuck-row reclaim CAS-safe via staleness gate The stuck-row reclaim branch in startArticleContentTranslation() SELECTed the row, decided it was stale (>30 min old and still beingTranslated), then UPDATEd it with only beingTranslated=true in the WHERE. A concurrent worker that landed its own reclaim in the gap between our SELECT and our UPDATE would have its fresh claim overwritten by ours, and both workers would then go on to call runArticleContentTranslation(), wasting one LLM round trip. Repeat the same staleness check inside the UPDATE itself (`updated < CURRENT_TIMESTAMP - INTERVAL '30 minutes'`). Under PG read-committed isolation, the second UPDATE waits for the first's row lock, then re-evaluates its WHERE on the now-fresh row state and drops the row from the candidate set, so the second worker's reclaim returns 0 rows and bails cleanly. Same predicate as the SELECT, so it stays in lockstep with the visibility logic above. https://github.com/hackers-pub/hackerspub/pull/283#discussion_r3179884732 Assisted-by: CodeRabbit:cr-3 Assisted-by: Claude Code:claude-opus-4-7 --- models/article.ts | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/models/article.ts b/models/article.ts index c6d6264b3..cad516e6a 100644 --- a/models/article.ts +++ b/models/article.ts @@ -819,12 +819,25 @@ export async function startArticleContentTranslation( eq(articleContentTable.sourceId, content.sourceId), eq(articleContentTable.language, targetLanguage), eq(articleContentTable.beingTranslated, true), + // Repeat the staleness check inside the UPDATE itself so + // the reclaim is CAS-safe. If a concurrent worker just + // reclaimed the same stale placeholder between our SELECT + // above and this UPDATE, their reclaim wrote a fresh + // `updated` past the threshold and PG's UPDATE re-evals + // this WHERE on the new row state, which makes our + // UPDATE drop the row from the candidate set and return + // 0 rows. That keeps us from stomping on the other + // worker's claim and double-firing `translate()`. + lt( + articleContentTable.updated, + sql`CURRENT_TIMESTAMP - INTERVAL '30 minutes'`, + ), ), ) .returning(); if (reclaimed.length < 1) { - // Lost the race to another writer that just completed (or - // restarted) this row between our SELECT and our UPDATE. + // Lost the race to another writer that just reclaimed this + // row, or completed it, between our SELECT and our UPDATE. // Return the row we observed; nothing further to do. return translated; } From 0211c609b4d6d80d2fb808134ebf3ec4dfebc11b Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Mon, 4 May 2026 16:26:10 +0900 Subject: [PATCH 16/18] Use Drizzle .for('update') for restart row lock restartArticleContentTranslations() acquired the article_source row lock with raw SQL via tx.execute(sql\`SELECT 1 FROM ... FOR UPDATE\`). Drizzle exposes the same primitive idiomatically as .for("update") on a select query, which keeps column references and parameter binding consistent with the rest of this file's query style. Same lock, same semantics, no behavior change. https://github.com/hackers-pub/hackerspub/pull/283#discussion_r3179886716 Assisted-by: Gemini Code Assist:gemini-2.5 Assisted-by: Claude Code:claude-opus-4-7 --- models/article.ts | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/models/article.ts b/models/article.ts index cad516e6a..592abb553 100644 --- a/models/article.ts +++ b/models/article.ts @@ -886,11 +886,10 @@ export async function restartArticleContentTranslations( // The translate() calls themselves run after the transaction // commits so the LLM round-trip doesn't extend the lock window. const resetRows = await db.transaction(async (tx) => { - await tx.execute(sql` - SELECT 1 FROM ${articleSourceTable} - WHERE ${articleSourceTable.id} = ${articleSource.id} - FOR UPDATE - `); + await tx.select({ id: articleSourceTable.id }) + .from(articleSourceTable) + .where(eq(articleSourceTable.id, articleSource.id)) + .for("update"); const original = await getOriginalArticleContent(tx, articleSource); if (original == null) { logger.debug( From 5fd961d6e96ce94530360fb9a5cad6984dc002d0 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Mon, 4 May 2026 16:28:47 +0900 Subject: [PATCH 17/18] Anchor translation title parsing to the first line splitTranslationTitleAndContent() previously took the first H1 line anywhere in the output as the title, dropping any text before it on the assumption that it was model commentary. That handled a "preamble before # Title" case nicely but introduced a data-loss path: when the model omitted the article-title H1 and the body happened to contain its own H1 section heading, the parser silently truncated the article (mis-promoted body H1 to title, dropped the intro paragraphs above it). Silent truncation is a worse failure mode than a visible-but-ugly title. Anchor parsing to the first line of the trimmed output: - The first line becomes the title. - If it begins with `# `, the marker is stripped; otherwise the whole line is the title verbatim. - Everything after the first line becomes the body, untouched. The strict implementation also folds in the related cleanup of the prior fallback branch. Because `trimmed` is the result of `String#trim()`, it is guaranteed to either be empty or begin with a non-whitespace character, so the previous `findIndex` for the first non-empty line was redundant and is now removed. The H1-marker regex is restricted to a single `#` followed by whitespace so a first-line `## Section` is taken verbatim instead of being mis-stripped. Tests in models/article.translation-split.test.ts are updated to match the new behavior: the preamble case now keeps the preamble visible as the title with the body's `# Title` framing preserved, and the rest of the cases (clean H1+body, no-H1 fallback, first-line H2, body H2 sections, empty input, H1-only input, title trimming) continue to pass unchanged. https://github.com/hackers-pub/hackerspub/pull/283#discussion_r3179891802 https://github.com/hackers-pub/hackerspub/pull/283#discussion_r3179886723 Assisted-by: Codex:gpt-5.5 Assisted-by: Gemini Code Assist:gemini-2.5 Assisted-by: Claude Code:claude-opus-4-7 --- models/article.translation-split.test.ts | 46 ++++++++++---------- models/article.ts | 53 ++++++++++++------------ 2 files changed, 51 insertions(+), 48 deletions(-) diff --git a/models/article.translation-split.test.ts b/models/article.translation-split.test.ts index 1a67ed640..981da9387 100644 --- a/models/article.translation-split.test.ts +++ b/models/article.translation-split.test.ts @@ -13,11 +13,16 @@ test("splitTranslationTitleAndContent() splits clean # Title + body", () => { ); }); -test("splitTranslationTitleAndContent() drops preamble before the H1", () => { - // The translator is prompted with `# Title\n\nbody`, so any text - // before the matching H1 in the output is model commentary - // (apologies, hedging, "translator's note" preambles, etc.) and - // shouldn't end up in the persisted body. +test("splitTranslationTitleAndContent() keeps preamble as the title rather than scanning deeper for an H1", () => { + // The previous implementation took the first H1 anywhere in the + // output, which handled this preamble case nicely but at the + // cost of silently truncating content when the model omitted + // the article-title H1 entirely and the body happened to + // contain its own H1 section heading. We deliberately accept + // an uglier title here in exchange for never dropping body + // content; the preamble stays visible (as the title) instead + // of disappearing onto the floor, and the body keeps its `# ` + // framing intact. const { title, content } = splitTranslationTitleAndContent( "Note: I have preserved technical terms in their original form.\n" + "\n" + @@ -25,15 +30,17 @@ test("splitTranslationTitleAndContent() drops preamble before the H1", () => { "\n" + "Translated body.", ); - assert.equal(title, "Translated title"); - assert.equal(content, "Translated body."); + assert.equal( + title, + "Note: I have preserved technical terms in their original form.", + ); + assert.equal(content, "# Translated title\n\nTranslated body."); }); -test("splitTranslationTitleAndContent() falls back to first non-empty line when no H1", () => { - // Some models occasionally drop the H1 framing entirely. Without - // a fallback the old strict-regex parser left `title=""` and put - // the entire output into `content`, which renders as a - // titleless article in the UI. +test("splitTranslationTitleAndContent() uses the first line as the title when there is no H1 marker", () => { + // Some models occasionally drop the H1 framing entirely. The + // first line stands in as the title so the article isn't + // persisted titleless. const { title, content } = splitTranslationTitleAndContent( "\n" + "Translated title\n" + @@ -48,19 +55,14 @@ test("splitTranslationTitleAndContent() falls back to first non-empty line when ); }); -test("splitTranslationTitleAndContent() ignores ## H2 headings as title candidates", () => { - // The regex must only match `# ` (single hash followed by - // whitespace), not `## `, so a body whose first line is a - // section heading rather than a document title doesn't have its - // first H2 mis-promoted to the article title. +test("splitTranslationTitleAndContent() does not strip a leading ## H2 marker from the first line", () => { + // The H1-marker regex only matches `# ` (single hash followed by + // whitespace), not `## `, so a first-line H2 is taken verbatim + // as the title. Mis-stripping would have produced a title + // starting with `# Section`, which would render very wrong. const { title, content } = splitTranslationTitleAndContent( "## Section\n\nBody under section.", ); - // No H1 anywhere: fall back to first non-empty line. The first - // non-empty line is the H2, which gets used as the title. Not - // ideal but matches the no-H1 fallback behavior; the caller - // upstream of this helper is supposed to prompt with `# Title` - // and this is just a defensive tail. assert.equal(title, "## Section"); assert.equal(content, "Body under section."); }); diff --git a/models/article.ts b/models/article.ts index 592abb553..1f37a7060 100644 --- a/models/article.ts +++ b/models/article.ts @@ -983,39 +983,40 @@ export async function restartArticleContentTranslations( * * The translator is prompted with `# {title}\n\n{body}` and is * expected to return the same shape with both halves translated. - * In practice models usually do; when they don't (e.g., they - * preface the output with a brief commentary line, or drop the H1 - * heading altogether), a strict regex parser leaves `title` empty - * and stuffs the entire output, including the broken framing, - * into `content`. Be more forgiving: + * In practice models usually do. When they don't (e.g., they drop + * the H1 framing entirely), the strict behavior here is: * - * - The first H1 line anywhere in the output is taken as the - * title. Any text before it is dropped on the assumption it is - * model commentary, since the prompt did not contain anything - * before `# Title`. - * - If the output has no H1 line at all, the first non-empty line - * becomes the title and the rest becomes the body. - * - The H1 detection only matches lines that begin with exactly - * `# ` (a single `#` followed by whitespace), so subsequent - * `## Section` headings inside the body are not mis-extracted. + * - The first line is taken as the title. If it begins with `# `, + * the marker is stripped; otherwise the whole line becomes the + * title verbatim. + * - Everything after the first line becomes the body. + * + * Scanning deeper for an H1 elsewhere in the output is *not* done + * on purpose: it would handle a "model put a preamble before the + * # Title" case nicely but at the cost of silently truncating + * content if the model omits the article-title H1 and the body + * happens to contain its own H1 section heading; that body H1 + * would be mis-promoted to the title and the intro paragraphs + * would be dropped. Leaving a preamble visible as the title is + * the lesser of those two failures. The H1-marker detection on + * the first line is restricted to a single `#` followed by + * whitespace, so a first-line `## Section` is not mis-stripped. */ export function splitTranslationTitleAndContent( translation: string, ): { title: string; content: string } { const trimmed = translation.trim(); - const h1Match = trimmed.match(/(?:^|\n)\s*#\s+([^\n]+)/); - if (h1Match != null) { - const title = h1Match[1].trim(); - const matchEnd = trimmed.indexOf(h1Match[0]) + h1Match[0].length; - const content = trimmed.slice(matchEnd).trim(); - return { title, content }; - } + if (trimmed === "") return { title: "", content: "" }; + // `trimmed` is guaranteed non-empty and starts with a non- + // whitespace character, so `lines[0]` is the first non-empty + // line as text and there's no need to scan past it. const lines = trimmed.split(/\r?\n/); - const firstNonEmpty = lines.findIndex((l) => l.trim().length > 0); - if (firstNonEmpty < 0) return { title: "", content: "" }; - const title = lines[firstNonEmpty].trim(); - const content = lines.slice(firstNonEmpty + 1).join("\n").trim(); - return { title, content }; + const firstLine = lines[0].trim(); + const h1AtStart = firstLine.match(/^#\s+(.+)$/); + return { + title: (h1AtStart?.[1] ?? firstLine).trim(), + content: lines.slice(1).join("\n").trim(), + }; } /** From 2dd4480e670b156306e87f31785bbe778371dfca Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Mon, 4 May 2026 16:42:00 +0900 Subject: [PATCH 18/18] Refresh stuck placeholder body before reclaim The stuck-row reclaim branch in startArticleContentTranslation() only bumped `updated` and left the row's `title` / `content` alone. That was fine when the helper translated from the caller-provided original content, but runArticleContentTranslation() now translates from `claimed.title` / `claimed.content` (the row's own state read back via RETURNING), so a placeholder that had been stuck since before a body edit would be retranslated using the OLD source text and publish a translation that no longer matches the article. Concrete failure path: an editor disables allowLlmTranslation, edits the body (no restart fires because of the gate), waits out the 30-min stale window, re-enables allowLlmTranslation, and a later requestArticleTranslation hits the stuck-recovery branch. Without this fix, the resulting translation reflects the body from before the edit. Extend the reclaim UPDATE to copy the caller-provided original title / content (and the matching language) into the placeholder, and clear the `summary` / `summaryStarted` / `summaryUnnecessary` / `ogImageKey` derived state for the same reason as restartArticleContentTranslations() does. The caller of startArticleContentTranslation() always passes the article's current original-language content row, which is the right source of truth at request time. https://github.com/hackers-pub/hackerspub/pull/283#discussion_r3180010017 Assisted-by: Codex:gpt-5.5 Assisted-by: Claude Code:claude-opus-4-7 --- models/article.ts | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/models/article.ts b/models/article.ts index 1f37a7060..31992f85e 100644 --- a/models/article.ts +++ b/models/article.ts @@ -806,14 +806,32 @@ export async function startArticleContentTranslation( return translated!; } // The placeholder is stale (older than 30 min, presumably from - // a crashed previous run). Re-stamp it with a fresh JS Date - // before handing off to the helper so the helper's claim CAS - // has a value it can match — without this, the CAS would be - // comparing against the row's possibly-µs-precision DB - // timestamp via a ms-truncated round-trip and never hit. + // a crashed previous run). Refresh it before handing off to + // the helper: + // + // - Re-stamp `updated` with a fresh JS Date so the helper's + // claim CAS has a value it can match (without this, the CAS + // would be comparing against the row's possibly-µs-precision + // DB timestamp via a ms-truncated round-trip and never hit). + // - Copy the caller-provided original title/content into the + // placeholder. The helper translates from `claimed.title` / + // `claimed.content`, so without this refresh a placeholder + // stuck since before a body edit would be retranslated from + // the OLD body and publish a translation that no longer + // matches the article. Clear the matching summary / OG + // image state for the same reason. const reclaim = new Date(); const reclaimed = await db.update(articleContentTable) - .set({ updated: reclaim }) + .set({ + updated: reclaim, + title: content.title, + content: content.content, + originalLanguage: content.language, + summary: null, + summaryStarted: null, + summaryUnnecessary: false, + ogImageKey: null, + }) .where( and( eq(articleContentTable.sourceId, content.sourceId),