diff --git a/ai/translate.ts b/ai/translate.ts index af487062e..35cece6bf 100644 --- a/ai/translate.ts +++ b/ai/translate.ts @@ -1,5 +1,5 @@ -import { fetchLinkedPages } from "@vertana/context-web"; -import type { RequiredContextSource } from "@vertana/core"; +import { fetchWebPage } from "@vertana/context-web"; +import type { ContextSource, RequiredContextSource } from "@vertana/core"; import { translate as vertanaTranslate } from "@vertana/facade"; import type { LanguageModel } from "ai"; @@ -64,7 +64,7 @@ function createTagsContextSource( export async function translate(options: TranslationOptions): Promise { // Build context sources - const contextSources: RequiredContextSource[] = []; + const contextSources: ContextSource[] = []; const authorSource = createAuthorContextSource( options.authorName, @@ -75,12 +75,11 @@ export async function translate(options: TranslationOptions): Promise { const tagsSource = createTagsContextSource(options.tags); if (tagsSource) contextSources.push(tagsSource); - // Add web context to fetch linked pages - const webContext = fetchLinkedPages({ - text: options.text, - mediaType: "text/markdown", - }); - contextSources.push(webContext); + // Expose linked-page fetching as a passive tool the model can call + // when it actually needs context, instead of dumping every linked + // page's full body into the system prompt up front (which made the + // translator confuse the context for the text to translate). + contextSources.push(fetchWebPage); const result = await vertanaTranslate( options.model, diff --git a/models/article.background.test.ts b/models/article.background.test.ts index 3044fca7c..8433a23a5 100644 --- a/models/article.background.test.ts +++ b/models/article.background.test.ts @@ -2,6 +2,7 @@ import assert from "node:assert/strict"; import test from "node:test"; import { articleContentTable, articleSourceTable } from "./schema.ts"; import { + restartArticleContentTranslations, startArticleContentSummary, startArticleContentTranslation, } from "./article.ts"; @@ -10,19 +11,9 @@ import { insertAccountWithActor, withRollback, } from "../test/postgres.ts"; +import { waitFor } from "../test/wait.ts"; import { generateUuidV7 } from "./uuid.ts"; -async function waitFor(predicate: () => Promise, timeoutMs = 10000) { - const deadline = Date.now() + timeoutMs; - while (Date.now() < deadline) { - if (await predicate()) return; - await new Promise((resolve) => setTimeout(resolve, 50)); - } - throw new Error( - `Timed out waiting for async background state after ${timeoutMs}ms`, - ); -} - test("startArticleContentSummary() resets summaryStarted when summarization fails", async () => { await withRollback(async (tx) => { const author = await insertAccountWithActor(tx, { @@ -134,3 +125,151 @@ test("startArticleContentTranslation() deletes queued rows when translation fail }); }); }); + +test( + "restartArticleContentTranslations() resets each translation row to placeholder state and re-runs the translator", + async () => { + await withRollback(async (tx) => { + const fedCtx = createFedCtx(tx); + // Use the same `{} as never` translator stub as the existing + // failure-path test: it lets us observe the row pass through + // the placeholder state and then be cleaned up by the failure + // branch, which is enough to confirm + // restartArticleContentTranslations actually re-fired the + // translation pipeline against the reset row. + fedCtx.data.models = { + summarizer: {} as never, + translator: {} as never, + } as typeof fedCtx.data.models; + const author = await insertAccountWithActor(tx, { + username: "restarttranslator", + name: "Restart Translator", + email: "restarttranslator@example.com", + }); + const requester = await insertAccountWithActor(tx, { + username: "restartrequester", + name: "Restart Requester", + email: "restartrequester@example.com", + }); + const sourceId = generateUuidV7(); + const published = new Date("2026-04-15T00:00:00.000Z"); + + const [articleSource] = await tx.insert(articleSourceTable).values({ + id: sourceId, + accountId: author.account.id, + publishedYear: 2026, + slug: "restart-translation", + tags: [], + allowLlmTranslation: true, + published, + updated: published, + }).returning(); + // The original row carries the *new* (post-edit) body — the + // shape updateArticleSource leaves behind for + // restartArticleContentTranslations to mirror into each + // translation placeholder. + await tx.insert(articleContentTable).values({ + sourceId, + language: "en", + title: "New original title", + content: "New original body", + published, + updated: published, + }); + // A previously completed translation that is now stale relative + // to the freshly edited original. + await tx.insert(articleContentTable).values({ + sourceId, + language: "ko", + title: "Stale translated title", + content: "Stale translated body", + summary: "Stale summary.", + originalLanguage: "en", + translationRequesterId: requester.account.id, + beingTranslated: false, + published: new Date("2026-04-15T01:00:00.000Z"), + updated: new Date("2026-04-15T01:00:00.000Z"), + }); + + await restartArticleContentTranslations(fedCtx, articleSource); + + // The row must briefly pass through placeholder state before + // the failing stub model causes the failure branch to delete + // it; assert on either observable. + await waitFor(async () => { + const current = await tx.query.articleContentTable.findFirst({ + where: { sourceId, language: "ko" }, + }); + if (current == null) return true; + // Placeholder reset: title/content mirror the new original + // and beingTranslated has flipped back true with summary + // state cleared. translationRequesterId is preserved. + return current.beingTranslated === true && + current.title === "New original title" && + current.content === "New original body" && + current.summary === null && + current.translationRequesterId === requester.account.id; + }); + + // Eventually the failing stub causes deletion via the + // run-translation failure branch. + await waitFor(async () => { + const current = await tx.query.articleContentTable.findFirst({ + where: { sourceId, language: "ko" }, + }); + return current == null; + }); + }); + }, +); + +test( + "restartArticleContentTranslations() is a no-op when the article has no translations", + async () => { + await withRollback(async (tx) => { + const fedCtx = createFedCtx(tx); + fedCtx.data.models = { + summarizer: {} as never, + translator: {} as never, + } as typeof fedCtx.data.models; + const author = await insertAccountWithActor(tx, { + username: "restartnotrans", + name: "Restart No Translations", + email: "restartnotrans@example.com", + }); + const sourceId = generateUuidV7(); + const published = new Date("2026-04-15T00:00:00.000Z"); + + const [articleSource] = await tx.insert(articleSourceTable).values({ + id: sourceId, + accountId: author.account.id, + publishedYear: 2026, + slug: "restart-no-translations", + tags: [], + allowLlmTranslation: false, + published, + updated: published, + }).returning(); + await tx.insert(articleContentTable).values({ + sourceId, + language: "en", + title: "Original", + content: "Body", + published, + updated: published, + }); + + // Should return without throwing despite the deliberately bad + // translator stub. + await restartArticleContentTranslations(fedCtx, articleSource); + + // Original row still in place and unchanged. + const original = await tx.query.articleContentTable.findFirst({ + where: { sourceId, language: "en" }, + }); + assert.ok(original != null); + assert.equal(original.beingTranslated, false); + assert.equal(original.content, "Body"); + }); + }, +); diff --git a/models/article.lifecycle.test.ts b/models/article.lifecycle.test.ts index 77e447d15..067a31eb2 100644 --- a/models/article.lifecycle.test.ts +++ b/models/article.lifecycle.test.ts @@ -1,11 +1,13 @@ import assert from "node:assert/strict"; import test from "node:test"; import { createArticle, updateArticle } from "./article.ts"; +import { articleContentTable } from "./schema.ts"; import { createFedCtx, insertAccountWithActor, withRollback, } from "../test/postgres.ts"; +import { waitFor } from "../test/wait.ts"; const fakeModels = { summarizer: {} as never, @@ -100,3 +102,294 @@ test("updateArticle() rewrites the persisted article post", async () => { assert.match(storedPost.contentHtml, /body<\/strong>/); }); }); + +test("updateArticle() resets existing translation rows when the body changes", async () => { + await withRollback(async (tx) => { + const fedCtx = createFedCtx(tx); + fedCtx.data.models = fakeModels as typeof fedCtx.data.models; + const author = await insertAccountWithActor(tx, { + username: "retranslateauthor", + name: "Retranslate Author", + email: "retranslateauthor@example.com", + }); + const requester = await insertAccountWithActor(tx, { + username: "retranslaterequester", + name: "Retranslate Requester", + email: "retranslaterequester@example.com", + }); + const article = await createArticle(fedCtx, { + accountId: author.account.id, + publishedYear: 2026, + slug: "retranslate-article", + tags: [], + allowLlmTranslation: true, + published: new Date("2026-04-15T00:00:00.000Z"), + updated: new Date("2026-04-15T00:00:00.000Z"), + title: "Original article", + content: "Original body", + language: "en", + }); + assert.ok(article != null); + // Pre-existing completed translation row that the edit should + // invalidate. + await tx.insert(articleContentTable).values({ + sourceId: article.articleSource.id, + language: "ko", + title: "Old translated title", + content: "Old translated body", + summary: "Old summary.", + originalLanguage: "en", + translationRequesterId: requester.account.id, + beingTranslated: false, + published: new Date("2026-04-15T01:00:00.000Z"), + updated: new Date("2026-04-15T01:00:00.000Z"), + }); + + const updated = await updateArticle(fedCtx, article.articleSource.id, { + content: "Edited body", + }); + assert.ok(updated != null); + + // The retranslation runs in the background. The placeholder + // reset is awaited synchronously, then the failing stub + // translator deletes the row via the failure-cleanup branch. + // Either observable is acceptable. + await waitFor(async () => { + const ko = await tx.query.articleContentTable.findFirst({ + where: { sourceId: article.articleSource.id, language: "ko" }, + }); + if (ko == null) return true; + return ko.beingTranslated === true && + ko.title === "Original article" && + ko.content === "Edited body" && + ko.summary === null; + }); + }); +}); + +test("updateArticle() leaves existing translations alone on title-only edits", async () => { + await withRollback(async (tx) => { + const fedCtx = createFedCtx(tx); + fedCtx.data.models = fakeModels as typeof fedCtx.data.models; + const author = await insertAccountWithActor(tx, { + username: "retranslatetitleauthor", + name: "Retranslate Title Author", + email: "retranslatetitleauthor@example.com", + }); + const requester = await insertAccountWithActor(tx, { + username: "retranslatetitlerequester", + name: "Retranslate Title Requester", + email: "retranslatetitlerequester@example.com", + }); + const article = await createArticle(fedCtx, { + accountId: author.account.id, + publishedYear: 2026, + slug: "retranslate-title-article", + tags: [], + allowLlmTranslation: true, + published: new Date("2026-04-15T00:00:00.000Z"), + updated: new Date("2026-04-15T00:00:00.000Z"), + title: "Original article", + content: "Original body", + language: "en", + }); + assert.ok(article != null); + await tx.insert(articleContentTable).values({ + sourceId: article.articleSource.id, + language: "ko", + title: "Existing translated title", + content: "Existing translated body", + summary: "Existing summary.", + originalLanguage: "en", + translationRequesterId: requester.account.id, + beingTranslated: false, + published: new Date("2026-04-15T01:00:00.000Z"), + updated: new Date("2026-04-15T01:00:00.000Z"), + }); + + const updated = await updateArticle(fedCtx, article.articleSource.id, { + title: "Renamed only", + }); + assert.ok(updated != null); + + // Title-only edits don't trigger retranslation; the ko row stays + // exactly as it was — no placeholder, no deletion, original + // summary preserved. + const ko = await tx.query.articleContentTable.findFirst({ + where: { sourceId: article.articleSource.id, language: "ko" }, + }); + assert.ok(ko != null); + assert.equal(ko.beingTranslated, false); + assert.equal(ko.title, "Existing translated title"); + assert.equal(ko.content, "Existing translated body"); + assert.equal(ko.summary, "Existing summary."); + }); +}); + +test("updateArticle() does not retranslate when allowLlmTranslation is false", async () => { + await withRollback(async (tx) => { + const fedCtx = createFedCtx(tx); + fedCtx.data.models = fakeModels as typeof fedCtx.data.models; + const author = await insertAccountWithActor(tx, { + username: "noretranslateauthor", + name: "No Retranslate Author", + email: "noretranslateauthor@example.com", + }); + const requester = await insertAccountWithActor(tx, { + username: "noretranslaterequester", + name: "No Retranslate Requester", + email: "noretranslaterequester@example.com", + }); + const article = await createArticle(fedCtx, { + accountId: author.account.id, + publishedYear: 2026, + slug: "no-retranslate-article", + tags: [], + // The author opted out of LLM translation: no placeholder + // resets, no background `translate()` calls. This guards + // against re-enqueueing translation work in the same edit + // that flips the switch off (or any later body edit while + // the switch stays off). + allowLlmTranslation: false, + published: new Date("2026-04-15T00:00:00.000Z"), + updated: new Date("2026-04-15T00:00:00.000Z"), + title: "Original article", + content: "Original body", + language: "en", + }); + assert.ok(article != null); + // A pre-existing translation row from before the switch was + // turned off (or seeded by a prior allowLlmTranslation=true + // window) — must remain unchanged. + await tx.insert(articleContentTable).values({ + sourceId: article.articleSource.id, + language: "ko", + title: "Existing translated title", + content: "Existing translated body", + summary: "Existing summary.", + originalLanguage: "en", + translationRequesterId: requester.account.id, + beingTranslated: false, + published: new Date("2026-04-15T01:00:00.000Z"), + updated: new Date("2026-04-15T01:00:00.000Z"), + }); + + const updated = await updateArticle(fedCtx, article.articleSource.id, { + content: "Edited body", + }); + assert.ok(updated != null); + + // No placeholder reset, no `translate()` queued: the row + // stays exactly as it was. No async waiting needed because + // the gate skips the synchronous claim-and-reset entirely. + const ko = await tx.query.articleContentTable.findFirst({ + where: { sourceId: article.articleSource.id, language: "ko" }, + }); + assert.ok(ko != null); + assert.equal(ko.beingTranslated, false); + assert.equal(ko.title, "Existing translated title"); + assert.equal(ko.content, "Existing translated body"); + assert.equal(ko.summary, "Existing summary."); + }); +}); + +test("updateArticle() does not retranslate human-curated translation rows", async () => { + await withRollback(async (tx) => { + const fedCtx = createFedCtx(tx); + fedCtx.data.models = fakeModels as typeof fedCtx.data.models; + const author = await insertAccountWithActor(tx, { + username: "humantranslateauthor", + name: "Human Translate Author", + email: "humantranslateauthor@example.com", + }); + const humanTranslator = await insertAccountWithActor(tx, { + username: "humantranslator", + name: "Human Translator", + email: "humantranslator@example.com", + }); + const llmRequester = await insertAccountWithActor(tx, { + username: "humantranslatellmrequester", + name: "LLM Requester", + email: "humantranslatellmrequester@example.com", + }); + const article = await createArticle(fedCtx, { + accountId: author.account.id, + publishedYear: 2026, + slug: "human-translate-article", + tags: [], + allowLlmTranslation: true, + published: new Date("2026-04-15T00:00:00.000Z"), + updated: new Date("2026-04-15T00:00:00.000Z"), + title: "Original article", + content: "Original body", + language: "en", + }); + assert.ok(article != null); + // A human-curated translation row: `translatorId` is set + // (and `translationRequesterId` is null per the schema check + // `article_content_translator_translation_requester_id_check`, + // which makes the two columns mutually exclusive). This row + // must survive a body edit untouched: resetting it to a + // source-language placeholder for the LLM to re-do would + // silently destroy the translator's work. + await tx.insert(articleContentTable).values({ + sourceId: article.articleSource.id, + language: "ko", + title: "Human-curated translated title", + content: "Human-curated translated body", + summary: "Human-curated summary.", + originalLanguage: "en", + translatorId: humanTranslator.account.id, + translationRequesterId: null, + beingTranslated: false, + published: new Date("2026-04-15T01:00:00.000Z"), + updated: new Date("2026-04-15T01:00:00.000Z"), + }); + // An LLM-requested translation row alongside it, to confirm + // the gate is selective rather than blanket-skipping the + // restart for the whole article. + await tx.insert(articleContentTable).values({ + sourceId: article.articleSource.id, + language: "ja", + title: "LLM translated title", + content: "LLM translated body", + summary: "LLM summary.", + originalLanguage: "en", + translatorId: null, + translationRequesterId: llmRequester.account.id, + beingTranslated: false, + published: new Date("2026-04-15T01:00:00.000Z"), + updated: new Date("2026-04-15T01:00:00.000Z"), + }); + + const updated = await updateArticle(fedCtx, article.articleSource.id, { + content: "Edited body", + }); + assert.ok(updated != null); + + // Human row stays exactly as it was. + const ko = await tx.query.articleContentTable.findFirst({ + where: { sourceId: article.articleSource.id, language: "ko" }, + }); + assert.ok(ko != null); + assert.equal(ko.beingTranslated, false); + assert.equal(ko.title, "Human-curated translated title"); + assert.equal(ko.content, "Human-curated translated body"); + assert.equal(ko.summary, "Human-curated summary."); + assert.equal(ko.translatorId, humanTranslator.account.id); + + // The LLM row, in contrast, IS picked up by the restart and + // ends up either as a placeholder (which the failing stub + // translator then deletes) or already deleted by the + // failure-cleanup path. + await waitFor(async () => { + const ja = await tx.query.articleContentTable.findFirst({ + where: { sourceId: article.articleSource.id, language: "ja" }, + }); + if (ja == null) return true; + return ja.beingTranslated === true && + ja.title === "Original article" && + ja.content === "Edited body"; + }); + }); +}); diff --git a/models/article.source.test.ts b/models/article.source.test.ts index d7add745a..6929d73a5 100644 --- a/models/article.source.test.ts +++ b/models/article.source.test.ts @@ -107,7 +107,7 @@ test("getOriginalArticleContent() picks the earliest non-translation content", ( assert.deepEqual(selected, original); }); -test("updateArticleSource() updates the original content and preserves translations", async () => { +test("updateArticleSource() flags originalContentChanged when the body changes and preserves existing translation rows in place", async () => { await withRollback(async (tx) => { const author = await insertAccountWithActor(tx, { username: "updatearticlesource", @@ -149,29 +149,86 @@ test("updateArticleSource() updates the original content and preserves translati }, ]); - const updated = await updateArticleSource(tx, sourceId, { + const result = await updateArticleSource(tx, sourceId, { title: "Updated title", content: "Updated content", slug: "updated-source", }); - assert.ok(updated != null); - assert.equal(updated.slug, "updated-source"); - assert.equal(updated.contents.length, 2); + assert.ok(result != null); + assert.equal(result.originalContentChanged, true); + assert.equal(result.source.slug, "updated-source"); + assert.equal(result.source.contents.length, 2); - const originalContent = updated.contents.find((content) => + const originalContent = result.source.contents.find((content) => content.originalLanguage == null ); - const translatedContent = updated.contents.find((content) => + const translatedContent = result.source.contents.find((content) => content.originalLanguage === "en" ); assert.ok(originalContent != null); assert.equal(originalContent.title, "Updated title"); assert.equal(originalContent.content, "Updated content"); + // updateArticleSource itself does not touch translation rows; the + // retranslation step (restartArticleContentTranslations) lives in + // updateArticle so it has access to fedCtx for the federation + // Update side effects. Here we only check that the existing + // translation row passes through unchanged and that the + // originalContentChanged signal is set so an outer caller can + // decide to invalidate it. assert.ok(translatedContent != null); assert.equal(translatedContent.title, "Translated title"); assert.equal(translatedContent.content, "Translated content"); + assert.equal(translatedContent.beingTranslated, false); + }); +}); + +test("updateArticleSource() does not flag originalContentChanged on title-only edits", async () => { + await withRollback(async (tx) => { + const author = await insertAccountWithActor(tx, { + username: "updatearticlesourcetitle", + name: "Title Only Edit", + email: "updatearticlesourcetitle@example.com", + }); + const sourceId = generateUuidV7(); + const published = new Date("2026-04-15T00:00:00.000Z"); + + await tx.insert(articleSourceTable).values({ + id: sourceId, + accountId: author.account.id, + publishedYear: 2026, + slug: "title-only-edit", + tags: [], + allowLlmTranslation: false, + published, + updated: published, + }); + await tx.insert(articleContentTable).values({ + sourceId, + language: "en", + title: "Original title", + content: "Original content", + published, + updated: published, + }); + + const result = await updateArticleSource(tx, sourceId, { + title: "Renamed", + }); + + assert.ok(result != null); + // Title-only edits don't trigger retranslation, mirroring the + // existing summary-invalidation gate in updateArticleSource: the + // body is unchanged, so existing translations would still be + // accurate. + assert.equal(result.originalContentChanged, false); + const originalContent = result.source.contents.find((c) => + c.originalLanguage == null + ); + assert.ok(originalContent != null); + assert.equal(originalContent.title, "Renamed"); + assert.equal(originalContent.content, "Original content"); }); }); diff --git a/models/article.translation-split.test.ts b/models/article.translation-split.test.ts new file mode 100644 index 000000000..981da9387 --- /dev/null +++ b/models/article.translation-split.test.ts @@ -0,0 +1,124 @@ +import assert from "node:assert/strict"; +import test from "node:test"; +import { splitTranslationTitleAndContent } from "./article.ts"; + +test("splitTranslationTitleAndContent() splits clean # Title + body", () => { + const { title, content } = splitTranslationTitleAndContent( + "# Translated title\n\nTranslated body line 1.\n\nTranslated body line 2.", + ); + assert.equal(title, "Translated title"); + assert.equal( + content, + "Translated body line 1.\n\nTranslated body line 2.", + ); +}); + +test("splitTranslationTitleAndContent() keeps preamble as the title rather than scanning deeper for an H1", () => { + // The previous implementation took the first H1 anywhere in the + // output, which handled this preamble case nicely but at the + // cost of silently truncating content when the model omitted + // the article-title H1 entirely and the body happened to + // contain its own H1 section heading. We deliberately accept + // an uglier title here in exchange for never dropping body + // content; the preamble stays visible (as the title) instead + // of disappearing onto the floor, and the body keeps its `# ` + // framing intact. + const { title, content } = splitTranslationTitleAndContent( + "Note: I have preserved technical terms in their original form.\n" + + "\n" + + "# Translated title\n" + + "\n" + + "Translated body.", + ); + assert.equal( + title, + "Note: I have preserved technical terms in their original form.", + ); + assert.equal(content, "# Translated title\n\nTranslated body."); +}); + +test("splitTranslationTitleAndContent() uses the first line as the title when there is no H1 marker", () => { + // Some models occasionally drop the H1 framing entirely. The + // first line stands in as the title so the article isn't + // persisted titleless. + const { title, content } = splitTranslationTitleAndContent( + "\n" + + "Translated title\n" + + "\n" + + "Translated body line 1.\n" + + "Translated body line 2.", + ); + assert.equal(title, "Translated title"); + assert.equal( + content, + "Translated body line 1.\nTranslated body line 2.", + ); +}); + +test("splitTranslationTitleAndContent() does not strip a leading ## H2 marker from the first line", () => { + // The H1-marker regex only matches `# ` (single hash followed by + // whitespace), not `## `, so a first-line H2 is taken verbatim + // as the title. Mis-stripping would have produced a title + // starting with `# Section`, which would render very wrong. + const { title, content } = splitTranslationTitleAndContent( + "## Section\n\nBody under section.", + ); + assert.equal(title, "## Section"); + assert.equal(content, "Body under section."); +}); + +test("splitTranslationTitleAndContent() preserves later ## H2 headings inside the body", () => { + // A document with a real H1 followed by H2 sections must keep + // the H2 sections in the body untouched. + const { title, content } = splitTranslationTitleAndContent( + "# Translated title\n" + + "\n" + + "Intro paragraph.\n" + + "\n" + + "## Section A\n" + + "\n" + + "Body of section A.\n" + + "\n" + + "## Section B\n" + + "\n" + + "Body of section B.", + ); + assert.equal(title, "Translated title"); + assert.equal( + content, + "Intro paragraph.\n" + + "\n" + + "## Section A\n" + + "\n" + + "Body of section A.\n" + + "\n" + + "## Section B\n" + + "\n" + + "Body of section B.", + ); +}); + +test("splitTranslationTitleAndContent() handles entirely empty output", () => { + const { title, content } = splitTranslationTitleAndContent(""); + assert.equal(title, ""); + assert.equal(content, ""); + const ws = splitTranslationTitleAndContent(" \n\n \n"); + assert.equal(ws.title, ""); + assert.equal(ws.content, ""); +}); + +test("splitTranslationTitleAndContent() handles H1-only output (no body)", () => { + const { title, content } = splitTranslationTitleAndContent( + "# Translated title\n", + ); + assert.equal(title, "Translated title"); + assert.equal(content, ""); +}); + +test("splitTranslationTitleAndContent() trims surrounding whitespace from the title", () => { + const { title, content } = splitTranslationTitleAndContent( + "# Translated title with extra spaces \n\nBody.", + ); + assert.equal(title, "Translated title with extra spaces"); + assert.equal(content, "Body."); +}); diff --git a/models/article.ts b/models/article.ts index 85c4c2bda..31992f85e 100644 --- a/models/article.ts +++ b/models/article.ts @@ -10,7 +10,7 @@ import { sendTagsPubRelayActivity } from "@hackerspub/federation/tags-pub"; import { getLogger } from "@logtape/logtape"; import { minBy } from "@std/collections/min-by"; import type { LanguageModel } from "ai"; -import { and, eq, isNull, lt, or, sql } from "drizzle-orm"; +import { and, eq, isNotNull, isNull, lt, or, sql } from "drizzle-orm"; import postgres from "postgres"; import type { ContextData, Models } from "./context.ts"; import type { Database } from "./db.ts"; @@ -300,6 +300,28 @@ export async function createArticle( return post; } +export interface UpdateArticleSourceResult { + source: ArticleSource & { contents: ArticleContent[] }; + /** + * `true` when the original-language `article_content` row's body + * actually changed during this update. The caller uses this to + * decide whether to invalidate existing translation rows. + * + * Title-only edits do not set this flag, matching the existing + * summary-invalidation gate below. + * + * Language changes never reach this branch when translations exist: + * the self-FK on `article_content` (`schema.ts:524-527`) is + * `ON DELETE CASCADE` only, so any `UPDATE … SET language = …` on + * the original row aborts with 23503 (rethrown as + * {@link LanguageChangeWithTranslationsError}) whenever a row's + * `originalLanguage` references the old language. A successful + * `languageChanged` therefore implies zero translations and there + * is nothing to retranslate. + */ + originalContentChanged: boolean; +} + export async function updateArticleSource( db: Database, id: Uuid, @@ -309,11 +331,12 @@ export async function updateArticleSource( language?: string; }, models?: Models, -): Promise { +): Promise { // Captured inside the transaction and used after it commits so we // can enqueue a fresh summarization for the row whose body or // language just changed. let resummarizeTarget: ArticleContent | undefined; + let originalContentChanged = false; const result = await db.transaction(async (tx) => { const sources = await tx.update(articleSourceTable) .set({ ...source, updated: sql`CURRENT_TIMESTAMP` }) @@ -371,6 +394,9 @@ export async function updateArticleSource( ) { resummarizeTarget = updatedRows[0]; } + if (contentChanged && updatedRows.length > 0) { + originalContentChanged = true; + } } catch (error) { if ( error instanceof postgres.PostgresError && error.code === "23503" @@ -386,13 +412,14 @@ export async function updateArticleSource( }); return { ...sources[0], contents }; }); + if (result == null) return undefined; // Queue a fresh summarization outside of the transaction so the // claim is visible to other workers as soon as it is acquired and // the deferred apply step does not try to use a closed transaction. if (resummarizeTarget != null && models != null) { await startArticleContentSummary(db, models.summarizer, resummarizeTarget); } - return result; + return { source: result, originalContentChanged }; } export async function updateArticle( @@ -418,13 +445,14 @@ export async function updateArticle( const previousPost = await db.query.postTable.findFirst({ where: { articleSourceId }, }); - const articleSource = await updateArticleSource( + const updateResult = await updateArticleSource( db, articleSourceId, source, models, ); - if (articleSource == null) return undefined; + if (updateResult == null) return undefined; + const { source: articleSource, originalContentChanged } = updateResult; const account = await db.query.accountTable.findFirst({ where: { id: articleSource.accountId }, with: { emails: true, links: true }, @@ -476,6 +504,25 @@ export async function updateArticle( post.relayedTags = [...relayedTags]; } // TODO: send Update(Article) to the mentioned actors too + // After federating the original-language Update, invalidate any + // existing translation rows so they retranslate against the new + // body. Each restarted translation will fire its own Update on + // completion (correct ActivityPub semantics — peers see the + // original change first, then each translation's refresh as it + // becomes available). We `await` the synchronous claim-and-reset + // step so the placeholders are visible by the time this function + // returns; the actual `translate()` calls run in the background. + // + // Gate on the article-level `allowLlmTranslation` switch so an + // edit that turns LLM translation off in the same update does + // not still enqueue background `translate()` runs against the + // author's just-expressed wish. Existing translation rows from + // before the switch was flipped are left alone (stale, not + // refreshed); re-enabling the switch and editing the body again + // brings them back into sync. + if (originalContentChanged && articleSource.allowLlmTranslation) { + await restartArticleContentTranslations(fedCtx, articleSource); + } return post; } @@ -722,7 +769,16 @@ export async function startArticleContentTranslation( fedCtx: Context, { content, targetLanguage, requester }: ArticleContentTranslationOptions, ): Promise { - const { db, models: { translator: model, summarizer } } = fedCtx.data; + const { db } = fedCtx.data; + // Stamp `updated` with a JS-side Date rather than letting it + // default to PG's `CURRENT_TIMESTAMP`. See the long comment on + // the CAS in `runArticleContentTranslation` for why this matters: + // the helper's claim WHERE compares the row's stored `updated` + // against `queued.updated`, and the comparison is only reliable + // when both sides round-trip through the same precision (the + // `postgres` driver hands JS `Date` values back at ms precision + // while `timestamptz` keeps µs). + const queueStamp = new Date(); const inserted = await db.insert(articleContentTable).values({ sourceId: content.sourceId, language: targetLanguage, @@ -731,6 +787,7 @@ export async function startArticleContentTranslation( originalLanguage: content.language, translationRequesterId: requester.id, beingTranslated: true, + updated: queueStamp, }).onConflictDoNothing().returning(); let queued: ArticleContent; if (inserted.length < 1) { @@ -748,18 +805,337 @@ export async function startArticleContentTranslation( logger.debug("Translation already started or not needed."); return translated!; } - queued = translated; + // The placeholder is stale (older than 30 min, presumably from + // a crashed previous run). Refresh it before handing off to + // the helper: + // + // - Re-stamp `updated` with a fresh JS Date so the helper's + // claim CAS has a value it can match (without this, the CAS + // would be comparing against the row's possibly-µs-precision + // DB timestamp via a ms-truncated round-trip and never hit). + // - Copy the caller-provided original title/content into the + // placeholder. The helper translates from `claimed.title` / + // `claimed.content`, so without this refresh a placeholder + // stuck since before a body edit would be retranslated from + // the OLD body and publish a translation that no longer + // matches the article. Clear the matching summary / OG + // image state for the same reason. + const reclaim = new Date(); + const reclaimed = await db.update(articleContentTable) + .set({ + updated: reclaim, + title: content.title, + content: content.content, + originalLanguage: content.language, + summary: null, + summaryStarted: null, + summaryUnnecessary: false, + ogImageKey: null, + }) + .where( + and( + eq(articleContentTable.sourceId, content.sourceId), + eq(articleContentTable.language, targetLanguage), + eq(articleContentTable.beingTranslated, true), + // Repeat the staleness check inside the UPDATE itself so + // the reclaim is CAS-safe. If a concurrent worker just + // reclaimed the same stale placeholder between our SELECT + // above and this UPDATE, their reclaim wrote a fresh + // `updated` past the threshold and PG's UPDATE re-evals + // this WHERE on the new row state, which makes our + // UPDATE drop the row from the candidate set and return + // 0 rows. That keeps us from stomping on the other + // worker's claim and double-firing `translate()`. + lt( + articleContentTable.updated, + sql`CURRENT_TIMESTAMP - INTERVAL '30 minutes'`, + ), + ), + ) + .returning(); + if (reclaimed.length < 1) { + // Lost the race to another writer that just reclaimed this + // row, or completed it, between our SELECT and our UPDATE. + // Return the row we observed; nothing further to do. + return translated; + } + queued = reclaimed[0]; } else { queued = inserted[0]; } + await runArticleContentTranslation(fedCtx, queued); + return queued; +} + +/** + * Invalidates and re-runs every existing translation row for an + * article whose original-language body has changed. For each + * translation row, atomically resets it to placeholder state + * (copying the new original title/content into it, flipping + * `beingTranslated` back to true, and clearing summary state), then + * fires {@link runArticleContentTranslation} against the freshly + * reset row to repopulate it from the model. The actual translation + * runs in the background; the synchronous claim-and-reset is + * awaited so callers can rely on placeholders being in place by + * return time. + * + * No-ops when the article has no original-language content (e.g., + * remote articles with no `articleSource.contents` row in the + * article's own language) or no translation rows at all. + * + * Used by {@link updateArticle} to satisfy + * . + */ +export async function restartArticleContentTranslations( + fedCtx: Context, + articleSource: ArticleSource, +): Promise { + const { db } = fedCtx.data; + // Serialize the read-original-then-reset-translations sequence + // against any other writer to this article's source row. Two + // concurrent restartArticleContentTranslations calls (driven by + // back-to-back edits to the same article) would otherwise read + // their own snapshot of the original and then overwrite each + // other's placeholder writes, leaving the translation rows + // pointing at whichever snapshot's UPDATE happened to land last. + // `SELECT … FOR UPDATE` on the article_source row holds the same + // row-level write lock that updateArticleSource takes during its + // own UPDATE, so concurrent edits and restarts queue up cleanly. + // The translate() calls themselves run after the transaction + // commits so the LLM round-trip doesn't extend the lock window. + const resetRows = await db.transaction(async (tx) => { + await tx.select({ id: articleSourceTable.id }) + .from(articleSourceTable) + .where(eq(articleSourceTable.id, articleSource.id)) + .for("update"); + const original = await getOriginalArticleContent(tx, articleSource); + if (original == null) { + logger.debug( + "No original-language content for {sourceId}; nothing to retranslate.", + { sourceId: articleSource.id }, + ); + return []; + } + // Reset every translation row to placeholder state in a single + // statement, mirroring the shape an initial + // `startArticleContentTranslation` would have produced. The + // `originalLanguage IS NOT NULL` filter targets exactly the + // translation rows for this article (the same set the previous + // implementation listed via `findMany` and then iterated over); + // the schema check `article_content_being_translated_check` + // requires `originalLanguage IS NOT NULL` whenever + // `beingTranslated=true`, which the filter already satisfies. + // `originalLanguage` and `translationRequesterId` are not in + // `set`, so each row's audit trail (who first asked for this + // translation) is preserved. + // Stamp `updated` with a JS-side Date rather than PG + // `CURRENT_TIMESTAMP` so it round-trips losslessly through the + // driver and the per-row claim CAS in + // `runArticleContentTranslation` can match it; see the long + // comment on that claim for the µs/ms precision rationale. + const restartStamp = new Date(); + const reset = await tx.update(articleContentTable) + .set({ + title: original.title, + content: original.content, + beingTranslated: true, + summary: null, + summaryStarted: null, + summaryUnnecessary: false, + // Clear the cached OG image too: it was rendered from the + // previous title/body and is now stale. Lazy regeneration + // on the next OG-image request will rebuild it from the + // freshly translated content. + ogImageKey: null, + updated: restartStamp, + }) + .where( + and( + eq(articleContentTable.sourceId, articleSource.id), + isNotNull(articleContentTable.originalLanguage), + // Only LLM-requested translations. Human translations + // carry `translatorId` instead of `translationRequesterId` + // (the schema check + // `article_content_translator_translation_requester_id_check` + // makes the two columns mutually exclusive), and resetting + // a curated human translation back to a source-language + // placeholder so the LLM can re-do it would silently + // destroy that contributor's work and mis-attribute the + // result. + isNull(articleContentTable.translatorId), + ), + ) + .returning(); + if (reset.length > 0) { + logger.debug( + "Restarted {count} translation(s) for {sourceId}.", + { count: reset.length, sourceId: articleSource.id }, + ); + } + return reset; + }); + for (const resetRow of resetRows) { + // Fire-and-forget: `runArticleContentTranslation` schedules the + // `translate()` chain on its own and the caller does not await + // the model call. Each translation runs concurrently. + // The `.catch()` is here because the synchronous setup before + // the chain is installed (the claim UPDATE, the article-source + // fetch) can itself throw on a transient DB error; without it + // those rejections would surface as unhandled promise + // rejections. + runArticleContentTranslation(fedCtx, resetRow).catch((error) => { + logger.error( + "Failed to start retranslation for {sourceId} {language}: {error}", + { + sourceId: resetRow.sourceId, + language: resetRow.language, + error, + }, + ); + }); + } +} + +/** + * Splits an LLM translation output into its title and body halves. + * + * The translator is prompted with `# {title}\n\n{body}` and is + * expected to return the same shape with both halves translated. + * In practice models usually do. When they don't (e.g., they drop + * the H1 framing entirely), the strict behavior here is: + * + * - The first line is taken as the title. If it begins with `# `, + * the marker is stripped; otherwise the whole line becomes the + * title verbatim. + * - Everything after the first line becomes the body. + * + * Scanning deeper for an H1 elsewhere in the output is *not* done + * on purpose: it would handle a "model put a preamble before the + * # Title" case nicely but at the cost of silently truncating + * content if the model omits the article-title H1 and the body + * happens to contain its own H1 section heading; that body H1 + * would be mis-promoted to the title and the intro paragraphs + * would be dropped. Leaving a preamble visible as the title is + * the lesser of those two failures. The H1-marker detection on + * the first line is restricted to a single `#` followed by + * whitespace, so a first-line `## Section` is not mis-stripped. + */ +export function splitTranslationTitleAndContent( + translation: string, +): { title: string; content: string } { + const trimmed = translation.trim(); + if (trimmed === "") return { title: "", content: "" }; + // `trimmed` is guaranteed non-empty and starts with a non- + // whitespace character, so `lines[0]` is the first non-empty + // line as text and there's no need to scan past it. + const lines = trimmed.split(/\r?\n/); + const firstLine = lines[0].trim(); + const h1AtStart = firstLine.match(/^#\s+(.+)$/); + return { + title: (h1AtStart?.[1] ?? firstLine).trim(), + content: lines.slice(1).join("\n").trim(), + }; +} + +/** + * Runs the actual LLM translation for an `article_content` row that is + * already in the placeholder / `beingTranslated` state. Awaits the + * synchronous setup (fetching author/tag context for the model), then + * schedules the `translate(...)` chain and returns; the caller does + * not await the translation itself. When the model resolves, the + * row is overwritten with the translated title/body, a federation + * `Update` activity is sent, and post-translation summarization is + * kicked off. On failure, the placeholder row is deleted so a future + * visit can re-queue. + * + * Should never be called with an original-language row + * (`originalLanguage IS NULL`); the caller is responsible for placing + * the row into the placeholder state first. + */ +async function runArticleContentTranslation( + fedCtx: Context, + queued: ArticleContent, +): Promise { + const { db, models: { translator: model, summarizer } } = fedCtx.data; logger.debug( "Starting translation for content: {sourceId} {language}", queued, ); + const { sourceId, language: targetLanguage, originalLanguage } = queued; + if (originalLanguage == null) { + // Defensive: a row without `originalLanguage` is the original- + // language content itself and should never be passed in here. + logger.error( + "runArticleContentTranslation called for an original-language row; " + + "skipping ({sourceId} {language}).", + queued, + ); + return; + } + + // Take ownership of the placeholder row by stamping it with a + // JS-side `Date` that becomes our claim id, and read the row's + // freshest title/content back via `RETURNING`. Subsequent + // success / failure writes from this run only land if the row's + // `updated` still equals this claim — a concurrent re-translation + // that resets the row out from under us bumps `updated` past this + // value, and our writes turn into no-ops instead of clobbering + // the fresher claim. Using a JS `Date` (rather than PG + // `CURRENT_TIMESTAMP`) is what makes this CAS reliable: PG + // `timestamptz` keeps µs precision while the `postgres` driver + // hands back JS `Date` values truncated to ms, so a CAS against + // the round-tripped value of a `CURRENT_TIMESTAMP` write would + // never match. + // + // Three further guards live here: + // - `beingTranslated=true` on the WHERE bails out silently if the + // row has already been completed (or deleted) by another writer + // between the caller queueing this run and the claim landing. + // - `updated = queued.updated` makes the claim itself + // conditional on the row not having been re-stamped under us + // by a concurrent `restartArticleContentTranslations` (or a + // parallel run for the same row). Without this, two runs + // triggered by back-to-back edits both pass the + // `beingTranslated` check and both end up calling `translate()`, + // wasting an LLM round trip even though the success/failure + // CAS below would still ensure only one write lands. All + // writers that produce a `queued` for this helper + // (`startArticleContentTranslation`'s INSERT, its stuck-row + // re-stamp branch, and `restartArticleContentTranslations`'s + // reset UPDATE) explicitly stamp `updated` with a JS `Date` + // for the same round-trip-precision reason as the claim above; + // the comparison is lossless. + // - The translate input below is built from `claimed.title` / + // `claimed.content` rather than the caller's `queued` snapshot. + // When two `restartArticleContentTranslations` calls race, the + // later one writes the freshest body into the placeholder; this + // helper then translates *that* body instead of the stale body + // from whichever caller it was queued for. + const claim = new Date(); + const claimedRows = await db.update(articleContentTable) + .set({ updated: claim }) + .where( + and( + eq(articleContentTable.sourceId, sourceId), + eq(articleContentTable.language, targetLanguage), + eq(articleContentTable.beingTranslated, true), + eq(articleContentTable.updated, queued.updated), + ), + ) + .returning(); + if (claimedRows.length < 1) { + logger.debug( + "Translation claim failed; row is not (or no longer) a " + + "placeholder ({sourceId} {language}).", + queued, + ); + return; + } + const claimed = claimedRows[0]; - // Fetch article source with author information for translation context + // Fetch article source with author information for translation context. const articleSource = await db.query.articleSourceTable.findFirst({ - where: { id: content.sourceId }, + where: { id: sourceId }, with: { account: { with: { @@ -769,14 +1145,20 @@ export async function startArticleContentTranslation( }, }); - // Combine title and content for translation - const text = `# ${content.title}\n\n${content.content}`; + // Combine title and content for translation, using the freshest + // values read back from the claim above. + const text = `# ${claimed.title}\n\n${claimed.content}`; + // `claimed.originalLanguage` is non-null in practice: the claim + // WHERE required `beingTranslated=true`, and the schema check + // `article_content_being_translated_check` makes that imply + // `originalLanguage IS NOT NULL`. Drizzle types it as nullable + // because the column is nullable in general, so assert. translate({ model, - sourceLanguage: content.language, + sourceLanguage: claimed.originalLanguage!, targetLanguage, text, - // Pass context for better translation quality + // Pass context for better translation quality. authorName: articleSource?.account?.actor?.name ?? undefined, authorBio: articleSource?.account?.actor?.bioHtml ?? undefined, tags: articleSource?.tags, @@ -785,9 +1167,7 @@ export async function startArticleContentTranslation( ...queued, translation, }); - // Split the translation into title and content - const title = translation.match(/^\s*#\s+([^\n]*)/)?.[1] ?? ""; - const content = translation.replace(/^\s*#\s+[^\n]*\s*/, "").trim(); + const { title, content } = splitTranslationTitleAndContent(translation); const updated = await db.update(articleContentTable) .set({ title, @@ -801,17 +1181,37 @@ export async function startArticleContentTranslation( summary: null, summaryStarted: null, summaryUnnecessary: false, + // The cached OG image was rendered from the placeholder + // (or from a prior translation of an older body) and is + // now stale; clear it for the same reason as `summary` so + // the next request regenerates it from the translated text. + ogImageKey: null, }) .where( and( - eq(articleContentTable.sourceId, queued.sourceId), + eq(articleContentTable.sourceId, sourceId), eq(articleContentTable.language, targetLanguage), + // CAS on the claim taken at the top of this function — see + // that comment for why a JS `Date` rather than the row's + // round-tripped `updated` is the safe reference. If a + // concurrent re-translation took its own claim under us + // the `updated` will no longer match `claim` and this + // write becomes a no-op so we don't clobber its fresher + // placeholder with our stale text. + eq(articleContentTable.updated, claim), ), ) .returning(); - if (updated.length < 1) return; + if (updated.length < 1) { + logger.debug( + "Stale translation claim, skipping federation/summary " + + "({sourceId} {language}).", + queued, + ); + return; + } const article = await db.query.articleSourceTable.findFirst({ - where: { id: queued.sourceId }, + where: { id: sourceId }, with: { account: true, contents: true, @@ -822,9 +1222,19 @@ export async function startArticleContentTranslation( where: { articleSourceId: article.id }, }); const articleObject = await getArticle(fedCtx, article); + // The id has to be unique across translation completions for + // this article — multiple locales can complete in close + // succession (especially after a body edit re-queues every + // existing translation), and they would all collide on + // `article.updated` since translation completions don't bump + // it. Including both the target language and the translated + // row's own `updated` (a fresh `CURRENT_TIMESTAMP` from the + // success UPDATE just above) keeps the id distinct from the + // original-language Update activity and from every other + // translation's Update for the same edit. const update = new vocab.Update({ id: new URL( - `#update/${article.updated.toISOString()}`, + `#update/${updated[0].updated.toISOString()}/${targetLanguage}`, articleObject.id ?? fedCtx.canonicalOrigin, ), actors: articleObject.attributionIds, @@ -879,10 +1289,13 @@ export async function startArticleContentTranslation( await db.delete(articleContentTable) .where( and( - eq(articleContentTable.sourceId, queued.sourceId), + eq(articleContentTable.sourceId, sourceId), eq(articleContentTable.language, targetLanguage), + // CAS on the same claim as the success path — a stale + // failure must not delete a row another caller has since + // re-claimed. + eq(articleContentTable.updated, claim), ), ); }); - return queued; } diff --git a/test/wait.ts b/test/wait.ts new file mode 100644 index 000000000..69ddd48af --- /dev/null +++ b/test/wait.ts @@ -0,0 +1,20 @@ +/** + * Polls `predicate` every 50 ms until it resolves to `true` or + * `timeoutMs` elapses. Used by tests that observe state which a + * background promise eventually mutates (translation completion, + * placeholder reset, summary cleanup, etc.) without exposing that + * promise to the test. + */ +export async function waitFor( + predicate: () => Promise, + timeoutMs = 10000, +): Promise { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + if (await predicate()) return; + await new Promise((resolve) => setTimeout(resolve, 50)); + } + throw new Error( + `Timed out waiting for async background state after ${timeoutMs}ms`, + ); +}