From 9f353f5536399e99cb996a7752b2cf7a5e1dceb8 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 23 Jun 2026 16:05:22 -0700 Subject: [PATCH] record revision translation records --- .pre-commit-config.yaml | 1 + kitsune/celery_beat.py | 5 + kitsune/settings.py | 3 + kitsune/wiki/admin.py | 86 +++++++- .../0025_revisiontranslationrecord.py | 138 ++++++++++++ kitsune/wiki/models.py | 43 ++++ kitsune/wiki/strategies.py | 44 ++-- kitsune/wiki/tasks.py | 18 +- kitsune/wiki/tests/test_strategies.py | 197 ++++++++++++++++++ pyproject.toml | 1 + uv.lock | 2 + 11 files changed, 524 insertions(+), 14 deletions(-) create mode 100644 kitsune/wiki/migrations/0025_revisiontranslationrecord.py create mode 100644 kitsune/wiki/tests/test_strategies.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index edf71df5bc0..94aa893499a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -47,6 +47,7 @@ repos: - types-python-dateutil==2.9.0.20241206 - types-simplejson==3.20.0.20250326 - types-PyYAML==6.0.12.20250402 + - types-Markdown==3.10.2.20260518 files: "^kitsune/" exclude: "/migrations/" - repo: https://github.com/pre-commit/mirrors-eslint diff --git a/kitsune/celery_beat.py b/kitsune/celery_beat.py index 69f19c1c911..ae8226b1fc3 100644 --- a/kitsune/celery_beat.py +++ b/kitsune/celery_beat.py @@ -165,6 +165,11 @@ "task": "kitsune.wiki.tasks.cleanup_old_anchor_records", "schedule": crontab(hour="1", minute="0", day_of_week="0"), }, + # Every Sunday at 01:30. + "cleanup_old_translation_records": { + "task": "kitsune.wiki.tasks.cleanup_old_translation_records", + "schedule": crontab(hour="1", minute="30", day_of_week="0"), + }, } # Periodic tasks only for the prod environment. diff --git a/kitsune/settings.py b/kitsune/settings.py index 0568593ffa4..d9f3cb6180c 100644 --- a/kitsune/settings.py +++ b/kitsune/settings.py @@ -1401,6 +1401,9 @@ def setup_logger(name, formatter_str, level=logging.DEBUG): STALE_ANCHOR_RECORD_RETENTION_DAYS = config( "STALE_ANCHOR_RECORD_RETENTION_DAYS", default=90, cast=int ) +TRANSLATION_RECORD_RETENTION_DAYS = config( + "TRANSLATION_RECORD_RETENTION_DAYS", default=180, cast=int +) # Threshold for how long inactive users are allowed to remain in groups. INACTIVE_GROUP_MEMBER_RETENTION_DAYS = config( diff --git a/kitsune/wiki/admin.py b/kitsune/wiki/admin.py index c1164b048c0..97fdeb741b6 100644 --- a/kitsune/wiki/admin.py +++ b/kitsune/wiki/admin.py @@ -1,10 +1,19 @@ import itertools +import markdown from django.contrib import admin from django.urls import reverse from django.utils.html import format_html_join +from django.utils.safestring import mark_safe -from kitsune.wiki.models import Document, ImportantDate, Locale, PinnedArticleConfig +from kitsune.sumo.sanitize import RESTRICTED_HTML_ATTRIBUTES, RESTRICTED_HTML_TAGS, clean +from kitsune.wiki.models import ( + Document, + ImportantDate, + Locale, + PinnedArticleConfig, + RevisionTranslationRecord, +) class DocumentAdmin(admin.ModelAdmin): @@ -140,4 +149,79 @@ def used_by(self, obj): ) +class RevisionTranslationRecordAdmin(admin.ModelAdmin): + """Read-only view of the LLM explanations recorded for AI/hybrid translations.""" + + list_display = ("revision", "locale", "method", "trigger", "created") + list_filter = ("locale", "method", "trigger", "created") + search_fields = ("revision__document__title", "revision__document__slug") + fieldsets = ( + (None, {"fields": ("revision", "locale", "method", "trigger", "created")}), + ( + "LLM Explanations", + { + "fields": ( + "content_explanation", + "summary_explanation", + "keywords_explanation", + "title_explanation", + ) + }, + ), + ) + readonly_fields = ( + "revision", + "locale", + "method", + "trigger", + "created", + "content_explanation", + "summary_explanation", + "keywords_explanation", + "title_explanation", + ) + + @admin.display(description="Content") + def content_explanation(self, obj): + return self.render_explanation(obj, "content") + + @admin.display(description="Summary") + def summary_explanation(self, obj): + return self.render_explanation(obj, "summary") + + @admin.display(description="Keywords") + def keywords_explanation(self, obj): + return self.render_explanation(obj, "keywords") + + @admin.display(description="Title") + def title_explanation(self, obj): + return self.render_explanation(obj, "title") + + @staticmethod + def render_explanation(obj, key): + """Render a single attribute's explanation, or an em dash if absent.""" + text = (obj.explanation or {}).get(key) + if text: + # The LLM often uses markdown in its explanation. + html = markdown.markdown(text, extensions=["fenced_code"]) + return mark_safe( + clean(html, tags=RESTRICTED_HTML_TAGS, attributes=RESTRICTED_HTML_ATTRIBUTES) + ) + + return "—" + + def get_queryset(self, request): + return super().get_queryset(request).select_related("revision__document") + + def has_add_permission(self, request): + return False + + def has_change_permission(self, request, obj=None): + return False + + def has_delete_permission(self, request, obj=None): + return False + + admin.site.register(PinnedArticleConfig, PinnedArticleConfigAdmin) +admin.site.register(RevisionTranslationRecord, RevisionTranslationRecordAdmin) diff --git a/kitsune/wiki/migrations/0025_revisiontranslationrecord.py b/kitsune/wiki/migrations/0025_revisiontranslationrecord.py new file mode 100644 index 00000000000..69502bb49a3 --- /dev/null +++ b/kitsune/wiki/migrations/0025_revisiontranslationrecord.py @@ -0,0 +1,138 @@ +# Generated by Django 5.2.14 on 2026-06-24 17:40 + +import django.db.models.deletion +import kitsune.sumo.models +import kitsune.sumo.utils +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("wiki", "0024_alter_pinnedarticleconfig_pinned_articles"), + ] + + operations = [ + migrations.CreateModel( + name="RevisionTranslationRecord", + fields=[ + ( + "revision", + models.OneToOneField( + help_text="The translated revision this record belongs to.", + on_delete=django.db.models.deletion.CASCADE, + primary_key=True, + related_name="translation_record", + serialize=False, + to="wiki.revision", + ), + ), + ( + "locale", + kitsune.sumo.models.LocaleField( + choices=[ + ("af", "Afrikaans"), + ("ar", "عربي"), + ("az", "Azərbaycanca"), + ("bg", "Български"), + ("bm", "Bamanankan"), + ("bn", "বাংলা"), + ("bs", "Bosanski"), + ("ca", "català"), + ("cs", "Čeština"), + ("da", "Dansk"), + ("de", "Deutsch"), + ("ee", "Èʋegbe"), + ("el", "Ελληνικά"), + ("en-US", "English"), + ("es", "Español"), + ("et", "eesti keel"), + ("eu", "Euskara"), + ("fa", "فارسی"), + ("fi", "suomi"), + ("fr", "Français"), + ("fy-NL", "Frysk"), + ("ga-IE", "Gaeilge (Éire)"), + ("gl", "Galego"), + ("gn", "Avañe'ẽ"), + ("gu-IN", "ગુજરાતી"), + ("ha", "هَرْشَن هَوْسَ"), + ("he", "עברית"), + ("hi-IN", "हिन्दी (भारत)"), + ("hr", "Hrvatski"), + ("hu", "Magyar"), + ("dsb", "Dolnoserbšćina"), + ("hsb", "Hornjoserbsce"), + ("id", "Bahasa Indonesia"), + ("ig", "Asụsụ Igbo"), + ("it", "Italiano"), + ("ja", "日本語"), + ("ka", "ქართული"), + ("km", "ខ្មែរ"), + ("kn", "ಕನ್ನಡ"), + ("ko", "한국어"), + ("ln", "Lingála"), + ("lt", "lietuvių kalba"), + ("mg", "Malagasy"), + ("mk", "Македонски"), + ("ml", "മലയാളം"), + ("ms", "Bahasa Melayu"), + ("ne-NP", "नेपाली"), + ("nl", "Nederlands"), + ("no", "Norsk"), + ("pl", "Polski"), + ("pt-BR", "Português (do Brasil)"), + ("pt-PT", "Português (Europeu)"), + ("ro", "română"), + ("ru", "Русский"), + ("si", "සිංහල"), + ("sk", "slovenčina"), + ("sl", "slovenščina"), + ("sq", "Shqip"), + ("sr", "Српски"), + ("sw", "Kiswahili"), + ("sv", "Svenska"), + ("ta", "தமிழ்"), + ("ta-LK", "தமிழ் (இலங்கை)"), + ("te", "తెలుగు"), + ("th", "ไทย"), + ("tn", "Setswana"), + ("tr", "Türkçe"), + ("uk", "Українська"), + ("ur", "اُردو"), + ("vi", "Tiếng Việt"), + ("wo", "Wolof"), + ("xh", "isiXhosa"), + ("yo", "èdè Yorùbá"), + ("zh-CN", "中文 (简体)"), + ("zh-TW", "正體中文 (繁體)"), + ("zu", "isiZulu"), + ], + db_index=True, + default="en-US", + help_text="The locale of the translated revision.", + max_length=7, + ), + ), + ( + "explanation", + models.JSONField( + default=dict, + encoder=kitsune.sumo.utils.PrettyJSONEncoder, + help_text="Per-attribute LLM explanations (content, summary, keywords, title).", + ), + ), + ( + "trigger", + models.CharField(help_text="What triggered the translation.", max_length=32), + ), + ( + "method", + models.CharField(help_text="The translation method used.", max_length=10), + ), + ("created", models.DateTimeField(auto_now_add=True, db_index=True)), + ], + options={ + "abstract": False, + }, + ), + ] diff --git a/kitsune/wiki/models.py b/kitsune/wiki/models.py index 659a4b3683c..2306d0805e5 100644 --- a/kitsune/wiki/models.py +++ b/kitsune/wiki/models.py @@ -1106,6 +1106,49 @@ def __str__(self): return f"" +class RevisionTranslationRecord(ModelBase): + """ + Record of an LLM-generated AI or hybrid translation. + + Tied to the translated revision that was created. The translated content itself is + stored on the revision, so the explanation is the key piece here. The explanation is + useful for understanding how the LLM is interpreting and executing the translation + prompt, which can then inform prompt adjustments. + + Records are removed periodically by a Celery beat task once they are older than + settings.TRANSLATION_RECORD_RETENTION_DAYS. + """ + + revision = models.OneToOneField( + Revision, + on_delete=models.CASCADE, + related_name="translation_record", + primary_key=True, + help_text="The translated revision this record belongs to.", + ) + locale = LocaleField( + db_index=True, + help_text="The locale of the translated revision.", + ) + explanation = models.JSONField( + default=dict, + encoder=PrettyJSONEncoder, + help_text="Per-attribute LLM explanations (content, summary, keywords, title).", + ) + trigger = models.CharField( + max_length=32, + help_text="What triggered the translation.", + ) + method = models.CharField( + max_length=10, + help_text="The translation method used.", + ) + created = models.DateTimeField(auto_now_add=True, db_index=True) + + def __str__(self): + return f"Translation record for [{self.locale}] revision #{self.revision_id}" + + class HelpfulVote(ModelBase): """Helpful or Not Helpful vote on Revision.""" diff --git a/kitsune/wiki/strategies.py b/kitsune/wiki/strategies.py index cfe195070a2..267f81265fa 100644 --- a/kitsune/wiki/strategies.py +++ b/kitsune/wiki/strategies.py @@ -1,7 +1,7 @@ import json from abc import ABC, abstractmethod from dataclasses import dataclass, field -from enum import Enum +from enum import StrEnum from typing import Any from django.conf import settings @@ -16,10 +16,10 @@ ManualContentManager, WikiContentManager, ) -from kitsune.wiki.models import Revision +from kitsune.wiki.models import Revision, RevisionTranslationRecord -class TranslationTrigger(str, Enum): +class TranslationTrigger(StrEnum): """Available translation triggers.""" REVIEW_REVISION = "review_revision" @@ -144,10 +144,25 @@ def translate(self, l10n_request: TranslationRequest) -> TranslationResult: def _log_operation( self, l10n_request: TranslationRequest, translation_result: TranslationResult ) -> None: - # TODO: Future enhancement - log all operations to database - # This will track: revision_id, strategy_used, timestamps, outcomes - # self._log_operation(l10n_request, result) - pass + """Record the LLM's explanation for AI and hybrid translations. + + Only operations that produced a revision and carry an LLM explanation are + recorded (i.e. AI and hybrid). Manual operations have no explanation, so they + are skipped. + """ + explanation = translation_result.metadata.get("explanation") + if not (translation_result.success and translation_result.revision and explanation): + return + + RevisionTranslationRecord.objects.update_or_create( + revision=translation_result.revision, + defaults={ + "locale": l10n_request.target_locale, + "explanation": explanation, + "trigger": l10n_request.trigger, + "method": translation_result.method, + }, + ) @dataclass @@ -252,7 +267,9 @@ def translate( "translated_content": translated_content, } - rev = self.content_manager.create_revision(data, doc, send_notifications=send_notifications) + rev = self.content_manager.create_revision( + data, doc, send_notifications=send_notifications + ) if publish: rev = self.content_manager.publish_revision(rev, send_notifications=send_notifications) @@ -283,10 +300,13 @@ def __post_init__(self): self.content_manager = HybridContentManager() def translate(self, l10n_request: TranslationRequest) -> TranslationResult: - """Perform hybrid translation.""" - result = AITranslationStrategy().translate(l10n_request, publish=False) - result.method = TranslationMethod.HYBRID - return result + """Perform hybrid translation. + + The created revision is left unpublished, awaiting human review. The result's + method is the request's method (HYBRID), since AITranslationStrategy.translate + derives it from the request rather than hardcoding it. + """ + return AITranslationStrategy().translate(l10n_request, publish=False) class TranslationStrategyFactory: diff --git a/kitsune/wiki/tasks.py b/kitsune/wiki/tasks.py index 7b9863d9862..8a1e9fa0c8e 100644 --- a/kitsune/wiki/tasks.py +++ b/kitsune/wiki/tasks.py @@ -38,6 +38,7 @@ Locale, Revision, RevisionAnchorRecord, + RevisionTranslationRecord, SlugCollision, TitleCollision, resolves_to_document_view, @@ -389,7 +390,9 @@ def maybe_award_badge(badge_template: dict, year: int, user_id: int) -> bool: else: # l10n-badge qs = qs.exclude(document__locale=settings.WIKI_DEFAULT_LANGUAGE) - deleted_contributions_extra_kwargs.update(exclude_locale=settings.WIKI_DEFAULT_LANGUAGE) + deleted_contributions_extra_kwargs.update( + exclude_locale=settings.WIKI_DEFAULT_LANGUAGE + ) num_contributions = qs.count() + num_deleted_contributions( Revision, @@ -622,3 +625,16 @@ def cleanup_old_anchor_records() -> None: ) log.info(f"Deleted {num_deleted} stale anchor record(s).") + + +@shared_task +@skip_if_read_only_mode +def cleanup_old_translation_records() -> None: + """ + Deletes all RevisionTranslationRecord entries that were created more than + settings.TRANSLATION_RECORD_RETENTION_DAYS ago. + """ + cutoff = timezone.now() - timedelta(days=settings.TRANSLATION_RECORD_RETENTION_DAYS) + num_deleted, _ = RevisionTranslationRecord.objects.filter(created__lt=cutoff).delete() + + log.info(f"Deleted {num_deleted} old translation record(s).") diff --git a/kitsune/wiki/tests/test_strategies.py b/kitsune/wiki/tests/test_strategies.py new file mode 100644 index 00000000000..62bba569b77 --- /dev/null +++ b/kitsune/wiki/tests/test_strategies.py @@ -0,0 +1,197 @@ +from datetime import timedelta +from unittest import mock + +from django.test import TestCase, override_settings +from django.utils import timezone + +from kitsune.wiki.content_managers import AIContentManager +from kitsune.wiki.models import RevisionTranslationRecord +from kitsune.wiki.strategies import ( + AITranslationStrategy, + HybridTranslationStrategy, + TranslationMethod, + TranslationRequest, + TranslationResult, + TranslationStrategy, + TranslationTrigger, +) +from kitsune.wiki.tasks import cleanup_old_translation_records +from kitsune.wiki.tests import RevisionFactory + + +class LogOperationTests(TestCase): + """Tests for TranslationStrategy._log_operation (recording LLM explanations).""" + + def setUp(self): + self.strategy = TranslationStrategy() + self.source_revision = RevisionFactory() + self.target_revision = RevisionFactory() + self.explanation = { + "content": "Translated the body.", + "summary": "Translated the summary.", + "keywords": "Translated the keywords.", + "title": "Translated the title.", + } + + def _request(self, trigger=TranslationTrigger.TRANSLATE): + return TranslationRequest( + revision=self.source_revision, + trigger=trigger, + target_locale="fr", + method=TranslationMethod.AI, + ) + + def _result(self, **kwargs): + defaults = { + "success": True, + "method": TranslationMethod.AI, + "revision": self.target_revision, + "metadata": {"explanation": self.explanation}, + } + defaults.update(kwargs) + return TranslationResult(**defaults) + + def test_records_explanation_for_successful_ai_translation(self): + self.strategy._log_operation(self._request(), self._result()) + + record = RevisionTranslationRecord.objects.get(revision=self.target_revision) + self.assertEqual(record.locale, "fr") + self.assertEqual(record.explanation, self.explanation) + self.assertEqual(record.trigger, "translate") + self.assertEqual(record.method, "ai") + + def test_stores_enum_trigger_as_its_value(self): + # An enum member subclasses str, so it persists as its underlying value. + self.strategy._log_operation( + self._request(trigger=TranslationTrigger.STALE_TRANSLATION_UPDATE), self._result() + ) + + record = RevisionTranslationRecord.objects.get(revision=self.target_revision) + self.assertEqual(record.trigger, "stale_translation_update") + + def test_stores_string_trigger_unchanged(self): + # After an async round-trip through the celery task, the trigger arrives as a + # plain string rather than a TranslationTrigger member. + self.strategy._log_operation(self._request(trigger="initial_translation"), self._result()) + + record = RevisionTranslationRecord.objects.get(revision=self.target_revision) + self.assertEqual(record.trigger, "initial_translation") + + def test_no_record_without_explanation(self): + self.strategy._log_operation(self._request(), self._result(metadata={})) + + self.assertFalse(RevisionTranslationRecord.objects.exists()) + + def test_no_record_for_failed_operation(self): + self.strategy._log_operation(self._request(), self._result(success=False)) + + self.assertFalse(RevisionTranslationRecord.objects.exists()) + + def test_no_record_without_revision(self): + self.strategy._log_operation(self._request(), self._result(revision=None)) + + self.assertFalse(RevisionTranslationRecord.objects.exists()) + + def test_is_idempotent_and_updates_existing_record(self): + self.strategy._log_operation(self._request(), self._result()) + + new_explanation = {"content": "Revised explanation."} + self.strategy._log_operation( + self._request(), self._result(metadata={"explanation": new_explanation}) + ) + + records = RevisionTranslationRecord.objects.filter(revision=self.target_revision) + self.assertEqual(records.count(), 1) + self.assertEqual(records.get().explanation, new_explanation) + + +class AIAndHybridStrategyLogTests(TestCase): + """The AI and hybrid strategies record the LLM explanation for the translated revision.""" + + TRANSLATED_CONTENT = { + "content": {"translation": "Contenu", "explanation": "Explained content."}, + "summary": {"translation": "Résumé", "explanation": "Explained summary."}, + "keywords": {"translation": "mots", "explanation": "Explained keywords."}, + "title": {"translation": "Titre", "explanation": "Explained title."}, + } + + @mock.patch("kitsune.wiki.strategies.llm_translate") + def test_ai_translate_records_explanation(self, mock_translate): + mock_translate.return_value = self.TRANSLATED_CONTENT + target_revision = RevisionFactory() + strategy = AITranslationStrategy() + l10n_request = TranslationRequest( + revision=RevisionFactory(), + trigger=TranslationTrigger.TRANSLATE, + target_locale="fr", + method=TranslationMethod.AI, + ) + + with ( + mock.patch.object( + strategy.content_manager, "create_revision", return_value=target_revision + ), + mock.patch.object( + strategy.content_manager, "publish_revision", return_value=target_revision + ), + ): + result = strategy.translate(l10n_request) + + self.assertTrue(result.success) + record = RevisionTranslationRecord.objects.get(revision=target_revision) + self.assertEqual(record.locale, "fr") + self.assertEqual(record.method, "ai") + self.assertEqual(record.trigger, "translate") + self.assertEqual(record.explanation["content"], "Explained content.") + self.assertEqual(record.explanation["title"], "Explained title.") + + @mock.patch("kitsune.wiki.strategies.llm_translate") + def test_hybrid_translate_records_hybrid_method(self, mock_translate): + # The hybrid strategy delegates to AITranslationStrategy.translate, but the + # recorded method must reflect the request (hybrid), not the AI strategy that + # ran it. + mock_translate.return_value = self.TRANSLATED_CONTENT + target_revision = RevisionFactory() + l10n_request = TranslationRequest( + revision=RevisionFactory(), + trigger=TranslationTrigger.TRANSLATE, + target_locale="es", + method=TranslationMethod.HYBRID, + ) + + # The inner AITranslationStrategy builds its own AIContentManager, so patch at + # the class. publish_revision is never reached because hybrid passes publish=False. + with mock.patch.object(AIContentManager, "create_revision", return_value=target_revision): + result = HybridTranslationStrategy().translate(l10n_request) + + self.assertTrue(result.success) + record = RevisionTranslationRecord.objects.get(revision=target_revision) + self.assertEqual(record.locale, "es") + self.assertEqual(record.method, "hybrid") + + +class CleanupOldTranslationRecordsTests(TestCase): + """Tests for the cleanup_old_translation_records purge task.""" + + def _create_record(self, created): + record = RevisionTranslationRecord.objects.create( + revision=RevisionFactory(), + locale="fr", + explanation={"content": "explanation"}, + trigger="translate", + method="ai", + ) + # created is auto_now_add, so set it explicitly via a queryset update. + RevisionTranslationRecord.objects.filter(pk=record.pk).update(created=created) + return record + + @override_settings(TRANSLATION_RECORD_RETENTION_DAYS=180) + def test_deletes_only_records_older_than_retention(self): + now = timezone.now() + old = self._create_record(now - timedelta(days=181)) + recent = self._create_record(now - timedelta(days=179)) + + cleanup_old_translation_records() + + self.assertFalse(RevisionTranslationRecord.objects.filter(pk=old.pk).exists()) + self.assertTrue(RevisionTranslationRecord.objects.filter(pk=recent.pk).exists()) diff --git a/pyproject.toml b/pyproject.toml index bd7552a46d5..8421cb6f11c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,6 +81,7 @@ dependencies = [ "langchain-google-vertexai==3.2.3", "django-celery-beat>=2.8.1,<3", "django-treebeard>=5.2.2", + "markdown>=3.10.2", ] [project.optional-dependencies] diff --git a/uv.lock b/uv.lock index dd4eafc7c50..ff6e2188b85 100644 --- a/uv.lock +++ b/uv.lock @@ -1833,6 +1833,7 @@ dependencies = [ { name = "langchain" }, { name = "langchain-google-vertexai" }, { name = "lxml" }, + { name = "markdown" }, { name = "mkdocs" }, { name = "mkdocs-material" }, { name = "mozilla-django-oidc" }, @@ -1949,6 +1950,7 @@ requires-dist = [ { name = "langchain", specifier = "==1.3.9" }, { name = "langchain-google-vertexai", specifier = "==3.2.3" }, { name = "lxml", specifier = ">=6.0.1,<7" }, + { name = "markdown", specifier = ">=3.10.2" }, { name = "mkdocs", specifier = ">=1.5.3,<2" }, { name = "mkdocs-material", specifier = ">=9.5.3,<10" }, { name = "mozilla-django-oidc", specifier = "==4.0.0" },