commoncrawl · malteos · Jun 5, 2026 · May 22, 2026
diff --git a/scripts/backfill_supported_languages.py b/scripts/backfill_supported_languages.py
@@ -0,0 +1,206 @@
+"""Backfill the ``supported_languages`` field in existing ``summary.json`` files.
+
+Walks ``<source_dir>/<dataset>/<model>/summary.json``, groups paths by
+``model_id``, loads each model once via the registry, calls
+``discover_supported_languages()``, and writes the result (sorted ISO 639-3
+list, or JSON ``null`` for models whose support set is undefined) back into
+every matching summary.
+
+Run after ``pip install commonlid[<extras>]`` for whichever model classes
+you need to enumerate. Models whose extras are absent are reported as
+skipped and their summary files are left untouched (rather than clobbered
+with ``null``), so a partial-env run doesn't lose previously-correct entries.
+
+Skips files that already have a ``supported_languages`` key — even when its
+value is JSON ``null`` (that's a real answer for LLM-style models, not a
+placeholder). Pass ``--overwrite`` to refresh anyway.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import sys
+from collections import defaultdict
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger("backfill_supported_languages")
+
+
+def _discover_one(model_id: str) -> list[str] | None:
+    """Instantiate ``model_id`` from the registry and call ``discover_supported_languages()``.
+
+    Returns the sorted list, or ``None`` if the model declines to enumerate
+    (the canonical "support set undefined" sentinel — must round-trip as
+    JSON ``null``). Raises on import or load errors so callers can decide
+    whether to skip the file.
+    """
+    from commonlid.core.registry import get_model
+
+    model = get_model(model_id)
+    supported = model.discover_supported_languages()
+    if supported is None:
+        return None
+    return sorted(supported)
+
+
+def _update_summary(
+    path: Path,
+    value: list[str] | None,
+    *,
+    overwrite: bool,
+    dry_run: bool,
+) -> str:
+    """Return one of ``written|skipped-has-key|skipped-dry-run``."""
+    with path.open(encoding="utf-8") as f:
+        summary = json.load(f)
+    if "supported_languages" in summary and not overwrite:
+        return "skipped-has-key"
+    summary["supported_languages"] = value
+    if dry_run:
+        return "skipped-dry-run"
+    with path.open("w", encoding="utf-8") as f:
+        json.dump(summary, f, indent=2, sort_keys=True)
+        f.write("\n")
+    return "written"
+
+
+def _collect_summaries(
+    source_dir: Path,
+    only_models: list[str] | None,
+) -> dict[str, list[Path]]:
+    """Group ``<dataset>/<model>/summary.json`` paths under ``source_dir`` by model_id."""
+    grouped: dict[str, list[Path]] = defaultdict(list)
+    for path in sorted(source_dir.glob("*/*/summary.json")):
+        model_id = path.parent.name
+        if only_models and model_id not in only_models:
+            continue
+        grouped[model_id].append(path)
+    return grouped
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--source-dir",
+        type=Path,
+        default=Path("data/results"),
+        help="Root directory holding <dataset>/<model>/summary.json files.",
+    )
+    parser.add_argument(
+        "--model",
+        action="append",
+        default=None,
+        dest="models",
+        help="Restrict to this model id (repeatable). Default: every model_id found.",
+    )
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Refresh files even if they already have a supported_languages key.",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Discover support sets but do not modify any files.",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Enable DEBUG-level logging.",
+    )
+    args = parser.parse_args(argv)
+
+    logging.basicConfig(
+        level=logging.DEBUG if args.verbose else logging.INFO,
+        format="%(asctime)s %(levelname)s %(message)s",
+    )
+
+    source_dir: Path = args.source_dir
+    if not source_dir.is_dir():
+        logger.error("source dir %s not found", source_dir)
+        return 2
+
+    grouped = _collect_summaries(source_dir, args.models)
+    if not grouped:
+        logger.error(
+            "no summary.json files matched under %s (filters: --model=%s)",
+            source_dir,
+            args.models,
+        )
+        return 1
+
+    # Import the models package so every model_id registers itself with the
+    # registry before we look any of them up.
+    import commonlid.models  # noqa: F401
+
+    counts: dict[str, int] = defaultdict(int)
+    skipped_models: list[tuple[str, str]] = []
+    for model_id, paths in grouped.items():
+        try:
+            supported = _discover_one(model_id)
+            reason = "undefined support set" if supported is None else None
+        except KeyError:
+            # Unknown model_id: usually legacy imports of LLM runs (e.g. GPT-4o).
+            # There is no class we can ask, so persist the explicit "undefined"
+            # sentinel rather than skipping; the data layer treats it the same
+            # as a registered model whose discover() returned None.
+            supported = None
+            reason = "unknown model_id (legacy LLM import?)"
+        except Exception as exc:
+            # Load / import failures: skip rather than clobber. The user can
+            # rerun from an env that has the missing extras.
+            logger.warning(
+                "skipping %s (%d file(s)) -- %s: %s",
+                model_id,
+                len(paths),
+                type(exc).__name__,
+                exc,
+            )
+            skipped_models.append((model_id, f"{type(exc).__name__}: {exc}"))
+            counts["skipped-model-error"] += len(paths)
+            continue
+
+        if supported is None:
+            logger.info(
+                "%s: %s -- writing JSON null to %d file(s)",
+                model_id,
+                reason,
+                len(paths),
+            )
+        else:
+            logger.info(
+                "%s: %d languages -- writing to %d file(s)",
+                model_id,
+                len(supported),
+                len(paths),
+            )
+
+        for path in paths:
+            try:
+                outcome = _update_summary(
+                    path, supported, overwrite=args.overwrite, dry_run=args.dry_run
+                )
+            except (OSError, json.JSONDecodeError) as exc:
+                logger.warning("failed to update %s: %s", path, exc)
+                counts["error"] += 1
+                continue
+            counts[outcome] += 1
+            logger.debug("%s %s", outcome, path)
+
+    summary: dict[str, Any] = {
+        "models_processed": len(grouped) - len(skipped_models),
+        "models_skipped": len(skipped_models),
+        **counts,
+    }
+    logger.info("done: %s", json.dumps(summary))
+    for model_id, reason in skipped_models:
+        logger.info("  skipped model %s: %s", model_id, reason)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/commonlid/evaluation/evaluator.py b/src/commonlid/evaluation/evaluator.py
@@ -159,6 +159,21 @@ def _run_one(
         )
         n_with_gold = sum(1 for g in ytrue if g is not None)
         samples_per_second = (len(ytrue) / elapsed) if elapsed > 0 else 0.0
+        # `None` here is meaningful: it tells downstream consumers that the
+        # model's support set is undefined (e.g. LLMs), distinct from a model
+        # that declared an empty set. Errors during discovery downgrade to
+        # the same "unknown" sentinel rather than crashing the run.
+        try:
+            supported = model.discover_supported_languages()
+        except Exception as exc:
+            logger.warning(
+                "%s   discover_supported_languages() raised %s: %s -- recording as None",
+                prefix,
+                type(exc).__name__,
+                exc,
+            )
+            supported = None
+        supported_languages = sorted(supported) if supported is not None else None
         result = Result(
             model_id=model.model_id,
             dataset_id=dataset.dataset_id,
@@ -170,6 +185,7 @@ def _run_one(
             limit=self.config.limit,
             timestamp=datetime.now(timezone.utc).isoformat(),
             commonlid_version=__version__,
+            supported_languages=supported_languages,
         )
 
         run_dir = self.config.output_dir / dataset.dataset_id / model.model_id

diff --git a/src/commonlid/evaluation/results.py b/src/commonlid/evaluation/results.py
@@ -13,12 +13,20 @@
 from commonlid.metrics.aggregate import macro_average, micro_average
 from commonlid.metrics.core import LanguageMetrics
 
-SCHEMA_VERSION = 2
+SCHEMA_VERSION = 3
 
 
 @dataclass(slots=True)
 class Result:
-    """Aggregate outcome of one model evaluated on one dataset."""
+    """Aggregate outcome of one model evaluated on one dataset.
+
+    ``supported_languages`` follows a tri-state convention shared with
+    :meth:`LIDModel.discover_supported_languages`: ``None`` means the
+    model's support set is undefined (e.g. LLM-based models that can be
+    prompted for any language), a list of ISO 639-3 codes is the closed
+    set the model declares, and an empty list is the degenerate "supports
+    zero languages" case. The leaderboard's ``(cov.)`` view consumes this.
+    """
 
     model_id: str
     dataset_id: str
@@ -32,6 +40,7 @@ class Result:
     commonlid_version: str = ""
     python_version: str = field(default_factory=lambda: sys.version.split()[0])
     platform: str = field(default_factory=platform.platform)
+    supported_languages: list[str] | None = None
     extra: dict[str, Any] = field(default_factory=dict)
 
     def summary(self) -> dict[str, Any]:
@@ -52,6 +61,7 @@ def summary(self) -> dict[str, Any]:
             "macro": macro_average(self.per_language),
             "micro": micro_average(self.per_language),
             "per_language": {lang: asdict(m) for lang, m in sorted(self.per_language.items())},
+            "supported_languages": self.supported_languages,
             "extra": self.extra,
         }