Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
206 changes: 206 additions & 0 deletions scripts/backfill_supported_languages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
"""Backfill the ``supported_languages`` field in existing ``summary.json`` files.

Walks ``<source_dir>/<dataset>/<model>/summary.json``, groups paths by
``model_id``, loads each model once via the registry, calls
``discover_supported_languages()``, and writes the result (sorted ISO 639-3
list, or JSON ``null`` for models whose support set is undefined) back into
every matching summary.

Run after ``pip install commonlid[<extras>]`` for whichever model classes
you need to enumerate. Models whose extras are absent are reported as
skipped and their summary files are left untouched (rather than clobbered
with ``null``), so a partial-env run doesn't lose previously-correct entries.

Skips files that already have a ``supported_languages`` key — even when its
value is JSON ``null`` (that's a real answer for LLM-style models, not a
placeholder). Pass ``--overwrite`` to refresh anyway.
"""

from __future__ import annotations

import argparse
import json
import logging
import sys
from collections import defaultdict
from pathlib import Path
from typing import Any

logger = logging.getLogger("backfill_supported_languages")


def _discover_one(model_id: str) -> list[str] | None:
"""Instantiate ``model_id`` from the registry and call ``discover_supported_languages()``.

Returns the sorted list, or ``None`` if the model declines to enumerate
(the canonical "support set undefined" sentinel — must round-trip as
JSON ``null``). Raises on import or load errors so callers can decide
whether to skip the file.
"""
from commonlid.core.registry import get_model

model = get_model(model_id)
supported = model.discover_supported_languages()
if supported is None:
return None
return sorted(supported)


def _update_summary(
path: Path,
value: list[str] | None,
*,
overwrite: bool,
dry_run: bool,
) -> str:
"""Return one of ``written|skipped-has-key|skipped-dry-run``."""
with path.open(encoding="utf-8") as f:
summary = json.load(f)
if "supported_languages" in summary and not overwrite:
return "skipped-has-key"
summary["supported_languages"] = value
if dry_run:
return "skipped-dry-run"
with path.open("w", encoding="utf-8") as f:
json.dump(summary, f, indent=2, sort_keys=True)
f.write("\n")
return "written"


def _collect_summaries(
source_dir: Path,
only_models: list[str] | None,
) -> dict[str, list[Path]]:
"""Group ``<dataset>/<model>/summary.json`` paths under ``source_dir`` by model_id."""
grouped: dict[str, list[Path]] = defaultdict(list)
for path in sorted(source_dir.glob("*/*/summary.json")):
model_id = path.parent.name
if only_models and model_id not in only_models:
continue
grouped[model_id].append(path)
return grouped


def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--source-dir",
type=Path,
default=Path("data/results"),
help="Root directory holding <dataset>/<model>/summary.json files.",
)
parser.add_argument(
"--model",
action="append",
default=None,
dest="models",
help="Restrict to this model id (repeatable). Default: every model_id found.",
)
parser.add_argument(
"--overwrite",
action="store_true",
help="Refresh files even if they already have a supported_languages key.",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Discover support sets but do not modify any files.",
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
help="Enable DEBUG-level logging.",
)
args = parser.parse_args(argv)

logging.basicConfig(
level=logging.DEBUG if args.verbose else logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
)

source_dir: Path = args.source_dir
if not source_dir.is_dir():
logger.error("source dir %s not found", source_dir)
return 2

grouped = _collect_summaries(source_dir, args.models)
if not grouped:
logger.error(
"no summary.json files matched under %s (filters: --model=%s)",
source_dir,
args.models,
)
return 1

# Import the models package so every model_id registers itself with the
# registry before we look any of them up.
import commonlid.models # noqa: F401

counts: dict[str, int] = defaultdict(int)
skipped_models: list[tuple[str, str]] = []
for model_id, paths in grouped.items():
try:
supported = _discover_one(model_id)
reason = "undefined support set" if supported is None else None
except KeyError:
# Unknown model_id: usually legacy imports of LLM runs (e.g. GPT-4o).
# There is no class we can ask, so persist the explicit "undefined"
# sentinel rather than skipping; the data layer treats it the same
# as a registered model whose discover() returned None.
supported = None
reason = "unknown model_id (legacy LLM import?)"
except Exception as exc:
# Load / import failures: skip rather than clobber. The user can
# rerun from an env that has the missing extras.
logger.warning(
"skipping %s (%d file(s)) -- %s: %s",
model_id,
len(paths),
type(exc).__name__,
exc,
)
skipped_models.append((model_id, f"{type(exc).__name__}: {exc}"))
counts["skipped-model-error"] += len(paths)
continue

if supported is None:
logger.info(
"%s: %s -- writing JSON null to %d file(s)",
model_id,
reason,
len(paths),
)
else:
logger.info(
"%s: %d languages -- writing to %d file(s)",
model_id,
len(supported),
len(paths),
)

for path in paths:
try:
outcome = _update_summary(
path, supported, overwrite=args.overwrite, dry_run=args.dry_run
)
except (OSError, json.JSONDecodeError) as exc:
logger.warning("failed to update %s: %s", path, exc)
counts["error"] += 1
continue
counts[outcome] += 1
logger.debug("%s %s", outcome, path)

summary: dict[str, Any] = {
"models_processed": len(grouped) - len(skipped_models),
"models_skipped": len(skipped_models),
**counts,
}
logger.info("done: %s", json.dumps(summary))
for model_id, reason in skipped_models:
logger.info(" skipped model %s: %s", model_id, reason)
return 0


if __name__ == "__main__":
sys.exit(main())
16 changes: 16 additions & 0 deletions src/commonlid/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,21 @@ def _run_one(
)
n_with_gold = sum(1 for g in ytrue if g is not None)
samples_per_second = (len(ytrue) / elapsed) if elapsed > 0 else 0.0
# `None` here is meaningful: it tells downstream consumers that the
# model's support set is undefined (e.g. LLMs), distinct from a model
# that declared an empty set. Errors during discovery downgrade to
# the same "unknown" sentinel rather than crashing the run.
try:
supported = model.discover_supported_languages()
except Exception as exc:
logger.warning(
"%s discover_supported_languages() raised %s: %s -- recording as None",
prefix,
type(exc).__name__,
exc,
)
supported = None
supported_languages = sorted(supported) if supported is not None else None
result = Result(
model_id=model.model_id,
dataset_id=dataset.dataset_id,
Expand All @@ -170,6 +185,7 @@ def _run_one(
limit=self.config.limit,
timestamp=datetime.now(timezone.utc).isoformat(),
commonlid_version=__version__,
supported_languages=supported_languages,
)

run_dir = self.config.output_dir / dataset.dataset_id / model.model_id
Expand Down
14 changes: 12 additions & 2 deletions src/commonlid/evaluation/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,20 @@
from commonlid.metrics.aggregate import macro_average, micro_average
from commonlid.metrics.core import LanguageMetrics

SCHEMA_VERSION = 2
SCHEMA_VERSION = 3


@dataclass(slots=True)
class Result:
"""Aggregate outcome of one model evaluated on one dataset."""
"""Aggregate outcome of one model evaluated on one dataset.

``supported_languages`` follows a tri-state convention shared with
:meth:`LIDModel.discover_supported_languages`: ``None`` means the
model's support set is undefined (e.g. LLM-based models that can be
prompted for any language), a list of ISO 639-3 codes is the closed
set the model declares, and an empty list is the degenerate "supports
zero languages" case. The leaderboard's ``(cov.)`` view consumes this.
"""

model_id: str
dataset_id: str
Expand All @@ -32,6 +40,7 @@ class Result:
commonlid_version: str = ""
python_version: str = field(default_factory=lambda: sys.version.split()[0])
platform: str = field(default_factory=platform.platform)
supported_languages: list[str] | None = None
extra: dict[str, Any] = field(default_factory=dict)

def summary(self) -> dict[str, Any]:
Expand All @@ -52,6 +61,7 @@ def summary(self) -> dict[str, Any]:
"macro": macro_average(self.per_language),
"micro": micro_average(self.per_language),
"per_language": {lang: asdict(m) for lang, m in sorted(self.per_language.items())},
"supported_languages": self.supported_languages,
"extra": self.extra,
}

Expand Down
Loading
Loading