From 5911b8bcf124ca807b00a0f00036bb4bb436ff7e Mon Sep 17 00:00:00 2001 From: malteos Date: Fri, 22 May 2026 13:33:17 +0200 Subject: [PATCH] feat(leaderboard): add (cov.) scoring-scope toggle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a radio between the dataset metadata and the results table per tab. "All samples" (default) preserves the current paper-headline view; "(cov.)" restricts the macro/micro/FPR/Languages columns to gold samples whose language is in the model's declared support set, matching the CommonLID paper's `(cov.)` column. Mechanics: persist `supported_languages` in `summary.json` (schema v3) — sorted ISO 639-3 list when the model can enumerate, JSON `null` for LLM-style models whose support set is undefined. The leaderboard data layer reuses `mean_stats_with_coverage` + `mean_false_positive_rate` to compute the cov fields per row; rows without a support set render em-dashes and sort to the bottom of the cov view. Backfill existing summary files via `scripts/backfill_supported_languages.py` (standalone, argparse) and publish them to the HF dataset to keep the deployed Space in sync. --- scripts/backfill_supported_languages.py | 206 ++++++++++++++++++++++++ src/commonlid/evaluation/evaluator.py | 16 ++ src/commonlid/evaluation/results.py | 14 +- src/commonlid/leaderboard/app.py | 162 ++++++++++++++++--- src/commonlid/leaderboard/data.py | 127 ++++++++++++++- tests/unit/test_leaderboard_data.py | 185 ++++++++++++++++++++- tests/unit/test_results_io.py | 30 ++++ 7 files changed, 714 insertions(+), 26 deletions(-) create mode 100644 scripts/backfill_supported_languages.py diff --git a/scripts/backfill_supported_languages.py b/scripts/backfill_supported_languages.py new file mode 100644 index 0000000..0a1c13d --- /dev/null +++ b/scripts/backfill_supported_languages.py @@ -0,0 +1,206 @@ +"""Backfill the ``supported_languages`` field in existing ``summary.json`` files. + +Walks ``///summary.json``, groups paths by +``model_id``, loads each model once via the registry, calls +``discover_supported_languages()``, and writes the result (sorted ISO 639-3 +list, or JSON ``null`` for models whose support set is undefined) back into +every matching summary. + +Run after ``pip install commonlid[]`` for whichever model classes +you need to enumerate. Models whose extras are absent are reported as +skipped and their summary files are left untouched (rather than clobbered +with ``null``), so a partial-env run doesn't lose previously-correct entries. + +Skips files that already have a ``supported_languages`` key — even when its +value is JSON ``null`` (that's a real answer for LLM-style models, not a +placeholder). Pass ``--overwrite`` to refresh anyway. +""" + +from __future__ import annotations + +import argparse +import json +import logging +import sys +from collections import defaultdict +from pathlib import Path +from typing import Any + +logger = logging.getLogger("backfill_supported_languages") + + +def _discover_one(model_id: str) -> list[str] | None: + """Instantiate ``model_id`` from the registry and call ``discover_supported_languages()``. + + Returns the sorted list, or ``None`` if the model declines to enumerate + (the canonical "support set undefined" sentinel — must round-trip as + JSON ``null``). Raises on import or load errors so callers can decide + whether to skip the file. + """ + from commonlid.core.registry import get_model + + model = get_model(model_id) + supported = model.discover_supported_languages() + if supported is None: + return None + return sorted(supported) + + +def _update_summary( + path: Path, + value: list[str] | None, + *, + overwrite: bool, + dry_run: bool, +) -> str: + """Return one of ``written|skipped-has-key|skipped-dry-run``.""" + with path.open(encoding="utf-8") as f: + summary = json.load(f) + if "supported_languages" in summary and not overwrite: + return "skipped-has-key" + summary["supported_languages"] = value + if dry_run: + return "skipped-dry-run" + with path.open("w", encoding="utf-8") as f: + json.dump(summary, f, indent=2, sort_keys=True) + f.write("\n") + return "written" + + +def _collect_summaries( + source_dir: Path, + only_models: list[str] | None, +) -> dict[str, list[Path]]: + """Group ``//summary.json`` paths under ``source_dir`` by model_id.""" + grouped: dict[str, list[Path]] = defaultdict(list) + for path in sorted(source_dir.glob("*/*/summary.json")): + model_id = path.parent.name + if only_models and model_id not in only_models: + continue + grouped[model_id].append(path) + return grouped + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--source-dir", + type=Path, + default=Path("data/results"), + help="Root directory holding //summary.json files.", + ) + parser.add_argument( + "--model", + action="append", + default=None, + dest="models", + help="Restrict to this model id (repeatable). Default: every model_id found.", + ) + parser.add_argument( + "--overwrite", + action="store_true", + help="Refresh files even if they already have a supported_languages key.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Discover support sets but do not modify any files.", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Enable DEBUG-level logging.", + ) + args = parser.parse_args(argv) + + logging.basicConfig( + level=logging.DEBUG if args.verbose else logging.INFO, + format="%(asctime)s %(levelname)s %(message)s", + ) + + source_dir: Path = args.source_dir + if not source_dir.is_dir(): + logger.error("source dir %s not found", source_dir) + return 2 + + grouped = _collect_summaries(source_dir, args.models) + if not grouped: + logger.error( + "no summary.json files matched under %s (filters: --model=%s)", + source_dir, + args.models, + ) + return 1 + + # Import the models package so every model_id registers itself with the + # registry before we look any of them up. + import commonlid.models # noqa: F401 + + counts: dict[str, int] = defaultdict(int) + skipped_models: list[tuple[str, str]] = [] + for model_id, paths in grouped.items(): + try: + supported = _discover_one(model_id) + reason = "undefined support set" if supported is None else None + except KeyError: + # Unknown model_id: usually legacy imports of LLM runs (e.g. GPT-4o). + # There is no class we can ask, so persist the explicit "undefined" + # sentinel rather than skipping; the data layer treats it the same + # as a registered model whose discover() returned None. + supported = None + reason = "unknown model_id (legacy LLM import?)" + except Exception as exc: + # Load / import failures: skip rather than clobber. The user can + # rerun from an env that has the missing extras. + logger.warning( + "skipping %s (%d file(s)) -- %s: %s", + model_id, + len(paths), + type(exc).__name__, + exc, + ) + skipped_models.append((model_id, f"{type(exc).__name__}: {exc}")) + counts["skipped-model-error"] += len(paths) + continue + + if supported is None: + logger.info( + "%s: %s -- writing JSON null to %d file(s)", + model_id, + reason, + len(paths), + ) + else: + logger.info( + "%s: %d languages -- writing to %d file(s)", + model_id, + len(supported), + len(paths), + ) + + for path in paths: + try: + outcome = _update_summary( + path, supported, overwrite=args.overwrite, dry_run=args.dry_run + ) + except (OSError, json.JSONDecodeError) as exc: + logger.warning("failed to update %s: %s", path, exc) + counts["error"] += 1 + continue + counts[outcome] += 1 + logger.debug("%s %s", outcome, path) + + summary: dict[str, Any] = { + "models_processed": len(grouped) - len(skipped_models), + "models_skipped": len(skipped_models), + **counts, + } + logger.info("done: %s", json.dumps(summary)) + for model_id, reason in skipped_models: + logger.info(" skipped model %s: %s", model_id, reason) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/commonlid/evaluation/evaluator.py b/src/commonlid/evaluation/evaluator.py index 3474361..60cb9a4 100644 --- a/src/commonlid/evaluation/evaluator.py +++ b/src/commonlid/evaluation/evaluator.py @@ -159,6 +159,21 @@ def _run_one( ) n_with_gold = sum(1 for g in ytrue if g is not None) samples_per_second = (len(ytrue) / elapsed) if elapsed > 0 else 0.0 + # `None` here is meaningful: it tells downstream consumers that the + # model's support set is undefined (e.g. LLMs), distinct from a model + # that declared an empty set. Errors during discovery downgrade to + # the same "unknown" sentinel rather than crashing the run. + try: + supported = model.discover_supported_languages() + except Exception as exc: + logger.warning( + "%s discover_supported_languages() raised %s: %s -- recording as None", + prefix, + type(exc).__name__, + exc, + ) + supported = None + supported_languages = sorted(supported) if supported is not None else None result = Result( model_id=model.model_id, dataset_id=dataset.dataset_id, @@ -170,6 +185,7 @@ def _run_one( limit=self.config.limit, timestamp=datetime.now(timezone.utc).isoformat(), commonlid_version=__version__, + supported_languages=supported_languages, ) run_dir = self.config.output_dir / dataset.dataset_id / model.model_id diff --git a/src/commonlid/evaluation/results.py b/src/commonlid/evaluation/results.py index 1b70a04..e81b390 100644 --- a/src/commonlid/evaluation/results.py +++ b/src/commonlid/evaluation/results.py @@ -13,12 +13,20 @@ from commonlid.metrics.aggregate import macro_average, micro_average from commonlid.metrics.core import LanguageMetrics -SCHEMA_VERSION = 2 +SCHEMA_VERSION = 3 @dataclass(slots=True) class Result: - """Aggregate outcome of one model evaluated on one dataset.""" + """Aggregate outcome of one model evaluated on one dataset. + + ``supported_languages`` follows a tri-state convention shared with + :meth:`LIDModel.discover_supported_languages`: ``None`` means the + model's support set is undefined (e.g. LLM-based models that can be + prompted for any language), a list of ISO 639-3 codes is the closed + set the model declares, and an empty list is the degenerate "supports + zero languages" case. The leaderboard's ``(cov.)`` view consumes this. + """ model_id: str dataset_id: str @@ -32,6 +40,7 @@ class Result: commonlid_version: str = "" python_version: str = field(default_factory=lambda: sys.version.split()[0]) platform: str = field(default_factory=platform.platform) + supported_languages: list[str] | None = None extra: dict[str, Any] = field(default_factory=dict) def summary(self) -> dict[str, Any]: @@ -52,6 +61,7 @@ def summary(self) -> dict[str, Any]: "macro": macro_average(self.per_language), "micro": micro_average(self.per_language), "per_language": {lang: asdict(m) for lang, m in sorted(self.per_language.items())}, + "supported_languages": self.supported_languages, "extra": self.extra, } diff --git a/src/commonlid/leaderboard/app.py b/src/commonlid/leaderboard/app.py index 3d73b59..9c3f658 100644 --- a/src/commonlid/leaderboard/app.py +++ b/src/commonlid/leaderboard/app.py @@ -42,6 +42,20 @@ ) PAPER_URL = "https://arxiv.org/abs/2601.18026" +Scope = Literal["all", "cov"] + +#: Radio choices shown above each dataset's results table. +SCOPE_CHOICES: list[tuple[str, Scope]] = [ + ("Scores are calculated over the whole dataset.", "all"), + ( + "Scores are calculated on the subset of language varieties covered by the model. (cov.)", + "cov", + ), +] + +#: Sentinel string used when a row has no cov data (rendered as em-dash). +_NA_DISPLAY = "—" + #: Display columns in the headline table (in order). Macro F1 is the headline metric. _HEADLINE_COLUMNS: list[tuple[str, str]] = [ ("model_id", "Model"), @@ -51,6 +65,19 @@ ("n_languages", "Languages"), ("samples_per_second", "Samples/s"), ] + +#: Same columns, projected from the ``*_cov`` source fields. Display +#: labels stay identical so the table layout doesn't shift when the +#: scope radio is toggled. +_HEADLINE_COLUMNS_COV: list[tuple[str, str]] = [ + ("model_id", "Model"), + ("macro_f1_cov", "Macro F1"), + ("micro_f1_cov", "Micro F1"), + ("mean_fpr_cov", "Mean FPR (%)"), + ("n_languages_cov", "Languages"), + ("samples_per_second", "Samples/s"), +] + #: Right-aligned numeric columns get the ``number`` Gradio datatype which #: pushes values to the right edge of the cell. _GradioDtype = Literal["str", "number", "bool", "date", "markdown", "html"] @@ -134,6 +161,46 @@ ] +#: Per-column human descriptions for the **(cov.)** view — same metrics, +#: but restricted to the model's declared support set. +_HEADLINE_COLUMN_HELP_COV: list[tuple[str, str]] = [ + ("Model", "Identifier of the language identification model."), + ( + "Macro F1", + "Unweighted mean of per-language F1 (x100) **restricted to languages the " + "model declares it supports** (paper `(cov.)` definition). Languages outside " + "the model's support set are excluded from the average — a model that covers " + "a small but accurate subset of the benchmark is no longer penalised for the " + "long tail of languages it never claimed to handle. **Higher is better.** " + f"Models without a declared support set show `{_NA_DISPLAY}`.", + ), + ( + "Micro F1", + "Sample-weighted F1 (x100) pooled over the **model-supported subset** of " + "gold samples only. **Higher is better.** " + f"`{_NA_DISPLAY}` when no support set is declared.", + ), + ( + "Mean FPR (%)", + "Mean per-language false-positive rate computed only on samples whose gold " + "language is in the model's support set; TN counts confusion across other " + "supported languages, not the long tail. **Lower is better.** " + f"`{_NA_DISPLAY}` when no support set is declared.", + ), + ( + "Languages", + "Number of model-supported languages that have at least one gold sample in " + "this dataset (`|supported ∩ gold|`). This is the size of the slice every " + "other `(cov.)` metric is averaged over.", + ), + ( + "Samples/s", + "Throughput during evaluation (samples processed per second). Unaffected by " + "the scope toggle — it is a model-property, not a metric.", + ), +] + + def _columns_help_markdown(items: list[tuple[str, str]]) -> str: """Render a (column, description) list as a Markdown bullet block.""" return "\n".join(f"- **{label}** — {desc}" for label, desc in items) @@ -157,30 +224,55 @@ def _styled_value(table: Any, right_align_after_col: int = 0) -> dict[str, Any]: return {"data": data, "headers": headers, "metadata": {"styling": styling}} -def _format_table(df: Any) -> Any: +def _fmt(value: Any, decimals: int, *, scale: float = 1.0) -> str: + """Format a numeric value with ``decimals`` precision, em-dash for ``None``/``NaN``.""" + import pandas as pd + + if value is None or (isinstance(value, float) and pd.isna(value)): + return _NA_DISPLAY + return f"{float(value) * scale:.{decimals}f}" + + +def _format_table(df: Any, scope: Scope = "all") -> Any: """Project + format a results DataFrame for one Gradio tab. Numeric columns are converted to **fixed-decimal strings** (e.g. ``0.00`` not ``0``) so the rendered cells line up vertically; sort ordering is - preserved by sorting on the raw ``macro_f1`` *before* formatting. + preserved by sorting on the raw float *before* formatting. - Macro F1 / Micro F1 / Samples/s use **1 decimal**. - Mean FPR (%) uses **2 decimals**. + - In ``scope="cov"``, rows without ``supported_languages`` data render + em-dashes for every cov metric and sort to the bottom. """ import pandas as pd + columns = _HEADLINE_COLUMNS_COV if scope == "cov" else _HEADLINE_COLUMNS + display_labels = [label for _, label in columns] if df.empty: - return pd.DataFrame(columns=[label for _, label in _HEADLINE_COLUMNS]) + return pd.DataFrame(columns=display_labels) + out = df.copy() - # Sort on the raw float so the resulting order is correct; format only - # afterwards (string sort would order "10" before "9"). - out = out.sort_values("macro_f1", ascending=False, kind="stable").reset_index(drop=True) - out["macro_f1"] = (out["macro_f1"] * 100).map(lambda x: f"{x:.1f}") - out["micro_f1"] = (out["micro_f1"] * 100).map(lambda x: f"{x:.1f}") - out["mean_fpr"] = (out["mean_fpr"] * 100).map(lambda x: f"{x:.2f}") - out["samples_per_second"] = out["samples_per_second"].map(lambda x: f"{x:.1f}") - out = out[[k for k, _ in _HEADLINE_COLUMNS]] - out.columns = [label for _, label in _HEADLINE_COLUMNS] + source = {key: key for key, _ in columns} + sort_key = source["macro_f1_cov"] if scope == "cov" else source["macro_f1"] + # ``na_position="last"`` sinks rows without cov data to the bottom of + # the (cov.) view; the "all" view has no NaNs in this column. + out = out.sort_values(sort_key, ascending=False, kind="stable", na_position="last") + out = out.reset_index(drop=True) + + macro_key = source["macro_f1_cov"] if scope == "cov" else source["macro_f1"] + micro_key = source["micro_f1_cov"] if scope == "cov" else source["micro_f1"] + fpr_key = source["mean_fpr_cov"] if scope == "cov" else source["mean_fpr"] + langs_key = source["n_languages_cov"] if scope == "cov" else source["n_languages"] + + out[macro_key] = out[macro_key].map(lambda x: _fmt(x, 1, scale=100)) + out[micro_key] = out[micro_key].map(lambda x: _fmt(x, 1, scale=100)) + out[fpr_key] = out[fpr_key].map(lambda x: _fmt(x, 2, scale=100)) + out[langs_key] = out[langs_key].map(lambda x: _fmt(x, 0)) + out["samples_per_second"] = out["samples_per_second"].map(lambda x: _fmt(x, 1)) + + out = out[[k for k, _ in columns]] + out.columns = display_labels return out @@ -314,23 +406,30 @@ def _format_license(license_name: str, license_url: str | None) -> str: def _make_select_handler( dataset_id: str, - table: Any, snapshot_root: Path, ) -> Any: """Build the row-select callback as a closure over the captured state. + The callback looks up the clicked row in the *current* table value + (passed in via Gradio's event arg) so that switching the scope radio + and then clicking a row drills down the row at its post-toggle + position, not the row that would have been there before the swap. + Gradio inspects ``__defaults__`` when registering events, and comparing a DataFrame default against a type annotation hits an unimplemented arrow dtype path. A closure keeps the state out of the function signature. """ - def _on_select(evt: gr.SelectData) -> tuple[str, Any]: + def _on_select(table_value: Any, evt: gr.SelectData) -> tuple[str, Any]: if evt.index is None: return ("_Click a row to load per-language metrics._", None) row_idx = evt.index[0] if isinstance(evt.index, list | tuple) else evt.index try: - model_id = table.iloc[row_idx]["Model"] - except (IndexError, KeyError): + data = table_value.get("data") if isinstance(table_value, dict) else None + if data is None: + return ("_Click a row to load per-language metrics._", None) + model_id = data[row_idx][0] + except (IndexError, KeyError, TypeError): return ("_Could not resolve clicked row._", None) per_lang = _per_language_drilldown(snapshot_root, dataset_id, model_id) return ( @@ -341,6 +440,19 @@ def _on_select(evt: gr.SelectData) -> tuple[str, Any]: return _on_select +def _make_scope_handler(sub_df: Any) -> Any: + """Build the scope-radio change callback: swap the table data + legend in lockstep.""" + + def _on_change(scope: Scope) -> tuple[Any, str]: + help_items = _HEADLINE_COLUMN_HELP_COV if scope == "cov" else _HEADLINE_COLUMN_HELP + return ( + _styled_value(_format_table(sub_df, scope=scope)), + _columns_help_markdown(help_items), + ) + + return _on_change + + def build_app( *, repo_id: str = DEFAULT_REPO_ID, @@ -384,7 +496,7 @@ def build_app( with gr.Tab(label=tab_label): gr.Markdown(_dataset_metadata_markdown(dataset_id)) sub = df[df["dataset_id"] == dataset_id] - table = _format_table(sub) + table = _format_table(sub, scope="all") if table.empty: gr.Markdown( f"_No results for `{dataset_id}` in `{repo_id}` yet." @@ -394,6 +506,12 @@ def build_app( ) continue + scope_radio = gr.Radio( + choices=SCOPE_CHOICES, + value="all", + label="Scoring scope", + interactive=True, + ) leaderboard = gr.Dataframe( value=_styled_value(table), datatype=_HEADLINE_DATATYPES, @@ -402,7 +520,7 @@ def build_app( label=f"{dataset_id} — sorted by Macro F1", ) with gr.Accordion("What do these columns mean?", open=False): - gr.Markdown(_columns_help_markdown(_HEADLINE_COLUMN_HELP)) + legend = gr.Markdown(_columns_help_markdown(_HEADLINE_COLUMN_HELP)) drilldown_label = gr.Markdown("_Click a row to load per-language metrics._") # Seed the drilldown grid with an empty DataFrame so the Component # has stable column headers before the first row click. @@ -415,8 +533,14 @@ def build_app( with gr.Accordion("What do these per-language columns mean?", open=False): gr.Markdown(_columns_help_markdown(_DRILLDOWN_COLUMN_HELP)) + scope_radio.change( + _make_scope_handler(sub), + inputs=[scope_radio], + outputs=[leaderboard, legend], + ) leaderboard.select( - _make_select_handler(dataset_id, table, snapshot_root), + _make_select_handler(dataset_id, snapshot_root), + inputs=[leaderboard], outputs=[drilldown_label, drilldown], ) gr.Markdown(footer) diff --git a/src/commonlid/leaderboard/data.py b/src/commonlid/leaderboard/data.py index 20b21a9..b59844f 100644 --- a/src/commonlid/leaderboard/data.py +++ b/src/commonlid/leaderboard/data.py @@ -14,11 +14,15 @@ import json import logging -from collections.abc import Iterable +import math +from collections.abc import Iterable, Mapping from dataclasses import asdict, dataclass from pathlib import Path from typing import Any +from commonlid.metrics.core import LanguageMetrics +from commonlid.metrics.fpr import mean_false_positive_rate, mean_stats_with_coverage + logger = logging.getLogger(__name__) DEFAULT_REPO_ID = "commoncrawl/commonlid-results" @@ -40,6 +44,14 @@ class LeaderboardRow: gold set. That's a model-property number, not a paper headline, and it stays consistent across rows: every model is reported on the same "what languages did you actually output here" basis. + + The ``*_cov`` mirror fields are the same metrics restricted to gold + samples whose language is in the model's declared support set + (``supported_languages``). They are ``None`` when no support set is + available — either the field is missing from ``summary.json`` (legacy + file), the field is JSON ``null`` (LLM-style models whose support set + is undefined), or the field is an empty list (degenerate "supports + zero languages"). All three render as em-dashes in the cov view. """ dataset_id: str @@ -57,6 +69,13 @@ class LeaderboardRow: commonlid_version: str timestamp: str is_imported: bool + supported_languages: list[str] | None + macro_f1_cov: float | None + macro_precision_cov: float | None + macro_recall_cov: float | None + micro_f1_cov: float | None + mean_fpr_cov: float | None + n_languages_cov: int | None def to_dict(self) -> dict[str, Any]: return asdict(self) @@ -68,10 +87,107 @@ def _safe_mean_fpr(per_language: dict[str, dict[str, Any]]) -> float: return sum(vals) / len(vals) if vals else 0.0 +def _hydrate_per_language( + per_language: Mapping[str, Mapping[str, Any]], +) -> dict[str, LanguageMetrics]: + """Reconstruct :class:`LanguageMetrics` objects from the serialised dict form.""" + out: dict[str, LanguageMetrics] = {} + for lang, m in per_language.items(): + out[lang] = LanguageMetrics( + gt_count=int(m.get("gt_count", 0)), + predictions=int(m.get("predictions", 0)), + correct=int(m.get("correct", 0)), + precision=float(m.get("precision", 0.0) or 0.0), + recall=float(m.get("recall", 0.0) or 0.0), + f1=float(m.get("f1", 0.0) or 0.0), + fpr=None if m.get("fpr") is None else float(m["fpr"]), + ) + return out + + +def _micro_average_over(rows: Mapping[str, LanguageMetrics]) -> tuple[float, float, float]: + """Pooled precision/recall/F1 over a (filtered) per-language slice. + + Mirrors :func:`commonlid.metrics.aggregate.micro_average`'s + ``*_gold_only`` math but accepts a pre-filtered subset, which the + public helper does not. + """ + total_correct = sum(m.correct for m in rows.values()) + total_predictions = sum(m.predictions for m in rows.values()) + total_gold = sum(m.gt_count for m in rows.values()) + precision = total_correct / total_predictions if total_predictions > 0 else 0.0 + recall = total_correct / total_gold if total_gold > 0 else 0.0 + f1 = ( + 2 * precision * recall / (precision + recall) + if (precision + recall) > 0 and not math.isclose(precision + recall, 0.0) + else 0.0 + ) + return precision, recall, f1 + + +def _compute_cov_fields( + per_language_raw: Mapping[str, Mapping[str, Any]], + supported: list[str] | None, +) -> dict[str, float | int | None]: + """Return the six cov-variant fields, or all ``None`` when no cov data. + + ``supported`` semantics: + + - ``None`` — model's support set is undefined (e.g. LLM); no cov data. + - ``[]`` — model declared zero supported languages; every cov metric + would divide by zero, so render as no-data. + - non-empty list — compute the cov metrics. + """ + none_result: dict[str, float | int | None] = { + "macro_f1_cov": None, + "macro_precision_cov": None, + "macro_recall_cov": None, + "micro_f1_cov": None, + "mean_fpr_cov": None, + "n_languages_cov": None, + } + if not supported: + return none_result + + supported_set = set(supported) + per_language = _hydrate_per_language(per_language_raw) + stats = mean_stats_with_coverage(per_language, model_supported_languages=supported_set) + cov = stats["cov"] + n_languages_cov = int(cov.get("cov_count", 0)) + if n_languages_cov == 0: + # Supported set has no overlap with the dataset's gold; nothing + # meaningful to report. + return none_result + + cov_rows = { + lang: m for lang, m in per_language.items() if m.gt_count > 0 and lang in supported_set + } + _micro_precision, _micro_recall, micro_f1 = _micro_average_over(cov_rows) + mean_fpr_cov = mean_false_positive_rate(per_language, language_whitelist=supported_set) + + return { + "macro_f1_cov": float(cov["f1"]), + "macro_precision_cov": float(cov["precision"]), + "macro_recall_cov": float(cov["recall"]), + "micro_f1_cov": float(micro_f1), + "mean_fpr_cov": float(mean_fpr_cov), + "n_languages_cov": n_languages_cov, + } + + def _row_from_summary(summary: dict[str, Any], dataset_id: str, model_id: str) -> LeaderboardRow: macro = summary.get("macro", {}) micro = summary.get("micro", {}) extra = summary.get("extra", {}) or {} + per_language = summary.get("per_language", {}) or {} + + # Tri-state: missing key, JSON null, or list. Anything else (e.g. an + # accidentally-serialised set) collapses to "unknown". + raw_supported = summary.get("supported_languages") + supported: list[str] | None = list(raw_supported) if isinstance(raw_supported, list) else None + cov = _compute_cov_fields(per_language, supported) + + n_languages_cov = cov["n_languages_cov"] return LeaderboardRow( dataset_id=dataset_id, model_id=model_id, @@ -79,7 +195,7 @@ def _row_from_summary(summary: dict[str, Any], dataset_id: str, model_id: str) - macro_precision=float(macro.get("precision_gold_only", 0.0)), macro_recall=float(macro.get("recall_gold_only", 0.0)), micro_f1=float(micro.get("f1_gold_only", 0.0)), - mean_fpr=_safe_mean_fpr(summary.get("per_language", {})), + mean_fpr=_safe_mean_fpr(per_language), n_languages=int(macro.get("n_languages_observed", 0)), n_samples=int(summary.get("n_samples", 0)), n_samples_with_gold=int(summary.get("n_samples_with_gold", 0)), @@ -88,6 +204,13 @@ def _row_from_summary(summary: dict[str, Any], dataset_id: str, model_id: str) - commonlid_version=str(summary.get("commonlid_version", "")), timestamp=str(summary.get("timestamp", "")), is_imported=("imported_from" in extra), + supported_languages=supported, + macro_f1_cov=cov["macro_f1_cov"], + macro_precision_cov=cov["macro_precision_cov"], + macro_recall_cov=cov["macro_recall_cov"], + micro_f1_cov=cov["micro_f1_cov"], + mean_fpr_cov=cov["mean_fpr_cov"], + n_languages_cov=int(n_languages_cov) if n_languages_cov is not None else None, ) diff --git a/tests/unit/test_leaderboard_data.py b/tests/unit/test_leaderboard_data.py index 6f426f2..4e9be30 100644 --- a/tests/unit/test_leaderboard_data.py +++ b/tests/unit/test_leaderboard_data.py @@ -4,12 +4,16 @@ import json from pathlib import Path +from typing import Any import pytest pd = pytest.importorskip("pandas") +_UNSET = object() + + def _write_summary( root: Path, dataset_id: str, @@ -17,12 +21,14 @@ def _write_summary( *, macro_f1: float = 0.5, fpr_per_lang: dict[str, float | None] | None = None, + per_language: dict[str, dict[str, Any]] | None = None, + supported_languages: object = _UNSET, ) -> None: out = root / dataset_id / model_id out.mkdir(parents=True, exist_ok=True) fpr_per_lang = fpr_per_lang or {"eng": 0.01, "deu": 0.02} - summary = { - "schema_version": 2, + summary: dict[str, Any] = { + "schema_version": 3, "model_id": model_id, "dataset_id": dataset_id, "dataset_revision": "abc123", @@ -54,7 +60,8 @@ def _write_summary( "n_correct_observed": 65, "n_predictions_observed": 95, }, - "per_language": { + "per_language": per_language + or { lang: { "f1": 0.5, "precision": 0.5, @@ -68,6 +75,8 @@ def _write_summary( }, "extra": {}, } + if supported_languages is not _UNSET: + summary["supported_languages"] = supported_languages (out / "summary.json").write_text(json.dumps(summary), encoding="utf-8") @@ -312,3 +321,173 @@ def test_format_table_pads_zero_values_with_decimals(tmp_path: Path) -> None: row = _format_table(df).iloc[0] assert row["Macro F1"] == "0.0" assert row["Mean FPR (%)"] == "0.00" + + +# ----- (cov.) variant --------------------------------------------------------- + + +def _per_language_block( + spec: dict[str, tuple[int, int, int, float]], +) -> dict[str, dict[str, Any]]: + """Build a per_language dict from a compact (gt, pred, correct, fpr) spec.""" + return { + lang: { + "gt_count": gt, + "predictions": pred, + "correct": correct, + "precision": (correct / pred) if pred else 0.0, + "recall": (correct / gt) if gt else 0.0, + "f1": ( + 2 * (correct / pred) * (correct / gt) / ((correct / pred) + (correct / gt)) + if pred and gt and correct + else 0.0 + ), + "fpr": fpr, + } + for lang, (gt, pred, correct, fpr) in spec.items() + } + + +def test_row_cov_fields_filter_to_supported_languages(tmp_path: Path) -> None: + """With supported={eng,fra}, cov metrics exclude deu (an unsupported gold lang).""" + from commonlid.leaderboard.data import load_results + + per_lang = _per_language_block({ + "eng": (10, 10, 10, 0.01), # perfect + "fra": (10, 10, 5, 0.02), # P=0.5 R=0.5 F1=0.5 + "deu": (10, 10, 0, 0.03), # zero (excluded from cov) + }) + _write_summary( + tmp_path, + "commonlid", + "M", + per_language=per_lang, + supported_languages=["eng", "fra"], + ) + df = load_results(local_dir=tmp_path) + row = df.iloc[0] + # Macro F1 cov = mean(F1 over {eng, fra}) = (1.0 + 0.5) / 2 = 0.75; "all" includes deu and drops. + assert row["macro_f1_cov"] == pytest.approx(0.75) + assert row["macro_f1_cov"] > row["macro_f1"] + # Micro F1 cov pools eng+fra: P = 15/20 = 0.75, R = 15/20 = 0.75, F1 = 0.75. + assert row["micro_f1_cov"] == pytest.approx(0.75) + # n_languages_cov is the supported-and-have-gold intersection size. + assert row["n_languages_cov"] == 2 + # supported_languages round-trips as a sorted list. + assert row["supported_languages"] == ["eng", "fra"] + + +@pytest.mark.parametrize( + ("kind", "supported"), + [ + ("missing", _UNSET), # legacy file, no key + ("null", None), # LLM row, undefined support set + ("empty_list", []), # degenerate "supports zero languages" + ], +) +def test_row_cov_fields_none_when_no_support_data( + tmp_path: Path, kind: str, supported: object +) -> None: + from commonlid.leaderboard.data import load_results + + _write_summary( + tmp_path, + "commonlid", + f"M_{kind}", + supported_languages=supported, + ) + df = load_results(local_dir=tmp_path) + row = df.iloc[0] + assert row["macro_f1_cov"] is None + assert row["macro_precision_cov"] is None + assert row["macro_recall_cov"] is None + assert row["micro_f1_cov"] is None + assert row["mean_fpr_cov"] is None + assert row["n_languages_cov"] is None + # supported_languages is preserved as-is in the row (None for both + # missing and null) so the UI can choose to show a tooltip later. + expected = None if supported is _UNSET or supported is None else supported + assert row["supported_languages"] == expected + + +def test_row_cov_fields_none_when_supported_set_misses_all_gold(tmp_path: Path) -> None: + """If the model's support set has no overlap with the dataset's gold, cov collapses to None.""" + from commonlid.leaderboard.data import load_results + + per_lang = _per_language_block({"eng": (10, 10, 8, 0.01)}) + _write_summary( + tmp_path, + "commonlid", + "M", + per_language=per_lang, + supported_languages=["xyz", "qqq"], # no overlap + ) + df = load_results(local_dir=tmp_path) + row = df.iloc[0] + assert row["macro_f1_cov"] is None + assert row["n_languages_cov"] is None + + +def test_format_table_cov_scope_renders_em_dashes(tmp_path: Path) -> None: + """Rows without ``supported_languages`` show em-dashes and sort to the bottom.""" + pytest.importorskip("gradio") + from commonlid.leaderboard.app import _format_table + from commonlid.leaderboard.data import load_results + + per_lang = _per_language_block({ + "eng": (10, 10, 10, 0.01), + "fra": (10, 10, 5, 0.02), + }) + # Model with a declared support set -> real cov numbers. + _write_summary( + tmp_path, + "commonlid", + "WITH_SUPPORT", + per_language=per_lang, + supported_languages=["eng", "fra"], + ) + # LLM-style row -> JSON null -> em-dashes in cov view. + _write_summary( + tmp_path, + "commonlid", + "NO_SUPPORT", + per_language=per_lang, + supported_languages=None, + ) + df = load_results(local_dir=tmp_path) + cov_table = _format_table(df, scope="cov") + # Real-cov row should sort above the em-dash row. + assert list(cov_table["Model"]) == ["WITH_SUPPORT", "NO_SUPPORT"] + assert cov_table.iloc[0]["Macro F1"] == "75.0" + assert cov_table.iloc[1]["Macro F1"] == "—" + assert cov_table.iloc[1]["Languages"] == "—" + # Samples/s is unaffected by the toggle (it's a model property). + assert cov_table.iloc[0]["Samples/s"] == "1234.5" + assert cov_table.iloc[1]["Samples/s"] == "1234.5" + + +def test_scope_radio_change_swaps_table_and_legend(tmp_path: Path) -> None: + """The scope-change handler returns a fresh styled table + legend Markdown.""" + pytest.importorskip("gradio") + from commonlid.leaderboard.app import ( + _HEADLINE_COLUMN_HELP_COV, + _columns_help_markdown, + _make_scope_handler, + ) + from commonlid.leaderboard.data import load_results + + per_lang = _per_language_block({"eng": (10, 10, 10, 0.01), "fra": (10, 10, 5, 0.02)}) + _write_summary( + tmp_path, + "commonlid", + "M", + per_language=per_lang, + supported_languages=["eng", "fra"], + ) + df = load_results(local_dir=tmp_path) + handler = _make_scope_handler(df) + table_payload, legend = handler("cov") + assert legend == _columns_help_markdown(_HEADLINE_COLUMN_HELP_COV) + assert table_payload["headers"][1] == "Macro F1" + # Row data is the cov computation, not the "all" one. + assert table_payload["data"][0][1] == "75.0" diff --git a/tests/unit/test_results_io.py b/tests/unit/test_results_io.py index 6fff8ca..d83e446 100644 --- a/tests/unit/test_results_io.py +++ b/tests/unit/test_results_io.py @@ -61,3 +61,33 @@ def test_write_predictions_roundtrip(tmp_path) -> None: write_predictions(rows, path) lines = [json.loads(line) for line in path.read_text().splitlines()] assert lines == rows + + +def test_summary_supported_languages_roundtrips_list() -> None: + """A declared support set ships through ``summary()`` as the sorted list.""" + r = _result() + r.supported_languages = ["eng", "fra"] + assert _result().summary()["supported_languages"] is None # default still None + assert r.summary()["supported_languages"] == ["eng", "fra"] + + +def test_summary_supported_languages_roundtrips_none(tmp_path) -> None: + """``None`` means *undefined* (e.g. LLM rows) and must serialize as JSON ``null``. + + Round-trip via write_summary -> load_summary guards against an accidental + coercion to ``[]``, which has a distinct semantic meaning (a model that + declared zero supported languages). + """ + path = tmp_path / "sum.json" + write_summary(_result(), path) + raw = path.read_text(encoding="utf-8") + # Verify JSON literal -- ``[]`` would be a wrong but type-compatible answer. + assert '"supported_languages": null' in raw + assert load_summary(path)["supported_languages"] is None + + +def test_summary_supported_languages_preserves_empty_list() -> None: + """``[]`` is degenerate but real -- distinct from ``None``.""" + r = _result() + r.supported_languages = [] + assert r.summary()["supported_languages"] == []