From 5911b8bcf124ca807b00a0f00036bb4bb436ff7e Mon Sep 17 00:00:00 2001
From: malteos <git@i.mieo.de>
Date: Fri, 22 May 2026 13:33:17 +0200
Subject: [PATCH] feat(leaderboard): add (cov.) scoring-scope toggle
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a radio between the dataset metadata and the results table per tab.
"All samples" (default) preserves the current paper-headline view; "(cov.)"
restricts the macro/micro/FPR/Languages columns to gold samples whose
language is in the model's declared support set, matching the CommonLID
paper's `(cov.)` column.

Mechanics: persist `supported_languages` in `summary.json` (schema v3) —
sorted ISO 639-3 list when the model can enumerate, JSON `null` for
LLM-style models whose support set is undefined. The leaderboard data
layer reuses `mean_stats_with_coverage` + `mean_false_positive_rate` to
compute the cov fields per row; rows without a support set render
em-dashes and sort to the bottom of the cov view.

Backfill existing summary files via `scripts/backfill_supported_languages.py`
(standalone, argparse) and publish them to the HF dataset to keep the
deployed Space in sync.
---
 scripts/backfill_supported_languages.py | 206 ++++++++++++++++++++++++
 src/commonlid/evaluation/evaluator.py   |  16 ++
 src/commonlid/evaluation/results.py     |  14 +-
 src/commonlid/leaderboard/app.py        | 162 ++++++++++++++++---
 src/commonlid/leaderboard/data.py       | 127 ++++++++++++++-
 tests/unit/test_leaderboard_data.py     | 185 ++++++++++++++++++++-
 tests/unit/test_results_io.py           |  30 ++++
 7 files changed, 714 insertions(+), 26 deletions(-)
 create mode 100644 scripts/backfill_supported_languages.py
diff --git a/scripts/backfill_supported_languages.py b/scripts/backfill_supported_languages.py
new file mode 100644
index 0000000..0a1c13d
--- /dev/null
+++ b/scripts/backfill_supported_languages.py
@@ -0,0 +1,206 @@
+"""Backfill the ``supported_languages`` field in existing ``summary.json`` files.
+
+Walks ``<source_dir>/<dataset>/<model>/summary.json``, groups paths by
+``model_id``, loads each model once via the registry, calls
+``discover_supported_languages()``, and writes the result (sorted ISO 639-3
+list, or JSON ``null`` for models whose support set is undefined) back into
+every matching summary.
+
+Run after ``pip install commonlid[<extras>]`` for whichever model classes
+you need to enumerate. Models whose extras are absent are reported as
+skipped and their summary files are left untouched (rather than clobbered
+with ``null``), so a partial-env run doesn't lose previously-correct entries.
+
+Skips files that already have a ``supported_languages`` key — even when its
+value is JSON ``null`` (that's a real answer for LLM-style models, not a
+placeholder). Pass ``--overwrite`` to refresh anyway.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import sys
+from collections import defaultdict
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger("backfill_supported_languages")
+
+
+def _discover_one(model_id: str) -> list[str] | None:
+    """Instantiate ``model_id`` from the registry and call ``discover_supported_languages()``.
+
+    Returns the sorted list, or ``None`` if the model declines to enumerate
+    (the canonical "support set undefined" sentinel — must round-trip as
+    JSON ``null``). Raises on import or load errors so callers can decide
+    whether to skip the file.
+    """
+    from commonlid.core.registry import get_model
+
+    model = get_model(model_id)
+    supported = model.discover_supported_languages()
+    if supported is None:
+        return None
+    return sorted(supported)
+
+
+def _update_summary(
+    path: Path,
+    value: list[str] | None,
+    *,
+    overwrite: bool,
+    dry_run: bool,
+) -> str:
+    """Return one of ``written|skipped-has-key|skipped-dry-run``."""
+    with path.open(encoding="utf-8") as f:
+        summary = json.load(f)
+    if "supported_languages" in summary and not overwrite:
+        return "skipped-has-key"
+    summary["supported_languages"] = value
+    if dry_run:
+        return "skipped-dry-run"
+    with path.open("w", encoding="utf-8") as f:
+        json.dump(summary, f, indent=2, sort_keys=True)
+        f.write("\n")
+    return "written"
+
+
+def _collect_summaries(
+    source_dir: Path,
+    only_models: list[str] | None,
+) -> dict[str, list[Path]]:
+    """Group ``<dataset>/<model>/summary.json`` paths under ``source_dir`` by model_id."""
+    grouped: dict[str, list[Path]] = defaultdict(list)
+    for path in sorted(source_dir.glob("*/*/summary.json")):
+        model_id = path.parent.name
+        if only_models and model_id not in only_models:
+            continue
+        grouped[model_id].append(path)
+    return grouped
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--source-dir",
+        type=Path,
+        default=Path("data/results"),
+        help="Root directory holding <dataset>/<model>/summary.json files.",
+    )
+    parser.add_argument(
+        "--model",
+        action="append",
+        default=None,
+        dest="models",
+        help="Restrict to this model id (repeatable). Default: every model_id found.",
+    )
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Refresh files even if they already have a supported_languages key.",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Discover support sets but do not modify any files.",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Enable DEBUG-level logging.",
+    )
+    args = parser.parse_args(argv)
+
+    logging.basicConfig(
+        level=logging.DEBUG if args.verbose else logging.INFO,
+        format="%(asctime)s %(levelname)s %(message)s",
+    )
+
+    source_dir: Path = args.source_dir
+    if not source_dir.is_dir():
+        logger.error("source dir %s not found", source_dir)
+        return 2
+
+    grouped = _collect_summaries(source_dir, args.models)
+    if not grouped:
+        logger.error(
+            "no summary.json files matched under %s (filters: --model=%s)",
+            source_dir,
+            args.models,
+        )
+        return 1
+
+    # Import the models package so every model_id registers itself with the
+    # registry before we look any of them up.
+    import commonlid.models  # noqa: F401
+
+    counts: dict[str, int] = defaultdict(int)
+    skipped_models: list[tuple[str, str]] = []
+    for model_id, paths in grouped.items():
+        try:
+            supported = _discover_one(model_id)
+            reason = "undefined support set" if supported is None else None
+        except KeyError:
+            # Unknown model_id: usually legacy imports of LLM runs (e.g. GPT-4o).
+            # There is no class we can ask, so persist the explicit "undefined"
+            # sentinel rather than skipping; the data layer treats it the same
+            # as a registered model whose discover() returned None.
+            supported = None
+            reason = "unknown model_id (legacy LLM import?)"
+        except Exception as exc:
+            # Load / import failures: skip rather than clobber. The user can
+            # rerun from an env that has the missing extras.
+            logger.warning(
+                "skipping %s (%d file(s)) -- %s: %s",
+                model_id,
+                len(paths),
+                type(exc).__name__,
+                exc,
+            )
+            skipped_models.append((model_id, f"{type(exc).__name__}: {exc}"))
+            counts["skipped-model-error"] += len(paths)
+            continue
+
+        if supported is None:
+            logger.info(
+                "%s: %s -- writing JSON null to %d file(s)",
+                model_id,
+                reason,
+                len(paths),
+            )
+        else:
+            logger.info(
+                "%s: %d languages -- writing to %d file(s)",
+                model_id,
+                len(supported),
+                len(paths),
+            )
+
+        for path in paths:
+            try:
+                outcome = _update_summary(
+                    path, supported, overwrite=args.overwrite, dry_run=args.dry_run
+                )
+            except (OSError, json.JSONDecodeError) as exc:
+                logger.warning("failed to update %s: %s", path, exc)
+                counts["error"] += 1
+                continue
+            counts[outcome] += 1
+            logger.debug("%s %s", outcome, path)
+
+    summary: dict[str, Any] = {
+        "models_processed": len(grouped) - len(skipped_models),
+        "models_skipped": len(skipped_models),
+        **counts,
+    }
+    logger.info("done: %s", json.dumps(summary))
+    for model_id, reason in skipped_models:
+        logger.info("  skipped model %s: %s", model_id, reason)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/commonlid/evaluation/evaluator.py b/src/commonlid/evaluation/evaluator.py
index 3474361..60cb9a4 100644
--- a/src/commonlid/evaluation/evaluator.py
+++ b/src/commonlid/evaluation/evaluator.py
@@ -159,6 +159,21 @@ def _run_one(
         )
         n_with_gold = sum(1 for g in ytrue if g is not None)
         samples_per_second = (len(ytrue) / elapsed) if elapsed > 0 else 0.0
+        # `None` here is meaningful: it tells downstream consumers that the
+        # model's support set is undefined (e.g. LLMs), distinct from a model
+        # that declared an empty set. Errors during discovery downgrade to
+        # the same "unknown" sentinel rather than crashing the run.
+        try:
+            supported = model.discover_supported_languages()
+        except Exception as exc:
+            logger.warning(
+                "%s   discover_supported_languages() raised %s: %s -- recording as None",
+                prefix,
+                type(exc).__name__,
+                exc,
+            )
+            supported = None
+        supported_languages = sorted(supported) if supported is not None else None
         result = Result(
             model_id=model.model_id,
             dataset_id=dataset.dataset_id,
@@ -170,6 +185,7 @@ def _run_one(
             limit=self.config.limit,
             timestamp=datetime.now(timezone.utc).isoformat(),
             commonlid_version=__version__,
+            supported_languages=supported_languages,
         )
 
         run_dir = self.config.output_dir / dataset.dataset_id / model.model_id
diff --git a/src/commonlid/evaluation/results.py b/src/commonlid/evaluation/results.py
index 1b70a04..e81b390 100644
--- a/src/commonlid/evaluation/results.py
+++ b/src/commonlid/evaluation/results.py
@@ -13,12 +13,20 @@
 from commonlid.metrics.aggregate import macro_average, micro_average
 from commonlid.metrics.core import LanguageMetrics
 
-SCHEMA_VERSION = 2
+SCHEMA_VERSION = 3
 
 
 @dataclass(slots=True)
 class Result:
-    """Aggregate outcome of one model evaluated on one dataset."""
+    """Aggregate outcome of one model evaluated on one dataset.
+
+    ``supported_languages`` follows a tri-state convention shared with
+    :meth:`LIDModel.discover_supported_languages`: ``None`` means the
+    model's support set is undefined (e.g. LLM-based models that can be
+    prompted for any language), a list of ISO 639-3 codes is the closed
+    set the model declares, and an empty list is the degenerate "supports
+    zero languages" case. The leaderboard's ``(cov.)`` view consumes this.
+    """
 
     model_id: str
     dataset_id: str
@@ -32,6 +40,7 @@ class Result:
     commonlid_version: str = ""
     python_version: str = field(default_factory=lambda: sys.version.split()[0])
     platform: str = field(default_factory=platform.platform)
+    supported_languages: list[str] | None = None
     extra: dict[str, Any] = field(default_factory=dict)
 
     def summary(self) -> dict[str, Any]:
@@ -52,6 +61,7 @@ def summary(self) -> dict[str, Any]:
             "macro": macro_average(self.per_language),
             "micro": micro_average(self.per_language),
             "per_language": {lang: asdict(m) for lang, m in sorted(self.per_language.items())},
+            "supported_languages": self.supported_languages,
             "extra": self.extra,
         }
 
diff --git a/src/commonlid/leaderboard/app.py b/src/commonlid/leaderboard/app.py
index 3d73b59..9c3f658 100644
--- a/src/commonlid/leaderboard/app.py
+++ b/src/commonlid/leaderboard/app.py
@@ -42,6 +42,20 @@
 )
 PAPER_URL = "https://arxiv.org/abs/2601.18026"
 
+Scope = Literal["all", "cov"]
+
+#: Radio choices shown above each dataset's results table.
+SCOPE_CHOICES: list[tuple[str, Scope]] = [
+    ("Scores are calculated over the whole dataset.", "all"),
+    (
+        "Scores are calculated on the subset of language varieties covered by the model. (cov.)",
+        "cov",
+    ),
+]
+
+#: Sentinel string used when a row has no cov data (rendered as em-dash).
+_NA_DISPLAY = "—"
+
 #: Display columns in the headline table (in order). Macro F1 is the headline metric.
 _HEADLINE_COLUMNS: list[tuple[str, str]] = [
     ("model_id", "Model"),
@@ -51,6 +65,19 @@
     ("n_languages", "Languages"),
     ("samples_per_second", "Samples/s"),
 ]
+
+#: Same columns, projected from the ``*_cov`` source fields. Display
+#: labels stay identical so the table layout doesn't shift when the
+#: scope radio is toggled.
+_HEADLINE_COLUMNS_COV: list[tuple[str, str]] = [
+    ("model_id", "Model"),
+    ("macro_f1_cov", "Macro F1"),
+    ("micro_f1_cov", "Micro F1"),
+    ("mean_fpr_cov", "Mean FPR (%)"),
+    ("n_languages_cov", "Languages"),
+    ("samples_per_second", "Samples/s"),
+]
+
 #: Right-aligned numeric columns get the ``number`` Gradio datatype which
 #: pushes values to the right edge of the cell.
 _GradioDtype = Literal["str", "number", "bool", "date", "markdown", "html"]
@@ -134,6 +161,46 @@
 ]
 
 
+#: Per-column human descriptions for the **(cov.)** view — same metrics,
+#: but restricted to the model's declared support set.
+_HEADLINE_COLUMN_HELP_COV: list[tuple[str, str]] = [
+    ("Model", "Identifier of the language identification model."),
+    (
+        "Macro F1",
+        "Unweighted mean of per-language F1 (x100) **restricted to languages the "
+        "model declares it supports** (paper `(cov.)` definition). Languages outside "
+        "the model's support set are excluded from the average — a model that covers "
+        "a small but accurate subset of the benchmark is no longer penalised for the "
+        "long tail of languages it never claimed to handle. **Higher is better.** "
+        f"Models without a declared support set show `{_NA_DISPLAY}`.",
+    ),
+    (
+        "Micro F1",
+        "Sample-weighted F1 (x100) pooled over the **model-supported subset** of "
+        "gold samples only. **Higher is better.** "
+        f"`{_NA_DISPLAY}` when no support set is declared.",
+    ),
+    (
+        "Mean FPR (%)",
+        "Mean per-language false-positive rate computed only on samples whose gold "
+        "language is in the model's support set; TN counts confusion across other "
+        "supported languages, not the long tail. **Lower is better.** "
+        f"`{_NA_DISPLAY}` when no support set is declared.",
+    ),
+    (
+        "Languages",
+        "Number of model-supported languages that have at least one gold sample in "
+        "this dataset (`|supported ∩ gold|`). This is the size of the slice every "
+        "other `(cov.)` metric is averaged over.",
+    ),
+    (
+        "Samples/s",
+        "Throughput during evaluation (samples processed per second). Unaffected by "
+        "the scope toggle — it is a model-property, not a metric.",
+    ),
+]
+
+
 def _columns_help_markdown(items: list[tuple[str, str]]) -> str:
     """Render a (column, description) list as a Markdown bullet block."""
     return "\n".join(f"- **{label}** — {desc}" for label, desc in items)
@@ -157,30 +224,55 @@ def _styled_value(table: Any, right_align_after_col: int = 0) -> dict[str, Any]:
     return {"data": data, "headers": headers, "metadata": {"styling": styling}}
 
 
-def _format_table(df: Any) -> Any:
+def _fmt(value: Any, decimals: int, *, scale: float = 1.0) -> str:
+    """Format a numeric value with ``decimals`` precision, em-dash for ``None``/``NaN``."""
+    import pandas as pd
+
+    if value is None or (isinstance(value, float) and pd.isna(value)):
+        return _NA_DISPLAY
+    return f"{float(value) * scale:.{decimals}f}"
+
+
+def _format_table(df: Any, scope: Scope = "all") -> Any:
     """Project + format a results DataFrame for one Gradio tab.
 
     Numeric columns are converted to **fixed-decimal strings** (e.g. ``0.00``
     not ``0``) so the rendered cells line up vertically; sort ordering is
-    preserved by sorting on the raw ``macro_f1`` *before* formatting.
+    preserved by sorting on the raw float *before* formatting.
 
     - Macro F1 / Micro F1 / Samples/s use **1 decimal**.
     - Mean FPR (%) uses **2 decimals**.
+    - In ``scope="cov"``, rows without ``supported_languages`` data render
+      em-dashes for every cov metric and sort to the bottom.
     """
     import pandas as pd
 
+    columns = _HEADLINE_COLUMNS_COV if scope == "cov" else _HEADLINE_COLUMNS
+    display_labels = [label for _, label in columns]
     if df.empty:
-        return pd.DataFrame(columns=[label for _, label in _HEADLINE_COLUMNS])
+        return pd.DataFrame(columns=display_labels)
+
     out = df.copy()
-    # Sort on the raw float so the resulting order is correct; format only
-    # afterwards (string sort would order "10" before "9").
-    out = out.sort_values("macro_f1", ascending=False, kind="stable").reset_index(drop=True)
-    out["macro_f1"] = (out["macro_f1"] * 100).map(lambda x: f"{x:.1f}")
-    out["micro_f1"] = (out["micro_f1"] * 100).map(lambda x: f"{x:.1f}")
-    out["mean_fpr"] = (out["mean_fpr"] * 100).map(lambda x: f"{x:.2f}")
-    out["samples_per_second"] = out["samples_per_second"].map(lambda x: f"{x:.1f}")
-    out = out[[k for k, _ in _HEADLINE_COLUMNS]]
-    out.columns = [label for _, label in _HEADLINE_COLUMNS]
+    source = {key: key for key, _ in columns}
+    sort_key = source["macro_f1_cov"] if scope == "cov" else source["macro_f1"]
+    # ``na_position="last"`` sinks rows without cov data to the bottom of
+    # the (cov.) view; the "all" view has no NaNs in this column.
+    out = out.sort_values(sort_key, ascending=False, kind="stable", na_position="last")
+    out = out.reset_index(drop=True)
+
+    macro_key = source["macro_f1_cov"] if scope == "cov" else source["macro_f1"]
+    micro_key = source["micro_f1_cov"] if scope == "cov" else source["micro_f1"]
+    fpr_key = source["mean_fpr_cov"] if scope == "cov" else source["mean_fpr"]
+    langs_key = source["n_languages_cov"] if scope == "cov" else source["n_languages"]
+
+    out[macro_key] = out[macro_key].map(lambda x: _fmt(x, 1, scale=100))
+    out[micro_key] = out[micro_key].map(lambda x: _fmt(x, 1, scale=100))
+    out[fpr_key] = out[fpr_key].map(lambda x: _fmt(x, 2, scale=100))
+    out[langs_key] = out[langs_key].map(lambda x: _fmt(x, 0))
+    out["samples_per_second"] = out["samples_per_second"].map(lambda x: _fmt(x, 1))
+
+    out = out[[k for k, _ in columns]]
+    out.columns = display_labels
     return out
 
 
@@ -314,23 +406,30 @@ def _format_license(license_name: str, license_url: str | None) -> str:
 
 def _make_select_handler(
     dataset_id: str,
-    table: Any,
     snapshot_root: Path,
 ) -> Any:
     """Build the row-select callback as a closure over the captured state.
 
+    The callback looks up the clicked row in the *current* table value
+    (passed in via Gradio's event arg) so that switching the scope radio
+    and then clicking a row drills down the row at its post-toggle
+    position, not the row that would have been there before the swap.
+
     Gradio inspects ``__defaults__`` when registering events, and comparing a
     DataFrame default against a type annotation hits an unimplemented arrow
     dtype path. A closure keeps the state out of the function signature.
     """
 
-    def _on_select(evt: gr.SelectData) -> tuple[str, Any]:
+    def _on_select(table_value: Any, evt: gr.SelectData) -> tuple[str, Any]:
         if evt.index is None:
             return ("_Click a row to load per-language metrics._", None)
         row_idx = evt.index[0] if isinstance(evt.index, list | tuple) else evt.index
         try:
-            model_id = table.iloc[row_idx]["Model"]
-        except (IndexError, KeyError):
+            data = table_value.get("data") if isinstance(table_value, dict) else None
+            if data is None:
+                return ("_Click a row to load per-language metrics._", None)
+            model_id = data[row_idx][0]
+        except (IndexError, KeyError, TypeError):
             return ("_Could not resolve clicked row._", None)
         per_lang = _per_language_drilldown(snapshot_root, dataset_id, model_id)
         return (
@@ -341,6 +440,19 @@ def _on_select(evt: gr.SelectData) -> tuple[str, Any]:
     return _on_select
 
 
+def _make_scope_handler(sub_df: Any) -> Any:
+    """Build the scope-radio change callback: swap the table data + legend in lockstep."""
+
+    def _on_change(scope: Scope) -> tuple[Any, str]:
+        help_items = _HEADLINE_COLUMN_HELP_COV if scope == "cov" else _HEADLINE_COLUMN_HELP
+        return (
+            _styled_value(_format_table(sub_df, scope=scope)),
+            _columns_help_markdown(help_items),
+        )
+
+    return _on_change
+
+
 def build_app(
     *,
     repo_id: str = DEFAULT_REPO_ID,
@@ -384,7 +496,7 @@ def build_app(
                 with gr.Tab(label=tab_label):
                     gr.Markdown(_dataset_metadata_markdown(dataset_id))
                     sub = df[df["dataset_id"] == dataset_id]
-                    table = _format_table(sub)
+                    table = _format_table(sub, scope="all")
                     if table.empty:
                         gr.Markdown(
                             f"_No results for `{dataset_id}` in `{repo_id}` yet."
@@ -394,6 +506,12 @@ def build_app(
                         )
                         continue
 
+                    scope_radio = gr.Radio(
+                        choices=SCOPE_CHOICES,
+                        value="all",
+                        label="Scoring scope",
+                        interactive=True,
+                    )
                     leaderboard = gr.Dataframe(
                         value=_styled_value(table),
                         datatype=_HEADLINE_DATATYPES,
@@ -402,7 +520,7 @@ def build_app(
                         label=f"{dataset_id} — sorted by Macro F1",
                     )
                     with gr.Accordion("What do these columns mean?", open=False):
-                        gr.Markdown(_columns_help_markdown(_HEADLINE_COLUMN_HELP))
+                        legend = gr.Markdown(_columns_help_markdown(_HEADLINE_COLUMN_HELP))
                     drilldown_label = gr.Markdown("_Click a row to load per-language metrics._")
                     # Seed the drilldown grid with an empty DataFrame so the Component
                     # has stable column headers before the first row click.
@@ -415,8 +533,14 @@ def build_app(
                     with gr.Accordion("What do these per-language columns mean?", open=False):
                         gr.Markdown(_columns_help_markdown(_DRILLDOWN_COLUMN_HELP))
 
+                    scope_radio.change(
+                        _make_scope_handler(sub),
+                        inputs=[scope_radio],
+                        outputs=[leaderboard, legend],
+                    )
                     leaderboard.select(
-                        _make_select_handler(dataset_id, table, snapshot_root),
+                        _make_select_handler(dataset_id, snapshot_root),
+                        inputs=[leaderboard],
                         outputs=[drilldown_label, drilldown],
                     )
         gr.Markdown(footer)
diff --git a/src/commonlid/leaderboard/data.py b/src/commonlid/leaderboard/data.py
index 20b21a9..b59844f 100644
--- a/src/commonlid/leaderboard/data.py
+++ b/src/commonlid/leaderboard/data.py
@@ -14,11 +14,15 @@
 
 import json
 import logging
-from collections.abc import Iterable
+import math
+from collections.abc import Iterable, Mapping
 from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Any
 
+from commonlid.metrics.core import LanguageMetrics
+from commonlid.metrics.fpr import mean_false_positive_rate, mean_stats_with_coverage
+
 logger = logging.getLogger(__name__)
 
 DEFAULT_REPO_ID = "commoncrawl/commonlid-results"
@@ -40,6 +44,14 @@ class LeaderboardRow:
     gold set. That's a model-property number, not a paper headline, and
     it stays consistent across rows: every model is reported on the same
     "what languages did you actually output here" basis.
+
+    The ``*_cov`` mirror fields are the same metrics restricted to gold
+    samples whose language is in the model's declared support set
+    (``supported_languages``). They are ``None`` when no support set is
+    available — either the field is missing from ``summary.json`` (legacy
+    file), the field is JSON ``null`` (LLM-style models whose support set
+    is undefined), or the field is an empty list (degenerate "supports
+    zero languages"). All three render as em-dashes in the cov view.
     """
 
     dataset_id: str
@@ -57,6 +69,13 @@ class LeaderboardRow:
     commonlid_version: str
     timestamp: str
     is_imported: bool
+    supported_languages: list[str] | None
+    macro_f1_cov: float | None
+    macro_precision_cov: float | None
+    macro_recall_cov: float | None
+    micro_f1_cov: float | None
+    mean_fpr_cov: float | None
+    n_languages_cov: int | None
 
     def to_dict(self) -> dict[str, Any]:
         return asdict(self)
@@ -68,10 +87,107 @@ def _safe_mean_fpr(per_language: dict[str, dict[str, Any]]) -> float:
     return sum(vals) / len(vals) if vals else 0.0
 
 
+def _hydrate_per_language(
+    per_language: Mapping[str, Mapping[str, Any]],
+) -> dict[str, LanguageMetrics]:
+    """Reconstruct :class:`LanguageMetrics` objects from the serialised dict form."""
+    out: dict[str, LanguageMetrics] = {}
+    for lang, m in per_language.items():
+        out[lang] = LanguageMetrics(
+            gt_count=int(m.get("gt_count", 0)),
+            predictions=int(m.get("predictions", 0)),
+            correct=int(m.get("correct", 0)),
+            precision=float(m.get("precision", 0.0) or 0.0),
+            recall=float(m.get("recall", 0.0) or 0.0),
+            f1=float(m.get("f1", 0.0) or 0.0),
+            fpr=None if m.get("fpr") is None else float(m["fpr"]),
+        )
+    return out
+
+
+def _micro_average_over(rows: Mapping[str, LanguageMetrics]) -> tuple[float, float, float]:
+    """Pooled precision/recall/F1 over a (filtered) per-language slice.
+
+    Mirrors :func:`commonlid.metrics.aggregate.micro_average`'s
+    ``*_gold_only`` math but accepts a pre-filtered subset, which the
+    public helper does not.
+    """
+    total_correct = sum(m.correct for m in rows.values())
+    total_predictions = sum(m.predictions for m in rows.values())
+    total_gold = sum(m.gt_count for m in rows.values())
+    precision = total_correct / total_predictions if total_predictions > 0 else 0.0
+    recall = total_correct / total_gold if total_gold > 0 else 0.0
+    f1 = (
+        2 * precision * recall / (precision + recall)
+        if (precision + recall) > 0 and not math.isclose(precision + recall, 0.0)
+        else 0.0
+    )
+    return precision, recall, f1
+
+
+def _compute_cov_fields(
+    per_language_raw: Mapping[str, Mapping[str, Any]],
+    supported: list[str] | None,
+) -> dict[str, float | int | None]:
+    """Return the six cov-variant fields, or all ``None`` when no cov data.
+
+    ``supported`` semantics:
+
+    - ``None`` — model's support set is undefined (e.g. LLM); no cov data.
+    - ``[]`` — model declared zero supported languages; every cov metric
+      would divide by zero, so render as no-data.
+    - non-empty list — compute the cov metrics.
+    """
+    none_result: dict[str, float | int | None] = {
+        "macro_f1_cov": None,
+        "macro_precision_cov": None,
+        "macro_recall_cov": None,
+        "micro_f1_cov": None,
+        "mean_fpr_cov": None,
+        "n_languages_cov": None,
+    }
+    if not supported:
+        return none_result
+
+    supported_set = set(supported)
+    per_language = _hydrate_per_language(per_language_raw)
+    stats = mean_stats_with_coverage(per_language, model_supported_languages=supported_set)
+    cov = stats["cov"]
+    n_languages_cov = int(cov.get("cov_count", 0))
+    if n_languages_cov == 0:
+        # Supported set has no overlap with the dataset's gold; nothing
+        # meaningful to report.
+        return none_result
+
+    cov_rows = {
+        lang: m for lang, m in per_language.items() if m.gt_count > 0 and lang in supported_set
+    }
+    _micro_precision, _micro_recall, micro_f1 = _micro_average_over(cov_rows)
+    mean_fpr_cov = mean_false_positive_rate(per_language, language_whitelist=supported_set)
+
+    return {
+        "macro_f1_cov": float(cov["f1"]),
+        "macro_precision_cov": float(cov["precision"]),
+        "macro_recall_cov": float(cov["recall"]),
+        "micro_f1_cov": float(micro_f1),
+        "mean_fpr_cov": float(mean_fpr_cov),
+        "n_languages_cov": n_languages_cov,
+    }
+
+
 def _row_from_summary(summary: dict[str, Any], dataset_id: str, model_id: str) -> LeaderboardRow:
     macro = summary.get("macro", {})
     micro = summary.get("micro", {})
     extra = summary.get("extra", {}) or {}
+    per_language = summary.get("per_language", {}) or {}
+
+    # Tri-state: missing key, JSON null, or list. Anything else (e.g. an
+    # accidentally-serialised set) collapses to "unknown".
+    raw_supported = summary.get("supported_languages")
+    supported: list[str] | None = list(raw_supported) if isinstance(raw_supported, list) else None
+    cov = _compute_cov_fields(per_language, supported)
+
+    n_languages_cov = cov["n_languages_cov"]
     return LeaderboardRow(
         dataset_id=dataset_id,
         model_id=model_id,
@@ -79,7 +195,7 @@ def _row_from_summary(summary: dict[str, Any], dataset_id: str, model_id: str) -
         macro_precision=float(macro.get("precision_gold_only", 0.0)),
         macro_recall=float(macro.get("recall_gold_only", 0.0)),
         micro_f1=float(micro.get("f1_gold_only", 0.0)),
-        mean_fpr=_safe_mean_fpr(summary.get("per_language", {})),
+        mean_fpr=_safe_mean_fpr(per_language),
         n_languages=int(macro.get("n_languages_observed", 0)),
         n_samples=int(summary.get("n_samples", 0)),
         n_samples_with_gold=int(summary.get("n_samples_with_gold", 0)),
@@ -88,6 +204,13 @@ def _row_from_summary(summary: dict[str, Any], dataset_id: str, model_id: str) -
         commonlid_version=str(summary.get("commonlid_version", "")),
         timestamp=str(summary.get("timestamp", "")),
         is_imported=("imported_from" in extra),
+        supported_languages=supported,
+        macro_f1_cov=cov["macro_f1_cov"],
+        macro_precision_cov=cov["macro_precision_cov"],
+        macro_recall_cov=cov["macro_recall_cov"],
+        micro_f1_cov=cov["micro_f1_cov"],
+        mean_fpr_cov=cov["mean_fpr_cov"],
+        n_languages_cov=int(n_languages_cov) if n_languages_cov is not None else None,
     )
 
 
diff --git a/tests/unit/test_leaderboard_data.py b/tests/unit/test_leaderboard_data.py
index 6f426f2..4e9be30 100644
--- a/tests/unit/test_leaderboard_data.py
+++ b/tests/unit/test_leaderboard_data.py
@@ -4,12 +4,16 @@
 
 import json
 from pathlib import Path
+from typing import Any
 
 import pytest
 
 pd = pytest.importorskip("pandas")
 
 
+_UNSET = object()
+
+
 def _write_summary(
     root: Path,
     dataset_id: str,
@@ -17,12 +21,14 @@ def _write_summary(
     *,
     macro_f1: float = 0.5,
     fpr_per_lang: dict[str, float | None] | None = None,
+    per_language: dict[str, dict[str, Any]] | None = None,
+    supported_languages: object = _UNSET,
 ) -> None:
     out = root / dataset_id / model_id
     out.mkdir(parents=True, exist_ok=True)
     fpr_per_lang = fpr_per_lang or {"eng": 0.01, "deu": 0.02}
-    summary = {
-        "schema_version": 2,
+    summary: dict[str, Any] = {
+        "schema_version": 3,
         "model_id": model_id,
         "dataset_id": dataset_id,
         "dataset_revision": "abc123",
@@ -54,7 +60,8 @@ def _write_summary(
             "n_correct_observed": 65,
             "n_predictions_observed": 95,
         },
-        "per_language": {
+        "per_language": per_language
+        or {
             lang: {
                 "f1": 0.5,
                 "precision": 0.5,
@@ -68,6 +75,8 @@ def _write_summary(
         },
         "extra": {},
     }
+    if supported_languages is not _UNSET:
+        summary["supported_languages"] = supported_languages
     (out / "summary.json").write_text(json.dumps(summary), encoding="utf-8")
 
 
@@ -312,3 +321,173 @@ def test_format_table_pads_zero_values_with_decimals(tmp_path: Path) -> None:
     row = _format_table(df).iloc[0]
     assert row["Macro F1"] == "0.0"
     assert row["Mean FPR (%)"] == "0.00"
+
+
+# ----- (cov.) variant ---------------------------------------------------------
+
+
+def _per_language_block(
+    spec: dict[str, tuple[int, int, int, float]],
+) -> dict[str, dict[str, Any]]:
+    """Build a per_language dict from a compact (gt, pred, correct, fpr) spec."""
+    return {
+        lang: {
+            "gt_count": gt,
+            "predictions": pred,
+            "correct": correct,
+            "precision": (correct / pred) if pred else 0.0,
+            "recall": (correct / gt) if gt else 0.0,
+            "f1": (
+                2 * (correct / pred) * (correct / gt) / ((correct / pred) + (correct / gt))
+                if pred and gt and correct
+                else 0.0
+            ),
+            "fpr": fpr,
+        }
+        for lang, (gt, pred, correct, fpr) in spec.items()
+    }
+
+
+def test_row_cov_fields_filter_to_supported_languages(tmp_path: Path) -> None:
+    """With supported={eng,fra}, cov metrics exclude deu (an unsupported gold lang)."""
+    from commonlid.leaderboard.data import load_results
+
+    per_lang = _per_language_block({
+        "eng": (10, 10, 10, 0.01),  # perfect
+        "fra": (10, 10, 5, 0.02),  # P=0.5 R=0.5 F1=0.5
+        "deu": (10, 10, 0, 0.03),  # zero (excluded from cov)
+    })
+    _write_summary(
+        tmp_path,
+        "commonlid",
+        "M",
+        per_language=per_lang,
+        supported_languages=["eng", "fra"],
+    )
+    df = load_results(local_dir=tmp_path)
+    row = df.iloc[0]
+    # Macro F1 cov = mean(F1 over {eng, fra}) = (1.0 + 0.5) / 2 = 0.75; "all" includes deu and drops.
+    assert row["macro_f1_cov"] == pytest.approx(0.75)
+    assert row["macro_f1_cov"] > row["macro_f1"]
+    # Micro F1 cov pools eng+fra: P = 15/20 = 0.75, R = 15/20 = 0.75, F1 = 0.75.
+    assert row["micro_f1_cov"] == pytest.approx(0.75)
+    # n_languages_cov is the supported-and-have-gold intersection size.
+    assert row["n_languages_cov"] == 2
+    # supported_languages round-trips as a sorted list.
+    assert row["supported_languages"] == ["eng", "fra"]
+
+
+@pytest.mark.parametrize(
+    ("kind", "supported"),
+    [
+        ("missing", _UNSET),  # legacy file, no key
+        ("null", None),  # LLM row, undefined support set
+        ("empty_list", []),  # degenerate "supports zero languages"
+    ],
+)
+def test_row_cov_fields_none_when_no_support_data(
+    tmp_path: Path, kind: str, supported: object
+) -> None:
+    from commonlid.leaderboard.data import load_results
+
+    _write_summary(
+        tmp_path,
+        "commonlid",
+        f"M_{kind}",
+        supported_languages=supported,
+    )
+    df = load_results(local_dir=tmp_path)
+    row = df.iloc[0]
+    assert row["macro_f1_cov"] is None
+    assert row["macro_precision_cov"] is None
+    assert row["macro_recall_cov"] is None
+    assert row["micro_f1_cov"] is None
+    assert row["mean_fpr_cov"] is None
+    assert row["n_languages_cov"] is None
+    # supported_languages is preserved as-is in the row (None for both
+    # missing and null) so the UI can choose to show a tooltip later.
+    expected = None if supported is _UNSET or supported is None else supported
+    assert row["supported_languages"] == expected
+
+
+def test_row_cov_fields_none_when_supported_set_misses_all_gold(tmp_path: Path) -> None:
+    """If the model's support set has no overlap with the dataset's gold, cov collapses to None."""
+    from commonlid.leaderboard.data import load_results
+
+    per_lang = _per_language_block({"eng": (10, 10, 8, 0.01)})
+    _write_summary(
+        tmp_path,
+        "commonlid",
+        "M",
+        per_language=per_lang,
+        supported_languages=["xyz", "qqq"],  # no overlap
+    )
+    df = load_results(local_dir=tmp_path)
+    row = df.iloc[0]
+    assert row["macro_f1_cov"] is None
+    assert row["n_languages_cov"] is None
+
+
+def test_format_table_cov_scope_renders_em_dashes(tmp_path: Path) -> None:
+    """Rows without ``supported_languages`` show em-dashes and sort to the bottom."""
+    pytest.importorskip("gradio")
+    from commonlid.leaderboard.app import _format_table
+    from commonlid.leaderboard.data import load_results
+
+    per_lang = _per_language_block({
+        "eng": (10, 10, 10, 0.01),
+        "fra": (10, 10, 5, 0.02),
+    })
+    # Model with a declared support set -> real cov numbers.
+    _write_summary(
+        tmp_path,
+        "commonlid",
+        "WITH_SUPPORT",
+        per_language=per_lang,
+        supported_languages=["eng", "fra"],
+    )
+    # LLM-style row -> JSON null -> em-dashes in cov view.
+    _write_summary(
+        tmp_path,
+        "commonlid",
+        "NO_SUPPORT",
+        per_language=per_lang,
+        supported_languages=None,
+    )
+    df = load_results(local_dir=tmp_path)
+    cov_table = _format_table(df, scope="cov")
+    # Real-cov row should sort above the em-dash row.
+    assert list(cov_table["Model"]) == ["WITH_SUPPORT", "NO_SUPPORT"]
+    assert cov_table.iloc[0]["Macro F1"] == "75.0"
+    assert cov_table.iloc[1]["Macro F1"] == "—"
+    assert cov_table.iloc[1]["Languages"] == "—"
+    # Samples/s is unaffected by the toggle (it's a model property).
+    assert cov_table.iloc[0]["Samples/s"] == "1234.5"
+    assert cov_table.iloc[1]["Samples/s"] == "1234.5"
+
+
+def test_scope_radio_change_swaps_table_and_legend(tmp_path: Path) -> None:
+    """The scope-change handler returns a fresh styled table + legend Markdown."""
+    pytest.importorskip("gradio")
+    from commonlid.leaderboard.app import (
+        _HEADLINE_COLUMN_HELP_COV,
+        _columns_help_markdown,
+        _make_scope_handler,
+    )
+    from commonlid.leaderboard.data import load_results
+
+    per_lang = _per_language_block({"eng": (10, 10, 10, 0.01), "fra": (10, 10, 5, 0.02)})
+    _write_summary(
+        tmp_path,
+        "commonlid",
+        "M",
+        per_language=per_lang,
+        supported_languages=["eng", "fra"],
+    )
+    df = load_results(local_dir=tmp_path)
+    handler = _make_scope_handler(df)
+    table_payload, legend = handler("cov")
+    assert legend == _columns_help_markdown(_HEADLINE_COLUMN_HELP_COV)
+    assert table_payload["headers"][1] == "Macro F1"
+    # Row data is the cov computation, not the "all" one.
+    assert table_payload["data"][0][1] == "75.0"
diff --git a/tests/unit/test_results_io.py b/tests/unit/test_results_io.py
index 6fff8ca..d83e446 100644
--- a/tests/unit/test_results_io.py
+++ b/tests/unit/test_results_io.py
@@ -61,3 +61,33 @@ def test_write_predictions_roundtrip(tmp_path) -> None:
     write_predictions(rows, path)
     lines = [json.loads(line) for line in path.read_text().splitlines()]
     assert lines == rows
+
+
+def test_summary_supported_languages_roundtrips_list() -> None:
+    """A declared support set ships through ``summary()`` as the sorted list."""
+    r = _result()
+    r.supported_languages = ["eng", "fra"]
+    assert _result().summary()["supported_languages"] is None  # default still None
+    assert r.summary()["supported_languages"] == ["eng", "fra"]
+
+
+def test_summary_supported_languages_roundtrips_none(tmp_path) -> None:
+    """``None`` means *undefined* (e.g. LLM rows) and must serialize as JSON ``null``.
+
+    Round-trip via write_summary -> load_summary guards against an accidental
+    coercion to ``[]``, which has a distinct semantic meaning (a model that
+    declared zero supported languages).
+    """
+    path = tmp_path / "sum.json"
+    write_summary(_result(), path)
+    raw = path.read_text(encoding="utf-8")
+    # Verify JSON literal -- ``[]`` would be a wrong but type-compatible answer.
+    assert '"supported_languages": null' in raw
+    assert load_summary(path)["supported_languages"] is None
+
+
+def test_summary_supported_languages_preserves_empty_list() -> None:
+    """``[]`` is degenerate but real -- distinct from ``None``."""
+    r = _result()
+    r.supported_languages = []
+    assert r.summary()["supported_languages"] == []