commoncrawl · malteos · May 22, 2026 · May 11, 2026
diff --git a/Makefile b/Makefile
@@ -13,7 +13,7 @@ PACKAGE := src/commonlid
 .DEFAULT_GOAL := help
 
 .PHONY: help venv \
-        install install-all install-afrolid install-notebooks install-leaderboard \
+        install install-all install-afrolid install-commonlingua install-notebooks install-leaderboard \
         lint format format-check typecheck \
         test test-slow test-all check \
         build clean \
@@ -24,6 +24,7 @@ help:
 	@echo "  venv                  Create a uv-managed virtualenv (.venv)"
 	@echo "  install               Sync runtime + dev extras (lint/type/test)"
 	@echo "  install-afrolid       install + the heavy [afrolid] extra (torch + transformers)"
+	@echo "  install-commonlingua  install + the [commonlingua] extra (torch only)"
 	@echo "  install-notebooks     install + the [notebooks] extra (jupyterlab + matplotlib)"
 	@echo "  install-leaderboard   install + the [leaderboard] extra (gradio)"
 	@echo "  install-all           install + every optional extra"
@@ -55,14 +56,17 @@ install:
 install-afrolid:
 	uv sync --extra dev --extra afrolid $(PYTHON_FLAG)
 
+install-commonlingua:
+	uv sync --extra dev --extra commonlingua $(PYTHON_FLAG)
+
 install-notebooks:
 	uv sync --extra dev --extra notebooks $(PYTHON_FLAG)
 
 install-leaderboard:
 	uv sync --extra dev --extra leaderboard $(PYTHON_FLAG)
 
 install-all:
-	uv sync --extra dev --extra afrolid --extra notebooks --extra leaderboard $(PYTHON_FLAG)
+	uv sync --extra dev --extra afrolid --extra commonlingua --extra notebooks --extra leaderboard $(PYTHON_FLAG)
 
 lint:
 	uv run ruff check $(SRC_DIRS)

diff --git a/README.md b/README.md
@@ -39,6 +39,7 @@ From PyPI:
 pip install commonlid                      # core deps + classical LID models
 pip install "commonlid[llm]"               # + DSPy-based LLM evaluation
 pip install "commonlid[afrolid]"           # + torch/transformers for AfroLID
+pip install "commonlid[commonlingua]"      # + torch for the CommonLingua byte-level model
 pip install "commonlid[notebooks]"         # + jupyterlab + matplotlib for paper_tables.ipynb
 pip install "commonlid[all]"               # everything runtime-facing
 ```
@@ -192,7 +193,7 @@ from commonlid import list_models, list_datasets
 
 assert list_models() == [
     "AfroLID", "GlotLID", "OpenLID-v2", "cld2", "cld3",
-    "fasttext", "funlangid", "pyfranc",
+    "commonlingua", "fasttext", "funlangid", "pyfranc",
 ]
 assert list_datasets() == [
     "bibles_300", "bibles_300_nano",
@@ -298,6 +299,7 @@ for line in preds_path.read_text().splitlines():
 | `fasttext` | [facebook/fasttext-language-identification](https://huggingface.co/facebook/fasttext-language-identification) | fasttext |
 | `pyfranc` | [pyfranc](https://pypi.org/project/pyfranc/) | Pure Python |
 | `AfroLID` | [UBC-NLP/afrolid_1.5](https://huggingface.co/UBC-NLP/afrolid_1.5) | Requires `[afrolid]` extra |
+| `commonlingua` | [PleIAs/CommonLingua](https://huggingface.co/PleIAs/CommonLingua) | 2.35M-param byte-level model, 334 languages; requires `[commonlingua]` extra |
 | `funlangid` | Vendored in `src/commonlid/vendor/fun_langid.py` | Simple char-4gram baseline |
 
 LLM models are instantiated dynamically (`DSPyLLMModel`) and not

diff --git a/pyproject.toml b/pyproject.toml
@@ -59,6 +59,11 @@ llm = [
     "botocore>=1.35",
 ]
 cld3 = ["cld3-py>=3.1"]
+commonlingua = [
+    # CommonLingua is a 2.35M-param byte-level model; needs torch but not the
+    # transformers stack that [afrolid] pulls in.
+    "torch>=2.4",
+]
 leaderboard = [
     # gradio 4.x imports HfFolder from huggingface_hub, which was removed in
     # huggingface-hub 1.0; gradio 5 dropped that import.
@@ -88,7 +93,7 @@ notebooks = [
     "nbclient>=0.10",
 ]
 all = [
-    "commonlid[afrolid,llm]",
+    "commonlid[afrolid,llm,commonlingua]",
 ]
 
 [project.scripts]
@@ -208,6 +213,8 @@ omit = [
     # afrolid needs the heavy `[afrolid]` extra (torch + transformers); not
     # installed in dev and so exercised only via mocked unit tests.
     "src/commonlid/models/afrolid.py",
+    # commonlingua needs the `[commonlingua]` extra (torch); same precedent.
+    "src/commonlid/models/commonlingua.py",
 ]
 
 [tool.coverage.report]

diff --git a/src/commonlid/models/__init__.py b/src/commonlid/models/__init__.py
@@ -11,6 +11,7 @@
 from commonlid.models import afrolid as _afrolid  # noqa: F401
 from commonlid.models import cld2 as _cld2  # noqa: F401
 from commonlid.models import cld3 as _cld3  # noqa: F401
+from commonlid.models import commonlingua as _commonlingua  # noqa: F401
 from commonlid.models import fasttext_ft as _fasttext_ft  # noqa: F401
 from commonlid.models import funlangid as _funlangid  # noqa: F401
 from commonlid.models import glotlid as _glotlid  # noqa: F401

diff --git a/src/commonlid/models/commonlingua.py b/src/commonlid/models/commonlingua.py
@@ -0,0 +1,115 @@
+"""CommonLingua: PleIAs' byte-level LID model (PleIAs/CommonLingua).
+
+Requires the ``commonlid[commonlingua]`` extra (torch). The checkpoint
+embeds its own ``lang2idx`` map, so no separate metadata file is fetched.
+Device selection mirrors AfroLID: MPS > CUDA > CPU.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Any, ClassVar
+
+from commonlid.core.lid_model import LIDModel
+from commonlid.core.registry import register_model
+
+if TYPE_CHECKING:
+    import torch
+
+
+@register_model
+class CommonLinguaModel(LIDModel):
+    model_id = "commonlingua"
+    # Byte-level model: casing carries strong language signal, so we feed
+    # raw UTF-8 and skip the OpenLID normer (which lowercases everything).
+    requires_preprocessing: ClassVar[bool] = False
+
+    _REPO_ID: ClassVar[str] = "PleIAs/CommonLingua"
+    _CHECKPOINT_FILENAME: ClassVar[str] = "model.pt"
+    _INTERNAL_BATCH: ClassVar[int] = 256
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._model: Any = None
+        self._idx2lang: dict[int, str] | None = None
+        self._max_len: int | None = None
+        self._device: str | None = None
+
+    def load(self) -> None:
+        if self._loaded:
+            return
+        try:
+            import torch
+        except ImportError as exc:
+            msg = "CommonLingua requires torch. Install with: pip install 'commonlid[commonlingua]'"
+            raise ImportError(msg) from exc
+
+        from huggingface_hub import hf_hub_download
+
+        from commonlid.vendor.commonlingua.model import CONFIGS, ByteHybrid
+
+        ckpt_path = hf_hub_download(repo_id=self._REPO_ID, filename=self._CHECKPOINT_FILENAME)
+        ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=False)
+
+        if torch.backends.mps.is_available():
+            device = "mps"
+        elif torch.cuda.is_available():
+            device = "cuda"
+        else:
+            device = "cpu"
+
+        model = ByteHybrid(  # type: ignore[no-untyped-call]
+            num_classes=ckpt["num_classes"],
+            max_len=ckpt["max_len"],
+            **CONFIGS[ckpt["config"]],
+        )
+        model.load_state_dict(ckpt["model_state_dict"])
+        model.eval().to(device)
+
+        self._model = model
+        self._idx2lang = {v: k for k, v in ckpt["lang2idx"].items()}
+        self._max_len = int(ckpt["max_len"])
+        self._device = device
+        super().load()
+
+    def _encode(self, texts: Sequence[str]) -> torch.Tensor:
+        import numpy as np
+        import torch
+
+        assert self._max_len is not None
+        out = np.full((len(texts), self._max_len), 256, dtype=np.int64)
+        for i, t in enumerate(texts):
+            raw = t.encode("utf-8", errors="replace")[: self._max_len]
+            if raw:
+                out[i, : len(raw)] = np.frombuffer(raw, dtype=np.uint8)
+        return torch.from_numpy(out)
+
+    def _predict_batch(self, texts: Sequence[str]) -> list[str | None]:
+        import torch
+
+        if not self._loaded:
+            self.load()
+        assert self._idx2lang is not None
+        assert self._device is not None
+
+        results: list[str | None] = []
+        for start in range(0, len(texts), self._INTERNAL_BATCH):
+            chunk = list(texts[start : start + self._INTERNAL_BATCH])
+            batch = self._encode(chunk).to(self._device)
+            with torch.no_grad():
+                logits = self._model(batch)
+                pred_idx = logits.argmax(dim=-1).cpu().tolist()
+            results.extend(self._idx2lang[int(i)] for i in pred_idx)
+        return results
+
+    def discover_supported_languages(self) -> frozenset[str]:
+        """Return every ISO 639-3 code in the model's ``lang2idx`` map."""
+        if not self._loaded:
+            self.load()
+        assert self._idx2lang is not None
+        codes: set[str] = set()
+        for code in self._idx2lang.values():
+            conformed = self._conform(code)
+            if conformed is not None:
+                codes.add(conformed)
+        return frozenset(codes)
diff --git a/src/commonlid/vendor/commonlingua/__init__.py b/src/commonlid/vendor/commonlingua/__init__.py
@@ -0,0 +1,4 @@
+"""Vendored PleIAs/CommonLingua architecture (Apache 2.0).
+
+Source: https://huggingface.co/PleIAs/CommonLingua
+"""