Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ PACKAGE := src/commonlid
.DEFAULT_GOAL := help

.PHONY: help venv \
install install-all install-afrolid install-notebooks install-leaderboard \
install install-all install-afrolid install-commonlingua install-notebooks install-leaderboard \
lint format format-check typecheck \
test test-slow test-all check \
build clean \
Expand All @@ -24,6 +24,7 @@ help:
@echo " venv Create a uv-managed virtualenv (.venv)"
@echo " install Sync runtime + dev extras (lint/type/test)"
@echo " install-afrolid install + the heavy [afrolid] extra (torch + transformers)"
@echo " install-commonlingua install + the [commonlingua] extra (torch only)"
@echo " install-notebooks install + the [notebooks] extra (jupyterlab + matplotlib)"
@echo " install-leaderboard install + the [leaderboard] extra (gradio)"
@echo " install-all install + every optional extra"
Expand Down Expand Up @@ -55,14 +56,17 @@ install:
install-afrolid:
uv sync --extra dev --extra afrolid $(PYTHON_FLAG)

install-commonlingua:
uv sync --extra dev --extra commonlingua $(PYTHON_FLAG)

install-notebooks:
uv sync --extra dev --extra notebooks $(PYTHON_FLAG)

install-leaderboard:
uv sync --extra dev --extra leaderboard $(PYTHON_FLAG)

install-all:
uv sync --extra dev --extra afrolid --extra notebooks --extra leaderboard $(PYTHON_FLAG)
uv sync --extra dev --extra afrolid --extra commonlingua --extra notebooks --extra leaderboard $(PYTHON_FLAG)

lint:
uv run ruff check $(SRC_DIRS)
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ From PyPI:
pip install commonlid # core deps + classical LID models
pip install "commonlid[llm]" # + DSPy-based LLM evaluation
pip install "commonlid[afrolid]" # + torch/transformers for AfroLID
pip install "commonlid[commonlingua]" # + torch for the CommonLingua byte-level model
pip install "commonlid[notebooks]" # + jupyterlab + matplotlib for paper_tables.ipynb
pip install "commonlid[all]" # everything runtime-facing
```
Expand Down Expand Up @@ -192,7 +193,7 @@ from commonlid import list_models, list_datasets

assert list_models() == [
"AfroLID", "GlotLID", "OpenLID-v2", "cld2", "cld3",
"fasttext", "funlangid", "pyfranc",
"commonlingua", "fasttext", "funlangid", "pyfranc",
]
assert list_datasets() == [
"bibles_300", "bibles_300_nano",
Expand Down Expand Up @@ -298,6 +299,7 @@ for line in preds_path.read_text().splitlines():
| `fasttext` | [facebook/fasttext-language-identification](https://huggingface.co/facebook/fasttext-language-identification) | fasttext |
| `pyfranc` | [pyfranc](https://pypi.org/project/pyfranc/) | Pure Python |
| `AfroLID` | [UBC-NLP/afrolid_1.5](https://huggingface.co/UBC-NLP/afrolid_1.5) | Requires `[afrolid]` extra |
| `commonlingua` | [PleIAs/CommonLingua](https://huggingface.co/PleIAs/CommonLingua) | 2.35M-param byte-level model, 334 languages; requires `[commonlingua]` extra |
| `funlangid` | Vendored in `src/commonlid/vendor/fun_langid.py` | Simple char-4gram baseline |

LLM models are instantiated dynamically (`DSPyLLMModel`) and not
Expand Down
9 changes: 8 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@ llm = [
"botocore>=1.35",
]
cld3 = ["cld3-py>=3.1"]
commonlingua = [
# CommonLingua is a 2.35M-param byte-level model; needs torch but not the
# transformers stack that [afrolid] pulls in.
"torch>=2.4",
]
leaderboard = [
# gradio 4.x imports HfFolder from huggingface_hub, which was removed in
# huggingface-hub 1.0; gradio 5 dropped that import.
Expand Down Expand Up @@ -88,7 +93,7 @@ notebooks = [
"nbclient>=0.10",
]
all = [
"commonlid[afrolid,llm]",
"commonlid[afrolid,llm,commonlingua]",
]

[project.scripts]
Expand Down Expand Up @@ -208,6 +213,8 @@ omit = [
# afrolid needs the heavy `[afrolid]` extra (torch + transformers); not
# installed in dev and so exercised only via mocked unit tests.
"src/commonlid/models/afrolid.py",
# commonlingua needs the `[commonlingua]` extra (torch); same precedent.
"src/commonlid/models/commonlingua.py",
]

[tool.coverage.report]
Expand Down
1 change: 1 addition & 0 deletions src/commonlid/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from commonlid.models import afrolid as _afrolid # noqa: F401
from commonlid.models import cld2 as _cld2 # noqa: F401
from commonlid.models import cld3 as _cld3 # noqa: F401
from commonlid.models import commonlingua as _commonlingua # noqa: F401
from commonlid.models import fasttext_ft as _fasttext_ft # noqa: F401
from commonlid.models import funlangid as _funlangid # noqa: F401
from commonlid.models import glotlid as _glotlid # noqa: F401
Expand Down
115 changes: 115 additions & 0 deletions src/commonlid/models/commonlingua.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
"""CommonLingua: PleIAs' byte-level LID model (PleIAs/CommonLingua).

Requires the ``commonlid[commonlingua]`` extra (torch). The checkpoint
embeds its own ``lang2idx`` map, so no separate metadata file is fetched.
Device selection mirrors AfroLID: MPS > CUDA > CPU.
"""

from __future__ import annotations

from collections.abc import Sequence
from typing import TYPE_CHECKING, Any, ClassVar

from commonlid.core.lid_model import LIDModel
from commonlid.core.registry import register_model

if TYPE_CHECKING:
import torch


@register_model
class CommonLinguaModel(LIDModel):
model_id = "commonlingua"
# Byte-level model: casing carries strong language signal, so we feed
# raw UTF-8 and skip the OpenLID normer (which lowercases everything).
requires_preprocessing: ClassVar[bool] = False

_REPO_ID: ClassVar[str] = "PleIAs/CommonLingua"
_CHECKPOINT_FILENAME: ClassVar[str] = "model.pt"
_INTERNAL_BATCH: ClassVar[int] = 256

def __init__(self) -> None:
super().__init__()
self._model: Any = None
self._idx2lang: dict[int, str] | None = None
self._max_len: int | None = None
self._device: str | None = None

def load(self) -> None:
if self._loaded:
return
try:
import torch
except ImportError as exc:
msg = "CommonLingua requires torch. Install with: pip install 'commonlid[commonlingua]'"
raise ImportError(msg) from exc

from huggingface_hub import hf_hub_download

from commonlid.vendor.commonlingua.model import CONFIGS, ByteHybrid

ckpt_path = hf_hub_download(repo_id=self._REPO_ID, filename=self._CHECKPOINT_FILENAME)
ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=False)

if torch.backends.mps.is_available():
device = "mps"
elif torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"

model = ByteHybrid( # type: ignore[no-untyped-call]
num_classes=ckpt["num_classes"],
max_len=ckpt["max_len"],
**CONFIGS[ckpt["config"]],
)
model.load_state_dict(ckpt["model_state_dict"])
model.eval().to(device)

self._model = model
self._idx2lang = {v: k for k, v in ckpt["lang2idx"].items()}
self._max_len = int(ckpt["max_len"])
self._device = device
super().load()

def _encode(self, texts: Sequence[str]) -> torch.Tensor:
import numpy as np
import torch

assert self._max_len is not None
out = np.full((len(texts), self._max_len), 256, dtype=np.int64)
for i, t in enumerate(texts):
raw = t.encode("utf-8", errors="replace")[: self._max_len]
if raw:
out[i, : len(raw)] = np.frombuffer(raw, dtype=np.uint8)
return torch.from_numpy(out)

def _predict_batch(self, texts: Sequence[str]) -> list[str | None]:
import torch

if not self._loaded:
self.load()
assert self._idx2lang is not None
assert self._device is not None

results: list[str | None] = []
for start in range(0, len(texts), self._INTERNAL_BATCH):
chunk = list(texts[start : start + self._INTERNAL_BATCH])
batch = self._encode(chunk).to(self._device)
with torch.no_grad():
logits = self._model(batch)
pred_idx = logits.argmax(dim=-1).cpu().tolist()
results.extend(self._idx2lang[int(i)] for i in pred_idx)
return results

def discover_supported_languages(self) -> frozenset[str]:
"""Return every ISO 639-3 code in the model's ``lang2idx`` map."""
if not self._loaded:
self.load()
assert self._idx2lang is not None
codes: set[str] = set()
for code in self._idx2lang.values():
conformed = self._conform(code)
if conformed is not None:
codes.add(conformed)
return frozenset(codes)
4 changes: 4 additions & 0 deletions src/commonlid/vendor/commonlingua/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"""Vendored PleIAs/CommonLingua architecture (Apache 2.0).

Source: https://huggingface.co/PleIAs/CommonLingua
"""
Loading
Loading