diff --git a/docs/migrating-to-v2.md b/docs/migrating-to-v2.md new file mode 100644 index 000000000..c72208a9c --- /dev/null +++ b/docs/migrating-to-v2.md @@ -0,0 +1,1041 @@ +# MLT v2 Wire Format Migration Guide + +This document provides a detailed, side-by-side comparison of every structural element +between MLT v1 and v2. It is the authoritative reference for implementing v2 encoders +and decoders. + +--- + +## Guiding Principles + +| Principle | v1 | v2 | +|---|---|---| +| Feature count | Not stored; implied by geometry stream `num_values` | `feature_count` varint in every layer header | +| Column layout | Separate metadata section then data section | Geometry section first, then counted columns encoded as type + config + data | +| Stream type identification | Explicit `stream_type` byte on every stream | Eliminated; role implied by `geometry_layout` or by column type plus stream position | +| Count per stream | Always stored as varint | Omitted when equal to `feature_count` or `popcount(presence_bitfield)`; required only when neither applies | +| Byte length per stream | Always stored as varint | Encoded in `physical` field: `None-noLen` omits it (derivable as count × width); `None-withLen`, `VarInt`, `FastPFor128` always carry it | +| Presence streams | Bool-RLE stream with 4-field header | Raw packed bitfield, no header; sharable across columns via index reference | +| RLE auxiliary fields | `runs` + `num_rle_values` as varints | Eliminated; both derivable at decode time | +| String encoding variant | `Str`/`OptStr` type + runtime `stream_count` | Encoded in column type code; 8 flat types | +| Dict null encoding | Optional presence stream controls which features are present | Null-at-index-0 in the indices stream; no presence bitfield | +| FastPFor byte order | Big-endian u32 | Little-endian u32 | +| FastPFor block size | 256 | 128 | + +> **Note on lazy / skip-ahead parsing.** A reader that only needs a subset of +> columns must be able to advance past unwanted streams without decoding them. +> `physical=None-noLen` streams are always skippable: `byte_length = num_values +> × element_width`, where both are known at header-parse time. `physical=VarInt` +> and `physical=FastPFor128` streams always carry an explicit `byte_length` (it +> is part of their physical encoding value), so they are also skippable. +> `has_explicit_count` is **not** needed for skipping: `popcount` of a presence +> bitfield is a fast bit-counting operation. `has_explicit_count = 1` is only +> required when the count is genuinely not derivable from `feature_count` or +> `popcount(presence_bitfield)` — for example, geometry vertex counts and string +> character/offset counts. + +--- + +## Layer Envelope + +### v1 + +`feature_count` is not stored. It is implied by the length of the geometry types stream. +Metadata and data are completely separated; the metadata section must be fully parsed +before any column data can be located. + +``` +[varint body_len+1] +[u8 tag = 1] +[varint name_len] [name bytes] +[varint extent] +[varint column_count] +── metadata section ────────────────────────────── +[u8 col_type₀] [optional: varint name_len + bytes] +[u8 col_type₁] [optional: varint name_len + bytes] +... +[u8 col_typeN] [optional: varint name_len + bytes] +── data section ────────────────────────────────── +[streams for column 0] +[streams for column 1] +... +[streams for column N] +``` + +### v2 + +``` +[varint body_len+1] +[u8 tag = 1] +[varint name_len] [name bytes] +[varint extent] +[varint feature_count] ← NEW +── geometry section ────────────────────────────── +[u8 geometry_layout] ← NEW; one of the geometry layouts below +[geometry streams...] +── columns (merged meta + data) ────────────────── +[varint column_count] ← counts only non-geometry top-level columns +[column 0: type + name? + presence? + data] +[column 1: type + name? + presence? + data] +... +[column N: type + name? + presence? + data] +``` + +**What changes:** +- `feature_count` is inserted after `extent`. +- The geometry section now begins with `geometry_layout`, followed by the geometry + streams it declares. +- The geometry section appears before the counted columns in the layer body. +- The metadata section is eliminated. Each column is fully self-describing; its type + byte, optional name, optional presence section, and stream data appear contiguously. +- `column_count` now appears immediately before the counted columns and covers only + ids, scalars, strings, and shared-dict columns. + +**Derived invariants unlocked by `feature_count`:** + +> **Popcount** means "count the number of bits set to `1` in the presence bitfield." +> If a presence bitfield has `feature_count` bits and K of them are `1`, then K features +> have a value and the optional data stream that follows contains exactly K encoded values. +> This document uses `popcount(bitfield)` as shorthand for that count. + +| Stream | Count in v1 | Count in v2 | +|---|---|---| +| Geometry types | `num_values` varint | = `feature_count` | +| Non-optional scalar data | `num_values` varint | = `feature_count` | +| Non-optional ID data | `num_values` varint | = `feature_count` | +| Presence bitfield size | stored in header | = `ceil(feature_count / 8)` bytes | +| Optional data stream | `num_values` varint | = `popcount(presence_bitfield)` — number of `1` bits in the bitfield | + +--- + +## Column Layout (Meta + Data Merge) + +Every column — whether top-level or a SharedDict child — follows one of three templates +depending on its column type: + +**Non-optional** (e.g. `I32`): +``` +[u8 column_type] +[varint name_len] [name bytes] ← only when column_type.has_name() == true +[data section] +``` + +**With own presence** (e.g. `OptI32`): own bitfield is declared inline, no prefix varint: +``` +[u8 column_type] +[varint name_len] [name bytes] +[ceil(feature_count/8) bytes: raw packed bitfield, LSB-first per byte] + ← this bitfield is registered as the next presence group (0-based, sequential) +[data section] +``` + +**With shared presence** (e.g. `OptRefI32`): references a previously declared bitfield: +``` +[u8 column_type] +[varint name_len] [name bytes] +[varint presence_group] ← 0-based index of a prior column's bitfield +[data section] +``` + +`has_name()` is false for `Id`, `OptId`, `LongId`, and `OptLongId` (same rule as v1). + +### v1 split vs. v2 merged — concrete example + +**v1** (two-pass layout, 3 columns: Geometry, Id, OptI32): + +``` +[varint 3] ← column_count +[0x04] ← Geometry type +[0x00] ← Id type +[0x11] ← OptI32 type +[varint name_len] "val" ← Id has no name; OptI32 name +[geometry streams...] +[id streams...] +[presence stream for OptI32] +[i32 streams...] +``` + +**v2** (one-pass layout; geometry moved out of `column_count`): + +``` +[varint feature_count] +── geometry section ── +[0x04] ← geometry_layout +[geometry streams...] +── columns ──────────── +[varint 2] ← column_count (Id, OptI32 only) +[0x00] ← Id column_type +[enc_byte] [data...] +[0x11] ← OptI32 column_type (own presence — no prefix varint) +[varint name_len] "val" +[bitfield bytes...] ← ceil(feature_count/8) bytes; registered as presence group 0 +[enc_byte] [data...] +``` + +--- + +## Encoding Byte + +Every integer stream in v2 is prefixed by exactly one **encoding byte**, whose layout +replaces the two v1 bytes (stream_type + encoding) for streams where the type is implied. + +### v1 stream header (4–14 bytes per stream) + +``` +[u8] stream_type + high nibble: category (0=Present, 1=Data, 2=Offset, 3=Length) + low nibble: subtype (DictionaryType / OffsetType / LengthType) +[u8] encoding + bits 7-5: logical1 (0=None, 1=Delta, 2=CwDelta, 3=Rle, 4=Morton, 5=PseudoDecimal) + bits 4-2: logical2 (secondary; 0=None, 1=Delta, 3=Rle — used only with Morton) + bits 1-0: physical (0=None, 1=FastPFor256, 2=VarInt) +[varint] num_values ← always present +[varint] byte_length ← always present +[varint] runs ← RLE streams only +[varint] num_rle_values ← RLE streams only +[varint] bits ← Morton streams only +[varint] shift ← Morton streams only +``` + +### v2 encoding byte (1–6 bytes per stream) + +The byte has no separate flag bits for byte_length or extension presence — both are +implied by the `logical` and `physical` fields, eliminating invalid bit combinations. + +``` +[u8] encoding_byte + bit 7: has_explicit_count (0 = use context count, 1 = varint follows) + bits 6-4: logical + 0 = None + 1 = Delta + 2 = CwDelta (ComponentwiseDelta) + 3 = Rle → physical implied = VarInt; byte_length always follows; + bits 3-2 reserved (must be 00) + 4 = DeltaRle → physical implied = VarInt; byte_length always follows; + bits 3-2 reserved (must be 00) + 5 = Morton → extension byte always follows; + bits 3-2 are physical (all four values valid) + 6 = PseudoDecimal + 7 = reserved + bits 3-2: physical (meaning when logical ∈ {None, Delta, CwDelta, Morton, PseudoDecimal}) + 0 = None-noLen raw fixed-width, byte_length omitted + (derivable: byte_length = num_values × element_width) + 1 = None-withLen raw fixed-width, varint byte_length follows + 2 = VarInt zigzag/varint encoded, varint byte_length follows + 3 = FastPFor128 block-SIMD compressed, varint byte_length follows + bits 1-0: reserved (must be 0) + +[optional: varint num_values] ← present when has_explicit_count = 1 +[optional: varint byte_length] ← present when physical ∈ {None-withLen, VarInt, FastPFor128}, + or when logical ∈ {Rle, DeltaRle} +[optional: extension_byte] ← present when logical = Morton (always) + bits 7-6: Morton sub-variant (00=None, 01=Rle, 10=Delta, 11=reserved) + bits 5-0: reserved (must be 0) +[optional: varint bits] ← present when logical = Morton +[optional: varint shift] ← present when logical = Morton +``` + +### Byte-length rules + +byte_length presence is encoded directly in the `physical` field — there is no +separate flag bit: + +| `physical` value | byte_length | Rationale | +|---|---|---| +| `None-noLen` (0) | omitted | `byte_length = num_values × element_width`; both known at header time | +| `None-withLen` (1) | varint follows | Use when explicit length is preferred (e.g. compression later) | +| `VarInt` (2) | varint follows | Cannot skip a VarInt stream without knowing its byte span | +| `FastPFor128` (3) | varint follows | Not self-delimiting | +| `Rle` / `DeltaRle` logical | varint follows | VarInt physical implied; same skippability requirement | + +### Count context rules + +`has_explicit_count = 0` is valid when the count equals `feature_count` or equals +`popcount(presence_bitfield)`. `popcount` is a fast bit-counting operation, so +recomputing it on the fly is cheaper than storing an extra varint. `has_explicit_count += 1` is required only when neither source applies. + +| Stream position | Implicit count (when `has_explicit_count = 0`) | +|---|---| +| Geometry types (first geometry stream) | `feature_count` | +| Non-optional scalar / ID data | `feature_count` | +| Dict index stream (`StrDict`, `StrFsstDict`, child refs) | `feature_count` (all features have an index) | +| Data stream after own presence bitfield | `popcount(presence_bitfield)` | +| Data stream with `presence_ref > 0` | `popcount(referenced_presence_bitfield)` | +| Geometry vertex / aux streams | must use `has_explicit_count = 1` | +| String character / offset / dict-size streams | must use `has_explicit_count = 1` | + +### Changes from v1 + +| Field | v1 | v2 | +|---|---|---| +| `stream_type` byte | Always present (1 byte) | Eliminated (role implied by position) | +| `encoding` byte | Always present (1 byte) | Replaced by `encoding_byte` | +| `has_byte_length` flag | Part of encoding byte | Eliminated; byte_length presence encoded in `physical` field | +| `has_extension` flag | Part of encoding byte | Eliminated; extension byte implied by `logical=Morton` | +| `num_values` | Always present (1–5 bytes) | Conditional: omitted when derivable | +| `byte_length` | Always present (1–5 bytes) | Omitted for `physical=None-noLen`; always present otherwise | +| `runs` (RLE) | Present for RLE (1–5 bytes) | Eliminated | +| `num_rle_values` (RLE) | Present for RLE (1–5 bytes) | Eliminated | +| `bits`/`shift` (Morton) | Present for Morton (2–10 bytes) | In extension byte + varints; extension always present when `logical=Morton` | + +--- + +## Presence Streams + +### v1 presence stream + +Optional columns prepend a bool-RLE stream with a full stream header: + +``` +[u8 stream_type = 0x00] ← Present category, subtype 0 +[u8 encoding] + bits 7-5: logical1 = Rle (3) + bits 4-2: logical2 = None (0) + bits 1-0: physical = None (0) ← or VarInt +[varint num_values] ← = feature_count +[varint byte_length] +[varint runs] ← RLE pair count +[varint num_rle_values] ← = feature_count +[RLE-packed bits...] +``` + +The stream uses bool-RLE encoding: pairs of `(run_length, value)` where value is 0 +(absent) or 1 (present). Header overhead: 4–14 bytes before the data. + +### v2 presence section + +Presence is encoded in the **column type** itself, not in a prefix varint, giving three +distinct variants for any nullable type: + +**`Opt*` (own presence)** — bitfield immediately follows the optional name; no leading varint: +``` +[ceil(feature_count/8) bytes: raw packed bits, LSB-first within each byte] +← registered as the next presence group (0-based sequential index) +← num_values for the data stream = popcount(this bitfield) +``` + +**`OptRef*`** — references a prior column's already-declared bitfield: +``` +[varint presence_group] ← 0-based index of the group to reuse +← no bitfield bytes follow +← num_values for the data stream = popcount(referenced bitfield) +``` + +**What changes:** + +| Aspect | v1 | v2 | +|---|---|---| +| Encoding | Bool-RLE pairs | Raw packed bitfield (1 bit per feature) | +| Header overhead | 4–14 bytes | **0 bytes** for `Opt*` (bitfield directly follows column type); 1 byte (varint) for `OptRef*` | +| Size known in advance | No (stored in `byte_length`) | Yes: always `ceil(feature_count / 8)` bytes | +| Sharing identical bitfields | Not possible | `OptRef*` type with group index; bitfield stored once | +| Null count for data stream | Stored in `num_rle_values` | `popcount(bitfield)`, computed at decode time | +| "Own bitfield" marker | n/a | Encoded in column type — no `presence_ref = 0` byte needed | + +**Presence group numbering** is layer-wide and sequential. Every `Opt*` column +(top-level or SharedDict child) increments the counter when its bitfield is parsed. + +**Dict variants** (`OptStrDict`, `OptStrFsstDict`, `OptSharedDictChildRef`) do **not** +have a presence section; they encode null via index 0 in the indices stream and therefore +have no corresponding `OptRef*` variant either. + +--- + +## Scalar and ID Columns + +Covers: `Bool`, `I8`, `U8`, `I32`, `U32`, `I64`, `U64`, `F32`, `F64`, `Id`, `LongId` +and their `Opt*` counterparts. + +### v1 layout + +``` +── metadata section (written before all column data) ── +[u8 column_type] +[optional: varint name_len + bytes] + +── data section ── +[optional presence stream] ← full stream header + bool-RLE data +[stream_type byte] ← always 0x10 (Data, DictionaryType::None) +[encoding byte] +[varint num_values] ← = feature_count (non-optional) or number of 1-bits in presence (optional) +[varint byte_length] +[encoded data bytes] +``` + +### v2 layout + +Non-optional: +``` +[u8 column_type] ← e.g. I32 +[optional: varint name_len + bytes] +[encoding_byte] +[optional: varint num_values] ← only when has_explicit_count = 1 +[optional: varint byte_length] ← present when physical ∈ {None-withLen, VarInt, FastPFor128} +[encoded data bytes] +``` + +Optional — own presence (`Opt*`): +``` +[u8 column_type] ← e.g. OptI32 +[optional: varint name_len + bytes] +[ceil(feature_count/8) bytes: bitfield] ← no prefix varint +[encoding_byte] [optional count/size] [data bytes] +``` + +Optional — shared presence (`OptRef*`): +``` +[u8 column_type] ← e.g. I32SharedPresence +[optional: varint name_len + bytes] +[varint presence_group] ← 0-based group index +[encoding_byte] [optional count/size] [data bytes] +``` + +**What changes per column:** + +| | v1 bytes | v2 bytes | +|---|---|---| +| Column type in metadata section | 1 (in meta pass) | 1 (inline) | +| Name in metadata section | 0–33 (in meta pass) | 0–33 (inline) | +| Presence stream header | 4–14 | 0 (`Opt*` — bitfield directly follows) / 1 (`OptRef*` — group ref varint) | +| Presence data | RLE packed | Raw packed bits | +| `stream_type` byte | 1 | 0 (eliminated) | +| `num_values` | 1–5 | 0 (derivable from `feature_count` or presence popcount) | +| `byte_length` | 1–5 | 0 for `None-noLen`; 1–5 for `None-withLen`, VarInt, FastPFor128 | + +**Element widths** used for `physical=None-noLen` byte-length derivation: + +| Column types | Element width | +|---|---| +| `Bool`, `I8`, `U8` | 1 byte | +| `I32`, `U32`, `F32` | 4 bytes | +| `I64`, `U64`, `F64`, `Id`, `LongId` | 8 bytes | + +--- + +## Geometry Section + +### v1 layout + +``` +── metadata section ── +[u8 column_type = 0x04] ← Geometry; no name written + +── data section ── +[varint stream_count] ← number of geometry streams that follow +[stream 0: types] + [u8 stream_type = 0x10] ← Data(None) + [u8 encoding] + [varint num_values] ← = feature_count + [varint byte_length] + [geometry type data] +[stream 1..N: auxiliary streams, each with full header] + [u8 stream_type] ← identifies role: Data(Vertex/Morton), + Offset(Vertex/Index), Length(Geometries/Parts/Rings/Triangles) + [u8 encoding] + [varint num_values] + [varint byte_length] + [stream data] +``` + +Stream presence is implicit: `stream_count` streams follow, each identified by its +`stream_type` byte. + +### v2 layout + +Geometry is no longer encoded as a regular column type. Instead, v2 starts with a +geometry section. That section begins with a single `geometry_layout` byte, which +selects one of the named layouts below and unambiguously specifies which geometry +streams are present and in what order. After the geometry section, `column_count` +and the regular columns follow. Geometry does not contribute to that count. + +``` +[u8 geometry_layout] ← first byte of the geometry section +[geometry streams in fixed order determined by geometry_layout; see table below] +``` + +Every stream: `[encoding_byte] [optional: varint num_values] [optional: varint byte_length] [data]` + +| Geometry layout | Streams (in order, all `has_explicit_count = 1` except Types) | +|---|---| +| `GeoPoints` | Types¹, Vertices | +| `GeoPointsDict` | Types¹, VertexData (dict), VertexOffsets | +| `GeoMultiPoints` | Types¹, GeoLengths, Vertices | +| `GeoMultiPointsDict` | Types¹, GeoLengths, VertexData (dict), VertexOffsets | +| `GeoLines` | Types¹, PartLengths, Vertices | +| `GeoLinesDict` | Types¹, PartLengths, VertexData (dict), VertexOffsets | +| `GeoMultiLines` | Types¹, GeoLengths, PartLengths, Vertices | +| `GeoMultiLinesDict` | Types¹, GeoLengths, PartLengths, VertexData (dict), VertexOffsets | +| `GeoPolygons` | Types¹, PartLengths, RingLengths, Vertices | +| `GeoPolygonsDict` | Types¹, PartLengths, RingLengths, VertexData (dict), VertexOffsets | +| `GeoMultiPolygons` | Types¹, GeoLengths, PartLengths, RingLengths, Vertices | +| `GeoMultiPolygonsDict` | Types¹, GeoLengths, PartLengths, RingLengths, VertexData (dict), VertexOffsets | +| `GeoTessPolygons` | Types¹, TriLengths, IndexBuffer, Vertices | +| `GeoTessPolygonsWithOutlines` | Types¹, GeoLengths, PartLengths, RingLengths, TriLengths, IndexBuffer, Vertices | + +¹ Types stream: `has_explicit_count = 0` (count = `feature_count`) + +This spec intentionally does **not** include the extra tessellated-with-rings layouts, for example, cases with ring topology but missing or +degenerate tessellation payloads. Here, those cases fall back to the corresponding +non-tessellated polygon layout (`GeoPolygons*` / `GeoMultiPolygons*` as applicable), +and no tessellation streams (`TriLengths`, `IndexBuffer`) are written. + +**Dict columns** store a deduplicated vertex dictionary in VertexData and per-vertex +indices in VertexOffsets. The vertex encoding (plain delta, CwDelta, Morton) is +specified by the `logical` field in the VertexData encoding byte. + +**Mixed-type layers** use the layout whose stream set is a superset of all geometry +types present: +- Point + LineString → `GeoLines` (Points consume no PartLengths entry) +- LineString + Polygon → `GeoPolygons` (LineString vertex counts stored in RingLengths) +- Polygon + MultiPolygon → `GeoMultiPolygons` + +**What changes:** + +| Aspect | v1 | v2 | +|---|---|---| +| Stream presence declaration | `varint stream_count` + `stream_type` per stream | Encoded in `geometry_layout` | +| `stream_count` varint | 1–5 bytes | Eliminated | +| Geometry selector | Geometry is a regular column (`column_type = 0x04`) | First byte of the geometry section; outside `column_count` | +| `stream_type` per stream | 1 byte × N streams | Eliminated | +| `num_values` for types stream | varint (= feature_count) | Omitted | +| Vertex encoding style | Read from `stream_type` subtype | Read from `logical` field in encoding byte | + +--- + +## String Columns + +### Overview of changes + +| | v1 | v2 | +|---|---|---| +| Column types | `Str (28)`, `OptStr (29)` | 8 explicit types (28–35) | +| Encoding variant declaration | `varint stream_count` (2–5 determines variant) | Encoded in column type code | +| `stream_count` varint | 1–5 bytes | Eliminated | +| `stream_type` per stream | 1 byte × N streams | Eliminated (position-implied) | +| Null encoding for dict variants | Presence bitfield (separate stream) | Null-at-index-0 in indices stream | +| Presence sharing | Not possible | Dict variants: none; plain variants: shared presence group | + +--- + +### 8.1 StrPlain / OptStrPlain + +A plain string column stores per-feature string lengths and the concatenated string data. + +**v1 layout (`Str`, non-optional):** + +``` +── metadata ── [u8 = 28] [varint name_len + bytes] +── data ── +[varint stream_count = 2] +[stream 0: lengths] + [u8 stream_type = 0x31] ← Length(VarBinary) + [u8 encoding] + [varint num_values] ← = feature_count + [varint byte_length] + [length data] +[stream 1: string data] + [u8 stream_type = 0x10] ← Data(None) + [u8 encoding = raw/None] + [varint num_values] ← = total byte count of all strings + [varint byte_length] ← = same + [raw UTF-8 bytes] +``` + +**v1 layout (`OptStr`, optional):** + +``` +── metadata ── [u8 = 29] [varint name_len + bytes] +── data ── +[varint stream_count = 3] +[presence stream] ← full bool-RLE header + data +[stream 1: lengths] ← num_values = number of 1-bits in presence bitfield +[stream 2: string data] +``` + +**v2 layout (`StrPlain`, non-optional):** + +``` +[u8 = 28] [varint name_len + bytes] +[encoding_byte for lengths] ← has_explicit_count=0 (= feature_count) +[lengths data] +[encoding_byte for string data] + has_explicit_count=1 ← total byte count ≠ feature_count + logical=None, physical=None-noLen ← byte_length = num_values × 1 (trivial) +[varint num_values] ← total UTF-8 byte count +[raw UTF-8 bytes] +``` + +**v2 layout (`OptStrPlain`, optional — own presence):** + +``` +[u8 = 29] [varint name_len + bytes] +[ceil(feature_count/8) bytes: bitfield] ← no prefix varint; registered as next group +[encoding_byte for lengths] ← has_explicit_count=0; count = popcount(bitfield) +[lengths data] +[encoding_byte for string data] + has_explicit_count=1 + logical=None, physical=None-noLen +[varint num_values] +[raw UTF-8 bytes] +``` + +**v2 layout (`StrPlainSharedPresence`, optional — shared presence):** + +``` +[u8 = StrPlainSharedPresence] [varint name_len + bytes] +[varint presence_group] ← 0-based group index +[encoding_byte for lengths] ← has_explicit_count=0; count = popcount(referenced group) +[lengths data] +[encoding_byte for string data] has_explicit_count=1, logical=None, physical=None-noLen +[varint num_values] +[raw UTF-8 bytes] +``` + +--- + +### 8.2 StrDict / OptStrDict + +A dictionary string column stores unique string values in a dictionary and per-feature +indices into that dictionary. + +**Null encoding change — the most significant difference for dict columns:** + +| | v1 | v2 | +|---|---|---| +| Optional dict column | Presence bitfield controls which features have values; index stream only covers present features | No presence bitfield; **index 0 = null**; index stream covers all `feature_count` features | +| Non-optional dict column | Presence absent; index stream covers all features | Same; index 0 is a valid dict entry (decoder assumes non-null) | +| Index count | = number of `1` bits in presence, or `feature_count` | Always `feature_count` | + +**v1 layout (`OptStr` + dictionary variant, stream_count = 4):** + +``` +── metadata ── [u8 = 29] [varint name_len + bytes] +── data ── +[varint stream_count = 4] +[presence stream] ← bool-RLE; num_values = feature_count +[offsets stream] ← Offset(String); per-present-feature dict indices + [u8 stream_type = 0x23] + [u8 encoding] + [varint num_values] ← = number of 1-bits in presence bitfield + [varint byte_length] + [offset data] +[dict lengths stream] ← Length(Dictionary) + [u8 stream_type = 0x36] + [u8 encoding] + [varint num_values] ← = dict size + [varint byte_length] + [dict length data] +[dict data stream] ← Data(Single/Shared) + [u8 stream_type = 0x11/0x12] + [u8 encoding = raw/None] + [varint num_values] ← = total dict UTF-8 bytes + [varint byte_length] + [raw UTF-8 dict bytes] +``` + +**v2 layout (`OptStrDict`, optional):** + +``` +[u8 = 31] [varint name_len + bytes] +← NO presence section for dict variants +[encoding_byte for dict_lengths] + has_explicit_count=1 ← dict size ≠ feature_count +[varint num_values] ← dict entry count +[dict_lengths data] +[encoding_byte for dict_data] + has_explicit_count=1 + logical=None, physical=None-noLen ← byte_length = num_values × 1 +[varint num_values] ← total dict UTF-8 bytes +[raw UTF-8 dict bytes] +[encoding_byte for indices] + has_explicit_count=0 ← = feature_count (index 0 = null, all features present) +[indices data] ← index 0 = null; indices 1..N map to dict entries 0..N-1 +``` + +**v2 layout (`StrDict`, non-optional):** + +Identical wire format to `OptStrDict`. The column type code (30 vs 31) tells the +decoder whether index 0 should be treated as null. + +--- + +### 8.3 StrFsst / OptStrFsst + +FSST-plain compresses string data using a per-column symbol table. The column stores a +symbol table, per-value lengths, and the compressed corpus. + +**FSST symbol table stream** (same for all FSST variants): + +``` +[encoding_byte] + has_explicit_count=1 + logical=None, physical=None-noLen ← raw bytes; byte_length = num_values × 1 +[varint num_values] ← number of raw symbol table bytes +[symbol table bytes] +``` + +Symbol lengths (how long each symbol is) precede the symbol table bytes. + +**v1 layout (`OptStr` + FSST-plain variant, stream_count = 5):** + +``` +[varint stream_count = 5] +[presence stream] ← bool-RLE header +[symbol_lengths stream] ← Length(Symbol) +[symbol_table stream] ← Data(Fsst) +[per-value lengths stream] ← Length(Dictionary) +[compressed corpus stream] ← Data(Single/Shared) +``` + +Each stream has a full 4-field header plus optional RLE metadata. + +**v2 layout (`OptStrFsst`, optional — own presence):** + +``` +[u8 = 33] [varint name_len + bytes] +[ceil(feature_count/8) bytes: bitfield] ← no prefix varint; registered as next group +[encoding_byte] [symbol_lengths data] ← has_explicit_count=1 +[encoding_byte] [symbol_table data] ← has_explicit_count=1, logical=None, physical=None-noLen +[encoding_byte] [per-value lengths] ← has_explicit_count=0; count = popcount(bitfield) +[encoding_byte] [compressed corpus] ← has_explicit_count=1, logical=None, physical=None-noLen +``` + +**v2 layout (`StrFsst`, non-optional):** + +Same but no presence section; per-value lengths count = `feature_count`. + +--- + +### 8.4 StrFsstDict / OptStrFsstDict + +FSST-dictionary stores the dictionary corpus FSST-compressed. Per-feature values are +indices into the decoded dictionary. + +**v1 layout (`OptStr` + FSST-dictionary variant, stream_count = 6):** + +``` +[varint stream_count = 6] +[presence stream] ← bool-RLE +[symbol_lengths stream] +[symbol_table stream] +[dict_lengths stream] +[compressed dict corpus stream] +[per-feature offsets stream] ← Offset(String); indices per present feature +``` + +**v2 layout (`OptStrFsstDict`, optional):** + +``` +[u8 = 35] [varint name_len + bytes] +← NO presence section +[encoding_byte] [symbol_lengths data] ← has_explicit_count=1 +[encoding_byte] [symbol_table data] ← has_explicit_count=1, logical=None, physical=None-noLen +[encoding_byte] [dict_lengths data] ← has_explicit_count=1 (dict size ≠ feature_count) +[encoding_byte] [dict_corpus data] ← has_explicit_count=1, logical=None, physical=None-noLen +[encoding_byte] [indices data] ← has_explicit_count=0 (= feature_count); index 0 = null +``` + +**v2 layout (`StrFsstDict`, non-optional):** + +Same wire format; decoder does not treat index 0 as null. + +--- + +## SharedDict Columns + +A SharedDict column stores a shared string corpus (plain or FSST-compressed) that +multiple child columns index into. + +### Overview of changes + +| Aspect | v1 | v2 | +|---|---|---| +| Column types | `SharedDict (30)` — single type with runtime variant | `SharedDictPlain (36)`, `SharedDictFsst (37)` — flat types | +| Corpus encoding variant | Runtime: detected from stream types within stream_count group | Static: encoded in column type code | +| Children | `varint stream_count` + optional presence + offset stream per child | Full column definitions with type byte (`SharedDictChildRef`/`OptSharedDictChildRef`) | +| Optional child null encoding | Presence bitfield (separate stream per optional child) | Null-at-index-0 in the indices stream | +| Shared presence for children | Not possible | Possible for plain/FSST variants; **not** for dict children (null-at-0 convention) | + +### v1 layout + +``` +── metadata ── [u8 = 30] [varint name_len + bytes] + [varint child_col_count] + [u8 child_type₀] [varint name_len + bytes] + ... +── data ── +[varint stream_count] ← dict streams + (1 or 2 per child) +[dict stream(s)] ← lengths + data, or FSST 4-stream group +[per child:] + [varint child_stream_count] ← 1 or 2 depending on optionality + [optional presence stream] ← bool-RLE header + data + [offset/index stream] ← Offset(Key) or Offset(String) + [u8 stream_type = 0x23] + [u8 encoding] + [varint num_values] + [varint byte_length] + [index data] +``` + +### v2 layout (SharedDictPlain) + +``` +[u8 = 36] [varint name_len + bytes] +[encoding_byte] [dict_lengths data] ← has_explicit_count=1 +[encoding_byte] [dict_data bytes] ← has_explicit_count=1, logical=None, physical=None-noLen + +[varint child_column_count] +[child 0:] + [u8 = 38 (SharedDictChildRef) or 39 (OptSharedDictChildRef)] + [varint name_len + bytes] + ← NO presence section for child ref types (null-at-0 for optional) + [encoding_byte] [indices data] + has_explicit_count=0 ← = feature_count for both ref types; index 0 = null when type = 39 +[child 1:] + ... +``` + +### v2 layout (SharedDictFsst) + +``` +[u8 = 37] [varint name_len + bytes] +[encoding_byte] [symbol_lengths data] ← has_explicit_count=1 +[encoding_byte] [symbol_table data] ← has_explicit_count=1, logical=None, physical=None-noLen +[encoding_byte] [dict_lengths data] ← has_explicit_count=1 +[encoding_byte] [dict_corpus data] ← has_explicit_count=1, logical=None, physical=None-noLen + +[varint child_column_count] +[children identical to SharedDictPlain children...] +``` + +### Child type codes + +| Column type | Code | Null handling | +|---|---|---| +| `SharedDictChildRef` | 38 | No nulls; index 0 is a valid dict entry | +| `OptSharedDictChildRef` | 39 | index 0 = null; all other indices map to dict | + +--- + +## RLE Encoding + +### v1 RLE stream header extras + +All RLE and DeltaRle streams (except bool-RLE presence streams) carry two extra varints +after `byte_length`: + +``` +[varint runs] ← number of (run_length, value) pairs +[varint num_rle_values] ← sum of all run_lengths = total decoded element count +``` + +Both fields are redundant: they can be derived from the stream data itself. + +### v2 changes + +Both fields are **eliminated**: + +- `num_rle_values` = `num_values`, already known from context (`feature_count`, + presence popcount, or explicit count in encoding byte). +- `runs` is recoverable by scanning: the data is interleaved `[run_length_varint, + value_varint]` pairs, so `runs = total_pairs` (found by reading to byte_length). + +Bool-RLE is eliminated entirely; presence data uses raw packed bitfields (§5). + +### Encoding byte for RLE streams + +| Logical | Encoding byte bits 6-4 | Physical implied | byte_length | +|---|---|---|---| +| `Rle` | `011` | VarInt | always follows | +| `DeltaRle` | `100` | VarInt | always follows | + +bits 3-2 (physical field) must be `00` for Rle/DeltaRle; they are reserved for future +RLE sub-variants. No extension byte follows. + +--- + +## Morton Encoding + +### v1 Morton header extras + +Morton streams carry `bits` and `shift` after `byte_length`: + +``` +[varint bits] ← number of bits used per coordinate component +[varint shift] ← coordinate shift value +``` + +Morton sub-variant (plain/delta/rle) is encoded via `logical2` in the encoding byte. + +### v2 changes + +`bits` and `shift` are moved to an **extension byte** that is unconditionally present +whenever `logical = Morton` (no flag needed): + +``` +[encoding_byte] + logical = 5 (Morton) ← bits 6-4 + physical = ... ← bits 3-2 (all four values valid for Morton) +[extension_byte] ← always present; no flag + bits 7-6: Morton sub-variant (00=None, 01=Rle, 10=Delta, 11=reserved) + bits 5-0: reserved (must be 0) +[varint bits] +[varint shift] +``` + +The `logical2` field in v1's encoding byte is replaced by the Morton sub-variant in the +extension byte. The extension byte is implied by `logical=Morton` rather than by an +explicit `has_extension` flag. + +--- + +## FastPFor Codec + +### v1 FastPFor wire format + +``` +[u32 BE] N = number of FastPFor-compressed u32 words +[N × u32 BE] FastPFor primary codec output +[remaining u32 BE words] VariableByte remainder codec output +``` + +All u32 words are stored **big-endian**. Block size is **256**. + +### v2 FastPFor wire format + +``` +[u32 LE] N = number of FastPFor-compressed u32 words +[N × u32 LE] FastPFor primary codec output +[remaining u32 LE words] VariableByte remainder codec output +``` + +Changes: +- All u32 words are stored **little-endian**. +- Block size is **128**. +- `physical=FastPFor128` (value 3 in bits 3-2) always carries `byte_length`; it marks + where the FastPFor stream ends and the next stream begins. + +--- + +## Column Type Codes + +### v1 column type codes + +| Code | Type | Code | Type | +|------|------|------|------| +| 0 | Id | 1 | OptId | +| 2 | LongId | 3 | OptLongId | +| 4 | Geometry | — | — | +| 10 | Bool | 11 | OptBool | +| 12 | I8 | 13 | OptI8 | +| 14 | U8 | 15 | OptU8 | +| 16 | I32 | 17 | OptI32 | +| 18 | U32 | 19 | OptU32 | +| 20 | I64 | 21 | OptI64 | +| 22 | U64 | 23 | OptU64 | +| 24 | F32 | 25 | OptF32 | +| 26 | F64 | 27 | OptF64 | +| 28 | Str | 29 | OptStr | +| 30 | SharedDict | — | — | + +### v2 column type codes + +Specific numeric assignments are deferred to the final spec; only the variant names +are defined here. + +Every nullable scalar/string type has **three** variants: + +| Variant suffix | Presence mechanism | +|---|---| +| *(none)* — non-optional | No presence | +| `Opt*` — own-presence | Bitfield directly follows name; no prefix varint | +| `OptRef*` — shared-presence | `varint presence_group` follows name | + +**ID and scalar types:** + +| Non-optional | Own-presence | Shared-presence | +|---|---|---| +| `Id` | `OptId` | `OptRefId` | +| `LongId` | `OptLongId` | `OptRefLongId` | +| `Bool` | `OptBool` | `OptRefBool` | +| `I8` | `OptI8` | `OptRefI8` | +| `U8` | `OptU8` | `OptRefU8` | +| `I32` | `OptI32` | `OptRefI32` | +| `U32` | `OptU32` | `OptRefU32` | +| `I64` | `OptI64` | `OptRefI64` | +| `U64` | `OptU64` | `OptRefU64` | +| `F32` | `OptF32` | `OptRefF32` | +| `F64` | `OptF64` | `OptRefF64` | + +**String and shared-dict types:** + +| Non-optional | Own-presence | Shared-presence | +|---|---|---| +| `StrPlain` | `OptStrPlain` | `OptRefStrPlain` | +| `StrDict` | — (null-at-0; no presence bitfield) | +| `StrFsst` | `OptStrFsst` | `OptRefStrFsst` | +| `StrFsstDict` | — (null-at-0; no presence bitfield) | +| `SharedDictPlain` | — | — | +| `SharedDictFsst` | — | — | +| `SharedDictChildRef` | — (null-at-0; no presence bitfield) | + +**Invariants:** +- Dict-family string types (`OptStrDict`, `OptStrFsstDict`, `OptSharedDictChildRef`) have no + shared-presence variant because they encode null via index 0, not via a presence bitfield. +- Shared-presence (`OptRef*`) code = own-presence code with bit 7 set. + +--- + +## Stream Type Bytes (Eliminated) + +In v1 every stream begins with a `stream_type` byte that encodes the stream's role: + +``` +bits 7-4: category (0=Present, 1=Data, 2=Offset, 3=Length) +bits 3-0: subtype (DictionaryType / OffsetType / LengthType) +``` + +In v2 this byte is **eliminated from the wire format entirely**. Stream role is always +determinable without it: + +| Column type | How stream role is known | +|---|---| +| Scalar / ID | Single data stream; role is trivial | +| Geometry section | `geometry_layout` determines which streams are present; position in fixed sequence declares role | +| `StrPlain` / `OptStrPlain` | Positions: 1=lengths, 2=string-data | +| `StrDict` / `OptStrDict` | Positions: 1=dict-lengths, 2=dict-data, 3=indices | +| `StrFsst` / `OptStrFsst` | Positions: 1=sym-lengths, 2=sym-table, 3=lengths, 4=corpus | +| `StrFsstDict` / `OptStrFsstDict` | Positions: 1=sym-lengths, 2=sym-table, 3=dict-lengths, 4=dict-corpus, 5=indices | +| `SharedDictPlain` / `SharedDictFsst` | Corpus streams in fixed order; children are self-describing column definitions | + +The `StreamType` concept (`Present`, `Data`, `Offset`, `Length`) and its subtypes +(`DictionaryType`, `OffsetType`, `LengthType`) remain useful as descriptive labels for +documentation and in-memory representations, but they are no longer serialized to the +wire in v2. + +--- + +## Overhead Comparison Table + +Per-stream header costs, typical small-integer varint values. + +| Stream Kind | v1 overhead (bytes) | v2 overhead (bytes) | Saving | +|----------------------------------------|---|---|---| +| Presence bitfield header (`Opt*`) | 4–14 | 0 (bitfield directly follows column type) | 4–14 | +| Shared presence (`OptRef*`) | full bitfield copy | 1 (presence_group varint) | ≈ ceil(N/8) | +| Non-optional scalar, VarInt | 4–6 | 1 (enc) + varint(size) | 2–4 | +| Non-optional scalar, None-noLen | 4–6 | 1 | 3–5 | +| Non-optional scalar, FastPFor128 | 5–7 | 1 (enc) + varint(size) | 2–4 | +| Optional scalar, VarInt | 4–6 | 1 (enc) + varint(size) | 2–4 | +| RLE stream, VarInt | 6–10 | 1 | 5–9 | +| Geometry: stream_count varint | 1–3 | 0 (encoded in `geometry_layout`) | 1–3 | +| Geometry: types stream | 4–6 | 1 | 3–5 | +| Geometry: aux stream, FastPFor | 6–10 | 1 + varint(count) + varint(size) | 2–4 | +| Morton vertex stream | 8–14 | 1 + ext + 2 varints | 4–8 | +| String variant declaration | 1 (Str/OptStr type) + 1–3 (stream_count) | 0 | 2–4 | +| Dict optional: presence + index stream | 4–10 + bool-RLE | 0 (null-at-0) | 4–10 + bitfield | + +**Estimated total saving for a typical tile (100 streams, 10 optional columns):** +600–1 000 bytes of stream-header metadata, plus elimination of bool-RLE presence data +(replaced by compact bitfields). diff --git a/justfile b/justfile index 7d2753607..56e569e4b 100755 --- a/justfile +++ b/justfile @@ -69,6 +69,12 @@ ci-extract-version language tag: mlt *args: cargo run --manifest-path {{join(justfile_directory(), 'rust', 'Cargo.toml')}} --package mlt -- "$@" +# Run the mlt CLI tool with the given arguments from current dir. +[no-cd] +[positional-arguments] # avoids shell expansions +mlt-rel *args: + cargo run --release --manifest-path {{join(justfile_directory(), 'rust', 'Cargo.toml')}} --package mlt -- "$@" + # Ensure a command is available assert-cmd command: #!/usr/bin/env bash diff --git a/mkdocs.yml b/mkdocs.yml index 3a2e6c0b2..ef82aa970 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -65,6 +65,7 @@ nav: - Encoding Algorithms: encodings.md - Specification: specification.md - Geometry Spec: geometry.md + - Migrating to v2: migrating-to-v2.md extra: social: diff --git a/rust/mlt-core/src/decoder/layer.rs b/rust/mlt-core/src/decoder/layer.rs index 00e7ea9c0..d0a3ed5c7 100644 --- a/rust/mlt-core/src/decoder/layer.rs +++ b/rust/mlt-core/src/decoder/layer.rs @@ -1,6 +1,7 @@ use crate::codecs::varint::parse_varint; use crate::decoder::{Layer01, Unknown}; use crate::utils::{parse_u8, take}; +use crate::v2::decode_v2_layer; use crate::{DecodeState, Decoder, Layer, MltError, MltRefResult, MltResult, ParsedLayer, Parser}; impl<'a, S: DecodeState> Layer<'a, S> { @@ -9,7 +10,7 @@ impl<'a, S: DecodeState> Layer<'a, S> { pub fn as_layer01(&self) -> Option<&Layer01<'a, S>> { match self { Self::Tag01(l) => Some(l), - Self::Unknown(_) => None, + _ => None, } } @@ -18,7 +19,7 @@ impl<'a, S: DecodeState> Layer<'a, S> { pub fn into_layer01(self) -> Option> { match self { Self::Tag01(l) => Some(l), - Self::Unknown(_) => None, + _ => None, } } } @@ -37,8 +38,8 @@ impl<'a> Layer<'a> { let (input, value) = take(input, size)?; let layer = match tag { - // For now, we only support tag 0x01 layers, but more will be added soon 1 => Layer::Tag01(Layer01::from_bytes(value, parser)?), + 2 => Layer::Tag02(decode_v2_layer(value)?), tag => Layer::Unknown(Unknown { tag, value }), }; @@ -51,7 +52,8 @@ impl<'a> Layer<'a> { /// `Layer::Tag01(lazy)` and call the individual methods on [`Layer01`]. pub fn decode_all(self, dec: &mut Decoder) -> MltResult> { match self { - Layer::Tag01(lazy) => Ok(Layer::Tag01(lazy.decode_all(dec)?)), + Layer::Tag01(v) => Ok(Layer::Tag01(v.decode_all(dec)?)), + Layer::Tag02(v) => Ok(Layer::Tag02(v)), Layer::Unknown(u) => Ok(Layer::Unknown(u)), } } diff --git a/rust/mlt-core/src/decoder/model.rs b/rust/mlt-core/src/decoder/model.rs index 6c3bcd6b3..5dd10c60a 100644 --- a/rust/mlt-core/src/decoder/model.rs +++ b/rust/mlt-core/src/decoder/model.rs @@ -14,6 +14,11 @@ use crate::{DecodeState, Lazy, Parsed}; pub enum Layer<'a, S: DecodeState = Lazy> { /// MVT-compatible layer (tag = 1) Tag01(Layer01<'a, S>), + /// Experimental v2 layer (tag = 2), decoded eagerly to owned row-oriented form. + /// + /// Unlike `Tag01`, this variant is always fully decoded when parsed: the + /// `S` type parameter is ignored and the data is stored as a [`TileLayer01`]. + Tag02(TileLayer01), /// Unknown layer with tag, size, and value Unknown(Unknown<'a>), } @@ -26,6 +31,7 @@ where fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::Tag01(l) => f.debug_tuple("Tag01").field(l).finish(), + Self::Tag02(t) => f.debug_tuple("Tag02").field(t).finish(), Self::Unknown(u) => f.debug_tuple("Unknown").field(u).finish(), } } @@ -116,6 +122,8 @@ pub struct Layer01<'a, S: DecodeState = Lazy> { pub(crate) layer_order: Vec, } +impl StrParse for Layer {} + pub type ParsedLayer01<'a> = Layer01<'a, Parsed>; impl<'a, S> fmt::Debug for Layer01<'a, S> diff --git a/rust/mlt-core/src/lib.rs b/rust/mlt-core/src/lib.rs index bf7c8930c..647192a85 100644 --- a/rust/mlt-core/src/lib.rs +++ b/rust/mlt-core/src/lib.rs @@ -23,6 +23,7 @@ pub(crate) mod decoder; pub mod encoder; pub(crate) mod errors; pub(crate) mod utils; +pub mod v2; pub use convert::{geojson, mvt}; pub use decoder::{ diff --git a/rust/mlt-core/src/utils/test_helpers.rs b/rust/mlt-core/src/utils/test_helpers.rs index 85f85d1de..b6f5bc0cb 100644 --- a/rust/mlt-core/src/utils/test_helpers.rs +++ b/rust/mlt-core/src/utils/test_helpers.rs @@ -27,7 +27,8 @@ pub fn assert_empty(result: MltRefResult) -> T { pub fn into_layer01(layer: Layer) -> Layer01 { match layer { Layer::Tag01(layer01) => layer01, - Layer::Unknown(_) => panic!("expected Tag01 layer"), + Layer::Tag02(_) => panic!("expected Tag01 layer, got Tag02"), + Layer::Unknown(_) => panic!("expected Tag01 layer, got Unknown"), } } diff --git a/rust/mlt-core/src/v2.rs b/rust/mlt-core/src/v2.rs new file mode 100644 index 000000000..ee9344211 --- /dev/null +++ b/rust/mlt-core/src/v2.rs @@ -0,0 +1,2550 @@ +//! MLT v2 experimental encoder and decoder. +//! +//! This module implements a minimal v2 wire format for round-trip experimentation. +//! The format uses tag `2` to distinguish v2 layers from v1 layers (tag `1`). +//! +//! Encoding strategies (each stream picks the smallest result automatically): +//! +//! - **Feature ordering**: tries `Unsorted`, `SpatialMorton`, and `SpatialHilbert`, keeping +//! the ordering that produces the smallest encoded tile. Mirrors the v1 optimizer. +//! - **Geometry types stream**: tries VarInt and RLE. +//! For single-type layers (all Points, all Lines …) RLE reduces from N bytes to ~3 bytes. +//! - **Vertex data**: ComponentwiseDelta applied first, then tries: +//! - VarInt (1 byte per small delta) +//! - FastPFor128 LE (bit-packing in blocks of 128; wins for layers with many vertices) +//! - **Integer ID columns**: tries plain VarInt and Delta+VarInt. +//! Sequential OSM IDs encode as 1-byte deltas each. +//! - **String columns**: tries three encodings, keeps smallest: +//! - StrPlain / OptStrPlain: per-value byte lengths + raw UTF-8 bytes. +//! - StrDict / OptStrDict: deduplicated dictionary + per-feature indices. +//! Best for low-cardinality columns (road class, surface type …). +//! - StrFsst / OptStrFsst: FSST symbol-table compression. +//! Best for high-cardinality columns (street names, place names …). +//! - **Presence**: raw packed bitfields for optional columns (no bool-RLE header). +//! - **Shared dictionaries**: string columns with similar content are grouped using MinHash +//! similarity (same algorithm as v1). Each group encodes one shared dictionary corpus +//! (plain or FSST-compressed, whichever is smaller) followed by per-child index streams. +//! Mirrors v1's `ColumnType::SharedDict` optimization. Note: property column order in the +//! decoded layer may differ from the original when grouping is applied (shared-dict children +//! are emitted together before the next non-grouped column). + +use std::collections::HashMap; + +use fastpfor::{AnyLenCodec as _, FastPFor128}; +use geo_types::{ + Coord, Geometry, LineString, MultiLineString, MultiPoint, MultiPolygon, Point, Polygon, +}; +use integer_encoding::{VarInt as _, VarIntWriter as _}; +use strum::FromRepr; +use zigzag::ZigZag as _; + +use crate::codecs::fsst::compress_fsst; +use crate::codecs::varint::parse_varint; +use crate::codecs::zigzag::encode_componentwise_delta_vec2s; +use crate::decoder::{GeometryType, PropValue, TileFeature, TileLayer01}; +use crate::encoder::{ + EncoderConfig, SortStrategy, StagedSharedDict, StagedSharedDictItem, StringGroup, + group_string_properties, spatial_sort_likely_to_help, +}; +use crate::utils::{BinarySerializer as _, parse_string}; +use crate::{MltError, MltResult}; + +/// Minimum feature count above which the bounding-box heuristic is applied +/// before deciding whether to try spatial sort trials. Mirrors v1's threshold. +const SORT_TRIAL_THRESHOLD: usize = 512; + +// ── Encoding byte bit layout ────────────────────────────────────────────────── +// bit 7: has_explicit_count (1 = count varint precedes byte_length) +// bits 6-4: logical encoding (0=None, 1=Delta, 2=CwDelta, 3=Rle) +// bits 3-2: physical encoding (0=None-noLen, 1=None-withLen, 2=VarInt, 3=FastPFor128) +// bits 1-0: reserved (0) + +/// Stream encoding byte for MLT v2 wire format. +/// +/// Each variant name encodes the logical+physical combination and whether an +/// explicit count varint precedes the byte-length varint. +/// Variants ending in `Expl` carry `bit7=1` and are followed by a count field. +#[repr(u8)] +#[derive(Debug, Clone, Copy, PartialEq, FromRepr)] +enum Enc { + /// logical=None, physical=VarInt, count from context (implicit) + VarInt = 0x08, + /// logical=None, physical=VarInt, explicit count follows + VarIntExpl = 0x88, + /// logical=Delta, physical=VarInt, count from context (implicit) + DeltaVarInt = 0x18, + /// logical=CwDelta, physical=VarInt, explicit count follows + CwDeltaVarInt = 0xA8, + /// logical=CwDelta, physical=FastPFor128, explicit count follows + CwDeltaFp128 = 0xAC, + /// logical=Rle, physical=reserved; byte_length always follows (no count field) + Rle = 0x30, + /// logical=None, physical=None-noLen, explicit count (= byte count) follows + RawExpl = 0x80, +} + +impl TryFrom for Enc { + type Error = MltError; + fn try_from(v: u8) -> MltResult { + Self::from_repr(v).ok_or(MltError::NotImplemented( + "unsupported v2 stream encoding byte", + )) + } +} + +// ── Geometry layout byte values ─────────────────────────────────────────────── + +/// MLT v2 geometry section layout codes. +/// +/// Encodes which geometry streams are present and in what order. +/// Dict and tessellation variants are excluded from this initial implementation. +#[repr(u8)] +#[derive(Debug, Clone, Copy, PartialEq, FromRepr)] +pub enum GeoLayout { + /// Types, Vertices + Points = 0, + /// Types, GeoLengths, Vertices + MultiPoints = 2, + /// Types, PartLengths, Vertices + Lines = 4, + /// Types, GeoLengths, PartLengths, Vertices + MultiLines = 6, + /// Types, PartLengths, RingLengths, Vertices + Polygons = 8, + /// Types, GeoLengths, PartLengths, RingLengths, Vertices + MultiPolygons = 10, +} + +impl TryFrom for GeoLayout { + type Error = MltError; + fn try_from(v: u8) -> MltResult { + Self::from_repr(v).ok_or(MltError::NotImplemented("unsupported v2 geometry layout")) + } +} + +// ── Column type codes (reuse v1 codes for backward clarity) ────────────────── + +/// Wire column type byte for MLT v2 property columns. +/// +/// Each variant maps to its discriminant value on the wire. +/// Optional variants encode the same data as their non-optional counterpart +/// but are preceded by a packed presence bitfield (except `OptStrDict`, which +/// uses index 0 to signal null). +#[repr(u8)] +#[derive(Debug, Clone, Copy, PartialEq, FromRepr)] +enum ColType { + // IDs + Id = 0, + OptId = 1, + // Scalars (mirror v1 ColumnType values) + Bool = 10, + OptBool = 11, + I8 = 12, + OptI8 = 13, + U8 = 14, + OptU8 = 15, + I32 = 16, + OptI32 = 17, + U32 = 18, + OptU32 = 19, + I64 = 20, + OptI64 = 21, + U64 = 22, + OptU64 = 23, + F32 = 24, + OptF32 = 25, + F64 = 26, + OptF64 = 27, + // Strings: plain lengths + raw bytes (presence bitfield when optional) + Str = 28, + OptStr = 29, + // Strings: deduplicated dictionary; index 0 = null for OptStrDict + StrDict = 30, + OptStrDict = 31, + // Strings: FSST-compressed corpus (presence bitfield when optional) + StrFsst = 32, + OptStrFsst = 33, + // Strings: cross-column shared dictionary (plain corpus) + StrSharedDict = 34, + // Strings: cross-column shared dictionary (FSST corpus) + StrSharedDictFsst = 35, +} + +impl TryFrom for ColType { + type Error = MltError; + fn try_from(v: u8) -> MltResult { + Self::from_repr(v).ok_or(MltError::NotImplemented("unsupported v2 column type")) + } +} + +/// Flag byte for each child within a shared-dict column: bit 0 = child has null values. +const CHILD_OPTIONAL: u8 = 0x01; + +// ── Encoder ─────────────────────────────────────────────────────────────────── + +impl TileLayer01 { + /// Encode this layer to the MLT **v2** experimental wire format. + /// + /// Respects the same [`EncoderConfig`] flags as the v1 encoder: + /// - `try_spatial_morton_sort` / `try_spatial_hilbert_sort`: spatial sort trials. + /// - `allow_fsst`: FSST string compression. + /// - `allow_fpf`: FastPFor128 vertex compression. + /// - `allow_shared_dict`: cross-column shared dictionary grouping. + /// + /// Returns a complete framed record: `[varint(body_len+1)][tag=2][body…]` + /// ready to be concatenated with other layers in a tile. + /// + /// Returns an empty `Vec` for layers with no features. + pub fn encode_v2(&self, cfg: EncoderConfig) -> MltResult> { + if self.features.is_empty() { + return Ok(Vec::new()); + } + + // Baseline: unsorted feature order. + let mut best = self.encode_v2_with_sort(cfg)?; + + // Apply the same spatial-sort heuristic as the v1 optimizer. + let try_spatial = + self.features.len() < SORT_TRIAL_THRESHOLD || spatial_sort_likely_to_help(self); + + if try_spatial { + let strategies = [ + (SortStrategy::SpatialMorton, cfg.try_spatial_morton_sort), + (SortStrategy::SpatialHilbert, cfg.try_spatial_hilbert_sort), + ]; + for (strategy, enabled) in strategies { + if !enabled { + continue; + } + let mut candidate = self.clone(); + candidate.sort(strategy); + let bytes = candidate.encode_v2_with_sort(cfg)?; + if bytes.len() < best.len() { + best = bytes; + } + } + } + + Ok(best) + } + + /// Encode this layer to v2 bytes with features in their **current** order. + /// + /// Called by [`encode_v2`] once per sort-strategy trial. + fn encode_v2_with_sort(&self, cfg: EncoderConfig) -> MltResult> { + let feature_count = self.features.len(); + + // ── Geometry ────────────────────────────────────────────────────────── + let geom = collect_geometry(&self.features)?; + + // ── IDs ─────────────────────────────────────────────────────────────── + let has_ids = self.features.iter().any(|f| f.id.is_some()); + let all_ids: Vec> = if has_ids { + self.features.iter().map(|f| f.id).collect() + } else { + Vec::new() + }; + + // ── Collect column chunks ───────────────────────────────────────────── + // We must know the total column count before writing it, so collect all + // encoded chunks first, then write col_count + chunks together. + // + // For shared-dict groups, we compare the shared-dict chunk against encoding + // each member individually and keep whichever is smaller. + + // Group similar string columns with MinHash (mirrors v1); skip when disabled. + let groups = if cfg.allow_shared_dict { + group_string_properties(self) + } else { + Vec::new() + }; + let col_to_group: HashMap = groups + .iter() + .flat_map(|g| g.columns.iter().map(move |(_, i)| (*i, g))) + .collect(); + let mut group_start: HashMap = + groups.iter().map(|g| (g.columns[0].1, g)).collect(); + + let mut col_chunks: Vec> = Vec::new(); + + // ID column (single chunk) + if has_ids { + let mut chunk = Vec::new(); + write_id_column(&mut chunk, &all_ids, feature_count)?; + col_chunks.push(chunk); + } + + // Property columns + for (col_idx, name) in self.property_names.iter().enumerate() { + if let Some(g) = group_start.remove(&col_idx) { + // Try shared-dict encoding (all group members as one encoded column). + let shared_chunk = encode_shared_dict_chunk(g, &self.features, cfg.allow_fsst)?; + + // Try individual encoding for every group member as a baseline. + let individual_chunks: Vec> = g + .columns + .iter() + .map(|(_, c_idx)| { + let full_name = &self.property_names[*c_idx]; + let vals: Vec<&PropValue> = self + .features + .iter() + .map(|f| &f.properties[*c_idx]) + .collect(); + let mut chunk = Vec::new(); + write_property_column( + &mut chunk, + full_name, + &vals, + feature_count, + cfg.allow_fsst, + )?; + Ok(chunk) + }) + .collect::>()?; + let individual_total: usize = individual_chunks.iter().map(|c| c.len()).sum(); + + if shared_chunk.len() <= individual_total { + col_chunks.push(shared_chunk); + } else { + col_chunks.extend(individual_chunks); + } + } else if !col_to_group.contains_key(&col_idx) { + let vals: Vec<&PropValue> = self + .features + .iter() + .map(|f| &f.properties[col_idx]) + .collect(); + let mut chunk = Vec::new(); + write_property_column(&mut chunk, name, &vals, feature_count, cfg.allow_fsst)?; + col_chunks.push(chunk); + } + // else: column absorbed into a shared-dict group handled above. + } + + // ── Build body ──────────────────────────────────────────────────────── + let mut body: Vec = Vec::new(); + + // name + body.write_string(&self.name)?; + + // extent + body.write_varint(self.extent)?; + + // feature_count (v2 addition) + body.write_varint(feature_count as u32)?; + + // ── Geometry section ────────────────────────────────────────────────── + body.push(geom.layout as u8); + write_geometry_streams( + &mut body, + &geom.types, + geom.geo_lengths.as_deref(), + geom.part_lengths.as_deref(), + geom.ring_lengths.as_deref(), + &geom.vertices, + feature_count, + cfg.allow_fpf, + )?; + + // ── Columns ─────────────────────────────────────────────────────────── + body.write_varint(col_chunks.len() as u32)?; + for chunk in col_chunks { + body.extend_from_slice(&chunk); + } + + // ── Frame: [varint(body_len+1)][tag=2][body] ────────────────────────── + let size = u32::try_from(body.len() + 1)?; // +1 for the tag byte + let mut out = Vec::with_capacity(5 + 1 + body.len()); + out.write_varint(size)?; + out.push(2_u8); // tag = 2 + out.extend_from_slice(&body); + + Ok(out) + } +} + +// ── Geometry collection ─────────────────────────────────────────────────────── + +struct CollectedGeometry { + layout: GeoLayout, + /// Geometry type per feature (u8 repr of GeometryType) + types: Vec, + /// Number of component geometries per multi-geometry feature + geo_lengths: Option>, + /// Number of rings per polygon, or vertices per line + part_lengths: Option>, + /// Number of vertices per ring + ring_lengths: Option>, + /// Flat vertex buffer `[x0, y0, x1, y1, …]` + vertices: Vec, +} + +fn collect_geometry(features: &[TileFeature]) -> MltResult { + let has_multi = features.iter().any(|f| { + matches!( + f.geometry, + Geometry::MultiPoint(_) | Geometry::MultiLineString(_) | Geometry::MultiPolygon(_) + ) + }); + let has_ring = features + .iter() + .any(|f| matches!(f.geometry, Geometry::Polygon(_) | Geometry::MultiPolygon(_))); + let has_line = features.iter().any(|f| { + matches!( + f.geometry, + Geometry::LineString(_) | Geometry::MultiLineString(_) + ) + }); + let has_part = has_line || has_ring; + + let layout = match (has_multi, has_ring, has_part) { + (_, true, _) if has_multi => GeoLayout::MultiPolygons, + (_, true, _) => GeoLayout::Polygons, + (_, false, true) if has_multi => GeoLayout::MultiLines, + (_, false, true) => GeoLayout::Lines, + (true, false, false) => GeoLayout::MultiPoints, + (false, false, false) => GeoLayout::Points, + }; + + let mut types = Vec::with_capacity(features.len()); + let mut geo_lengths: Option> = if has_multi { Some(Vec::new()) } else { None }; + let mut part_lengths: Option> = if has_part { Some(Vec::new()) } else { None }; + let mut ring_lengths: Option> = if has_ring { Some(Vec::new()) } else { None }; + let mut vertices: Vec = Vec::new(); + + for feat in features { + push_feature_geometry( + &feat.geometry, + &mut types, + &mut geo_lengths, + &mut part_lengths, + &mut ring_lengths, + &mut vertices, + )?; + } + + Ok(CollectedGeometry { + layout, + types, + geo_lengths, + part_lengths, + ring_lengths, + vertices, + }) +} + +fn push_coords(coords: impl Iterator>, vertices: &mut Vec) -> u32 { + let start = vertices.len(); + for c in coords { + vertices.push(c.x); + vertices.push(c.y); + } + u32::try_from((vertices.len() - start) / 2).expect("vertex count fits u32") +} + +/// For polygon rings: GeoJSON rings include the closing coordinate (first == last), +/// which MLT does not store. Drop the last coordinate if it equals the first. +fn ring_vertex_count(ring: &LineString, vertices: &mut Vec) -> u32 { + let coords = &ring.0; + let has_closing = coords.len() >= 2 && coords.first() == coords.last(); + let coords_to_write = if has_closing { + &coords[..coords.len() - 1] + } else { + &coords[..] + }; + push_coords(coords_to_write.iter().copied(), vertices) +} + +fn push_feature_geometry( + geom: &Geometry, + types: &mut Vec, + geo_lengths: &mut Option>, + part_lengths: &mut Option>, + ring_lengths: &mut Option>, + vertices: &mut Vec, +) -> MltResult<()> { + match geom { + Geometry::Point(pt) => { + types.push(GeometryType::Point as u32); + vertices.push(pt.0.x); + vertices.push(pt.0.y); + } + Geometry::MultiPoint(mp) => { + types.push(GeometryType::MultiPoint as u32); + let count = push_coords(mp.0.iter().map(|p| p.0), vertices); + geo_lengths.as_mut().map(|v| v.push(count)); + } + Geometry::LineString(ls) => { + types.push(GeometryType::LineString as u32); + let count = push_coords(ls.0.iter().copied(), vertices); + part_lengths.as_mut().map(|v| v.push(count)); + } + Geometry::MultiLineString(mls) => { + types.push(GeometryType::MultiLineString as u32); + geo_lengths.as_mut().map(|v| v.push(mls.0.len() as u32)); + for ls in &mls.0 { + let count = push_coords(ls.0.iter().copied(), vertices); + part_lengths.as_mut().map(|v| v.push(count)); + } + } + Geometry::Polygon(poly) => { + types.push(GeometryType::Polygon as u32); + let ring_count = 1 + poly.interiors().len(); + part_lengths.as_mut().map(|v| v.push(ring_count as u32)); + let ext_count = ring_vertex_count(poly.exterior(), vertices); + ring_lengths.as_mut().map(|v| v.push(ext_count)); + for hole in poly.interiors() { + let hole_count = ring_vertex_count(hole, vertices); + ring_lengths.as_mut().map(|v| v.push(hole_count)); + } + } + Geometry::MultiPolygon(mp) => { + types.push(GeometryType::MultiPolygon as u32); + geo_lengths.as_mut().map(|v| v.push(mp.0.len() as u32)); + for poly in &mp.0 { + let ring_count = 1 + poly.interiors().len(); + part_lengths.as_mut().map(|v| v.push(ring_count as u32)); + let ext_count = ring_vertex_count(poly.exterior(), vertices); + ring_lengths.as_mut().map(|v| v.push(ext_count)); + for hole in poly.interiors() { + let hole_count = ring_vertex_count(hole, vertices); + ring_lengths.as_mut().map(|v| v.push(hole_count)); + } + } + } + _other => { + return Err(MltError::NotImplemented( + "v2 encoder: unsupported geometry type (GeometryCollection not supported)", + )); + } + } + Ok(()) +} + +// ── Stream body builders (return raw bytes, no enc_byte prefix) ─────────────── + +/// Build the VarInt body for a `u32` slice (no enc_byte, no count, no length). +fn build_varint_body_u32(values: &[u32]) -> MltResult> { + let mut tmp = Vec::new(); + for &v in values { + tmp.write_varint(v)?; + } + Ok(tmp) +} + +/// Build the RLE body for a `u32` slice: pairs of `(run_length, value)` as VarInts. +fn build_rle_body_u32(values: &[u32]) -> MltResult> { + let mut tmp = Vec::new(); + let mut i = 0; + while i < values.len() { + let val = values[i]; + let mut run: u32 = 1; + while i + (run as usize) < values.len() && values[i + (run as usize)] == val { + run += 1; + } + tmp.write_varint(run)?; + tmp.write_varint(val)?; + i += run as usize; + } + Ok(tmp) +} + +/// Build the Delta+VarInt body for a `u64` slice (deltas from the previous value). +/// The first element is the original value (delta from 0). +fn build_delta_body_u64(values: &[u64]) -> MltResult> { + let mut tmp = Vec::new(); + let mut prev = 0u64; + for &v in values { + let delta = v.wrapping_sub(prev); + tmp.write_varint(delta)?; + prev = v; + } + Ok(tmp) +} + +// ── Stream write helpers ────────────────────────────────────────────────────── + +/// Write a VarInt-encoded `u32` stream with implicit count (= `feature_count`). +fn write_u32_stream_implicit(buf: &mut Vec, values: &[u32]) -> MltResult<()> { + buf.push(Enc::VarInt as u8); + let body = build_varint_body_u32(values)?; + buf.write_varint(body.len() as u32)?; + buf.extend_from_slice(&body); + Ok(()) +} + +/// Write a `u32` types stream choosing the smaller of VarInt vs RLE encoding. +/// +/// For pure single-type layers `[0, 0, 0, …]` this reduces from N bytes to ~3. +fn write_types_stream(buf: &mut Vec, values: &[u32]) -> MltResult<()> { + let varint_body = build_varint_body_u32(values)?; + let rle_body = build_rle_body_u32(values)?; + + // VarInt overhead: 1 (enc_byte) + varint(body_len) + body + // RLE overhead: 1 (enc_byte) + varint(body_len) + body (same shape) + if rle_body.len() < varint_body.len() { + buf.push(Enc::Rle as u8); + buf.write_varint(rle_body.len() as u32)?; + buf.extend_from_slice(&rle_body); + } else { + buf.push(Enc::VarInt as u8); + buf.write_varint(varint_body.len() as u32)?; + buf.extend_from_slice(&varint_body); + } + Ok(()) +} + +/// Write a VarInt-encoded `u32` stream with explicit count. +fn write_u32_stream_explicit(buf: &mut Vec, values: &[u32]) -> MltResult<()> { + buf.push(Enc::VarIntExpl as u8); + buf.write_varint(values.len() as u32)?; + let body = build_varint_body_u32(values)?; + buf.write_varint(body.len() as u32)?; + buf.extend_from_slice(&body); + Ok(()) +} + +/// Write a `u64` stream choosing the smaller of plain VarInt vs Delta+VarInt. +/// +/// For sequential IDs (common in OSM) delta encoding reduces each ID to 1 byte. +fn write_u64_stream_best(buf: &mut Vec, values: &[u64]) -> MltResult<()> { + let plain_body = { + let mut tmp = Vec::new(); + for &v in values { + tmp.write_varint(v)?; + } + tmp + }; + let delta_body = build_delta_body_u64(values)?; + + if delta_body.len() < plain_body.len() { + buf.push(Enc::DeltaVarInt as u8); + buf.write_varint(delta_body.len() as u32)?; + buf.extend_from_slice(&delta_body); + } else { + buf.push(Enc::VarInt as u8); + buf.write_varint(plain_body.len() as u32)?; + buf.extend_from_slice(&plain_body); + } + Ok(()) +} + +/// Write a VarInt-encoded `u64` stream with implicit count. +fn write_u64_stream_implicit(buf: &mut Vec, values: &[u64]) -> MltResult<()> { + buf.push(Enc::VarInt as u8); + let mut tmp = Vec::new(); + for &v in values { + tmp.write_varint(v)?; + } + buf.write_varint(tmp.len() as u32)?; + buf.extend_from_slice(&tmp); + Ok(()) +} + +/// Write a ZigZag+VarInt encoded `i32` stream with implicit count. +fn write_i32_stream_implicit(buf: &mut Vec, values: &[i32]) -> MltResult<()> { + buf.push(Enc::VarInt as u8); + let mut tmp = Vec::new(); + for &v in values { + tmp.write_varint(i32::encode(v))?; + } + buf.write_varint(tmp.len() as u32)?; + buf.extend_from_slice(&tmp); + Ok(()) +} + +/// Write a ZigZag+VarInt encoded `i64` stream with implicit count. +fn write_i64_stream_implicit(buf: &mut Vec, values: &[i64]) -> MltResult<()> { + buf.push(Enc::VarInt as u8); + let mut tmp = Vec::new(); + for &v in values { + tmp.write_varint(i64::encode(v))?; + } + buf.write_varint(tmp.len() as u32)?; + buf.extend_from_slice(&tmp); + Ok(()) +} + +/// Write a fixed-width raw byte stream with explicit count (number of data bytes). +fn write_raw_bytes(buf: &mut Vec, data: &[u8]) -> MltResult<()> { + buf.push(Enc::RawExpl as u8); + buf.write_varint(data.len() as u32)?; + buf.extend_from_slice(data); + Ok(()) +} + +/// Write vertex data using ComponentwiseDelta + VarInt with explicit count (vertex pairs). +fn write_cwdelta_vertices_varint(pair_count: u32, encoded: &[u32]) -> MltResult> { + let mut out = Vec::new(); + out.push(Enc::CwDeltaVarInt as u8); + out.write_varint(pair_count)?; + let mut tmp = Vec::new(); + for &v in encoded { + tmp.write_varint(v)?; + } + out.write_varint(tmp.len() as u32)?; + out.extend_from_slice(&tmp); + Ok(out) +} + +/// Write vertex data using ComponentwiseDelta + FastPFor128 with explicit count (vertex pairs). +/// +/// FastPFor128 requires the input length to be a multiple of 128. The tail is handled +/// by the VarByte portion of the `Composition(FastPFor128, VariableByte)` codec. +/// Bytes are stored in little-endian u32 order (no byte-swap needed on x86). +fn write_cwdelta_vertices_fp128(pair_count: u32, encoded: &[u32]) -> MltResult> { + if encoded.is_empty() { + // FP128 can't encode an empty slice; fall back handled by caller. + return Ok(Vec::new()); + } + let mut scratch: Vec = Vec::with_capacity(encoded.len() + 1024); + FastPFor128::default() + .encode(encoded, &mut scratch) + .map_err(|_| MltError::NotImplemented("v2: FastPFor128 encode error"))?; + + // v2 wire format: little-endian u32 words (no byte swap on LE hosts). + let byte_data: Vec = scratch.iter().flat_map(|&w| w.to_le_bytes()).collect(); + + let mut out = Vec::new(); + out.push(Enc::CwDeltaFp128 as u8); + out.write_varint(pair_count)?; + out.write_varint(byte_data.len() as u32)?; + out.extend_from_slice(&byte_data); + Ok(out) +} + +/// Write vertex data: tries CwDelta+FastPFor128 and CwDelta+VarInt, keeps the smaller. +/// When `allow_fpf` is false only VarInt is tried (mirrors `EncoderConfig::allow_fpf`). +fn write_cwdelta_vertices(buf: &mut Vec, flat_verts: &[i32], allow_fpf: bool) -> MltResult<()> { + debug_assert!( + flat_verts.len() % 2 == 0, + "vertex buffer must have even length" + ); + let pair_count = (flat_verts.len() / 2) as u32; + + let mut encoded: Vec = Vec::new(); + encode_componentwise_delta_vec2s(flat_verts, &mut encoded); + + let varint_bytes = write_cwdelta_vertices_varint(pair_count, &encoded)?; + + if allow_fpf { + let fp128_bytes = write_cwdelta_vertices_fp128(pair_count, &encoded)?; + if !fp128_bytes.is_empty() && fp128_bytes.len() < varint_bytes.len() { + buf.extend_from_slice(&fp128_bytes); + return Ok(()); + } + } + buf.extend_from_slice(&varint_bytes); + Ok(()) +} + +fn write_geometry_streams( + buf: &mut Vec, + types: &[u32], + geo_lengths: Option<&[u32]>, + part_lengths: Option<&[u32]>, + ring_lengths: Option<&[u32]>, + vertices: &[i32], + feature_count: usize, + allow_fpf: bool, +) -> MltResult<()> { + debug_assert_eq!(types.len(), feature_count); + // Types stream: count = feature_count (implicit); prefer RLE when all same type. + write_types_stream(buf, types)?; + if let Some(g) = geo_lengths { + write_u32_stream_explicit(buf, g)?; + } + if let Some(p) = part_lengths { + write_u32_stream_explicit(buf, p)?; + } + if let Some(r) = ring_lengths { + write_u32_stream_explicit(buf, r)?; + } + write_cwdelta_vertices(buf, vertices, allow_fpf) +} + +// ── Presence bitfield ───────────────────────────────────────────────────────── + +/// Write `ceil(feature_count / 8)` bytes of a packed presence bitfield. +/// +/// Bit `i` (LSB-first within each byte) is set when `values[i]` is non-null. +fn write_presence(buf: &mut Vec, null_mask: &[bool]) { + let byte_count = null_mask.len().div_ceil(8); + let start = buf.len(); + buf.resize(start + byte_count, 0u8); + for (i, &present) in null_mask.iter().enumerate() { + if present { + buf[start + i / 8] |= 1 << (i % 8); + } + } +} + +// ── ID column ───────────────────────────────────────────────────────────────── + +fn write_id_column(buf: &mut Vec, ids: &[Option], _feature_count: usize) -> MltResult<()> { + let any_null = ids.iter().any(|id| id.is_none()); + + if any_null { + // OptId: presence bitfield + data for present values + buf.push(ColType::OptId as u8); + let presence: Vec = ids.iter().map(|id| id.is_some()).collect(); + write_presence(buf, &presence); + let values: Vec = ids.iter().filter_map(|id| *id).collect(); + write_u64_stream_best(buf, &values)?; + } else { + // Id: all values present; Delta+VarInt compresses sequential OSM IDs well. + buf.push(ColType::Id as u8); + let values: Vec = ids.iter().map(|id| id.unwrap_or(0)).collect(); + write_u64_stream_best(buf, &values)?; + } + + Ok(()) +} + +// ── Property column ─────────────────────────────────────────────────────────── + +fn write_property_column( + buf: &mut Vec, + name: &str, + values: &[&PropValue], + feature_count: usize, + allow_fsst: bool, +) -> MltResult<()> { + debug_assert_eq!(values.len(), feature_count); + + let any_null = values.iter().any(|v| is_null(v)); + + // String columns: choose the smallest available encoding. + if matches!(values[0], PropValue::Str(_)) { + let strings: Vec> = values + .iter() + .map(|v| match v { + PropValue::Str(Some(s)) => Some(s.as_str()), + _ => None, + }) + .collect(); + + let plain_bytes = build_string_plain_data(&strings, any_null)?; + let dict_bytes = build_string_dict_data(&strings, any_null)?; + let fsst_bytes = if allow_fsst { + build_string_fsst_data(&strings, any_null)? + } else { + Vec::new() + }; + + // Choose whichever encoding yields the fewest bytes, then write once. + // Dict encoding uses index 0 for null — no separate presence bitfield. + let best_size = if fsst_bytes.is_empty() { + plain_bytes.len().min(dict_bytes.len()) + } else { + plain_bytes + .len() + .min(dict_bytes.len()) + .min(fsst_bytes.len()) + }; + + let (col_type, data, with_presence) = + if !fsst_bytes.is_empty() && best_size == fsst_bytes.len() { + ( + if any_null { + ColType::OptStrFsst + } else { + ColType::StrFsst + }, + &fsst_bytes, + any_null, + ) + } else if best_size == dict_bytes.len() { + ( + if any_null { + ColType::OptStrDict + } else { + ColType::StrDict + }, + &dict_bytes, + false, + ) + } else { + ( + if any_null { + ColType::OptStr + } else { + ColType::Str + }, + &plain_bytes, + any_null, + ) + }; + + buf.push(col_type as u8); + buf.write_string(name)?; + if with_presence { + let presence: Vec = strings.iter().map(|s| s.is_some()).collect(); + write_presence(buf, &presence); + } + buf.extend_from_slice(data); + return Ok(()); + } + + let col_type = prop_column_type(values[0], any_null); + buf.push(col_type as u8); + buf.write_string(name)?; + + if any_null { + let presence: Vec = values.iter().map(|v| !is_null(v)).collect(); + write_presence(buf, &presence); + } + + write_prop_data(buf, values, any_null) +} + +fn is_null(v: &PropValue) -> bool { + match v { + PropValue::Bool(o) => o.is_none(), + PropValue::I8(o) => o.is_none(), + PropValue::U8(o) => o.is_none(), + PropValue::I32(o) => o.is_none(), + PropValue::U32(o) => o.is_none(), + PropValue::I64(o) => o.is_none(), + PropValue::U64(o) => o.is_none(), + PropValue::F32(o) => o.is_none(), + PropValue::F64(o) => o.is_none(), + PropValue::Str(o) => o.is_none(), + } +} + +fn prop_column_type(sample: &PropValue, any_null: bool) -> ColType { + let (non_opt, opt) = match sample { + PropValue::Bool(_) => (ColType::Bool, ColType::OptBool), + PropValue::I8(_) => (ColType::I8, ColType::OptI8), + PropValue::U8(_) => (ColType::U8, ColType::OptU8), + PropValue::I32(_) => (ColType::I32, ColType::OptI32), + PropValue::U32(_) => (ColType::U32, ColType::OptU32), + PropValue::I64(_) => (ColType::I64, ColType::OptI64), + PropValue::U64(_) => (ColType::U64, ColType::OptU64), + PropValue::F32(_) => (ColType::F32, ColType::OptF32), + PropValue::F64(_) => (ColType::F64, ColType::OptF64), + PropValue::Str(_) => (ColType::Str, ColType::OptStr), + }; + if any_null { opt } else { non_opt } +} + +fn write_prop_data(buf: &mut Vec, values: &[&PropValue], any_null: bool) -> MltResult<()> { + match values[0] { + PropValue::Bool(_) => { + let data: Vec = values + .iter() + .filter(|v| !is_null(v)) + .map(|v| prop_bool(v) as u8) + .collect(); + write_raw_bytes(buf, &data)?; + } + PropValue::I8(_) => { + let data: Vec = values + .iter() + .filter(|v| !is_null(v)) + .map(|v| i8::encode(prop_i8(v)) as u8) + .collect(); + write_raw_bytes(buf, &data)?; + } + PropValue::U8(_) => { + let data: Vec = values + .iter() + .filter(|v| !is_null(v)) + .map(|v| prop_u8(v)) + .collect(); + write_raw_bytes(buf, &data)?; + } + PropValue::I32(_) => { + let data: Vec = values + .iter() + .filter(|v| !is_null(v)) + .map(|v| prop_i32(v)) + .collect(); + write_i32_stream_implicit(buf, &data)?; + } + PropValue::U32(_) => { + let data: Vec = values + .iter() + .filter(|v| !is_null(v)) + .map(|v| prop_u32(v)) + .collect(); + write_u32_stream_implicit(buf, &data)?; + } + PropValue::I64(_) => { + let data: Vec = values + .iter() + .filter(|v| !is_null(v)) + .map(|v| prop_i64(v)) + .collect(); + if !any_null { + write_i64_stream_implicit(buf, &data)?; + } else { + write_i64_stream_as_explicit(buf, &data)?; + } + } + PropValue::U64(_) => { + let data: Vec = values + .iter() + .filter(|v| !is_null(v)) + .map(|v| prop_u64(v)) + .collect(); + write_u64_stream_implicit(buf, &data)?; + } + PropValue::F32(_) => { + let mut data = Vec::new(); + for v in values.iter().filter(|v| !is_null(v)) { + data.extend_from_slice(&prop_f32(v).to_le_bytes()); + } + write_raw_bytes(buf, &data)?; + } + PropValue::F64(_) => { + let mut data = Vec::new(); + for v in values.iter().filter(|v| !is_null(v)) { + data.extend_from_slice(&prop_f64(v).to_le_bytes()); + } + write_raw_bytes(buf, &data)?; + } + PropValue::Str(_) => { + // Handled earlier in write_property_column; should not reach here. + unreachable!("Str columns are handled before write_prop_data"); + } + } + Ok(()) +} + +/// Build the bytes for a StrPlain / OptStrPlain column data section. +/// Returns: lengths_stream + raw_bytes_stream (no col_type, no name, no presence). +fn build_string_plain_data(strings: &[Option<&str>], _any_null: bool) -> MltResult> { + let present: Vec<&str> = strings.iter().filter_map(|s| *s).collect(); + let lengths: Vec = present.iter().map(|s| s.len() as u32).collect(); + let all_bytes: Vec = present.iter().flat_map(|s| s.as_bytes()).copied().collect(); + + let mut buf = Vec::new(); + // Lengths stream: count = len(present), encoded with implicit count. + write_u32_stream_implicit(&mut buf, &lengths)?; + // String data: raw bytes with explicit byte count. + write_raw_bytes(&mut buf, &all_bytes)?; + Ok(buf) +} + +/// Build the bytes for a StrDict / OptStrDict column data section. +/// +/// Wire layout: `dict_lengths_stream | dict_data_stream | indices_stream` +/// +/// - `dict_lengths_stream`: VarInt, explicit count = number of unique values. +/// - `dict_data_stream`: raw bytes, explicit count = total UTF-8 bytes in dict. +/// - `indices_stream`: VarInt, implicit count = feature_count. +/// - `OptStrDict`: index 0 = null; indices 1..N map to dict entries 0..N-1. +/// - `StrDict`: indices 0..N-1 map to dict entries directly. +fn build_string_dict_data(strings: &[Option<&str>], any_null: bool) -> MltResult> { + // Build ordered dictionary (preserves first-occurrence order). + let mut dict: Vec<&str> = Vec::new(); + let mut dict_index: HashMap<&str, u32> = HashMap::new(); + + for s in strings.iter().filter_map(|s| *s) { + if !dict_index.contains_key(s) { + let idx = dict.len() as u32; + dict.push(s); + dict_index.insert(s, idx); + } + } + + // Build per-feature index stream. + let mut indices: Vec = Vec::with_capacity(strings.len()); + for s in strings { + match s { + None => { + // OptStrDict: null = index 0; StrDict should not have nulls. + indices.push(0); + } + Some(s) => { + let dict_pos = dict_index[*s]; + if any_null { + // Shift by 1: index 0 reserved for null. + indices.push(dict_pos + 1); + } else { + indices.push(dict_pos); + } + } + } + } + + let dict_lengths: Vec = dict.iter().map(|s| s.len() as u32).collect(); + let dict_bytes: Vec = dict.iter().flat_map(|s| s.as_bytes()).copied().collect(); + + let mut buf = Vec::new(); + // dict_lengths: explicit count = number of dict entries. + write_u32_stream_explicit(&mut buf, &dict_lengths)?; + // dict_data: raw bytes. + write_raw_bytes(&mut buf, &dict_bytes)?; + // indices: implicit count = feature_count. + write_u32_stream_implicit(&mut buf, &indices)?; + Ok(buf) +} + +/// Build the bytes for a StrFsst / OptStrFsst column data section. +/// +/// Wire layout: `sym_lengths_stream | sym_bytes_stream | val_lengths_stream | corpus_stream` +/// +/// - `sym_lengths_stream`: VarInt, explicit count = number of symbols. +/// - `sym_bytes_stream`: raw bytes, explicit count = total symbol bytes. +/// - `val_lengths_stream`: VarInt, implicit count = popcount (or feature_count if non-optional). +/// - `corpus_stream`: raw bytes, explicit count = compressed corpus size. +fn build_string_fsst_data(strings: &[Option<&str>], _any_null: bool) -> MltResult> { + let present: Vec<&str> = strings.iter().filter_map(|s| *s).collect(); + // FSST requires at least one string to train on. + if present.is_empty() { + // Fall back to an empty structure that the decoder can handle. + let mut buf = Vec::new(); + write_u32_stream_explicit(&mut buf, &[])?; // 0 symbols + write_raw_bytes(&mut buf, &[])?; // 0 symbol bytes + write_u32_stream_implicit(&mut buf, &[])?; // 0 lengths + write_raw_bytes(&mut buf, &[])?; // 0 corpus bytes + return Ok(buf); + } + + let raw = compress_fsst(&present); + + let mut buf = Vec::new(); + write_u32_stream_explicit(&mut buf, &raw.symbol_lengths)?; + write_raw_bytes(&mut buf, &raw.symbol_bytes)?; + write_u32_stream_implicit(&mut buf, &raw.value_lengths)?; + write_raw_bytes(&mut buf, &raw.corpus)?; + Ok(buf) +} + +// ── Shared-dictionary encoding ──────────────────────────────────────────────── + +/// Build the per-feature index stream for one child of a shared-dict group. +/// +/// - Non-optional child: index `k` → dict entry `k` (0-based). +/// - Optional child: index `0` = null; index `k` (k ≥ 1) → dict entry `k-1`. +fn build_item_indices( + item: &StagedSharedDictItem, + span_to_idx: &HashMap<(u32, u32), u32>, + optional: bool, +) -> Vec { + let mut span_iter = item.dense_spans(); + item.presence_bools() + .map(|present| { + if present { + let span = span_iter + .next() + .expect("v2 SharedDict: presence/dense mismatch"); + let dict_idx = *span_to_idx + .get(&span) + .expect("v2 SharedDict: span not in dict"); + if optional { dict_idx + 1 } else { dict_idx } + } else { + 0 // null slot (only reachable when `optional`) + } + }) + .collect() +} + +/// Build the encoded bytes for a plain-corpus shared-dict column (`COL_STR_SHARED_DICT`). +/// +/// Wire layout: +/// ```text +/// [u8: COL_STR_SHARED_DICT] +/// [varint: prefix_len] [prefix bytes] +/// [varint: child_count] +/// [dict_lengths: ENC_VARINT_EXPL, count = dict_entry_count, body] +/// [dict_data: ENC_RAW_EXPL, count = total UTF-8 bytes] +/// for each child: +/// [u8: child_flags] bit 0 = optional +/// [varint: suffix_len] [suffix bytes] +/// [indices: ENC_VARINT, implicit count = feature_count, body] +/// non-optional: 0..N-1 → dict index +/// optional: 0 = null, 1..N → dict index + 1 +/// ``` +fn build_shared_dict_plain( + prefix: &str, + items: &[StagedSharedDictItem], + dict_strings: &[&str], + span_to_idx: &HashMap<(u32, u32), u32>, +) -> MltResult> { + let mut buf = Vec::new(); + buf.push(ColType::StrSharedDict as u8); + buf.write_string(prefix)?; + buf.write_varint(items.len() as u32)?; + + let dict_lengths: Vec = dict_strings.iter().map(|s| s.len() as u32).collect(); + let dict_bytes: Vec = dict_strings + .iter() + .flat_map(|s| s.as_bytes()) + .copied() + .collect(); + write_u32_stream_explicit(&mut buf, &dict_lengths)?; + write_raw_bytes(&mut buf, &dict_bytes)?; + + for item in items { + let optional = item.has_presence(); + buf.push(if optional { CHILD_OPTIONAL } else { 0 }); + buf.write_string(&item.suffix)?; + let indices = build_item_indices(item, span_to_idx, optional); + write_u32_stream_implicit(&mut buf, &indices)?; + } + + Ok(buf) +} + +/// Build the encoded bytes for a FSST-corpus shared-dict column (`COL_STR_SHARED_DICT_FSST`). +/// +/// Same as [`build_shared_dict_plain`] but the corpus section becomes: +/// ```text +/// [sym_lengths: ENC_VARINT_EXPL] [sym_bytes: ENC_RAW_EXPL] +/// [val_lengths: ENC_VARINT_EXPL, count = dict_entry_count] [corpus: ENC_RAW_EXPL] +/// ``` +/// Per-child index semantics are identical to the plain variant. +fn build_shared_dict_fsst( + prefix: &str, + items: &[StagedSharedDictItem], + dict_strings: &[&str], + span_to_idx: &HashMap<(u32, u32), u32>, +) -> MltResult> { + let raw = compress_fsst(dict_strings); + + let mut buf = Vec::new(); + buf.push(ColType::StrSharedDictFsst as u8); + buf.write_string(prefix)?; + buf.write_varint(items.len() as u32)?; + + write_u32_stream_explicit(&mut buf, &raw.symbol_lengths)?; + write_raw_bytes(&mut buf, &raw.symbol_bytes)?; + write_u32_stream_explicit(&mut buf, &raw.value_lengths)?; + write_raw_bytes(&mut buf, &raw.corpus)?; + + for item in items { + let optional = item.has_presence(); + buf.push(if optional { CHILD_OPTIONAL } else { 0 }); + buf.write_string(&item.suffix)?; + let indices = build_item_indices(item, span_to_idx, optional); + write_u32_stream_implicit(&mut buf, &indices)?; + } + + Ok(buf) +} + +/// Encode a shared-dict group, automatically choosing plain vs FSST corpus. +/// +/// Returns the encoded bytes for a single shared-dict column entry. The caller +/// should compare this against individually-encoded columns and use whichever is +/// smaller (see [`encode_v2_with_sort`]). +/// +/// When `allow_fsst` is false only the plain-corpus variant is tried. +fn encode_shared_dict_chunk( + group: &StringGroup, + features: &[TileFeature], + allow_fsst: bool, +) -> MltResult> { + // Extract per-column string values in stable column-index order. + let mut order: Vec = (0..group.columns.len()).collect(); + order.sort_by_key(|&i| group.columns[i].1); + + let columns: Vec<(String, Vec>)> = order + .iter() + .map(|&i| { + let (suffix, col_idx) = &group.columns[i]; + let values: Vec> = features + .iter() + .map(|f| match f.properties.get(*col_idx) { + Some(PropValue::Str(Some(s))) => Some(s.clone()), + _ => None, + }) + .collect(); + (suffix.clone(), values) + }) + .collect(); + + // Build the shared corpus (dedup strings across all children into one buffer). + let shared_dict = StagedSharedDict::new( + group.prefix.clone(), + columns + .iter() + .map(|(s, v)| (s.as_str(), v.iter().map(|o| o.as_deref()))), + )?; + + // Unique dictionary entries (sorted, deduped spans → stable dict order). + let dict_spans = { + let mut s: Vec<(u32, u32)> = shared_dict + .items + .iter() + .flat_map(|item| item.dense_spans()) + .collect(); + s.sort_unstable(); + s.dedup(); + s + }; + let dict_strings: Vec<&str> = dict_spans + .iter() + .map(|&(s, e)| { + shared_dict + .get((s, e)) + .ok_or(MltError::NotImplemented("v2: shared dict span OOB")) + }) + .collect::>>()?; + let span_to_idx: HashMap<(u32, u32), u32> = dict_spans + .iter() + .copied() + .enumerate() + .map(|(i, span)| (span, i as u32)) + .collect(); + + let plain_buf = build_shared_dict_plain( + &group.prefix, + &shared_dict.items, + &dict_strings, + &span_to_idx, + )?; + + if allow_fsst && !dict_strings.is_empty() { + let fsst_buf = build_shared_dict_fsst( + &group.prefix, + &shared_dict.items, + &dict_strings, + &span_to_idx, + )?; + if fsst_buf.len() < plain_buf.len() { + return Ok(fsst_buf); + } + } + + Ok(plain_buf) +} + +// ── VarInt stream helpers for optional columns (count = popcount) ───────────── +// These reuse the same wire format as their _implicit counterparts; the count is +// determined by the decoder from context (feature_count or popcount). + +fn write_i64_stream_as_explicit(buf: &mut Vec, values: &[i64]) -> MltResult<()> { + write_i64_stream_implicit(buf, values) +} + +// ── PropValue field accessors ───────────────────────────────────────────────── + +fn prop_bool(v: &PropValue) -> bool { + if let PropValue::Bool(Some(b)) = v { + *b + } else { + false + } +} +fn prop_i8(v: &PropValue) -> i8 { + if let PropValue::I8(Some(b)) = v { + *b + } else { + 0 + } +} +fn prop_u8(v: &PropValue) -> u8 { + if let PropValue::U8(Some(b)) = v { + *b + } else { + 0 + } +} +fn prop_i32(v: &PropValue) -> i32 { + if let PropValue::I32(Some(b)) = v { + *b + } else { + 0 + } +} +fn prop_u32(v: &PropValue) -> u32 { + if let PropValue::U32(Some(b)) = v { + *b + } else { + 0 + } +} +fn prop_i64(v: &PropValue) -> i64 { + if let PropValue::I64(Some(b)) = v { + *b + } else { + 0 + } +} +fn prop_u64(v: &PropValue) -> u64 { + if let PropValue::U64(Some(b)) = v { + *b + } else { + 0 + } +} +fn prop_f32(v: &PropValue) -> f32 { + if let PropValue::F32(Some(b)) = v { + *b + } else { + 0.0 + } +} +fn prop_f64(v: &PropValue) -> f64 { + if let PropValue::F64(Some(b)) = v { + *b + } else { + 0.0 + } +} + +// ── Decoder ─────────────────────────────────────────────────────────────────── + +/// Decode an MLT **v2** layer body (the bytes after the `tag=2` byte). +/// +/// Returns a [`TileLayer01`] that can be used by the rest of the MLT tooling. +pub fn decode_v2_layer(data: &[u8]) -> MltResult { + // ── Header ──────────────────────────────────────────────────────────────── + let (data, name) = parse_string(data)?; + let name = name.to_string(); + + let (data, extent) = parse_varint::(data)?; + let (data, feature_count) = parse_varint::(data)?; + let feature_count = feature_count as usize; + + // ── Geometry section ────────────────────────────────────────────────────── + if data.is_empty() { + return Err(MltError::MissingGeometry); + } + let layout = GeoLayout::try_from(data[0])?; + let data = &data[1..]; + + let (data, geometries) = decode_geometry(data, layout, feature_count)?; + + // ── Column count ────────────────────────────────────────────────────────── + let (mut data, col_count) = parse_varint::(data)?; + + // ── Columns ─────────────────────────────────────────────────────────────── + let mut feature_ids: Vec> = vec![None; feature_count]; + let mut property_names: Vec = Vec::new(); + let mut property_columns: Vec> = Vec::new(); + let mut presence_groups: Vec> = Vec::new(); + + for _ in 0..col_count { + if data.is_empty() { + return Err(MltError::BufferUnderflow(1, 0)); + } + let col_type = ColType::try_from(data[0])?; + data = &data[1..]; + + match col_type { + ColType::Id => { + let (new_data, ids) = decode_id_column(data, feature_count)?; + data = new_data; + for (i, id) in ids.into_iter().enumerate() { + feature_ids[i] = Some(id); + } + } + ColType::OptId => { + let (new_data, presence, ids) = decode_opt_id_column(data, feature_count)?; + data = new_data; + presence_groups.push(presence.clone()); + let mut iter = ids.into_iter(); + for (i, &present) in presence.iter().enumerate() { + if present { + feature_ids[i] = iter.next(); + } + } + } + ColType::Bool + | ColType::OptBool + | ColType::I8 + | ColType::OptI8 + | ColType::U8 + | ColType::OptU8 + | ColType::I32 + | ColType::OptI32 + | ColType::U32 + | ColType::OptU32 + | ColType::I64 + | ColType::OptI64 + | ColType::U64 + | ColType::OptU64 + | ColType::F32 + | ColType::OptF32 + | ColType::F64 + | ColType::OptF64 + | ColType::Str + | ColType::OptStr => { + let (new_data, col_name, col_values) = + decode_property_column(data, col_type, feature_count, &mut presence_groups)?; + data = new_data; + property_names.push(col_name); + property_columns.push(col_values); + } + ColType::StrDict | ColType::OptStrDict => { + let (new_data, col_name, col_values) = + decode_string_dict_column(data, col_type, feature_count)?; + data = new_data; + property_names.push(col_name); + property_columns.push(col_values); + } + ColType::StrFsst | ColType::OptStrFsst => { + let (new_data, col_name, col_values) = + decode_string_fsst_column(data, col_type, feature_count)?; + data = new_data; + property_names.push(col_name); + property_columns.push(col_values); + } + ColType::StrSharedDict | ColType::StrSharedDictFsst => { + let (new_data, columns) = decode_shared_dict_v2(data, col_type, feature_count)?; + data = new_data; + for (col_name, col_values) in columns { + property_names.push(col_name); + property_columns.push(col_values); + } + } + } + } + + // ── Assemble TileLayer01 ────────────────────────────────────────────────── + let prop_count = property_names.len(); + let features = geometries + .into_iter() + .enumerate() + .map(|(i, geom)| { + let id = feature_ids[i]; + let properties = (0..prop_count) + .map(|j| property_columns[j][i].clone()) + .collect(); + TileFeature { + id, + geometry: geom, + properties, + } + }) + .collect(); + + Ok(TileLayer01 { + name, + extent, + property_names, + features, + }) +} + +// ── Geometry decoding ───────────────────────────────────────────────────────── + +fn decode_geometry<'a>( + data: &'a [u8], + layout: GeoLayout, + feature_count: usize, +) -> MltResult<(&'a [u8], Vec>)> { + // Types stream: count = feature_count (implicit) + let (data, type_vals) = read_u32_stream(data, feature_count, false)?; + let geo_types: Vec = type_vals + .iter() + .map(|&v| GeometryType::try_from(v as u8).map_err(MltError::from)) + .collect::>()?; + + // Optional auxiliary streams, depending on layout + let (data, geo_lengths) = if matches!( + layout, + GeoLayout::MultiPoints | GeoLayout::MultiLines | GeoLayout::MultiPolygons + ) { + let (d, v) = read_u32_stream(data, 0, true)?; + (d, Some(v)) + } else { + (data, None) + }; + + let (data, part_lengths) = if matches!( + layout, + GeoLayout::Lines | GeoLayout::MultiLines | GeoLayout::Polygons | GeoLayout::MultiPolygons + ) { + let (d, v) = read_u32_stream(data, 0, true)?; + (d, Some(v)) + } else { + (data, None) + }; + + let (data, ring_lengths) = if matches!(layout, GeoLayout::Polygons | GeoLayout::MultiPolygons) { + let (d, v) = read_u32_stream(data, 0, true)?; + (d, Some(v)) + } else { + (data, None) + }; + + // Vertices stream: explicit count (vertex pairs) + let (data, vertices) = read_cwdelta_stream(data)?; + + let geometries = reconstruct_geometries( + layout, + &geo_types, + geo_lengths.as_deref(), + part_lengths.as_deref(), + ring_lengths.as_deref(), + &vertices, + )?; + + Ok((data, geometries)) +} + +fn reconstruct_geometries( + layout: GeoLayout, + types: &[GeometryType], + geo_lengths: Option<&[u32]>, + part_lengths: Option<&[u32]>, + ring_lengths: Option<&[u32]>, + vertices: &[i32], +) -> MltResult>> { + let mut geoms = Vec::with_capacity(types.len()); + let mut vert_pos: usize = 0; // current position in vertices (counting i32 elements) + + match layout { + GeoLayout::Points => { + for _ in types { + let x = vertices[vert_pos]; + let y = vertices[vert_pos + 1]; + vert_pos += 2; + geoms.push(Geometry::Point(Point(Coord { x, y }))); + } + } + GeoLayout::MultiPoints => { + let geo_lens = geo_lengths.expect("MultiPoints requires GeoLengths"); + for &geo_len in geo_lens { + let pts: Vec> = (0..geo_len as usize) + .map(|_| { + let x = vertices[vert_pos]; + let y = vertices[vert_pos + 1]; + vert_pos += 2; + Point(Coord { x, y }) + }) + .collect(); + geoms.push(Geometry::MultiPoint(MultiPoint(pts))); + } + } + GeoLayout::Lines => { + let part_lens = part_lengths.expect("Lines requires PartLengths"); + for &part_len in part_lens { + let coords: Vec> = (0..part_len as usize) + .map(|_| { + let c = Coord { + x: vertices[vert_pos], + y: vertices[vert_pos + 1], + }; + vert_pos += 2; + c + }) + .collect(); + geoms.push(Geometry::LineString(LineString(coords))); + } + } + GeoLayout::MultiLines => { + let geo_lens = geo_lengths.expect("MultiLines requires GeoLengths"); + let part_lens = part_lengths.expect("MultiLines requires PartLengths"); + let mut part_idx = 0; + for &geo_len in geo_lens { + let lines: Vec> = (0..geo_len as usize) + .map(|_| { + let part_len = part_lens[part_idx] as usize; + part_idx += 1; + let coords: Vec> = (0..part_len) + .map(|_| { + let c = Coord { + x: vertices[vert_pos], + y: vertices[vert_pos + 1], + }; + vert_pos += 2; + c + }) + .collect(); + LineString(coords) + }) + .collect(); + geoms.push(Geometry::MultiLineString(MultiLineString(lines))); + } + } + GeoLayout::Polygons => { + let part_lens = part_lengths.expect("Polygons requires PartLengths"); + let ring_lens = ring_lengths.expect("Polygons requires RingLengths"); + let mut ring_idx = 0; + for &part_len in part_lens { + let rings: Vec> = (0..part_len as usize) + .map(|_| { + let ring_len = ring_lens[ring_idx] as usize; + ring_idx += 1; + let mut coords: Vec> = (0..ring_len) + .map(|_| { + let c = Coord { + x: vertices[vert_pos], + y: vertices[vert_pos + 1], + }; + vert_pos += 2; + c + }) + .collect(); + // Close the ring (MLT omits the closing vertex) + if let Some(&first) = coords.first() { + coords.push(first); + } + LineString(coords) + }) + .collect(); + let mut ring_iter = rings.into_iter(); + let exterior = ring_iter.next().unwrap_or_else(|| LineString(vec![])); + let holes: Vec> = ring_iter.collect(); + geoms.push(Geometry::Polygon(Polygon::new(exterior, holes))); + } + } + GeoLayout::MultiPolygons => { + let geo_lens = geo_lengths.expect("MultiPolygons requires GeoLengths"); + let part_lens = part_lengths.expect("MultiPolygons requires PartLengths"); + let ring_lens = ring_lengths.expect("MultiPolygons requires RingLengths"); + let mut part_idx = 0; + let mut ring_idx = 0; + for &geo_len in geo_lens { + let polys: Vec> = (0..geo_len as usize) + .map(|_| { + let part_len = part_lens[part_idx] as usize; + part_idx += 1; + let rings: Vec> = (0..part_len) + .map(|_| { + let ring_len = ring_lens[ring_idx] as usize; + ring_idx += 1; + let mut coords: Vec> = (0..ring_len) + .map(|_| { + let c = Coord { + x: vertices[vert_pos], + y: vertices[vert_pos + 1], + }; + vert_pos += 2; + c + }) + .collect(); + if let Some(&first) = coords.first() { + coords.push(first); + } + LineString(coords) + }) + .collect(); + let mut ring_iter = rings.into_iter(); + let exterior = ring_iter.next().unwrap_or_else(|| LineString(vec![])); + Polygon::new(exterior, ring_iter.collect()) + }) + .collect(); + geoms.push(Geometry::MultiPolygon(MultiPolygon(polys))); + } + } + } + + Ok(geoms) +} + +// ── Stream read helpers ─────────────────────────────────────────────────────── + +/// Read one encoding byte and return the recognised `Enc` variant. +fn read_enc(data: &[u8]) -> MltResult<(&[u8], Enc)> { + if data.is_empty() { + return Err(MltError::BufferUnderflow(1, 0)); + } + Ok((&data[1..], Enc::try_from(data[0])?)) +} + +/// Read a VarInt (or RLE) u32 stream. +/// +/// If `explicit` is true, read the count from the stream; otherwise use `implicit_count`. +/// Supports: +/// - `logical=0` (None) + `physical=2` (VarInt): plain VarInt stream. +/// - `logical=3` (Rle): byte_length always present; data is `(run_len, value)` VarInt pairs. +fn read_u32_stream( + data: &[u8], + implicit_count: usize, + explicit: bool, +) -> MltResult<(&[u8], Vec)> { + let (data, enc) = read_enc(data)?; + + // ── RLE ────────────────────────────────────────────────────────────────── + if enc == Enc::Rle { + // byte_length always follows for RLE; no separate count field. + let (data, byte_len) = parse_varint::(data)?; + let byte_len = byte_len as usize; + if data.len() < byte_len { + return Err(MltError::BufferUnderflow(byte_len as u32, data.len())); + } + let encoded = &data[..byte_len]; + let remaining = &data[byte_len..]; + + let mut values = Vec::with_capacity(if explicit { implicit_count } else { 16 }); + let mut pos = 0; + while pos < byte_len { + let (run_len, c1) = u32::decode_var(&encoded[pos..]) + .ok_or(MltError::BufferUnderflow(4, encoded.len() - pos))?; + pos += c1; + let (val, c2) = u32::decode_var(&encoded[pos..]) + .ok_or(MltError::BufferUnderflow(4, encoded.len() - pos))?; + pos += c2; + for _ in 0..run_len { + values.push(val); + } + } + return Ok((remaining, values)); + } + + // ── VarInt ─────────────────────────────────────────────────────────────── + if !matches!(enc, Enc::VarInt | Enc::VarIntExpl) { + return Err(MltError::NotImplemented( + "v2 decoder: unsupported encoding for u32 stream", + )); + } + + let (data, count) = if enc == Enc::VarIntExpl || explicit { + let (d, c) = parse_varint::(data)?; + (d, c as usize) + } else { + (data, implicit_count) + }; + + let (data, byte_len) = parse_varint::(data)?; + let byte_len = byte_len as usize; + if data.len() < byte_len { + return Err(MltError::BufferUnderflow(byte_len as u32, data.len())); + } + let encoded = &data[..byte_len]; + let remaining = &data[byte_len..]; + + let mut values = Vec::with_capacity(count); + let mut pos = 0; + while pos < byte_len { + let (v, consumed) = u32::decode_var(&encoded[pos..]) + .ok_or(MltError::BufferUnderflow(4, encoded.len() - pos))?; + values.push(v); + pos += consumed; + } + + Ok((remaining, values)) +} + +/// Read a raw-bytes stream (physical=None-noLen, logical=None, explicit count = byte count). +fn read_raw_bytes_stream(data: &[u8]) -> MltResult<(&[u8], Vec)> { + let (data, enc) = read_enc(data)?; + + if enc != Enc::RawExpl { + return Err(MltError::NotImplemented( + "v2 decoder: unexpected encoding byte for raw bytes stream", + )); + } + + let (data, count) = parse_varint::(data)?; + let count = count as usize; + if data.len() < count { + return Err(MltError::BufferUnderflow(count as u32, data.len())); + } + Ok((&data[count..], data[..count].to_vec())) +} + +/// Read a ZigZag+VarInt i32 stream (count from context = explicit). +fn read_i32_stream( + data: &[u8], + implicit_count: usize, + _has_any_null: bool, +) -> MltResult<(&[u8], Vec)> { + let (data, enc) = read_enc(data)?; + + if !matches!(enc, Enc::VarInt | Enc::VarIntExpl) { + return Err(MltError::NotImplemented( + "v2 decoder: only VarInt/ZigZag supported for i32 stream", + )); + } + + let (data, count) = if enc == Enc::VarIntExpl { + let (d, c) = parse_varint::(data)?; + (d, c as usize) + } else { + (data, implicit_count) + }; + + let (data, byte_len) = parse_varint::(data)?; + let byte_len = byte_len as usize; + if data.len() < byte_len { + return Err(MltError::BufferUnderflow(byte_len as u32, data.len())); + } + let encoded = &data[..byte_len]; + let remaining = &data[byte_len..]; + + let mut values = Vec::with_capacity(count); + let mut pos = 0; + while pos < byte_len { + let (v, consumed) = u32::decode_var(&encoded[pos..]) + .ok_or(MltError::BufferUnderflow(4, encoded.len() - pos))?; + values.push(i32::decode(v)); + pos += consumed; + } + + Ok((remaining, values)) +} + +/// Read a ZigZag+VarInt i64 stream. +fn read_i64_stream(data: &[u8], implicit_count: usize) -> MltResult<(&[u8], Vec)> { + let (data, enc) = read_enc(data)?; + + if !matches!(enc, Enc::VarInt | Enc::VarIntExpl) { + return Err(MltError::NotImplemented( + "v2 decoder: only VarInt/ZigZag supported for i64 stream", + )); + } + + let (data, byte_len) = parse_varint::(data)?; + let byte_len = byte_len as usize; + if data.len() < byte_len { + return Err(MltError::BufferUnderflow(byte_len as u32, data.len())); + } + let encoded = &data[..byte_len]; + let remaining = &data[byte_len..]; + + let mut values = Vec::with_capacity(implicit_count); + let mut pos = 0; + while pos < byte_len { + let (v, consumed) = u64::decode_var(&encoded[pos..]) + .ok_or(MltError::BufferUnderflow(8, encoded.len() - pos))?; + values.push(i64::decode(v)); + pos += consumed; + } + + Ok((remaining, values)) +} + +/// Read a VarInt (or Delta+VarInt) u64 stream. +/// +/// Supports: +/// - `logical=0` (None) + `physical=2` (VarInt): plain VarInt. +/// - `logical=1` (Delta) + `physical=2` (VarInt): VarInt-encoded deltas, prefix-sum to recover. +fn read_u64_stream(data: &[u8], implicit_count: usize) -> MltResult<(&[u8], Vec)> { + let (data, enc) = read_enc(data)?; + + if !matches!(enc, Enc::VarInt | Enc::DeltaVarInt) { + return Err(MltError::NotImplemented( + "v2 decoder: only VarInt / Delta+VarInt supported for u64 stream", + )); + } + + let (data, byte_len) = parse_varint::(data)?; + let byte_len = byte_len as usize; + if data.len() < byte_len { + return Err(MltError::BufferUnderflow(byte_len as u32, data.len())); + } + let encoded = &data[..byte_len]; + let remaining = &data[byte_len..]; + + let mut values = Vec::with_capacity(implicit_count); + let mut pos = 0; + while pos < byte_len { + let (v, consumed) = u64::decode_var(&encoded[pos..]) + .ok_or(MltError::BufferUnderflow(8, encoded.len() - pos))?; + values.push(v); + pos += consumed; + } + + // Apply prefix-sum to undo delta encoding. + if enc == Enc::DeltaVarInt { + let mut acc = 0u64; + for v in &mut values { + acc = acc.wrapping_add(*v); + *v = acc; + } + } + + Ok((remaining, values)) +} + +/// Read a ComponentwiseDelta + (VarInt or FastPFor128) vertex stream (explicit count = vertex pairs). +fn read_cwdelta_stream(data: &[u8]) -> MltResult<(&[u8], Vec)> { + let (data, enc) = read_enc(data)?; + + if !matches!(enc, Enc::CwDeltaVarInt | Enc::CwDeltaFp128) { + return Err(MltError::NotImplemented( + "v2 decoder: only CwDelta+VarInt or CwDelta+FastPFor128 supported for vertex stream", + )); + } + + // Both CwDelta variants carry an explicit pair count. + let (data, pair_count) = { + let (d, c) = parse_varint::(data)?; + (d, c as usize) + }; + + let (data, byte_len) = parse_varint::(data)?; + let byte_len = byte_len as usize; + if data.len() < byte_len { + return Err(MltError::BufferUnderflow(byte_len as u32, data.len())); + } + let encoded = &data[..byte_len]; + let remaining = &data[byte_len..]; + + let coord_count = pair_count * 2; + + let u32_vals: Vec = if enc == Enc::CwDeltaFp128 { + // FastPFor128: bytes are LE u32 words + if !byte_len.is_multiple_of(4) { + return Err(MltError::NotImplemented( + "v2 FP128: byte length not multiple of 4", + )); + } + let words: Vec = encoded + .chunks_exact(4) + .map(|c| u32::from_le_bytes([c[0], c[1], c[2], c[3]])) + .collect(); + let mut result = Vec::with_capacity(coord_count + 128); + FastPFor128::default() + .decode(&words, &mut result, Some(coord_count as u32)) + .map_err(|_| MltError::NotImplemented("v2: FastPFor128 decode error"))?; + result.truncate(coord_count); + result + } else { + // VarInt + let mut vals = Vec::with_capacity(coord_count); + let mut pos = 0; + while pos < byte_len { + let (v, consumed) = u32::decode_var(&encoded[pos..]) + .ok_or(MltError::BufferUnderflow(4, encoded.len() - pos))?; + vals.push(v); + pos += consumed; + } + vals + }; + + // Apply inverse CwDelta + let mut result = Vec::with_capacity(coord_count); + let mut last_x: i32 = 0; + let mut last_y: i32 = 0; + for chunk in u32_vals.chunks_exact(2) { + last_x = last_x.wrapping_add(i32::decode(chunk[0])); + last_y = last_y.wrapping_add(i32::decode(chunk[1])); + result.push(last_x); + result.push(last_y); + } + + Ok((remaining, result)) +} + +// ── Presence bitfield ───────────────────────────────────────────────────────── + +/// Read `ceil(feature_count / 8)` bytes of presence bitfield and return a `Vec`. +fn read_presence(data: &[u8], feature_count: usize) -> MltResult<(&[u8], Vec)> { + let byte_count = feature_count.div_ceil(8); + if data.len() < byte_count { + return Err(MltError::BufferUnderflow(byte_count as u32, data.len())); + } + let mut presence = Vec::with_capacity(feature_count); + for i in 0..feature_count { + presence.push((data[i / 8] >> (i % 8)) & 1 == 1); + } + Ok((&data[byte_count..], presence)) +} + +// ── ID column decoding ──────────────────────────────────────────────────────── + +fn decode_id_column<'a>(data: &'a [u8], feature_count: usize) -> MltResult<(&'a [u8], Vec)> { + read_u64_stream(data, feature_count) +} + +fn decode_opt_id_column<'a>( + data: &'a [u8], + feature_count: usize, +) -> MltResult<(&'a [u8], Vec, Vec)> { + let (data, presence) = read_presence(data, feature_count)?; + let popcount = presence.iter().filter(|&&p| p).count(); + let (data, ids) = read_u64_stream(data, popcount)?; + Ok((data, presence, ids)) +} + +// ── Property column decoding ────────────────────────────────────────────────── + +fn decode_property_column<'a>( + data: &'a [u8], + col_type: ColType, + feature_count: usize, + presence_groups: &mut Vec>, +) -> MltResult<(&'a [u8], String, Vec)> { + let (data, name) = parse_string(data)?; + let name = name.to_string(); + + let is_optional = matches!( + col_type, + ColType::OptBool + | ColType::OptI8 + | ColType::OptU8 + | ColType::OptI32 + | ColType::OptU32 + | ColType::OptI64 + | ColType::OptU64 + | ColType::OptF32 + | ColType::OptF64 + | ColType::OptStr + ); + + let (data, presence) = if is_optional { + let (d, p) = read_presence(data, feature_count)?; + presence_groups.push(p.clone()); + (d, Some(p)) + } else { + (data, None) + }; + + let popcount = presence + .as_ref() + .map_or(feature_count, |p| p.iter().filter(|&&b| b).count()); + + let (data, values) = decode_column_data(data, col_type, feature_count, popcount, &presence)?; + + Ok((data, name, values)) +} + +fn decode_column_data<'a>( + data: &'a [u8], + col_type: ColType, + feature_count: usize, + popcount: usize, + presence: &Option>, +) -> MltResult<(&'a [u8], Vec)> { + match col_type { + ColType::Bool | ColType::OptBool => { + let (data, bytes) = read_raw_bytes_stream(data)?; + let values = expand_with_presence( + bytes.iter().map(|&b| PropValue::Bool(Some(b != 0))), + presence, + feature_count, + PropValue::Bool(None), + ); + Ok((data, values)) + } + ColType::I8 | ColType::OptI8 => { + let (data, bytes) = read_raw_bytes_stream(data)?; + let values = expand_with_presence( + bytes + .iter() + .map(|&b| PropValue::I8(Some(i8::decode(b as u32 as u8)))), + presence, + feature_count, + PropValue::I8(None), + ); + Ok((data, values)) + } + ColType::U8 | ColType::OptU8 => { + let (data, bytes) = read_raw_bytes_stream(data)?; + let values = expand_with_presence( + bytes.into_iter().map(|b| PropValue::U8(Some(b))), + presence, + feature_count, + PropValue::U8(None), + ); + Ok((data, values)) + } + ColType::I32 | ColType::OptI32 => { + let any_null = col_type == ColType::OptI32; + let (data, ints) = read_i32_stream(data, popcount, any_null)?; + let values = expand_with_presence( + ints.into_iter().map(|v| PropValue::I32(Some(v))), + presence, + feature_count, + PropValue::I32(None), + ); + Ok((data, values)) + } + ColType::U32 | ColType::OptU32 => { + let (data, ints) = read_u32_stream(data, popcount, false)?; + let values = expand_with_presence( + ints.into_iter().map(|v| PropValue::U32(Some(v))), + presence, + feature_count, + PropValue::U32(None), + ); + Ok((data, values)) + } + ColType::I64 | ColType::OptI64 => { + let (data, ints) = read_i64_stream(data, popcount)?; + let values = expand_with_presence( + ints.into_iter().map(|v| PropValue::I64(Some(v))), + presence, + feature_count, + PropValue::I64(None), + ); + Ok((data, values)) + } + ColType::U64 | ColType::OptU64 => { + let (data, ints) = read_u64_stream(data, popcount)?; + let values = expand_with_presence( + ints.into_iter().map(|v| PropValue::U64(Some(v))), + presence, + feature_count, + PropValue::U64(None), + ); + Ok((data, values)) + } + ColType::F32 | ColType::OptF32 => { + let (data, bytes) = read_raw_bytes_stream(data)?; + let floats: Vec = bytes + .chunks_exact(4) + .map(|c| f32::from_le_bytes([c[0], c[1], c[2], c[3]])) + .collect(); + let values = expand_with_presence( + floats.into_iter().map(|v| PropValue::F32(Some(v))), + presence, + feature_count, + PropValue::F32(None), + ); + Ok((data, values)) + } + ColType::F64 | ColType::OptF64 => { + let (data, bytes) = read_raw_bytes_stream(data)?; + let floats: Vec = bytes + .chunks_exact(8) + .map(|c| f64::from_le_bytes([c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7]])) + .collect(); + let values = expand_with_presence( + floats.into_iter().map(|v| PropValue::F64(Some(v))), + presence, + feature_count, + PropValue::F64(None), + ); + Ok((data, values)) + } + ColType::Str | ColType::OptStr => { + decode_string_column(data, popcount, presence, feature_count) + } + // These variants are dispatched before reaching this function. + ColType::Id + | ColType::OptId + | ColType::StrDict + | ColType::OptStrDict + | ColType::StrFsst + | ColType::OptStrFsst + | ColType::StrSharedDict + | ColType::StrSharedDictFsst => { + unreachable!("ColType::{col_type:?} is handled before decode_column_data") + } + } +} + +fn decode_string_column<'a>( + data: &'a [u8], + popcount: usize, + presence: &Option>, + feature_count: usize, +) -> MltResult<(&'a [u8], Vec)> { + // Lengths stream (count = popcount, implicit) + let (data, lengths) = read_u32_stream(data, popcount, false)?; + // String data (explicit byte count) + let (data, str_bytes) = read_raw_bytes_stream(data)?; + + // Reconstruct strings from lengths + let mut strings: Vec> = Vec::with_capacity(popcount); + let mut offset = 0; + for &len in &lengths { + let len = len as usize; + let s = std::str::from_utf8(&str_bytes[offset..offset + len])?.to_string(); + strings.push(Some(s)); + offset += len; + } + + let values = expand_with_presence( + strings.into_iter().map(|s| PropValue::Str(s)), + presence, + feature_count, + PropValue::Str(None), + ); + Ok((data, values)) +} + +/// Decode a `StrFsst` or `OptStrFsst` column. +/// +/// Wire layout: `[presence?] | sym_lengths | sym_bytes | val_lengths | corpus` +fn decode_string_fsst_column<'a>( + data: &'a [u8], + col_type: ColType, + feature_count: usize, +) -> MltResult<(&'a [u8], String, Vec)> { + let optional = col_type == ColType::OptStrFsst; + + let (data, name) = parse_string(data)?; + let name = name.to_string(); + + // presence (only for optional) + let (data, presence) = if optional { + let (d, p) = read_presence(data, feature_count)?; + (d, Some(p)) + } else { + (data, None) + }; + let popcount = presence + .as_ref() + .map(|p| p.iter().filter(|&&x| x).count()) + .unwrap_or(feature_count); + + // val_lengths uses an implicit count (= popcount) in the per-column FSST variant. + let (data, present_strings) = read_fsst_strings(data, popcount, false)?; + + let values = if let Some(pres) = &presence { + let mut result = Vec::with_capacity(feature_count); + let mut iter = present_strings.into_iter(); + for &present in pres { + if present { + result.push(PropValue::Str(iter.next().map(Some).unwrap_or(None))); + } else { + result.push(PropValue::Str(None)); + } + } + result + } else { + present_strings + .into_iter() + .map(|s| PropValue::Str(Some(s))) + .collect() + }; + + Ok((data, name, values)) +} + +/// Decompress an FSST-encoded corpus into individual strings. +fn fsst_decompress( + sym_lengths: &[u32], + sym_bytes: &[u8], + val_lengths: &[u32], + corpus: &[u8], +) -> MltResult> { + // Build per-symbol offset table. + let mut sym_offsets = vec![0usize; sym_lengths.len() + 1]; + for (i, &len) in sym_lengths.iter().enumerate() { + sym_offsets[i + 1] = sym_offsets[i] + len as usize; + } + + // Decompress corpus: 0xFF → literal next byte; else → expand symbol. + let mut output: Vec = Vec::new(); + let mut i = 0; + while i < corpus.len() { + let sym_idx = corpus[i] as usize; + if sym_idx == 255 { + i += 1; + if i < corpus.len() { + output.push(corpus[i]); + } + } else if sym_idx < sym_lengths.len() { + let start = sym_offsets[sym_idx]; + let end = sym_offsets[sym_idx + 1]; + output.extend_from_slice(&sym_bytes[start..end]); + } + i += 1; + } + + // Split by value lengths. + let mut strings = Vec::with_capacity(val_lengths.len()); + let mut offset = 0usize; + for &len in val_lengths { + let end = offset + len as usize; + let s = std::str::from_utf8( + output + .get(offset..end) + .ok_or(MltError::BufferUnderflow(len, output.len() - offset))?, + )? + .to_string(); + strings.push(s); + offset = end; + } + Ok(strings) +} + +/// Decode a `StrDict` or `OptStrDict` column. +/// +/// Wire layout: `dict_lengths_stream | dict_data_stream | indices_stream` +/// +/// - `StrDict`: all features have a value; index `k` → dict entry `k`. +/// - `OptStrDict`: index `0` = null; index `k` (k≥1) → dict entry `k-1`. +fn decode_string_dict_column<'a>( + data: &'a [u8], + col_type: ColType, + feature_count: usize, +) -> MltResult<(&'a [u8], String, Vec)> { + let optional = col_type == ColType::OptStrDict; + + let (data, name) = parse_string(data)?; + let name = name.to_string(); + + // dict_lengths: explicit count = number of dict entries + let (data, dict_lengths) = read_u32_stream(data, 0, true)?; + // dict_data: raw bytes + let (data, dict_bytes) = read_raw_bytes_stream(data)?; + // indices: implicit count = feature_count + let (data, indices) = read_u32_stream(data, feature_count, false)?; + + // Reconstruct dictionary entries from lengths + raw bytes. + let dict = build_strings_from_lengths(&dict_lengths, &dict_bytes)?; + + // Map per-feature indices back to PropValue. + let mut values = Vec::with_capacity(feature_count); + for &idx in &indices { + let pv = if optional { + if idx == 0 { + PropValue::Str(None) + } else { + let entry = dict + .get(idx as usize - 1) + .ok_or(MltError::NotImplemented("v2 StrDict: index out of range"))?; + PropValue::Str(Some(entry.clone())) + } + } else { + let entry = dict + .get(idx as usize) + .ok_or(MltError::NotImplemented("v2 StrDict: index out of range"))?; + PropValue::Str(Some(entry.clone())) + }; + values.push(pv); + } + + Ok((data, name, values)) +} + +/// Decode a `COL_STR_SHARED_DICT` or `COL_STR_SHARED_DICT_FSST` column. +/// +/// Returns a list of `(property_name, per_feature_values)` — one entry per child column in +/// the group — so the caller can append them individually to `property_names` / +/// `property_columns`. +/// +/// Wire layout (plain corpus, `COL_STR_SHARED_DICT`): +/// ```text +/// [varint: prefix_len] [prefix bytes] +/// [varint: child_count] +/// [dict_lengths: ENC_VARINT_EXPL] [dict_data: ENC_RAW_EXPL] +/// for each child: +/// [u8: flags] bit 0 = optional +/// [varint: suffix_len] [suffix bytes] +/// [indices: ENC_VARINT, implicit count = feature_count] +/// ``` +/// FSST variant (`COL_STR_SHARED_DICT_FSST`) replaces the corpus section with: +/// ```text +/// [sym_lengths: ENC_VARINT_EXPL] [sym_bytes: ENC_RAW_EXPL] +/// [val_lengths: ENC_VARINT_EXPL] [corpus: ENC_RAW_EXPL] +/// ``` +fn decode_shared_dict_v2<'a>( + data: &'a [u8], + col_type: ColType, + feature_count: usize, +) -> MltResult<(&'a [u8], Vec<(String, Vec)>)> { + let use_fsst = col_type == ColType::StrSharedDictFsst; + + let (data, prefix) = parse_string(data)?; + let prefix = prefix.to_string(); + + let (data, child_count) = parse_varint::(data)?; + let child_count = child_count as usize; + + // Decode shared dictionary corpus into a Vec (one entry per dict index). + let (data, dict_strings): (&[u8], Vec) = if use_fsst { + // val_lengths has an explicit count in the FSST shared-dict variant. + read_fsst_strings(data, 0, true)? + } else { + let (data, dict_lengths) = read_u32_stream(data, 0, true)?; + let (data, dict_bytes) = read_raw_bytes_stream(data)?; + ( + data, + build_strings_from_lengths(&dict_lengths, &dict_bytes)?, + ) + }; + + let mut result = Vec::with_capacity(child_count); + let mut data = data; + + for _ in 0..child_count { + if data.is_empty() { + return Err(MltError::BufferUnderflow(1, 0)); + } + let flags = data[0]; + data = &data[1..]; + let optional = (flags & CHILD_OPTIONAL) != 0; + + let (d, suffix) = parse_string(data)?; + let suffix = suffix.to_string(); + data = d; + + let col_name = format!("{prefix}{suffix}"); + + // indices: implicit count = feature_count + let (d, indices) = read_u32_stream(data, feature_count, false)?; + data = d; + + let values: MltResult> = indices + .iter() + .map(|&idx| { + if optional && idx == 0 { + Ok(PropValue::Str(None)) + } else { + let dict_idx = if optional { + idx as usize - 1 + } else { + idx as usize + }; + let s = dict_strings + .get(dict_idx) + .ok_or(MltError::NotImplemented( + "v2 SharedDict: index out of range", + ))? + .clone(); + Ok(PropValue::Str(Some(s))) + } + }) + .collect(); + + result.push((col_name, values?)); + } + + Ok((data, result)) +} + +/// Reconstruct strings from a parallel `(lengths, flat_bytes)` pair. +fn build_strings_from_lengths(lengths: &[u32], bytes: &[u8]) -> MltResult> { + let mut strings = Vec::with_capacity(lengths.len()); + let mut offset = 0usize; + for &len in lengths { + let end = offset + len as usize; + let s = std::str::from_utf8(bytes.get(offset..end).ok_or(MltError::BufferUnderflow( + len, + bytes.len().saturating_sub(offset), + ))?)? + .to_string(); + strings.push(s); + offset = end; + } + Ok(strings) +} + +/// Read FSST-encoded streams and decompress them into plain strings. +/// +/// `val_explicit` controls whether the value-lengths count is read from the stream +/// (`true`) or inferred from `val_count` (`false`). +fn read_fsst_strings<'a>( + data: &'a [u8], + val_count: usize, + val_explicit: bool, +) -> MltResult<(&'a [u8], Vec)> { + let (data, sym_lengths) = read_u32_stream(data, 0, true)?; + let (data, sym_bytes) = read_raw_bytes_stream(data)?; + let (data, val_lengths) = read_u32_stream(data, val_count, val_explicit)?; + let (data, corpus) = read_raw_bytes_stream(data)?; + let strings = fsst_decompress(&sym_lengths, &sym_bytes, &val_lengths, &corpus)?; + Ok((data, strings)) +} + +/// Expand a stream of non-null values back to `feature_count` values by +/// inserting `null_val` for absent features according to the presence bitfield. +fn expand_with_presence>( + present_values: I, + presence: &Option>, + feature_count: usize, + null_val: PropValue, +) -> Vec { + match presence { + None => present_values.collect(), + Some(pres) => { + let mut result = Vec::with_capacity(feature_count); + let mut iter = present_values; + for &present in pres { + if present { + result.push(iter.next().unwrap_or(null_val.clone())); + } else { + result.push(null_val.clone()); + } + } + result + } + } +} diff --git a/rust/mlt-core/tests/unknown_layer.rs b/rust/mlt-core/tests/unknown_layer.rs index c5e06bf8c..64bf53456 100644 --- a/rust/mlt-core/tests/unknown_layer.rs +++ b/rust/mlt-core/tests/unknown_layer.rs @@ -72,9 +72,10 @@ fn unknown_zero_length_body() { #[test] fn multiple_layers_mixed_unknown_and_tag01() { - // Build two unknown layers back-to-back (tags 2 and 3, since tag=1 is Tag01). - let mut raw = unknown_layer_bytes(2, b"hello"); - raw.extend_from_slice(&unknown_layer_bytes(3, b"world")); + // Build two unknown layers back-to-back. Tags 1 and 2 are known (Tag01 and Tag02), + // so use tags 10 and 11 for these "unknown" tests. + let mut raw = unknown_layer_bytes(10, b"hello"); + raw.extend_from_slice(&unknown_layer_bytes(11, b"world")); let layers = Parser::default() .parse_layers(&raw) @@ -85,12 +86,12 @@ fn multiple_layers_mixed_unknown_and_tag01() { let Layer::Unknown(u0) = &layers[0] else { panic!("expected Unknown at index 0"); }; - assert_eq!(u0.tag(), 2u32); + assert_eq!(u0.tag(), 10); assert_eq!(u0.data(), b"hello"); let Layer::Unknown(u1) = &layers[1] else { panic!("expected Unknown at index 1"); }; - assert_eq!(u1.tag(), 3u32); + assert_eq!(u1.tag(), 11); assert_eq!(u1.data(), b"world"); }