diff --git a/COPYRIGHT.txt b/COPYRIGHT.txt
index 758917ae4c86..caffbc5adf3f 100644
--- a/COPYRIGHT.txt
+++ b/COPYRIGHT.txt
@@ -683,6 +683,16 @@ Comment: Zstandard
 Copyright: Meta Platforms, Inc. and affiliates.
 License: BSD-3-clause
 
+Files: thirdparty/zxc/*
+Comment: ZXC
+Copyright: 2025-2026, Bertrand Lebonnois and contributors.
+License: BSD-3-clause
+
+Files: thirdparty/zxc/src/lib/vendors/rapidhash.h
+Comment: rapidhash
+Copyright: 2025, Nicolas De Carli
+License: Expat
+
 
 
 License: Apache-2.0
diff --git a/SConstruct b/SConstruct
index d8d930106ba2..ef3c9899efd5 100644
--- a/SConstruct
+++ b/SConstruct
@@ -193,6 +193,7 @@ opts.Add(
 )
 opts.Add(BoolVariable("minizip", "Enable ZIP archive support using minizip", True))
 opts.Add(BoolVariable("brotli", "Enable Brotli for decompression and WOFF2 fonts support", True))
+opts.Add(BoolVariable("zxc", "Enable the ZXC compression codec", True))
 opts.Add(BoolVariable("xaudio2", "Enable the XAudio2 audio driver on supported platforms", False))
 opts.Add(BoolVariable("vulkan", "Enable the Vulkan rendering driver", True))
 opts.Add(BoolVariable("opengl3", "Enable the OpenGL/GLES3 rendering driver", True))
@@ -346,6 +347,7 @@ opts.Add(BoolVariable("builtin_rvo2_3d", "Use the built-in RVO2 3D library", Tru
 opts.Add(BoolVariable("builtin_xatlas", "Use the built-in xatlas library", True))
 opts.Add(BoolVariable("builtin_zlib", "Use the built-in zlib library", True))
 opts.Add(BoolVariable("builtin_zstd", "Use the built-in Zstd library", True))
+opts.Add(BoolVariable("builtin_zxc", "Use the built-in ZXC library", True))
 
 # Compilation environment setup
 # CXX, CC, and LINK directly set the equivalent `env` values (which may still
@@ -1094,6 +1096,8 @@ if env["minizip"]:
     env.Append(CPPDEFINES=["MINIZIP_ENABLED"])
 if env["brotli"]:
     env.Append(CPPDEFINES=["BROTLI_ENABLED"])
+if env["zxc"]:
+    env.Append(CPPDEFINES=["ZXC_ENABLED"])
 
 if not env["disable_overrides"]:
     env.Append(CPPDEFINES=["OVERRIDE_ENABLED"])
diff --git a/core/SCsub b/core/SCsub
index 533bd052258f..357eb7b473ae 100644
--- a/core/SCsub
+++ b/core/SCsub
@@ -161,6 +161,72 @@ if env["builtin_zstd"]:
 
     env_thirdparty.add_source_files(thirdparty_obj, thirdparty_zstd_sources)
 
+# ZXC compression codec, can be unbundled
+if env["zxc"] and env["builtin_zxc"]:
+    thirdparty_zxc_dir = "#thirdparty/zxc/"
+
+    # Include paths: public headers for <zxc.h>, plus the library's own internal
+    # and vendored headers needed by the thirdparty translation units.
+    env_thirdparty.Prepend(
+        CPPPATH=[
+            thirdparty_zxc_dir + "include",
+            thirdparty_zxc_dir + "src/lib",
+            thirdparty_zxc_dir + "src/lib/vendors",
+        ]
+    )
+    # <zxc.h> must also be reachable from core/io/compression.cpp.
+    env.Prepend(CPPPATH=[thirdparty_zxc_dir + "include"])
+
+    env_thirdparty.Append(CPPDEFINES=["ZXC_STATIC_DEFINE"])
+    env.Append(CPPDEFINES=["ZXC_STATIC_DEFINE"])
+
+    # Sources compiled once, with no per-ISA variants and no special flags.
+    thirdparty_zxc_sources = [
+        "src/lib/zxc_common.c",
+        "src/lib/zxc_dispatch.c",
+        "src/lib/zxc_dict.c",
+        "src/lib/zxc_seekable.c",
+        "src/lib/zxc_pstream.c",
+        "src/lib/zxc_driver.c",
+    ]
+    thirdparty_zxc_sources = [thirdparty_zxc_dir + file for file in thirdparty_zxc_sources]
+    env_thirdparty.add_source_files(thirdparty_obj, thirdparty_zxc_sources)
+
+    # zxc_compress.c / zxc_decompress.c / zxc_huffman.c are each compiled once per
+    # CPU variant: a distinct ZXC_FUNCTION_SUFFIX mangles the exported symbols and
+    # the matching ISA flags enable the SIMD intrinsics for that translation unit.
+    # zxc_dispatch.c then selects the best variant at runtime (CPUID / getauxval),
+    # so a single binary built without -march=native still uses SIMD when present.
+    # The variant set must match the symbols referenced by zxc_dispatch.c for the
+    # target architecture (x86_64 -> sse2/avx2/avx512, arm -> neon, others -> none).
+    zxc_variant_sources = ["zxc_compress.c", "zxc_decompress.c", "zxc_huffman.c"]
+
+    # (suffix, gcc/clang flags, msvc flags). "_default" (scalar) is always built.
+    zxc_variants = [("_default", [], [])]
+    if env["arch"] == "x86_64":
+        zxc_variants += [
+            ("_sse2", ["-msse2"], []),  # x86_64 baseline; MSVC implies SSE2.
+            ("_avx2", ["-mavx2", "-mfma", "-mbmi", "-mbmi2", "-mlzcnt"], ["/arch:AVX2"]),
+            ("_avx512", ["-mavx512f", "-mavx512bw", "-mbmi", "-mbmi2", "-mlzcnt"], ["/arch:AVX512"]),
+        ]
+    elif env["arch"] == "arm64":
+        # NEON is baseline on AArch64, so the intrinsics compile without extra flags.
+        zxc_variants += [("_neon", [], [])]
+    elif env["arch"] == "arm32":
+        zxc_variants += [("_neon", ["-mfpu=neon"], [])]
+
+    for suffix, gcc_flags, msvc_flags in zxc_variants:
+        env_zxc_variant = env_thirdparty.Clone()
+        env_zxc_variant.Append(CPPDEFINES=[("ZXC_FUNCTION_SUFFIX", suffix)])
+        variant_flags = msvc_flags if env.msvc else gcc_flags
+        if variant_flags:
+            env_zxc_variant.Append(CCFLAGS=variant_flags)
+        for source in zxc_variant_sources:
+            thirdparty_obj += env_zxc_variant.Object(
+                target=thirdparty_zxc_dir + "src/lib/" + source[:-2] + suffix,
+                source=thirdparty_zxc_dir + "src/lib/" + source,
+            )
+
 
 env.core_sources += thirdparty_obj
 
diff --git a/core/config/project_settings.cpp b/core/config/project_settings.cpp
index 7f23b5c78d38..dd2d472174cd 100644
--- a/core/config/project_settings.cpp
+++ b/core/config/project_settings.cpp
@@ -894,6 +894,8 @@ Error ProjectSettings::setup(const String &p_path, const String &p_main_pack, bo
 
 	Compression::gzip_level = GLOBAL_GET("compression/formats/gzip/compression_level");
 
+	Compression::zxc_level = GLOBAL_GET("compression/formats/zxc/compression_level");
+
 	load_scene_groups_cache();
 
 	project_loaded = err == OK;
@@ -1798,6 +1800,7 @@ ProjectSettings::ProjectSettings() {
 	GLOBAL_DEF(PropertyInfo(Variant::INT, "compression/formats/zstd/window_log_size", PROPERTY_HINT_RANGE, "10,30,1"), Compression::zstd_window_log_size);
 	GLOBAL_DEF(PropertyInfo(Variant::INT, "compression/formats/zlib/compression_level", PROPERTY_HINT_RANGE, "-1,9,1"), Compression::zlib_level);
 	GLOBAL_DEF(PropertyInfo(Variant::INT, "compression/formats/gzip/compression_level", PROPERTY_HINT_RANGE, "-1,9,1"), Compression::gzip_level);
+	GLOBAL_DEF(PropertyInfo(Variant::INT, "compression/formats/zxc/compression_level", PROPERTY_HINT_RANGE, "1,6,1"), Compression::zxc_level);
 
 	GLOBAL_DEF("debug/settings/crash_handler/message",
 			String("Please include this when reporting the bug to the project developer."));
diff --git a/core/io/compression.cpp b/core/io/compression.cpp
index 2ac02c59cfcb..6fb162ebf673 100644
--- a/core/io/compression.cpp
+++ b/core/io/compression.cpp
@@ -40,6 +40,10 @@
 #include <brotli/decode.h>
 #endif
 
+#ifdef ZXC_ENABLED
+#include <zxc.h>
+#endif
+
 namespace {
 struct ZstdDecompressorContext {
 	ZSTD_DCtx *zstd_d_ctx = nullptr;
@@ -124,6 +128,19 @@ int64_t Compression::compress(uint8_t *p_dst, const uint8_t *p_src, int64_t p_sr
 			ZSTD_freeCCtx(cctx);
 			return (int64_t)ret;
 		} break;
+		case MODE_ZXC: {
+#ifdef ZXC_ENABLED
+			zxc_compress_opts_t opts;
+			memset(&opts, 0, sizeof(opts));
+			opts.level = zxc_level;
+			const int64_t max_dst_size = get_max_compressed_buffer_size(p_src_size, MODE_ZXC);
+			const int64_t ret = zxc_compress(p_src, p_src_size, p_dst, max_dst_size, &opts);
+			ERR_FAIL_COND_V_MSG(ret < 0, -1, vformat("ZXC compression failed: %s.", zxc_error_name((int)ret)));
+			return ret;
+#else
+			ERR_FAIL_V_MSG(-1, "Godot was compiled without ZXC support.");
+#endif
+		} break;
 	}
 
 	ERR_FAIL_V(-1);
@@ -163,6 +180,13 @@ int64_t Compression::get_max_compressed_buffer_size(int64_t p_src_size, Mode p_m
 		case MODE_ZSTD: {
 			return ZSTD_compressBound(p_src_size);
 		} break;
+		case MODE_ZXC: {
+#ifdef ZXC_ENABLED
+			return (int64_t)zxc_compress_bound(p_src_size);
+#else
+			ERR_FAIL_V_MSG(-1, "Godot was compiled without ZXC support.");
+#endif
+		} break;
 	}
 
 	ERR_FAIL_V(-1);
@@ -226,6 +250,17 @@ int64_t Compression::decompress(uint8_t *p_dst, int64_t p_dst_max_size, const ui
 			size_t ret = ZSTD_decompressDCtx(decompressor_ctx.zstd_d_ctx, p_dst, p_dst_max_size, p_src, p_src_size);
 			return (int64_t)ret;
 		} break;
+		case MODE_ZXC: {
+#ifdef ZXC_ENABLED
+			zxc_decompress_opts_t opts;
+			memset(&opts, 0, sizeof(opts));
+			const int64_t ret = zxc_decompress(p_src, p_src_size, p_dst, p_dst_max_size, &opts);
+			ERR_FAIL_COND_V_MSG(ret < 0, -1, vformat("ZXC decompression failed: %s.", zxc_error_name((int)ret)));
+			return ret;
+#else
+			ERR_FAIL_V_MSG(-1, "Godot was compiled without ZXC support.");
+#endif
+		} break;
 	}
 
 	ERR_FAIL_V(-1);
diff --git a/core/io/compression.h b/core/io/compression.h
index bb3f81f58d0b..db415d115fe1 100644
--- a/core/io/compression.h
+++ b/core/io/compression.h
@@ -43,13 +43,15 @@ class Compression {
 	static inline bool zstd_long_distance_matching = false;
 	static inline int zstd_window_log_size = 27; // ZSTD_WINDOWLOG_LIMIT_DEFAULT
 	static inline int gzip_chunk = 16384;
+	static inline int zxc_level = 3; // ZXC_LEVEL_DEFAULT
 
 	enum Mode : int32_t {
 		MODE_FASTLZ,
 		MODE_DEFLATE,
 		MODE_ZSTD,
 		MODE_GZIP,
-		MODE_BROTLI
+		MODE_BROTLI,
+		MODE_ZXC
 	};
 
 	static int64_t compress(uint8_t *p_dst, const uint8_t *p_src, int64_t p_src_size, Mode p_mode = MODE_ZSTD);
diff --git a/core/io/file_access.cpp b/core/io/file_access.cpp
index 9bd1b450ac01..0bcc5b10d32f 100644
--- a/core/io/file_access.cpp
+++ b/core/io/file_access.cpp
@@ -1103,6 +1103,7 @@ void FileAccess::_bind_methods() {
 	BIND_ENUM_CONSTANT(COMPRESSION_ZSTD);
 	BIND_ENUM_CONSTANT(COMPRESSION_GZIP);
 	BIND_ENUM_CONSTANT(COMPRESSION_BROTLI);
+	BIND_ENUM_CONSTANT(COMPRESSION_ZXC);
 
 	BIND_BITFIELD_FLAG(UNIX_READ_OWNER);
 	BIND_BITFIELD_FLAG(UNIX_WRITE_OWNER);
diff --git a/core/io/file_access.h b/core/io/file_access.h
index 62bfa225e128..3a5d07c52a10 100644
--- a/core/io/file_access.h
+++ b/core/io/file_access.h
@@ -83,6 +83,7 @@ class FileAccess : public RefCounted {
 		COMPRESSION_ZSTD = Compression::MODE_ZSTD,
 		COMPRESSION_GZIP = Compression::MODE_GZIP,
 		COMPRESSION_BROTLI = Compression::MODE_BROTLI,
+		COMPRESSION_ZXC = Compression::MODE_ZXC,
 	};
 
 	typedef void (*FileCloseFailNotify)(const String &);
diff --git a/doc/classes/FileAccess.xml b/doc/classes/FileAccess.xml
index 01f25880ba85..a6c948c516cb 100644
--- a/doc/classes/FileAccess.xml
+++ b/doc/classes/FileAccess.xml
@@ -676,6 +676,9 @@
 		<constant name="COMPRESSION_BROTLI" value="4" enum="CompressionMode">
 			Uses the [url=https://github.com/google/brotli]brotli[/url] compression method (only decompression is supported).
 		</constant>
+		<constant name="COMPRESSION_ZXC" value="5" enum="CompressionMode">
+			Uses the ZXC compression method.
+		</constant>
 		<constant name="UNIX_READ_OWNER" value="256" enum="UnixPermissionFlags" is_bitfield="true">
 			Read for owner bit.
 		</constant>
diff --git a/doc/classes/ProjectSettings.xml b/doc/classes/ProjectSettings.xml
index b835af387336..09663ebb57ce 100644
--- a/doc/classes/ProjectSettings.xml
+++ b/doc/classes/ProjectSettings.xml
@@ -518,6 +518,9 @@
 		<member name="compression/formats/zstd/window_log_size" type="int" setter="" getter="" default="27">
 			Largest size limit (in power of 2) allowed when compressing using long-distance matching with Zstandard. Higher values can result in better compression, but will require more memory when compressing and decompressing.
 		</member>
+		<member name="compression/formats/zxc/compression_level" type="int" setter="" getter="" default="3">
+			The default compression level for ZXC. Affects compressed scenes and resources. Higher levels result in smaller files at the cost of compression speed. Decompression speed is mostly unaffected by the compression level.
+		</member>
 		<member name="debug/canvas_items/debug_redraw_color" type="Color" setter="" getter="" default="Color(1, 0.2, 0.2, 0.5)">
 			If canvas item redraw debugging is active, this color will be flashed on canvas items when they redraw.
 		</member>
diff --git a/tests/core/io/test_file_access.cpp b/tests/core/io/test_file_access.cpp
index 038eaa4b7699..526f3a3bc5f8 100644
--- a/tests/core/io/test_file_access.cpp
+++ b/tests/core/io/test_file_access.cpp
@@ -32,6 +32,7 @@
 
 TEST_FORCE_LINK(test_file_access)
 
+#include "core/io/compression.h"
 #include "core/io/dir_access.h"
 #include "core/io/file_access.h"
 #include "tests/test_utils.h"
@@ -250,6 +251,58 @@ TEST_CASE("[FileAccess] Get/Store floating point half precision values") {
 	}
 }
 
+TEST_CASE("[Compression] ZXC round-trip") {
+	// Representative data: enough structure that it actually compresses.
+	PackedByteArray original;
+	original.resize(8192);
+	uint8_t *w = original.ptrw();
+	for (int i = 0; i < original.size(); i++) {
+		w[i] = uint8_t((i * 31 + (i >> 3)) & 0xFF);
+	}
+
+	// Compress via the Compression class (MODE_ZXC).
+	PackedByteArray compressed;
+	compressed.resize(Compression::get_max_compressed_buffer_size(original.size(), Compression::MODE_ZXC));
+	const int64_t compressed_size = Compression::compress(compressed.ptrw(), original.ptr(), original.size(), Compression::MODE_ZXC);
+	REQUIRE(compressed_size > 0);
+	compressed.resize(compressed_size);
+
+	// Decompress back into a buffer of the known original size.
+	PackedByteArray decompressed;
+	decompressed.resize(original.size());
+	const int64_t decompressed_size = Compression::decompress(decompressed.ptrw(), decompressed.size(), compressed.ptr(), compressed.size(), Compression::MODE_ZXC);
+	CHECK(decompressed_size == original.size());
+	CHECK(decompressed == original);
+}
+
+TEST_CASE("[FileAccess] ZXC compressed file round-trip") {
+	const String file_path = TestUtils::get_data_path("zxc_roundtrip.bin");
+
+	PackedByteArray original;
+	original.resize(4096);
+	uint8_t *w = original.ptrw();
+	for (int i = 0; i < original.size(); i++) {
+		w[i] = uint8_t((i * 17) & 0xFF);
+	}
+
+	{
+		Ref<FileAccess> fw = FileAccess::open_compressed(file_path, FileAccess::WRITE, FileAccess::COMPRESSION_ZXC);
+		REQUIRE(fw.is_valid());
+		fw->store_buffer(original);
+		fw->close();
+	}
+
+	{
+		Ref<FileAccess> fr = FileAccess::open_compressed(file_path, FileAccess::READ, FileAccess::COMPRESSION_ZXC);
+		REQUIRE(fr.is_valid());
+		const Vector<uint8_t> read_back = fr->get_buffer(original.size());
+		fr->close();
+		CHECK(read_back == original);
+	}
+
+	DirAccess::remove_file_or_error(file_path);
+}
+
 TEST_CASE("[FileAccess] Cursor positioning") {
 	Ref<FileAccess> f = FileAccess::open(TestUtils::get_data_path("line_endings_lf.test.txt"), FileAccess::READ);
 	REQUIRE(f.is_valid());
diff --git a/thirdparty/README.md b/thirdparty/README.md
index 311b32c2cb05..fbe2a366ddbe 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -1311,3 +1311,19 @@ Files extracted from upstream source:
 
 - `lib/{common/,compress/,decompress/,zstd.h,zstd_errors.h}`
 - `LICENSE`
+
+
+## zxc
+
+- Upstream: https://github.com/hellobertrand/zxc
+- Version: 0.12.0 (c8748471f7a6e895e4b9dc0d9063d91e8567c249, 2026)
+- License: BSD-3-Clause
+
+Files extracted from upstream source:
+
+- `include/`
+- `src/lib/` (including `src/lib/vendors/rapidhash.h`, MIT-licensed)
+- `LICENSE`
+
+The CMake/Meson build files, command-line tool, and test suite are not needed
+and were not extracted.
diff --git a/thirdparty/zxc/LICENSE b/thirdparty/zxc/LICENSE
new file mode 100644
index 000000000000..9046c0e3aeb1
--- /dev/null
+++ b/thirdparty/zxc/LICENSE
@@ -0,0 +1,64 @@
+==============================================================================
+ZXC
+Copyright (c) 2025-2026, Bertrand Lebonnois and contributors
+License: BSD 3-Clause
+==============================================================================
+
+BSD 3-Clause License
+====================
+
+Copyright (c) 2025-2026, Bertrand Lebonnois and contributors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+3. Neither the name of ZXC nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL BERTRAND LEBONNOIS OR CONTRIBUTORS BE LIABLE FOR
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+==============================================================================
+This project includes code from rapidhash
+Copyright (C) 2025 Nicolas De Carli
+License: MIT (Expat)
+==============================================================================
+
+rapidhash - Very fast, high quality, platform-independent hashing algorithm.
+Copyright (C) 2025 Nicolas De Carli
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+You can contact the author at:
+  - rapidhash source repository: https://github.com/Nicoshev/rapidhash
diff --git a/thirdparty/zxc/include/zxc.h b/thirdparty/zxc/include/zxc.h
new file mode 100644
index 000000000000..e8f5d4d09988
--- /dev/null
+++ b/thirdparty/zxc/include/zxc.h
@@ -0,0 +1,18 @@
+/*
+ * ZXC - High-performance lossless compression
+ *
+ * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#ifndef ZXC_H
+#define ZXC_H
+
+#include "zxc_buffer.h"     // IWYU pragma: keep
+#include "zxc_constants.h"  // IWYU pragma: keep
+#include "zxc_dict.h"       // IWYU pragma: keep
+#include "zxc_error.h"      // IWYU pragma: keep
+#include "zxc_opts.h"       // IWYU pragma: keep
+#include "zxc_pstream.h"    // IWYU pragma: keep
+
+#endif  // ZXC_H
\ No newline at end of file
diff --git a/thirdparty/zxc/include/zxc_buffer.h b/thirdparty/zxc/include/zxc_buffer.h
new file mode 100644
index 000000000000..f605cbdfbc50
--- /dev/null
+++ b/thirdparty/zxc/include/zxc_buffer.h
@@ -0,0 +1,634 @@
+/*
+ * ZXC - High-performance lossless compression
+ *
+ * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file zxc_buffer.h
+ * @brief Buffer-based (single-shot) compression and decompression API.
+ *
+ * This header exposes the simplest way to use ZXC: pass an entire input buffer
+ * and receive the result in a single output buffer.  All functions in this
+ * header are single-threaded and blocking.
+ *
+ * @par Typical usage
+ * @code
+ * // Compress
+ * size_t bound = zxc_compress_bound(src_size);
+ * void *dst    = malloc(bound);
+ * zxc_compress_opts_t opts = { .level = ZXC_LEVEL_DEFAULT, .checksum_enabled = 1 };
+ * int64_t csize = zxc_compress(src, src_size, dst, bound, &opts);
+ *
+ * // Decompress
+ * uint64_t orig = zxc_get_decompressed_size(dst, csize);
+ * void *out     = malloc(orig);
+ * zxc_decompress_opts_t dopts = { .checksum_enabled = 1 };
+ * int64_t dsize = zxc_decompress(dst, csize, out, orig, &dopts);
+ * @endcode
+ *
+ * @see zxc_stream.h  for the streaming (multi-threaded) API.
+ * @see zxc_pstream.h for single-threaded push-based streaming.
+ */
+
+#ifndef ZXC_BUFFER_H
+#define ZXC_BUFFER_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "zxc_export.h"
+#include "zxc_opts.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @defgroup library_info Library Information
+ * @brief Runtime-queryable library metadata.
+ *
+ * These functions allow callers (including filesystem integrations)
+ * to discover the supported compression level range and library version at
+ * runtime, without relying on compile-time constants alone.
+ * @{
+ */
+
+/**
+ * @brief Returns the minimum supported compression level.
+ *
+ * Currently returns @ref ZXC_LEVEL_FASTEST (1).
+ *
+ * @return Minimum compression level value.
+ */
+ZXC_EXPORT int zxc_min_level(void);
+
+/**
+ * @brief Returns the maximum supported compression level.
+ *
+ * Currently returns @ref ZXC_LEVEL_DENSITY (6).
+ *
+ * @return Maximum compression level value.
+ */
+ZXC_EXPORT int zxc_max_level(void);
+
+/**
+ * @brief Returns the default compression level.
+ *
+ * Currently returns @ref ZXC_LEVEL_DEFAULT (3).
+ *
+ * @return Default compression level value.
+ */
+ZXC_EXPORT int zxc_default_level(void);
+
+/**
+ * @brief Returns the human-readable library version string.
+ *
+ * The returned pointer is a compile-time constant and must not be freed.
+ * Format: "MAJOR.MINOR.PATCH" (e.g. "0.12.0").
+ *
+ * @return Null-terminated version string.
+ */
+ZXC_EXPORT const char* zxc_version_string(void);
+
+/** @} */ /* end of library_info */
+
+/**
+ * @defgroup buffer_api Buffer API
+ * @brief Single-shot, buffer-based compression and decompression.
+ * @{
+ */
+
+/**
+ * @brief Calculates the maximum theoretical compressed size for a given input.
+ *
+ * Useful for allocating output buffers before compression.
+ * Accounts for file headers, block headers, and potential expansion
+ * of incompressible data.
+ *
+ * @param[in] input_size Size of the input data in bytes.
+ *
+ * @return           Maximum required buffer size in bytes.
+ */
+ZXC_EXPORT uint64_t zxc_compress_bound(const size_t input_size);
+
+/**
+ * @brief Compresses a data buffer using the ZXC algorithm.
+ *
+ * This version uses standard size_t types and void pointers.
+ * It executes in a single thread (blocking operation).
+ * It writes the ZXC file header followed by compressed blocks.
+ *
+ * @param[in] src          Pointer to the source buffer.
+ * @param[in] src_size     Size of the source data in bytes.
+ * @param[out] dst          Pointer to the destination buffer.
+ * @param[in] dst_capacity Maximum capacity of the destination buffer.
+ * @param[in] opts         Compression options (NULL uses all defaults).
+ *                         @c n_threads and the progress callback are ignored
+ *                         (this call is single-threaded and blocking).
+ *
+ * @note @p src and @p dst must not overlap (same contract as memcpy).
+ *
+ * @return The number of bytes written to dst (>0 on success),
+ *         or a negative zxc_error_t code (e.g., ZXC_ERROR_DST_TOO_SMALL) on failure.
+ */
+ZXC_EXPORT int64_t zxc_compress(const void* src, const size_t src_size, void* dst,
+                                const size_t dst_capacity, const zxc_compress_opts_t* opts);
+
+/**
+ * @brief Decompresses a ZXC compressed buffer.
+ *
+ * This version uses standard size_t types and void pointers.
+ * It executes in a single thread (blocking operation).
+ * It expects a valid ZXC file header followed by compressed blocks.
+ *
+ * @param[in] src          Pointer to the source buffer containing compressed data.
+ * @param[in] src_size      Size of the compressed data in bytes.
+ * @param[out] dst          Pointer to the destination buffer.
+ * @param[in] dst_capacity  Capacity of the destination buffer.
+ * @param[in] opts          Decompression options (NULL uses all defaults).
+ *                          @c n_threads and the progress callback are ignored
+ *                          (this call is single-threaded and blocking).
+ *
+ * @note @p src and @p dst must not overlap (same contract as memcpy).
+ *
+ * @return The number of bytes written to dst (>0 on success),
+ *         or a negative zxc_error_t code (e.g., ZXC_ERROR_CORRUPT_DATA) on failure.
+ */
+ZXC_EXPORT int64_t zxc_decompress(const void* src, const size_t src_size, void* dst,
+                                  const size_t dst_capacity, const zxc_decompress_opts_t* opts);
+
+/**
+ * @brief Returns the decompressed size stored in a ZXC compressed buffer.
+ *
+ * This function reads the file footer to extract the original uncompressed size
+ * without performing any decompression. Useful for allocating output buffers.
+ *
+ * @param[in] src       Pointer to the compressed data buffer.
+ * @param[in] src_size  Size of the compressed data in bytes.
+ *
+ * @return The original uncompressed size in bytes, or 0 if the buffer is invalid
+ *         or too small to contain a valid ZXC archive.
+ */
+ZXC_EXPORT uint64_t zxc_get_decompressed_size(const void* src, const size_t src_size);
+
+/**
+ * @brief Returns the dictionary ID stored in a ZXC compressed buffer.
+ *
+ * Reads the file header flag and dict_id field without decompressing.
+ * Returns 0 if the file does not require a dictionary or the buffer is invalid.
+ *
+ * @param[in] src       Pointer to the compressed data buffer.
+ * @param[in] src_size  Size of the compressed data in bytes.
+ * @return Dictionary ID, or 0 if no dictionary is required.
+ */
+ZXC_EXPORT uint32_t zxc_get_dict_id(const void* src, size_t src_size);
+
+/* ========================================================================= */
+/*  Block-Level API (no file framing)                                        */
+/* ========================================================================= */
+
+/**
+ * @defgroup block_api Block API
+ * @brief Single-block compression/decompression without file framing.
+ *
+ * These functions compress or decompress a single independent block, producing
+ * only the block header (8 bytes) + compressed payload + optional checksum (4 bytes).
+ * No file header, EOF block, or footer is written.
+ *
+ * This API is designed for filesystem integrations where the filesystem manages its own block
+ * indexing and each block is compressed independently.
+ *
+ * @par Typical usage
+ * @code
+ * // Compress a single filesystem block
+ * zxc_cctx* cctx = zxc_create_cctx(NULL);
+ * zxc_compress_opts_t opts = { .level = 3 };
+ * size_t bound = zxc_compress_block_bound(block_size);
+ * void *dst = malloc(bound);
+ * int64_t csize = zxc_compress_block(cctx, block, block_size, dst, bound, &opts);
+ *
+ * // Decompress
+ * zxc_dctx* dctx = zxc_create_dctx();
+ * int64_t dsize = zxc_decompress_block(dctx, dst, csize, out, block_size, NULL);
+ *
+ * zxc_free_cctx(cctx);
+ * zxc_free_dctx(dctx);
+ * @endcode
+ * @{
+ */
+
+/* Forward declarations for context types (defined below). */
+/** @brief Opaque reusable compression context (see @ref zxc_create_cctx). */
+typedef struct zxc_cctx_s zxc_cctx;
+/** @brief Opaque reusable decompression context (see @ref zxc_create_dctx). */
+typedef struct zxc_dctx_s zxc_dctx;
+
+/**
+ * @brief Returns the maximum compressed size for a single block.
+ *
+ * Unlike zxc_compress_bound(), this does NOT include file header,
+ * EOF block, or footer overhead.  Use this to size the destination
+ * buffer for zxc_compress_block().
+ *
+ * @param[in] input_size Size of the uncompressed block in bytes
+ *                       (must be <= @ref ZXC_BLOCK_SIZE_MAX).
+ * @return Upper bound on compressed block size, or 0 if @p input_size is
+ *         out of range for the Block API
+ *         (@p input_size > @ref ZXC_BLOCK_SIZE_MAX) or would overflow.
+ */
+ZXC_EXPORT uint64_t zxc_compress_block_bound(size_t input_size);
+
+/**
+ * @brief Returns the minimum destination capacity required by
+ *        zxc_decompress_block() for a block of @p uncompressed_size bytes.
+ *
+ * The decoder uses speculative (wild-copy) writes on its fast path and
+ * therefore needs a tail pad beyond the declared uncompressed size.
+ * Passing exactly @c uncompressed_size as @c dst_capacity forces the slow
+ * tail path and may trigger @ref ZXC_ERROR_OVERFLOW on some inputs.
+ *
+ * Use this helper to size the destination buffer. The returned value is
+ * guaranteed to enable the fastest decode path without aliasing or
+ * overrun checks tripping.
+ *
+ * @param[in] uncompressed_size Original uncompressed block size in bytes
+ *                              (must be <= @ref ZXC_BLOCK_SIZE_MAX).
+ * @return Minimum @c dst_capacity to pass to zxc_decompress_block(), or 0 if
+ *         @p uncompressed_size is out of range for the Block API
+ *         (@p uncompressed_size > @ref ZXC_BLOCK_SIZE_MAX) or would overflow.
+ */
+ZXC_EXPORT uint64_t zxc_decompress_block_bound(const size_t uncompressed_size);
+
+/**
+ * @brief Compresses a single block without file framing.
+ *
+ * Output format: @c block_header(8B) + payload + optional @c checksum(4B).
+ * The output can be decompressed with zxc_decompress_block().
+ *
+ * The Block API processes a single format-conformant block at a time:
+ * @p src_size must not exceed @ref ZXC_BLOCK_SIZE_MAX (2 MiB). For larger
+ * payloads, use the frame API (zxc_compress) or the streaming API
+ * (zxc_cstream_*), both of which chunk transparently into compliant blocks.
+ *
+ * @param[in,out] cctx         Reusable compression context.
+ * @param[in]     src          Source data.
+ * @param[in]     src_size     Source data size in bytes
+ *                             (must be in [1, @ref ZXC_BLOCK_SIZE_MAX]).
+ * @param[out]    dst          Destination buffer.
+ * @param[in]     dst_capacity Capacity of the destination buffer
+ *                             (use zxc_compress_block_bound() to size).
+ * @param[in]     opts         Compression options, or NULL for defaults.
+ *                             Only @c level, @c block_size, and
+ *                             @c checksum_enabled are used.
+ *
+ * @note @p src and @p dst must not overlap (same contract as memcpy).
+ *
+ * @return Compressed block size in bytes (> 0) on success,
+ *         or a negative @ref zxc_error_t code on failure.
+ *         Returns @ref ZXC_ERROR_BAD_BLOCK_SIZE if
+ *         @p src_size > @ref ZXC_BLOCK_SIZE_MAX.
+ */
+ZXC_EXPORT int64_t zxc_compress_block(zxc_cctx* cctx, const void* src, size_t src_size, void* dst,
+                                      size_t dst_capacity, const zxc_compress_opts_t* opts);
+
+/**
+ * @brief Decompresses a single block produced by zxc_compress_block().
+ *
+ * The Block API decompresses a single format-conformant block at a time:
+ * @p dst_capacity must not exceed @ref ZXC_BLOCK_SIZE_MAX +
+ * @ref ZXC_DECOMPRESS_TAIL_PAD (the size returned by
+ * zxc_decompress_block_bound() for the maximum block size). For payloads
+ * produced by the frame or streaming APIs, use zxc_decompress instead.
+ *
+ * @param[in,out] dctx         Reusable decompression context.
+ * @param[in]     src          Compressed block data.
+ * @param[in]     src_size     Compressed data size in bytes.
+ * @param[out]    dst          Destination buffer for decompressed data.
+ * @param[in]     dst_capacity Capacity of the destination buffer (must be
+ *                             at least the original uncompressed size,
+ *                             and at most @ref ZXC_BLOCK_SIZE_MAX +
+ *                             @ref ZXC_DECOMPRESS_TAIL_PAD).
+ * @param[in]     opts         Decompression options (NULL for defaults).
+ *                             Only @c checksum_enabled is used.
+ *
+ * @note @p src and @p dst must not overlap (same contract as memcpy).
+ *
+ * @return Decompressed size in bytes (> 0) on success,
+ *         or a negative @ref zxc_error_t code on failure.
+ *         Returns @ref ZXC_ERROR_BAD_BLOCK_SIZE if @p dst_capacity exceeds
+ *         the per-block limit.
+ */
+ZXC_EXPORT int64_t zxc_decompress_block(zxc_dctx* dctx, const void* src, size_t src_size, void* dst,
+                                        size_t dst_capacity, const zxc_decompress_opts_t* opts);
+
+/**
+ * @brief Decompresses a single block with a strict-sized destination buffer.
+ *
+ * Identical semantics to zxc_decompress_block() but accepts
+ * @p dst_capacity == @c uncompressed_size (no trailing @c ZXC_DECOMPRESS_TAIL_PAD
+ * required). Intended for integrations whose destination buffer cannot be
+ * oversized (for example, decoding into an exactly-sized, page-aligned
+ * region). Here "in-place" means a tightly-sized destination, not an
+ * overlapping @p src / @p dst (see @note below).
+ *
+ * This path is slightly slower than zxc_decompress_block() on the same input
+ * because it avoids the wild-copy overshoot that the fast decoder relies on.
+ * Output is bit-identical to zxc_decompress_block().
+ *
+ * RAW blocks transparently forward to zxc_decompress_block(); only
+ * GLO/GHI use the strict-tail decoder path.
+ *
+ * Strict-tail variant: @p dst_capacity is the exact uncompressed size with
+ * no tail-pad margin, so the upper limit is @ref ZXC_BLOCK_SIZE_MAX (not
+ * @c MAX+TAIL_PAD as for zxc_decompress_block).
+ *
+ * @param[in,out] dctx         Reusable decompression context.
+ * @param[in]     src          Compressed block data.
+ * @param[in]     src_size     Compressed data size in bytes.
+ * @param[out]    dst          Destination buffer for decompressed data.
+ * @param[in]     dst_capacity Capacity of the destination buffer (must be
+ *                             at least the original uncompressed size,
+ *                             and at most @ref ZXC_BLOCK_SIZE_MAX; unlike
+ *                             zxc_decompress_block, no trailing tail-pad
+ *                             margin is required).
+ * @param[in]     opts         Decompression options (NULL for defaults).
+ *                             Only @c checksum_enabled is used.
+ *
+ * @note @p src and @p dst must not overlap (same contract as memcpy).
+ *
+ * @return Decompressed size in bytes (> 0) on success,
+ *         or a negative @ref zxc_error_t code on failure.
+ *         Returns @ref ZXC_ERROR_BAD_BLOCK_SIZE if
+ *         @p dst_capacity > @ref ZXC_BLOCK_SIZE_MAX.
+ */
+ZXC_EXPORT int64_t zxc_decompress_block_safe(zxc_dctx* dctx, const void* src, const size_t src_size,
+                                             void* dst, const size_t dst_capacity,
+                                             const zxc_decompress_opts_t* opts);
+
+/**
+ * @brief Estimates the peak memory used by compression for a given block & level.
+ *
+ * Returns the total bytes reserved by @ref zxc_compress_block for a block of
+ * @p src_size bytes: all per-chunk working buffers (chain table, literals,
+ * sequence/token/offset/extras buffers) plus the fixed hash tables and
+ * cache-line alignment padding. At @p level >= 6 the value also includes the
+ * `opt_scratch` region (~8.125 x @p src_size bytes) used by the price-based
+ * optimal parser. That region is lazy-allocated on the first level-6 call
+ * and reused across blocks for the lifetime of the cctx. Scales roughly
+ * linearly with @p src_size.
+ *
+ * Intended for integrators that need an accurate memory-budget figure.
+ *
+ * @param[in] src_size Uncompressed block size in bytes.
+ * @param[in] level    Compression level (1..6). Levels <= 5 share the same
+ *                     persistent cctx footprint; level 6 adds the optimal-
+ *                     parser scratch.
+ * @return Estimated peak cctx memory usage in bytes, or 0 if @p src_size is 0.
+ */
+ZXC_EXPORT uint64_t zxc_estimate_cctx_size(size_t src_size, int level);
+
+/** @} */ /* end of block_api */
+
+/* ========================================================================= */
+/*  Reusable Context API (opaque, heap-allocated)                            */
+/* ========================================================================= */
+
+/**
+ * @defgroup context_api Reusable Context API
+ * @brief Opaque, reusable compression / decompression contexts.
+ *
+ * This API eliminates per-call allocation overhead by letting callers retain
+ * a context across multiple operations.  The internal layout is hidden behind
+ * an opaque pointer.
+ *
+ * @{
+ */
+
+/* --- Compression context ------------------------------------------------- */
+
+/**
+ * @brief Creates a new reusable compression context.
+ *
+ * When @p opts is non-NULL the context pre-allocates all internal buffers
+ * using the supplied level, block_size, and checksum_enabled settings.
+ * When @p opts is NULL, allocation is deferred to the first call to
+ * zxc_compress_cctx().
+ *
+ * The returned context must be freed with zxc_free_cctx().
+ *
+ * @param[in] opts  Compression options for eager init, or NULL for lazy init.
+ * @return Pointer to the new context, or @c NULL on allocation failure.
+ */
+ZXC_EXPORT zxc_cctx* zxc_create_cctx(const zxc_compress_opts_t* opts);
+
+/**
+ * @brief Frees a compression context and all associated resources.
+ *
+ * It is safe to pass @c NULL; the call is a no-op in that case.
+ *
+ * @param[in] cctx Context to free.
+ */
+ZXC_EXPORT void zxc_free_cctx(zxc_cctx* cctx);
+
+/**
+ * @brief Compresses data using a reusable context.
+ *
+ * Identical to zxc_compress() but reuses the internal buffers from @p cctx,
+ * avoiding per-call malloc/free overhead.  The context automatically
+ * re-initializes when block_size or level changes between calls.
+ *
+ * Options are **sticky**: settings passed via @p opts are remembered and
+ * reused on subsequent calls where @p opts is NULL.  The initial sticky
+ * values come from the @p opts passed to zxc_create_cctx().
+ *
+ * @param[in,out] cctx         Reusable compression context.
+ * @param[in]     src          Source data.
+ * @param[in]     src_size     Source data size in bytes.
+ * @param[out]    dst          Destination buffer.
+ * @param[in]     dst_capacity Capacity of the destination buffer.
+ * @param[in]     opts         Compression options, or NULL to reuse
+ *                             settings from create / last call.
+ *
+ * @note @p src and @p dst must not overlap (same contract as memcpy).
+ *
+ * @return Compressed size in bytes (> 0) on success,
+ *         or a negative @ref zxc_error_t code on failure.
+ */
+ZXC_EXPORT int64_t zxc_compress_cctx(zxc_cctx* cctx, const void* src, size_t src_size, void* dst,
+                                     size_t dst_capacity, const zxc_compress_opts_t* opts);
+
+/* --- Decompression context ----------------------------------------------- */
+
+/**
+ * @brief Creates a new reusable decompression context.
+ *
+ * @return Pointer to the new context, or @c NULL on allocation failure.
+ */
+ZXC_EXPORT zxc_dctx* zxc_create_dctx(void);
+
+/**
+ * @brief Frees a decompression context and all associated resources.
+ *
+ * It is safe to pass @c NULL.
+ *
+ * @param[in] dctx Context to free.
+ */
+ZXC_EXPORT void zxc_free_dctx(zxc_dctx* dctx);
+
+/**
+ * @brief Decompresses data using a reusable context.
+ *
+ * Identical to zxc_decompress() but reuses buffers from @p dctx.
+ *
+ * @param[in,out] dctx         Reusable decompression context.
+ * @param[in]     src          Compressed data.
+ * @param[in]     src_size     Compressed data size in bytes.
+ * @param[out]    dst          Destination buffer.
+ * @param[in]     dst_capacity Capacity of the destination buffer.
+ * @param[in]     opts         Decompression options (NULL for defaults).
+ *
+ * @note @p src and @p dst must not overlap (same contract as memcpy).
+ *
+ * @return Decompressed size in bytes (> 0) on success,
+ *         or a negative @ref zxc_error_t code on failure.
+ */
+ZXC_EXPORT int64_t zxc_decompress_dctx(zxc_dctx* dctx, const void* src, size_t src_size, void* dst,
+                                       size_t dst_capacity, const zxc_decompress_opts_t* opts);
+
+/* ========================================================================= */
+/*  Static Context API (caller-allocated workspace)                          */
+/* ========================================================================= */
+
+/**
+ * @defgroup static_context_api Static Context API
+ * @brief Caller-allocated, fixed-footprint compression / decompression
+ *        contexts.
+ *
+ * Mirrors the dynamic Reusable Context API but places the entire context
+ * (handle + persistent buffers) inside a single buffer allocated and owned
+ * by the caller.  This pattern is mandatory for environments where the
+ * library cannot call into the host allocator on the hot path: Linux
+ * kernel filesystems (one workspace per mount, served via @c vmalloc /
+ * @c kmalloc up front), embedded targets without a heap (`.bss` or
+ * stack-allocated workspace), sandboxed runtimes with a fixed memory
+ * budget, etc.
+ *
+ * The trade-off vs the dynamic API: the workspace is pinned to a single @c block_size and @c level
+ * at init time; subsequent compress/decompress calls cannot enlarge the footprint, so a workload
+ * that needs to mix block sizes must size the workspace for the maximum block_size up front.
+ *
+ * @par Typical usage
+ * @code
+ * size_t ws_sz = zxc_static_cctx_workspace_size(64 * 1024, ZXC_LEVEL_DEFAULT);
+ * void *ws = aligned_alloc(64, ws_sz);                   // or kmalloc, vmalloc, .bss
+ * zxc_compress_opts_t opts = { .level = ZXC_LEVEL_DEFAULT, .block_size = 64 * 1024 };
+ * zxc_cctx *cctx = zxc_init_static_cctx(ws, ws_sz, &opts);
+ *
+ * for (each block) zxc_compress_cctx(cctx, src, n, dst, cap, NULL);
+ *
+ * // zxc_free_cctx is a no-op on a static cctx; the caller owns @c ws.
+ * free(ws);
+ * @endcode
+ * @{
+ */
+
+/**
+ * @brief Returns the exact byte count required by a static compression
+ *        workspace for the given @p block_size and @p level.
+ *
+ * The value is the sum of the opaque @ref zxc_cctx wrapper plus every
+ * persistent sub-buffer the library would partition (hash tables, chain
+ * table, sequence buffers, literal scratch, plus the optimal-parser
+ * scratch at @ref ZXC_LEVEL_DENSITY). Round up to your allocator's
+ * alignment before calling @c posix_memalign / @c aligned_alloc, the
+ * workspace must be at least cache-line aligned.
+ *
+ * @param[in] block_size  Block size in bytes (must satisfy the regular
+ *                        block-size constraints: power of two in
+ *                        [@ref ZXC_BLOCK_SIZE_MIN, @ref ZXC_BLOCK_SIZE_MAX]).
+ * @param[in] level       Compression level (1..6); higher levels at
+ *                        @ref ZXC_LEVEL_DENSITY add the optimal-parser
+ *                        scratch (~8.125 x block_size).
+ * @return Workspace size in bytes, or 0 if either argument is invalid.
+ */
+ZXC_EXPORT size_t zxc_static_cctx_workspace_size(const size_t block_size, const int level);
+
+/**
+ * @brief Initialises a compression context inside a caller-supplied
+ *        workspace.
+ *
+ * @p workspace_size must be at least @ref zxc_static_cctx_workspace_size
+ * for the same @c block_size and @c level.  The workspace must remain
+ * valid for the lifetime of the returned handle and must be cache-line
+ * (64-byte) aligned.  The caller owns the workspace; @ref zxc_free_cctx
+ * is a no-op on the returned handle.
+ *
+ * @par Locked parameters
+ * The @c block_size, @c level, and @c checksum_enabled fields of @p opts
+ * are pinned at init time.  Subsequent @ref zxc_compress_cctx calls that
+ * pass options requesting a different @c block_size return
+ * @ref ZXC_ERROR_BAD_BLOCK_SIZE without re-initialising.  A different
+ * @c level / @c checksum_enabled is honoured per-call without
+ * re-partitioning.
+ *
+ * @param[in,out] workspace       Caller-allocated buffer, cache-line aligned.
+ * @param[in]     workspace_size  Capacity of @p workspace in bytes.
+ * @param[in]     opts            Must be non-NULL: @c block_size and
+ *                                @c level must be set explicitly to size
+ *                                the workspace correctly.
+ * @return Handle pointing inside @p workspace on success, or @c NULL if
+ *         the workspace is too small or the options are invalid.
+ */
+ZXC_EXPORT zxc_cctx* zxc_init_static_cctx(void* workspace, const size_t workspace_size,
+                                          const zxc_compress_opts_t* opts);
+
+/**
+ * @brief Returns the exact byte count required by a static decompression
+ *        workspace for the given @p block_size.
+ *
+ * Unlike the compression variant, this size is independent of the source
+ * archive's level: @c lit_buffer is always provisioned worst-case because
+ * the decoder cannot predict the per-block literal encoding until it sees
+ * each block header.
+ *
+ * @param[in] block_size  Maximum block size the decoder will encounter
+ *                        (must satisfy the regular block-size constraints).
+ * @return Workspace size in bytes, or 0 if @p block_size is invalid.
+ */
+ZXC_EXPORT size_t zxc_static_dctx_workspace_size(const size_t block_size);
+
+/**
+ * @brief Initialises a decompression context inside a caller-supplied
+ *        workspace.
+ *
+ * @p workspace_size must be at least @ref zxc_static_dctx_workspace_size
+ * for the same @p block_size.  The workspace must remain valid for the
+ * lifetime of the returned handle and must be cache-line aligned.  The
+ * caller owns the workspace; @ref zxc_free_dctx is a no-op on the
+ * returned handle.
+ *
+ * @par Locked block size
+ * @p block_size is pinned at init time: feeding the returned handle an
+ * archive whose header declares a different @c block_size returns
+ * @ref ZXC_ERROR_BAD_BLOCK_SIZE.
+ *
+ * @param[in,out] workspace       Caller-allocated buffer, cache-line aligned.
+ * @param[in]     workspace_size  Capacity of @p workspace in bytes.
+ * @param[in]     block_size      Block size the decoder will accept.
+ * @return Handle pointing inside @p workspace on success, or @c NULL if
+ *         the workspace is too small or @p block_size is invalid.
+ */
+ZXC_EXPORT zxc_dctx* zxc_init_static_dctx(void* workspace, const size_t workspace_size,
+                                          const size_t block_size);
+
+/** @} */ /* end of static_context_api */
+/** @} */ /* end of context_api */
+/** @} */ /* end of buffer_api */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // ZXC_BUFFER_H
\ No newline at end of file
diff --git a/thirdparty/zxc/include/zxc_constants.h b/thirdparty/zxc/include/zxc_constants.h
new file mode 100644
index 000000000000..b0fa5fe9f3a4
--- /dev/null
+++ b/thirdparty/zxc/include/zxc_constants.h
@@ -0,0 +1,135 @@
+/*
+ * ZXC - High-performance lossless compression
+ *
+ * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file zxc_constants.h
+ * @brief Public constants: library version and compression levels.
+ *
+ * Include this header to query the library version at compile time or to
+ * reference the predefined compression-level constants used throughout the API.
+ */
+
+#ifndef ZXC_CONSTANTS_H
+#define ZXC_CONSTANTS_H
+
+/**
+ * @defgroup version Library Version
+ * @brief Compile-time version information.
+ * @{
+ */
+
+/** @brief Major version number. */
+#define ZXC_VERSION_MAJOR 0
+/** @brief Minor version number. */
+#define ZXC_VERSION_MINOR 12
+/** @brief Patch version number. */
+#define ZXC_VERSION_PATCH 0
+
+/** @cond INTERNAL */
+#define ZXC_STR_HELPER(x) #x
+#define ZXC_STR(x) ZXC_STR_HELPER(x)
+/** @endcond */
+
+/**
+ * @brief Human-readable version string in "MAJOR.MINOR.PATCH" form (e.g. "0.12.0").
+ */
+#define ZXC_LIB_VERSION_STR    \
+    ZXC_STR(ZXC_VERSION_MAJOR) \
+    "." ZXC_STR(ZXC_VERSION_MINOR) "." ZXC_STR(ZXC_VERSION_PATCH)
+
+/** @} */ /* end of version */
+
+/**
+ * @defgroup block_size Block Size
+ * @brief Block size constraints for compression.
+ *
+ * Block size must be a power of two in range
+ * [@ref ZXC_BLOCK_SIZE_MIN, @ref ZXC_BLOCK_SIZE_MAX].
+ * Pass 0 to any API to use @ref ZXC_BLOCK_SIZE_DEFAULT.
+ * @{
+ */
+/** @brief log2(ZXC_BLOCK_SIZE_MIN) - exponent code for minimum block size. */
+#define ZXC_BLOCK_SIZE_MIN_LOG2 12
+/** @brief log2(ZXC_BLOCK_SIZE_MAX) - exponent code for maximum block size. */
+#define ZXC_BLOCK_SIZE_MAX_LOG2 21
+/** @brief Default block size (512 KB). */
+#define ZXC_BLOCK_SIZE_DEFAULT (512 * 1024)
+/** @brief Minimum allowed block size (4 KB = 2^12). */
+#define ZXC_BLOCK_SIZE_MIN (1U << ZXC_BLOCK_SIZE_MIN_LOG2)
+/** @brief Maximum allowed block size (2 MB = 2^21). */
+#define ZXC_BLOCK_SIZE_MAX (1U << ZXC_BLOCK_SIZE_MAX_LOG2)
+/** @} */ /* end of block_size */
+
+/**
+ * @defgroup dictionary Dictionary
+ * @brief Constants for pre-trained dictionary support.
+ * @{
+ */
+/** @brief Maximum dictionary content size in bytes (64 KB - 1).
+ *
+ * Bounded to a 16-bit value (65535) by two constraints that both cap at the
+ * same number: the `.zxd` header stores the content size in a 16-bit field, and
+ * LZ77 match offsets are 16-bit (max distance 65535). */
+#define ZXC_DICT_SIZE_MAX ((1U << 16) - 1U)
+/** @brief Size of the .zxd dictionary file header in bytes. */
+#define ZXC_DICT_HEADER_SIZE 16
+/** @brief Size in bytes of a packed literal Huffman code-lengths table
+ *         (256 symbols, 4 bits each): the shared table carried by a .zxd
+ *         file and, internally, the per-block lengths header. See
+ *         zxc_train_dict_huf() / zxc_dict_huf(). */
+#define ZXC_HUF_TABLE_SIZE 128
+/** @} */ /* end of dictionary */
+
+/**
+ * @defgroup threading Threading Limits
+ * @brief Bounds on thread-count parameters accepted by the streaming APIs.
+ * @{
+ */
+/** @brief Maximum value accepted for `n_threads` in `zxc_stream_compress`
+ *  / `zxc_stream_decompress`. Higher values are clamped to `ZXC_MAX_THREADS`. */
+#define ZXC_MAX_THREADS 512
+/** @} */ /* end of threading */
+
+/**
+ * @defgroup file_format File Format Constants
+ * @brief On-disk byte sizes of the archive header and footer.
+ *
+ * @{
+ */
+/** @brief File header size: Magic(4) + Version(1) + Chunk(1) + Flags(1) + Reserved(7) + CRC(2). */
+#define ZXC_FILE_HEADER_SIZE 16
+/** @brief File footer size: original_size(8) + global_checksum(4). */
+#define ZXC_FILE_FOOTER_SIZE 12
+/** @} */ /* end of file_format */
+
+/**
+ * @defgroup levels Compression Levels
+ * @brief Predefined compression levels for the ZXC library.
+ *
+ * Higher levels trade encoding speed for better compression ratio.
+ * All levels produce data that can be decompressed at the same speed.
+ * @{
+ */
+
+/**
+ * @brief Enumeration of ZXC compression levels.
+ *
+ * Use one of these constants as the @p level parameter of
+ * zxc_compress() or zxc_stream_compress().
+ */
+typedef enum {
+    ZXC_LEVEL_FASTEST = 1,  /**< Fastest compression, best for real-time applications. */
+    ZXC_LEVEL_FAST = 2,     /**< Fast compression, good for real-time applications. */
+    ZXC_LEVEL_DEFAULT = 3,  /**< Recommended: ratio > LZ4, decode speed > LZ4. */
+    ZXC_LEVEL_BALANCED = 4, /**< Good ratio, good decode speed. */
+    ZXC_LEVEL_COMPACT = 5,  /**< High density. Best for storage/firmware/assets. */
+    ZXC_LEVEL_DENSITY = 6   /**< Maximum density: Huffman-coded literals on top of COMPACT. */
+} zxc_compression_level_t;
+
+/** @} */ /* end of levels */
+
+#endif  // ZXC_CONSTANTS_H
diff --git a/thirdparty/zxc/include/zxc_dict.h b/thirdparty/zxc/include/zxc_dict.h
new file mode 100644
index 000000000000..36d8d1cbd97e
--- /dev/null
+++ b/thirdparty/zxc/include/zxc_dict.h
@@ -0,0 +1,221 @@
+/*
+ * ZXC - High-performance lossless compression
+ *
+ * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file zxc_dict.h
+ * @brief Pre-trained dictionary API for ZXC compression.
+ *
+ * Provides functions to train, save, load, and identify dictionaries that
+ * improve compression ratio on small, similar payloads. Dictionaries are
+ * stored as external `.zxd` files and referenced by a 32-bit ID in the
+ * ZXC file header.
+ *
+ * A dictionary contains raw byte content that prefills the LZ77 sliding
+ * window at the start of each block, giving the compressor immediate
+ * access to representative patterns without waiting for them to appear
+ * in the input stream.
+ *
+ * @code
+ * // Train a dictionary from a corpus of JSON samples
+ * void* dict_buf = malloc(32768);
+ * int64_t dict_sz = zxc_train_dict(samples, sizes, n, dict_buf, 32768);
+ *
+ * // Train the shared literal Huffman table on the same corpus
+ * uint8_t huf[ZXC_HUF_TABLE_SIZE];
+ * zxc_train_dict_huf(samples, sizes, n, dict_buf, dict_sz, huf);
+ *
+ * // Save to .zxd file (content + table)
+ * void* zxd = malloc(zxc_dict_save_bound(dict_sz));
+ * int64_t zxd_sz = zxc_dict_save(dict_buf, dict_sz, huf, zxd, ...);
+ *
+ * // Use for compression
+ * zxc_compress_opts_t opts = {
+ *     .level = 6, .dict = dict_buf, .dict_size = dict_sz, .dict_huf = huf };
+ * zxc_compress(src, src_size, dst, dst_capacity, &opts);
+ * @endcode
+ */
+
+#ifndef ZXC_DICT_H
+#define ZXC_DICT_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "zxc_export.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @defgroup dict Dictionary
+ * @brief Pre-trained dictionary training, serialization, and identification.
+ * @{
+ */
+
+/**
+ * @brief Compute the dictionary ID for the given content and optional table.
+ *
+ * The ID is a deterministic 32-bit hash stored in the ZXC file header so the
+ * decoder can verify that the correct dictionary is provided at decompression
+ * time. With @p huf_lengths NULL it hashes the raw content only (the in-memory
+ * content-only dictionary of the buffer API). With a table it binds the
+ * (content, table) pair: `hash(table, seed = hash(content))` -- the value
+ * stored in `.zxd` files and in archives compressed with a shared table.
+ *
+ * @param[in] dict        Pointer to dictionary content.
+ * @param[in] dict_size   Size in bytes.
+ * @param[in] huf_lengths Shared literal Huffman table (@ref ZXC_HUF_TABLE_SIZE
+ *                        bytes), or NULL for a content-only ID.
+ * @return 32-bit dictionary ID. Returns 0 if @p dict is NULL or @p dict_size is 0.
+ */
+ZXC_EXPORT uint32_t zxc_dict_id(const void* dict, size_t dict_size, const void* huf_lengths);
+
+/**
+ * @brief Load and validate a `.zxd` dictionary file from a memory buffer.
+ *
+ * On success, @p content_out and @p huf_out (when non-NULL) point into the
+ * input buffer (zero-copy); the caller must keep @p buf alive while they are in
+ * use. A single call yields everything needed to (de)compress with the
+ * dictionary, pass @p content_out / @p huf_out straight to the @c dict /
+ * @c dict_huf option fields.
+ *
+ * @param[in]  buf              Buffer containing the .zxd file.
+ * @param[in]  buf_size         Size of @p buf in bytes.
+ * @param[out] content_out      Receives a pointer to the dictionary content.
+ * @param[out] content_size_out Receives the content size in bytes.
+ * @param[out] huf_out          Receives a pointer to the 128-byte shared Huffman
+ *                              table (may be NULL if not needed).
+ * @param[out] dict_id_out      Receives the dictionary ID (may be NULL).
+ * @return @ref ZXC_OK on success, or a negative @ref zxc_error_t code.
+ */
+ZXC_EXPORT int zxc_dict_load(const void* buf, size_t buf_size, const void** content_out,
+                             size_t* content_size_out, const void** huf_out, uint32_t* dict_id_out);
+
+/**
+ * @brief Serialize dictionary content and its shared Huffman table to the
+ *        `.zxd` file format.
+ *
+ * The 128-byte packed code-lengths table (from zxc_train_dict_huf()) is
+ * mandatory and is appended after the content. The stored dict_id covers both
+ * content and table, so archives compressed with this dictionary are bound to
+ * the exact pair.
+ *
+ * @param[in]  content       Raw dictionary content.
+ * @param[in]  content_size  Size of @p content in bytes (max ZXC_DICT_SIZE_MAX).
+ * @param[in]  huf_lengths   128-byte packed Huffman code lengths (required).
+ * @param[out] buf           Output buffer for the .zxd file.
+ * @param[in]  buf_capacity  Capacity of @p buf (see zxc_dict_save_bound()).
+ * @return Number of bytes written on success, or a negative @ref zxc_error_t code.
+ */
+ZXC_EXPORT int64_t zxc_dict_save(const void* content, size_t content_size, const void* huf_lengths,
+                                 void* buf, size_t buf_capacity);
+
+/**
+ * @brief Returns the maximum .zxd file size for a given content size.
+ *
+ * @param[in] content_size Size of the dictionary content.
+ * @return Total .zxd file size (header + content).
+ */
+ZXC_EXPORT size_t zxc_dict_save_bound(size_t content_size);
+
+/**
+ * @brief Returns the dictionary ID stored in a `.zxd` file buffer.
+ *
+ * Reads the dict_id field from the .zxd header without validating the full
+ * file. Returns 0 if the buffer is too small or the magic word doesn't match.
+ *
+ * @param[in] buf       Buffer containing the .zxd file.
+ * @param[in] buf_size  Size of @p buf in bytes.
+ * @return Dictionary ID, or 0 if the buffer is not a valid .zxd file.
+ */
+ZXC_EXPORT uint32_t zxc_dict_get_id(const void* buf, size_t buf_size);
+
+/**
+ * @brief Train a dictionary from a corpus of samples.
+ *
+ * Analyzes the samples to select byte sequences that maximize LZ77 match
+ * coverage. The resulting dictionary content can be passed directly to
+ * zxc_compress_opts_t::dict or serialized with zxc_dict_save().
+ *
+ * @param[in]  samples        Array of pointers to sample buffers.
+ * @param[in]  sample_sizes   Array of sample sizes in bytes.
+ * @param[in]  n_samples      Number of samples.
+ * @param[out] dict_buf       Output buffer for trained dictionary content.
+ * @param[in]  dict_capacity  Capacity of @p dict_buf (max ZXC_DICT_SIZE_MAX).
+ * @return Size of the trained dictionary on success, or a negative
+ *         @ref zxc_error_t code.
+ */
+ZXC_EXPORT int64_t zxc_train_dict(const void* const* samples, const size_t* sample_sizes,
+                                  size_t n_samples, void* dict_buf, size_t dict_capacity);
+
+/**
+ * @brief Train the shared literal Huffman table for an already-trained dictionary.
+ *
+ * Compresses the samples with @p dict and builds canonical Huffman code
+ * lengths from the real post-LZ literal distribution. The resulting 128-byte
+ * packed table can be embedded in a `.zxd` file via zxc_dict_save() and
+ * passed to the compressor/decompressor via the `dict_huf` option field.
+ * Blocks whose literals compress better with the shared table skip their
+ * per-block 128-byte table header, which is decisive at small block sizes.
+ *
+ * @param[in]  samples         Array of pointers to sample buffers (typically
+ *                             the same corpus used for zxc_train_dict()).
+ * @param[in]  sample_sizes    Array of sample sizes in bytes.
+ * @param[in]  n_samples       Number of samples.
+ * @param[in]  dict            Trained dictionary content.
+ * @param[in]  dict_size       Dictionary content size in bytes.
+ * @param[out] huf_lengths_out Receives the 128-byte packed code-lengths table.
+ * @return @ref ZXC_OK on success, or a negative @ref zxc_error_t code.
+ */
+ZXC_EXPORT int zxc_train_dict_huf(const void* const* samples, const size_t* sample_sizes,
+                                  size_t n_samples, const void* dict, size_t dict_size,
+                                  uint8_t* huf_lengths_out);
+
+/**
+ * @brief One-call dictionary creation: train content + shared table, serialize
+ *        to ready-to-write `.zxd` bytes.
+ *
+ * Convenience over the train/train-table/save sequence: it runs
+ * zxc_train_dict() then zxc_train_dict_huf() (which depends on the trained
+ * content) then zxc_dict_save(), writing a complete `.zxd` into @p zxd_buf.
+ * Use zxc_dict_save_bound(ZXC_DICT_SIZE_MAX) for a safe @p zxd_capacity, or
+ * size to the dictionary you expect. The lower-level primitives remain
+ * available for advanced use (raw content-only dictionaries, retraining only
+ * the table, or supplying externally-sourced content).
+ *
+ * @param[in]  samples       Array of pointers to sample buffers.
+ * @param[in]  sample_sizes  Array of sample sizes in bytes.
+ * @param[in]  n_samples     Number of samples.
+ * @param[out] zxd_buf       Output buffer for the `.zxd` file.
+ * @param[in]  zxd_capacity  Capacity of @p zxd_buf.
+ * @return Number of `.zxd` bytes written on success, or a negative
+ *         @ref zxc_error_t code.
+ */
+ZXC_EXPORT int64_t zxc_dict_train(const void* const* samples, const size_t* sample_sizes,
+                                  size_t n_samples, void* zxd_buf, size_t zxd_capacity);
+
+/**
+ * @brief Returns a pointer to the shared Huffman table inside a `.zxd` buffer.
+ *
+ * Zero-copy accessor: the returned pointer aims into @p buf and is valid as
+ * long as @p buf is. Returns NULL if the buffer is not a valid `.zxd` file or
+ * carries no table.
+ *
+ * @param[in] buf       Buffer containing the .zxd file.
+ * @param[in] buf_size  Size of @p buf in bytes.
+ * @return Pointer to the 128-byte packed code-lengths table, or NULL.
+ */
+ZXC_EXPORT const void* zxc_dict_huf(const void* buf, size_t buf_size);
+
+/** @} */ /* end of dict */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ZXC_DICT_H */
diff --git a/thirdparty/zxc/include/zxc_error.h b/thirdparty/zxc/include/zxc_error.h
new file mode 100644
index 000000000000..8b3006869c54
--- /dev/null
+++ b/thirdparty/zxc/include/zxc_error.h
@@ -0,0 +1,91 @@
+/*
+ * ZXC - High-performance lossless compression
+ *
+ * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file zxc_error.h
+ * @brief Error codes and error-name lookup for the ZXC library.
+ *
+ * Every public function that can fail returns a value from @ref zxc_error_t.
+ * A return value < 0 indicates an error; use zxc_error_name() to convert
+ * any code to a human-readable string.
+ */
+
+#ifndef ZXC_ERROR_H
+#define ZXC_ERROR_H
+
+#include "zxc_export.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @defgroup error Error Handling
+ * @brief Error codes returned by ZXC library functions.
+ * @{
+ */
+
+/**
+ * @brief Error codes returned by ZXC library functions.
+ *
+ * All error codes are negative integers. Functions that return int or int64_t
+ * will return these codes on failure. Check with `result < 0` for errors.
+ *
+ * Use zxc_error_name() to get a human-readable string for any error code.
+ */
+typedef enum {
+    ZXC_OK = 0, /**< Success (no error). */
+
+    /* Memory errors */
+    ZXC_ERROR_MEMORY = -1, /**< Memory allocation failure. */
+
+    /* Buffer/capacity errors */
+    ZXC_ERROR_DST_TOO_SMALL = -2, /**< Destination buffer too small. */
+    ZXC_ERROR_SRC_TOO_SMALL = -3, /**< Source buffer too small or truncated input. */
+
+    /* Format/header errors */
+    ZXC_ERROR_BAD_MAGIC = -4,    /**< Invalid magic word in file header. */
+    ZXC_ERROR_BAD_VERSION = -5,  /**< Unsupported file format version. */
+    ZXC_ERROR_BAD_HEADER = -6,   /**< Corrupted or invalid header (CRC mismatch). */
+    ZXC_ERROR_BAD_CHECKSUM = -7, /**< Block or global checksum verification failed. */
+
+    /* Data integrity errors */
+    ZXC_ERROR_CORRUPT_DATA = -8, /**< Corrupted compressed data. */
+    ZXC_ERROR_BAD_OFFSET = -9,   /**< Invalid match offset during decompression. */
+    ZXC_ERROR_OVERFLOW = -10,    /**< Buffer overflow detected during processing. */
+
+    /* I/O errors */
+    ZXC_ERROR_IO = -11,         /**< Read/write/seek failure on file. */
+    ZXC_ERROR_NULL_INPUT = -12, /**< Required input pointer is NULL. */
+
+    /* Block errors */
+    ZXC_ERROR_BAD_BLOCK_TYPE = -13, /**< Unknown or unexpected block type. */
+    ZXC_ERROR_BAD_BLOCK_SIZE = -14, /**< Invalid block size. */
+
+    /* Dictionary errors */
+    ZXC_ERROR_DICT_REQUIRED = -15,  /**< File requires a dictionary but none was provided. */
+    ZXC_ERROR_DICT_MISMATCH = -16,  /**< Provided dictionary ID does not match the file header. */
+    ZXC_ERROR_DICT_TOO_LARGE = -17, /**< Dictionary exceeds maximum allowed size. */
+
+} zxc_error_t;
+
+/**
+ * @brief Returns a human-readable name for the given error code.
+ *
+ * @param[in] code An error code from zxc_error_t (or any integer).
+ * @return A constant string such as "ZXC_OK" or "ZXC_ERROR_MEMORY".
+ *         Returns "ZXC_UNKNOWN_ERROR" for unrecognized codes.
+ */
+ZXC_EXPORT const char* zxc_error_name(const int code);
+
+/** @} */ /* end of error */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ZXC_ERROR_H */
diff --git a/thirdparty/zxc/include/zxc_export.h b/thirdparty/zxc/include/zxc_export.h
new file mode 100644
index 000000000000..8ca8a5cbf6ea
--- /dev/null
+++ b/thirdparty/zxc/include/zxc_export.h
@@ -0,0 +1,117 @@
+/*
+ * ZXC - High-performance lossless compression
+ *
+ * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file zxc_export.h
+ * @brief Platform-specific symbol visibility macros.
+ *
+ * This header defines the `ZXC_EXPORT`, `ZXC_NO_EXPORT`, and `ZXC_DEPRECATED`
+ * macros that control which symbols are exported from the shared library.
+ *
+ * - Define @c ZXC_STATIC_DEFINE when building or consuming ZXC as a **static**
+ *   library to disable import/export annotations.
+ * - When building the shared library the CMake target defines
+ *   @c zxc_lib_EXPORTS automatically, selecting `dllexport` / `visibility("default")`.
+ * - When consuming the shared library neither macro is defined, so the header
+ *   selects `dllimport` / `visibility("default")`.
+ */
+
+#ifndef ZXC_EXPORT_H
+#define ZXC_EXPORT_H
+
+/**
+ * @defgroup export Symbol Visibility
+ * @brief Macros controlling DLL export/import and deprecation attributes.
+ * @{
+ */
+
+#ifdef ZXC_STATIC_DEFINE
+
+/**
+ * @def ZXC_EXPORT
+ * @brief Marks a symbol as part of the public shared-library API.
+ *
+ * Expands to nothing when building a static library (@c ZXC_STATIC_DEFINE),
+ * to `__declspec(dllexport)` or `__declspec(dllimport)` on Windows, or
+ * to `__attribute__((visibility("default")))` on GCC/Clang.
+ */
+#define ZXC_EXPORT
+
+/**
+ * @def ZXC_NO_EXPORT
+ * @brief Marks a symbol as hidden (not exported from the shared library).
+ *
+ * Expands to nothing for static builds or Windows, and to
+ * `__attribute__((visibility("hidden")))` on GCC/Clang.
+ */
+#define ZXC_NO_EXPORT
+
+#else /* shared library */
+
+#ifndef ZXC_EXPORT
+#ifdef zxc_lib_EXPORTS
+/* Building the library */
+#ifdef _WIN32
+#define ZXC_EXPORT __declspec(dllexport)
+#else
+#define ZXC_EXPORT __attribute__((visibility("default")))
+#endif
+#else
+/* Consuming the library */
+#ifdef _WIN32
+#define ZXC_EXPORT __declspec(dllimport)
+#else
+#define ZXC_EXPORT __attribute__((visibility("default")))
+#endif
+#endif
+#endif
+
+#ifndef ZXC_NO_EXPORT
+#ifdef _WIN32
+#define ZXC_NO_EXPORT
+#else
+#define ZXC_NO_EXPORT __attribute__((visibility("hidden")))
+#endif
+#endif
+
+#endif /* ZXC_STATIC_DEFINE */
+
+#ifndef ZXC_DEPRECATED
+/**
+ * @def ZXC_DEPRECATED
+ * @brief Marks a symbol as deprecated.
+ *
+ * The compiler will emit a warning when a deprecated symbol is referenced.
+ * Expands to `__declspec(deprecated)` on MSVC or
+ * `__attribute__((__deprecated__))` on GCC/Clang.
+ */
+#ifdef _WIN32
+#define ZXC_DEPRECATED __declspec(deprecated)
+#else
+#define ZXC_DEPRECATED __attribute__((__deprecated__))
+#endif
+#endif
+
+/**
+ * @def ZXC_DEPRECATED_EXPORT
+ * @brief Combines `ZXC_EXPORT` with `ZXC_DEPRECATED`.
+ */
+#ifndef ZXC_DEPRECATED_EXPORT
+#define ZXC_DEPRECATED_EXPORT ZXC_EXPORT ZXC_DEPRECATED
+#endif
+
+/**
+ * @def ZXC_DEPRECATED_NO_EXPORT
+ * @brief Combines `ZXC_NO_EXPORT` with `ZXC_DEPRECATED`.
+ */
+#ifndef ZXC_DEPRECATED_NO_EXPORT
+#define ZXC_DEPRECATED_NO_EXPORT ZXC_NO_EXPORT ZXC_DEPRECATED
+#endif
+
+/** @} */ /* end of export */
+
+#endif /* ZXC_EXPORT_H */
diff --git a/thirdparty/zxc/include/zxc_opts.h b/thirdparty/zxc/include/zxc_opts.h
new file mode 100644
index 000000000000..30533cff0f68
--- /dev/null
+++ b/thirdparty/zxc/include/zxc_opts.h
@@ -0,0 +1,120 @@
+/*
+ * ZXC - High-performance lossless compression
+ *
+ * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file zxc_opts.h
+ * @brief Shared option structures for the ZXC compression APIs.
+ *
+ * Defines @ref zxc_compress_opts_t, @ref zxc_decompress_opts_t and
+ * @ref zxc_progress_callback_t. These types are consumed by every public
+ * ZXC API (one-shot buffer, multi-threaded @c FILE* streaming, push
+ * streaming, seekable).
+ *
+ * This header is never used in isolation: include the API header you
+ * actually use (@c zxc_buffer.h, @c zxc_stream.h, @c zxc_pstream.h, ...)
+ * which pulls this one in transitively.
+ */
+
+#ifndef ZXC_OPTS_H
+#define ZXC_OPTS_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "zxc_export.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Progress callback function type.
+ *
+ * This callback is invoked periodically during compression/decompression to report
+ * progress. It is called from the writer thread after each block is processed.
+ *
+ * @param[in] bytes_processed Total input bytes processed so far.
+ * @param[in] bytes_total     Total input bytes to process (0 if unknown, e.g., stdin).
+ * @param[in] user_data       User-provided context pointer (passed through from API call).
+ *
+ * @note The callback should be fast and non-blocking. Avoid heavy I/O or mutex locks.
+ */
+typedef void (*zxc_progress_callback_t)(uint64_t bytes_processed, uint64_t bytes_total,
+                                        const void* user_data);
+
+/**
+ * @brief Options for streaming compression.
+ *
+ * Zero-initialise for safe defaults: level 0 maps to ZXC_LEVEL_DEFAULT (3),
+ * block_size 0 maps to ZXC_BLOCK_SIZE_DEFAULT, n_threads 0 means
+ * auto-detect, and all other fields are disabled.
+ *
+ * @code
+ * zxc_compress_opts_t opts = { .level = ZXC_LEVEL_COMPACT };
+ * zxc_stream_compress(f_in, f_out, &opts);
+ * @endcode
+ */
+typedef struct {
+    int n_threads;     /**< Worker thread count (0 = auto-detect CPU cores). */
+    int level;         /**< Compression level 1-6 (0 = default, ZXC_LEVEL_DEFAULT). */
+    size_t block_size; /**< Block size in bytes (0 = default ZXC_BLOCK_SIZE_DEFAULT). Must be power
+                          of 2, [4KB - 2MB]. */
+    int checksum_enabled; /**< 1 to enable per-block and global checksums, 0 to disable. */
+    int seekable;         /**< 1 to append a seek table for random-access decompression. */
+    const void* dict;     /**< Pre-trained dictionary content (NULL = none). */
+    size_t dict_size;     /**< Dictionary size in bytes (0 = none, max ZXC_DICT_SIZE_MAX). */
+    const void* dict_huf; /**< Optional shared literal Huffman table: 128-byte packed
+                               code-lengths header from zxc_train_dict_huf() /
+                               zxc_dict_huf() (NULL = none; ignored without dict).
+                               Becomes part of the archive's dict_id binding. */
+    zxc_progress_callback_t progress_cb; /**< Optional progress callback (NULL to disable). */
+    void* user_data;                     /**< User context pointer passed to progress_cb. */
+} zxc_compress_opts_t;
+
+/**
+ * @brief Options for streaming decompression.
+ *
+ * Zero-initialise for safe defaults.
+ *
+ * @code
+ * zxc_decompress_opts_t opts = { .checksum_enabled = 1 };
+ * zxc_stream_decompress(f_in, f_out, &opts);
+ * @endcode
+ */
+typedef struct {
+    int n_threads;        /**< Worker thread count (0 = auto-detect CPU cores). */
+    int checksum_enabled; /**< 1 to verify per-block and global checksums, 0 to skip. */
+    const void* dict;     /**< Pre-trained dictionary content (NULL = none). */
+    size_t dict_size;     /**< Dictionary size in bytes (0 = none). */
+    const void* dict_huf; /**< Optional shared literal Huffman table: 128-byte packed
+                               code-lengths header matching the one used at
+                               compression time (NULL = none; ignored without dict). */
+    zxc_progress_callback_t progress_cb; /**< Optional progress callback (NULL to disable). */
+    void* user_data;                     /**< User context pointer passed to progress_cb. */
+} zxc_decompress_opts_t;
+
+/**
+ * @brief Returns `sizeof(zxc_compress_opts_t)` as compiled into the library.
+ *
+ * Layout guard for bindings that mirror the options structs by hand (raw FFI
+ * declarations, byte-offset serialization) instead of compiling against this
+ * header: comparing the mirrored size against this value at load time turns a
+ * silent layout drift (undefined behaviour) into an immediate, explicit error.
+ */
+ZXC_EXPORT size_t zxc_compress_opts_size(void);
+
+/**
+ * @brief Returns `sizeof(zxc_decompress_opts_t)` as compiled into the library.
+ * @see zxc_compress_opts_size
+ */
+ZXC_EXPORT size_t zxc_decompress_opts_size(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // ZXC_OPTS_H
diff --git a/thirdparty/zxc/include/zxc_pstream.h b/thirdparty/zxc/include/zxc_pstream.h
new file mode 100644
index 000000000000..0d6412866ff2
--- /dev/null
+++ b/thirdparty/zxc/include/zxc_pstream.h
@@ -0,0 +1,311 @@
+/*
+ * ZXC - High-performance lossless compression
+ *
+ * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file zxc_pstream.h
+ * @brief Push-based, single-threaded streaming compression and decompression.
+ *
+ * This header exposes a non-blocking, caller-driven streaming API, the
+ * counterpart of the @c FILE*-based @ref zxc_stream_compress / @ref
+ * zxc_stream_decompress.  Where the @c FILE* API takes ownership of the
+ * pipeline (it reads until EOF and writes until done), the push API hands the
+ * control back to the caller: feed input chunks when available, drain output
+ * chunks when ready, finalise on demand.
+ *
+ * Use this API when you need to integrate ZXC into:
+ * - a callback-driven library;
+ * - an asynchronous event loop;
+ * - a network protocol that streams data without seeking (HTTP chunked
+ *   transfer, gRPC, custom binary protocols);
+ * - any pipeline where you cannot block on a @c FILE*.
+ *
+ * The API is single-threaded: one context is processed by one thread at a
+ * time.  For multi-threaded compression of a single file end-to-end, use
+ * @ref zxc_stream_compress instead.
+ *
+ * @par Compression usage
+ * @code
+ * zxc_compress_opts_t opts = { .level = 3, .checksum_enabled = 1 };
+ * zxc_cstream* cs = zxc_cstream_create(&opts);
+ *
+ * uint8_t in_buf[64*1024], out_buf[64*1024];
+ * zxc_outbuf_t out = { out_buf, sizeof out_buf, 0 };
+ *
+ * ssize_t n;
+ * while ((n = read_some(in_buf, sizeof in_buf)) > 0) {
+ *     zxc_inbuf_t in = { in_buf, (size_t)n, 0 };
+ *     while (in.pos < in.size) {
+ *         int64_t r = zxc_cstream_compress(cs, &out, &in);
+ *         if (r < 0) goto fatal;
+ *         if (out.pos > 0) { write_to_sink(out_buf, out.pos); out.pos = 0; }
+ *     }
+ * }
+ *
+ * int64_t pending;
+ * do {
+ *     pending = zxc_cstream_end(cs, &out);
+ *     if (pending < 0) goto fatal;
+ *     if (out.pos > 0) { write_to_sink(out_buf, out.pos); out.pos = 0; }
+ * } while (pending > 0);
+ *
+ * zxc_cstream_free(cs);
+ * @endcode
+ *
+ * @see zxc_stream.h  for the multi-threaded @c FILE*-based pipeline.
+ * @see zxc_buffer.h  for one-shot in-memory compression.
+ */
+
+#ifndef ZXC_PSTREAM_H
+#define ZXC_PSTREAM_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "zxc_export.h"
+#include "zxc_opts.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @defgroup pstream Push Streaming API
+ * @brief Caller-driven, single-threaded streaming compression and decompression.
+ * @{
+ */
+
+/**
+ * @brief Input buffer descriptor for push streaming.
+ *
+ * The caller fills @c src with bytes to feed in and sets @c size to their
+ * count.  The library advances @c pos as it consumes input; the caller must
+ * not modify @c pos between calls.
+ */
+typedef struct {
+    const void* src; /**< Caller-owned input bytes. */
+    size_t size;     /**< Total bytes available in @c src. */
+    size_t pos;      /**< Bytes already consumed by the library (in/out). */
+} zxc_inbuf_t;
+
+/**
+ * @brief Output buffer descriptor for push streaming.
+ *
+ * The caller provides a writable region of capacity @c size starting at
+ * @c dst.  The library writes starting at @c dst+pos and advances @c pos by
+ * the number of bytes produced.  The caller drains @c [dst, dst+pos) and
+ * resets @c pos to 0 between rounds (or grows @c size).
+ */
+typedef struct {
+    void* dst;   /**< Caller-owned output region. */
+    size_t size; /**< Total capacity available at @c dst. */
+    size_t pos;  /**< Bytes already produced by the library (in/out). */
+} zxc_outbuf_t;
+
+/* Opaque streaming contexts. */
+/** @brief Opaque push-model compression stream (see @ref zxc_cstream_create). */
+typedef struct zxc_cstream_s zxc_cstream;
+/** @brief Opaque push-model decompression stream (see @ref zxc_dstream_create). */
+typedef struct zxc_dstream_s zxc_dstream;
+
+/* ===== Compression =================================================== */
+
+/**
+ * @brief Creates a push compression stream.
+ *
+ * All settings from @p opts are copied into the context.  After this call,
+ * the @p opts struct may be freed or reused.
+ *
+ * Only @c level, @c block_size, and @c checksum_enabled are honoured.
+ * @c n_threads is ignored (this API is single-threaded; use
+ * @ref zxc_stream_compress for the multi-threaded @c FILE* pipeline).
+ *
+ * If @p opts is not @c NULL, the honoured fields must contain valid values.
+ * Invalid option values (for example an unsupported @c block_size) cause
+ * stream creation to fail.
+ *
+ * @param[in] opts  Compression options, or @c NULL for all defaults.
+ * @return Allocated context to be released with @ref zxc_cstream_free,
+ *         or @c NULL if stream creation fails due to memory allocation
+ *         failure or invalid option values in @p opts.
+ */
+ZXC_EXPORT zxc_cstream* zxc_cstream_create(const zxc_compress_opts_t* opts);
+
+/**
+ * @brief Releases a compression stream and all internal buffers.
+ *
+ * Safe to call with @c NULL (no-op).
+ *
+ * @param[in] cs  Stream returned by @ref zxc_cstream_create.
+ */
+ZXC_EXPORT void zxc_cstream_free(zxc_cstream* cs);
+
+/**
+ * @brief Pushes input bytes into the stream and drains compressed output.
+ *
+ * Reads from @c in->src starting at @c in->pos, writes to @c out->dst
+ * starting at @c out->pos, advancing both as data flows.  Each call makes as
+ * much progress as either buffer allows in a single visit:
+ *
+ * - emits the file header on the first invocation (16 B);
+ * - copies input into the internal block accumulator;
+ * - whenever the accumulator is full, compresses one block and writes it
+ *   into @p out (up to @c out->size);
+ * - returns when @p in is fully consumed *and* no more compressed bytes are
+ *   pending, or when @p out has no room left.
+ *
+ * The function is fully reentrant: if @p out fills mid-block, the next call
+ * resumes draining from where the previous left off.  Safe to call with
+ * @c in->size == in->pos (drain-only mode).
+ *
+ * @par Errors
+ * On failure the context becomes errored (sticky): every subsequent call to
+ * @ref zxc_cstream_compress / @ref zxc_cstream_end returns the same negative
+ * code without doing further work.  Only @ref zxc_cstream_free is safe.
+ *
+ * @param[in,out] cs   Compression stream.
+ * @param[in,out] out  Output descriptor; @c pos is advanced by produced bytes.
+ * @param[in,out] in   Input descriptor;  @c pos is advanced by consumed bytes.
+ *
+ * @return @c 0 if @p in was fully consumed and no compressed bytes remain
+ *         pending in the internal staging area;
+ *         @c >0 number of bytes still pending, drain @p out and call again
+ *         with the same (or new) input;
+ *         @c <0 a @ref zxc_error_t code.
+ */
+ZXC_EXPORT int64_t zxc_cstream_compress(zxc_cstream* cs, zxc_outbuf_t* out, zxc_inbuf_t* in);
+
+/**
+ * @brief Finalises the stream: flushes pending data, writes EOF block + footer.
+ *
+ * Must be called after the last @ref zxc_cstream_compress invocation to
+ * produce a valid ZXC file.  Like @ref zxc_cstream_compress, this function
+ * is reentrant: if @p out fills before everything is drained, it returns a
+ * positive count and the caller drains and calls again.
+ *
+ * After @ref zxc_cstream_end returns @c 0, the stream is in DONE state and
+ * any further call returns @c ZXC_ERROR_NULL_INPUT (use @ref
+ * zxc_cstream_free to release).
+ *
+ * @param[in,out] cs   Compression stream.
+ * @param[in,out] out  Output descriptor.
+ *
+ * @return @c 0 finalisation complete (file is now valid);
+ *         @c >0 bytes still pending, drain @p out and call again;
+ *         @c <0 a @ref zxc_error_t code.
+ */
+ZXC_EXPORT int64_t zxc_cstream_end(zxc_cstream* cs, zxc_outbuf_t* out);
+
+/**
+ * @brief Suggested input chunk size for best throughput.
+ *
+ * Equal to the configured block size (default 512 KB).  The caller may
+ * supply any input chunk; this is purely a performance hint.
+ *
+ * @param[in] cs  Compression stream.
+ * @return Suggested @c in_buf capacity in bytes, or 0 if @p cs is @c NULL.
+ */
+ZXC_EXPORT size_t zxc_cstream_in_size(const zxc_cstream* cs);
+
+/**
+ * @brief Suggested output chunk size to never trigger a partial drain.
+ *
+ * Sized to hold one full compressed block plus framing overhead.  Smaller
+ * outputs work but may force the caller into an extra drain loop.
+ *
+ * @param[in] cs  Compression stream.
+ * @return Suggested @c out_buf capacity in bytes, or 0 if @p cs is @c NULL.
+ */
+ZXC_EXPORT size_t zxc_cstream_out_size(const zxc_cstream* cs);
+
+/* ===== Decompression ================================================= */
+
+/**
+ * @brief Creates a push decompression stream.
+ *
+ * All settings from @p opts are copied into the context.  Only
+ * @c checksum_enabled is honoured (controls whether per-block and global
+ * checksums are verified when present).
+ *
+ * @param[in] opts  Decompression options, or @c NULL for defaults.
+ * @return Allocated context to be released with @ref zxc_dstream_free,
+ *         or @c NULL on allocation failure.
+ */
+ZXC_EXPORT zxc_dstream* zxc_dstream_create(const zxc_decompress_opts_t* opts);
+
+/**
+ * @brief Releases a decompression stream.  Safe to call with @c NULL.
+ *
+ * @param[in] ds  Stream returned by @ref zxc_dstream_create.
+ */
+ZXC_EXPORT void zxc_dstream_free(zxc_dstream* ds);
+
+/**
+ * @brief Pushes compressed input and drains decompressed output.
+ *
+ * Internally runs a parser state machine: file header -> per-block
+ * (header + payload + optional checksum) -> EOF block -> optional SEK block ->
+ * file footer.  Each call makes as much progress as @p in and @p out allow.
+ *
+ * @par End of stream
+ * When the decoder reaches the file footer and validates it, the stream
+ * enters DONE state.  Subsequent calls return @c 0 without producing more
+ * output, even if extra bytes remain in @p in (those trailing bytes are
+ * silently ignored, the caller may use the residual @c in->pos to detect
+ * how much real data was consumed).
+ *
+ * @par Errors
+ * Sticky: once a negative code is returned, further calls keep returning it.
+ *
+ * @param[in,out] ds   Decompression stream.
+ * @param[in,out] out  Output descriptor; @c pos advanced by produced bytes.
+ * @param[in,out] in   Input descriptor;  @c pos advanced by consumed bytes.
+ *
+ * @return @c >0 number of decompressed bytes written into @p out this call;
+ *         @c 0 stream complete (DONE) or no progress possible (caller should
+ *         feed more input);
+ *         @c <0 a @ref zxc_error_t code.
+ */
+ZXC_EXPORT int64_t zxc_dstream_decompress(zxc_dstream* ds, zxc_outbuf_t* out, zxc_inbuf_t* in);
+
+/**
+ * @brief Reports whether the decoder has fully consumed a valid stream.
+ *
+ * Returns @c 1 iff the parser has reached the file footer **and** validated
+ * it.  Callers that have finished feeding input use this to detect truncated
+ * streams: if @ref zxc_dstream_decompress returns @c 0 with no output and
+ * @ref zxc_dstream_finished returns @c 0, the input ended prematurely.
+ *
+ * @param[in] ds  Decompression stream.
+ * @return @c 1 if DONE, @c 0 otherwise (including errored).
+ */
+ZXC_EXPORT int zxc_dstream_finished(const zxc_dstream* ds);
+
+/**
+ * @brief Suggested input chunk size for the decompressor.
+ *
+ * @param[in] ds  Decompression stream.
+ * @return Suggested @c in_buf capacity in bytes, or 0 if @p ds is @c NULL.
+ */
+ZXC_EXPORT size_t zxc_dstream_in_size(const zxc_dstream* ds);
+
+/**
+ * @brief Suggested output chunk size for the decompressor.
+ *
+ * Sized to hold at least one full decompressed block.
+ *
+ * @param[in] ds  Decompression stream.
+ * @return Suggested @c out_buf capacity in bytes, or 0 if @p ds is @c NULL.
+ */
+ZXC_EXPORT size_t zxc_dstream_out_size(const zxc_dstream* ds);
+
+/** @} */ /* end of pstream */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ZXC_PSTREAM_H */
diff --git a/thirdparty/zxc/include/zxc_seekable.h b/thirdparty/zxc/include/zxc_seekable.h
new file mode 100644
index 000000000000..be978a62f0c2
--- /dev/null
+++ b/thirdparty/zxc/include/zxc_seekable.h
@@ -0,0 +1,286 @@
+/*
+ * ZXC - High-performance lossless compression
+ *
+ * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file zxc_seekable.h
+ * @brief Seekable compression and random-access decompression API.
+ *
+ * This header provides functions to produce seekable ZXC archives and to
+ * decompress arbitrary byte ranges without reading the entire file.
+ *
+ * A seekable archive embeds a Seek Table block (block_type = @c ZXC_BLOCK_SEK)
+ * after the EOF block, recording the compressed size of every block.
+ * The table is detected at read time by deriving @c num_blocks from the file
+ * footer's total decompressed size and the header's block size, then seeking
+ * backward to validate the SEK block header.
+ * Standard (non-seekable) decompressors ignore the seek table entirely.
+ *
+ * This header is freestanding: it depends only on @c <stddef.h>, @c <stdint.h>
+ * and the rest of the ZXC public API. It does not pull in @c <stdio.h>, so it
+ * is includable from kernel-space or other freestanding environments.
+ * The @c FILE*-based @ref zxc_seekable_open_file entry point lives in the
+ * companion header @c zxc_seekable_file.h.
+ *
+ * @par Creating a seekable archive
+ * @code
+ * zxc_compress_opts_t opts = { .level = 3, .seekable = 1 };
+ * int64_t csize = zxc_compress(src, src_size, dst, dst_cap, &opts);
+ * @endcode
+ *
+ * @par Random-access decompression (buffer-backed)
+ * @code
+ * zxc_seekable* s = zxc_seekable_open(compressed, csize);
+ * int64_t n = zxc_seekable_decompress_range(s, out, out_cap, offset, len);
+ * zxc_seekable_free(s);
+ * @endcode
+ *
+ * @par Random-access decompression (custom storage)
+ * @code
+ * zxc_reader_t r = { .read_at = my_read_at, .ctx = my_state, .size = total };
+ * zxc_seekable* s = zxc_seekable_open_reader(&r);
+ * @endcode
+ */
+
+#ifndef ZXC_SEEKABLE_H
+#define ZXC_SEEKABLE_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "zxc_export.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @defgroup seekable_api Seekable API
+ * @brief Random-access compression and decompression.
+ * @{
+ */
+
+/* ========================================================================= */
+/*  Seekable Reader (Random-Access Decompression)                            */
+/* ========================================================================= */
+
+/**
+ * @brief Opaque handle for a seekable ZXC archive.
+ *
+ * Created by zxc_seekable_open(), zxc_seekable_open_reader(), or
+ * zxc_seekable_open_file() (see @c zxc_seekable_file.h).
+ * Must be freed with zxc_seekable_free().
+ */
+typedef struct zxc_seekable_s zxc_seekable;
+
+/**
+ * @brief Opens a seekable archive from a memory buffer.
+ *
+ * Parses the seek table from the end of the buffer and builds the internal
+ * block index.  The buffer must remain valid for the lifetime of the handle.
+ *
+ * @param[in] src       Pointer to the compressed data.
+ * @param[in] src_size  Size of the compressed data in bytes.
+ * @return Handle on success, or @c NULL if the buffer does not contain a
+ *         valid seekable archive (e.g. missing seek block, bad block type).
+ */
+ZXC_EXPORT zxc_seekable* zxc_seekable_open(const void* src, const size_t src_size);
+
+/**
+ * @brief Storage-agnostic reader interface for seekable archives.
+ *
+ * Lets the caller plug any backend (mmap, HTTP range requests, S3, a custom
+ * VFS, kernel @c vfs_read, etc.) behind the seekable reader.  The reader
+ * exposes positional reads only; no seeking state is implied.
+ *
+ * @par Thread safety
+ * @c read_at MUST be safe to call concurrently from multiple threads when the
+ * resulting handle is used with zxc_seekable_decompress_range_mt().  The
+ * single-threaded path makes no concurrent calls.
+ *
+ * @par Lifetime
+ * Both @c ctx and the backing storage must remain valid for the lifetime of
+ * the returned zxc_seekable handle (until zxc_seekable_free()).
+ */
+typedef struct {
+    /**
+     * @brief Reads exactly @c len bytes at @c offset into @c dst.
+     *
+     * @param[in,out] ctx     Opaque user context (forwarded from zxc_reader_t::ctx).
+     * @param[out]    dst     Destination buffer.
+     * @param[in]     len     Number of bytes to read.
+     * @param[in]     offset  Byte offset from the start of the archive.
+     * @return Number of bytes read (@c == @c len on success), or a negative
+     *         @ref zxc_error_t code on failure.  Short reads are treated as
+     *         errors by the seekable reader.
+     */
+    int64_t (*read_at)(void* ctx, void* dst, size_t len, uint64_t offset);
+
+    /** @brief Opaque user context passed unchanged to @c read_at. */
+    void* ctx;
+
+    /** @brief Total size of the compressed archive in bytes. */
+    uint64_t size;
+} zxc_reader_t;
+
+/**
+ * @brief Opens a seekable archive through a user-supplied reader.
+ *
+ * The reader is invoked to fetch the file header, footer, and seek table at
+ * open time, then again on every block read during decompression.  Use this
+ * entry point to back the seekable API with any storage that supports
+ * positional reads (e.g. mmap, HTTP, S3, a kernel file descriptor).
+ *
+ * @param[in] r  Reader interface (must remain valid for the handle lifetime).
+ * @return Handle on success, or @c NULL on error.
+ */
+ZXC_EXPORT zxc_seekable* zxc_seekable_open_reader(const zxc_reader_t* r);
+
+/**
+ * @brief Returns the total number of blocks in the seekable archive.
+ *
+ * @param[in] s  Seekable handle.
+ * @return Number of data blocks (excluding EOF).
+ */
+ZXC_EXPORT uint32_t zxc_seekable_get_num_blocks(const zxc_seekable* s);
+
+/**
+ * @brief Returns the total decompressed size of the seekable archive.
+ *
+ * @param[in] s  Seekable handle.
+ * @return Total decompressed size in bytes.
+ */
+ZXC_EXPORT uint64_t zxc_seekable_get_decompressed_size(const zxc_seekable* s);
+
+/**
+ * @brief Returns the compressed size of a specific block.
+ *
+ * This is the "on-disk" size including block header, payload, and optional
+ * per-block checksum.
+ *
+ * @param[in] s          Seekable handle.
+ * @param[in] block_idx  Zero-based block index.
+ * @return Compressed block size, or 0 if @p block_idx is out of range.
+ */
+ZXC_EXPORT uint32_t zxc_seekable_get_block_comp_size(const zxc_seekable* s,
+                                                     const uint32_t block_idx);
+
+/**
+ * @brief Returns the decompressed size of a specific block.
+ *
+ * @param[in] s          Seekable handle.
+ * @param[in] block_idx  Zero-based block index.
+ * @return Decompressed block size, or 0 if @p block_idx is out of range.
+ */
+ZXC_EXPORT uint32_t zxc_seekable_get_block_decomp_size(const zxc_seekable* s,
+                                                       const uint32_t block_idx);
+
+/**
+ * @brief Decompresses an arbitrary byte range from the original data.
+ *
+ * Only the blocks overlapping [@p offset, @p offset + @p len) are read and
+ * decompressed.  This is the core random-access primitive.
+ *
+ * @param[in,out] s            Seekable handle.
+ * @param[out]    dst          Destination buffer.
+ * @param[in]     dst_capacity Capacity of @p dst (must be >= @p len).
+ * @param[in]     offset       Byte offset into the original uncompressed data.
+ * @param[in]     len          Number of bytes to decompress.
+ * @return Number of bytes written to @p dst (== @p len on success),
+ *         or a negative @ref zxc_error_t code on failure.
+ */
+ZXC_EXPORT int64_t zxc_seekable_decompress_range(zxc_seekable* s, void* dst,
+                                                 const size_t dst_capacity, const uint64_t offset,
+                                                 const size_t len);
+
+/**
+ * @brief Multi-threaded variant of zxc_seekable_decompress_range().
+ *
+ * Decompresses blocks in parallel using a fork-join thread pool.  Each worker
+ * thread owns its own decompression context and reads compressed data via
+ * @c pread() (POSIX) or @c ReadFile() (Windows) for lock-free concurrent I/O.
+ *
+ * Falls back to single-threaded mode when @p n_threads <= 1 or when the
+ * requested range spans a single block.
+ *
+ * @param[in,out] s            Seekable handle.
+ * @param[out]    dst          Destination buffer.
+ * @param[in]     dst_capacity Capacity of @p dst (must be >= @p len).
+ * @param[in]     offset       Byte offset into the original uncompressed data.
+ * @param[in]     len          Number of bytes to decompress.
+ * @param[in]     n_threads    Number of worker threads (0 = auto-detect CPU cores).
+ * @return Number of bytes written to @p dst (== @p len on success),
+ *         or a negative @ref zxc_error_t code on failure.
+ */
+ZXC_EXPORT int64_t zxc_seekable_decompress_range_mt(zxc_seekable* s, void* dst,
+                                                    const size_t dst_capacity,
+                                                    const uint64_t offset, const size_t len,
+                                                    int n_threads);
+
+/**
+ * @brief Frees a seekable handle and all associated resources.
+ *
+ * Safe to call with @c NULL.
+ *
+ * @param[in] s  Handle to free.
+ */
+ZXC_EXPORT void zxc_seekable_free(zxc_seekable* s);
+
+/**
+ * @brief Attach a pre-trained dictionary to a seekable handle.
+ *
+ * The dictionary content and table are copied internally; the caller may free
+ * them after this call returns. Must be called before any
+ * zxc_seekable_decompress_range() call.
+ *
+ * @param[in] s         Seekable handle.
+ * @param[in] dict      Dictionary content.
+ * @param[in] dict_size Size in bytes (max ZXC_DICT_SIZE_MAX).
+ * @param[in] dict_huf  Shared literal Huffman table (128 bytes, see
+ *                      zxc_dict_huf()), or NULL if the archive was compressed
+ *                      without one. Must match the compression-time pair: the
+ *                      archive's dict_id binds (dict, table).
+ * @return @ref ZXC_OK on success, or a negative @ref zxc_error_t code.
+ */
+ZXC_EXPORT int zxc_seekable_set_dict(zxc_seekable* s, const void* dict, size_t dict_size,
+                                     const void* dict_huf);
+
+/* ========================================================================= */
+/*  Seek Table Writer (low-level)                                            */
+/* ========================================================================= */
+
+/**
+ * @brief Writes a seek table to the destination buffer.
+ *
+ * This is a low-level helper used internally by the seekable compression
+ * paths.  It writes: block_header(8) + N entries(4 each).
+ * Each entry stores only @c comp_size; decompressed sizes are derived at
+ * read time from the file header's block_size.
+ *
+ * @param[out] dst             Destination buffer.
+ * @param[in]  dst_capacity    Capacity of @p dst in bytes.
+ * @param[in]  comp_sizes      Array of compressed block sizes.
+ * @param[in]  num_blocks      Number of blocks.
+ * @return Number of bytes written, or a negative @ref zxc_error_t on failure.
+ */
+ZXC_EXPORT int64_t zxc_write_seek_table(uint8_t* dst, const size_t dst_capacity,
+                                        const uint32_t* comp_sizes, const uint32_t num_blocks);
+
+/**
+ * @brief Returns the encoded size of a seek table for the given block count.
+ *
+ * @param[in] num_blocks     Number of blocks.
+ * @return Total byte size of the seek table.
+ */
+ZXC_EXPORT size_t zxc_seek_table_size(const uint32_t num_blocks);
+
+/** @} */ /* end of seekable_api */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ZXC_SEEKABLE_H */
diff --git a/thirdparty/zxc/include/zxc_stream.h b/thirdparty/zxc/include/zxc_stream.h
new file mode 100644
index 000000000000..75ea9e04b25a
--- /dev/null
+++ b/thirdparty/zxc/include/zxc_stream.h
@@ -0,0 +1,132 @@
+/*
+ * ZXC - High-performance lossless compression
+ *
+ * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file zxc_stream.h
+ * @brief @c FILE*-flavored variants of the ZXC API.
+ *
+ * Groups the public entry points that depend on @c <stdio.h> so they can be
+ * cleanly excluded from kernel / freestanding builds (which include
+ * @c zxc_buffer.h, @c zxc_pstream.h, and the storage-agnostic part of
+ * @c zxc_seekable.h instead).
+ *
+ * Two subsystems live here:
+ *
+ * 1. **Multi-threaded streaming driver**: reads from a @c FILE* input and
+ *    writes compressed (or decompressed) output to a @c FILE*.  Internally
+ *    the driver uses an asynchronous Producer-Consumer pipeline via a ring
+ *    buffer to separate I/O from CPU-intensive work:
+ *      - Reader thread: reads chunks from @c f_in.
+ *      - Worker threads: compress/decompress chunks in parallel.
+ *      - Writer thread: orders the results and writes them to @c f_out.
+ *    Functions: @ref zxc_stream_compress, @ref zxc_stream_decompress,
+ *    @ref zxc_stream_get_decompressed_size.
+ *
+ * 2. **Seekable @c FILE* open helper**: thin wrapper that adapts a
+ *    @c FILE* into a thread-safe @c pread / @c ReadFile-backed
+ *    @ref zxc_reader_t and delegates to @ref zxc_seekable_open_reader.
+ *    Function: @ref zxc_seekable_open_file.
+ *
+ * @see zxc_buffer.h   for the simple one-shot buffer API.
+ * @see zxc_pstream.h  for single-threaded push-based streaming.
+ * @see zxc_seekable.h for the storage-agnostic seekable reader.
+ */
+
+#ifndef ZXC_STREAM_H
+#define ZXC_STREAM_H
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "zxc_export.h"
+#include "zxc_opts.h"
+#include "zxc_seekable.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @defgroup stream_api Streaming API
+ * @brief Multi-threaded, FILE*-based compression and decompression.
+ * @{
+ */
+
+/**
+ * @brief Compresses data from an input stream to an output stream.
+ *
+ * This function sets up a multi-threaded pipeline:
+ * 1. Reader Thread: Reads chunks from f_in.
+ * 2. Worker Threads: Compress chunks in parallel (LZ77 + Bitpacking).
+ * 3. Writer Thread: Orders the processed chunks and writes them to f_out.
+ *
+ * @param[in] f_in   Input file stream (must be opened in "rb" mode).
+ * @param[out] f_out  Output file stream (must be opened in "wb" mode).
+ * @param[in] opts   Compression options (NULL uses all defaults).
+ *
+ * @return Total compressed bytes written, or a negative zxc_error_t code (e.g.,
+ * ZXC_ERROR_IO) if an error occurred.
+ */
+ZXC_EXPORT int64_t zxc_stream_compress(FILE* f_in, FILE* f_out, const zxc_compress_opts_t* opts);
+
+/**
+ * @brief Decompresses data from an input stream to an output stream.
+ *
+ * Uses the same pipeline architecture as compression to maximize throughput.
+ *
+ * @param[in] f_in   Input file stream (must be opened in "rb" mode).
+ * @param[out] f_out  Output file stream (must be opened in "wb" mode).
+ * @param[in] opts   Decompression options (NULL uses all defaults).
+ *
+ * @return Total decompressed bytes written, or a negative zxc_error_t code (e.g.,
+ * ZXC_ERROR_BAD_HEADER) if an error occurred.
+ */
+ZXC_EXPORT int64_t zxc_stream_decompress(FILE* f_in, FILE* f_out,
+                                         const zxc_decompress_opts_t* opts);
+
+/**
+ * @brief Returns the decompressed size stored in a ZXC compressed file.
+ *
+ * This function reads the file footer to extract the original uncompressed size
+ * without performing any decompression. The file position is restored after reading.
+ *
+ * @param[in] f_in  Input file stream (must be opened in "rb" mode).
+ *
+ * @return The original uncompressed size in bytes, or a negative zxc_error_t code (e.g.,
+ * ZXC_ERROR_BAD_MAGIC) if the file is invalid or an I/O error occurred.
+ */
+ZXC_EXPORT int64_t zxc_stream_get_decompressed_size(FILE* f_in);
+
+/* ========================================================================= */
+/*  Seekable FILE* open helper                                               */
+/* ========================================================================= */
+
+/**
+ * @brief Opens a seekable archive from a @c FILE*.
+ *
+ * Internally builds a @ref zxc_reader_t that performs thread-safe positional
+ * reads (@c pread on POSIX, @c ReadFile + @c OVERLAPPED on Windows) on the
+ * file descriptor extracted from @p f, then delegates to
+ * @ref zxc_seekable_open_reader.  The current file position is saved and
+ * restored.  The @c FILE* must remain open for the lifetime of the handle.
+ *
+ * Lives here (next to the other @c FILE*-based entry points) rather than in
+ * @c zxc_seekable.h so the latter remains freestanding (kernel-includable).
+ *
+ * @param[in] f  File opened in @c "rb" mode (must be seekable, not a pipe).
+ * @return Handle on success (free with @ref zxc_seekable_free), or @c NULL
+ *         on error.
+ */
+ZXC_EXPORT zxc_seekable* zxc_seekable_open_file(FILE* f);
+
+/** @} */ /* end of stream_api */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // ZXC_STREAM_H
\ No newline at end of file
diff --git a/thirdparty/zxc/src/lib/vendors/rapidhash.h b/thirdparty/zxc/src/lib/vendors/rapidhash.h
new file mode 100644
index 000000000000..88de4bc431d1
--- /dev/null
+++ b/thirdparty/zxc/src/lib/vendors/rapidhash.h
@@ -0,0 +1,568 @@
+/*
+ * rapidhash V3 - Very fast, high quality, platform-independent hashing algorithm.
+ *
+ * Based on 'wyhash', by Wang Yi <godspeed_china@yeah.net>
+ * 
+ * Copyright (C) 2025 Nicolas De Carli
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * You can contact the author at:
+ *   - rapidhash source repository: https://github.com/Nicoshev/rapidhash
+ */
+
+ #pragma once
+ 
+/*
+ *  Includes.
+ */
+ #include <stdint.h>
+ #include <string.h>
+ #if defined(_MSC_VER)
+ # include <intrin.h>
+ # if defined(_M_X64) && !defined(_M_ARM64EC)
+ #   pragma intrinsic(_umul128)
+ # endif
+ #endif
+ 
+ /*
+  *  C/C++ macros.
+  */
+ 
+ #ifdef _MSC_VER
+ # define RAPIDHASH_ALWAYS_INLINE __forceinline
+ #elif defined(__GNUC__)
+ # define RAPIDHASH_ALWAYS_INLINE inline __attribute__((__always_inline__))
+ #else
+ # define RAPIDHASH_ALWAYS_INLINE inline
+ #endif
+ 
+ #ifdef __cplusplus
+ # define RAPIDHASH_NOEXCEPT noexcept
+ # define RAPIDHASH_CONSTEXPR constexpr
+ # ifndef RAPIDHASH_INLINE
+ #   define RAPIDHASH_INLINE RAPIDHASH_ALWAYS_INLINE
+ # endif
+ # if __cplusplus >= 201402L && !defined(_MSC_VER)
+ #   define RAPIDHASH_INLINE_CONSTEXPR RAPIDHASH_ALWAYS_INLINE constexpr
+ # else
+ #   define RAPIDHASH_INLINE_CONSTEXPR RAPIDHASH_ALWAYS_INLINE
+ # endif
+ #else
+ # define RAPIDHASH_NOEXCEPT
+ # define RAPIDHASH_CONSTEXPR static const
+ # ifndef RAPIDHASH_INLINE
+ #   define RAPIDHASH_INLINE static RAPIDHASH_ALWAYS_INLINE
+ # endif
+ # define RAPIDHASH_INLINE_CONSTEXPR RAPIDHASH_INLINE
+ #endif
+
+ /*
+  *  Unrolled macro.
+  *  Improves large input speed, but increases code size and worsens small input speed.
+  *
+  *  RAPIDHASH_COMPACT: Normal behavior.
+  *  RAPIDHASH_UNROLLED: 
+  *
+  */
+  #ifndef RAPIDHASH_UNROLLED
+  # define RAPIDHASH_COMPACT
+  #elif defined(RAPIDHASH_COMPACT)
+  # error "cannot define RAPIDHASH_COMPACT and RAPIDHASH_UNROLLED simultaneously."
+  #endif
+ 
+ /*
+  *  Protection macro, alters behaviour of rapid_mum multiplication function.
+  *
+  *  RAPIDHASH_FAST: Normal behavior, max speed.
+  *  RAPIDHASH_PROTECTED: Extra protection against entropy loss.
+  */
+ #ifndef RAPIDHASH_PROTECTED
+ # define RAPIDHASH_FAST
+ #elif defined(RAPIDHASH_FAST)
+ # error "cannot define RAPIDHASH_PROTECTED and RAPIDHASH_FAST simultaneously."
+ #endif
+ 
+ /*
+  *  Likely and unlikely macros.
+  */
+ #if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
+ # define _likely_(x)  __builtin_expect(x,1)
+ # define _unlikely_(x)  __builtin_expect(x,0)
+ #else
+ # define _likely_(x) (x)
+ # define _unlikely_(x) (x)
+ #endif
+ 
+ /*
+  *  Endianness macros.
+  */
+ #ifndef RAPIDHASH_LITTLE_ENDIAN
+ # if defined(_WIN32) || defined(__LITTLE_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+ #   define RAPIDHASH_LITTLE_ENDIAN
+ # elif defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ #   define RAPIDHASH_BIG_ENDIAN
+ # else
+ #   warning "could not determine endianness! Falling back to little endian."
+ #   define RAPIDHASH_LITTLE_ENDIAN
+ # endif
+ #endif
+ 
+ /*
+  *  Default secret parameters.
+  */
+   RAPIDHASH_CONSTEXPR uint64_t rapid_secret[8] = {
+     0x2d358dccaa6c78a5ull,
+     0x8bb84b93962eacc9ull,
+     0x4b33a62ed433d4a3ull,
+     0x4d5a2da51de1aa47ull,
+     0xa0761d6478bd642full,
+     0xe7037ed1a0b428dbull,
+     0x90ed1765281c388cull,
+     0xaaaaaaaaaaaaaaaaull};
+ 
+ /*
+  *  64*64 -> 128bit multiply function.
+  *
+  *  @param A  Address of 64-bit number.
+  *  @param B  Address of 64-bit number.
+  *
+  *  Calculates 128-bit C = *A * *B.
+  *
+  *  When RAPIDHASH_FAST is defined:
+  *  Overwrites A contents with C's low 64 bits.
+  *  Overwrites B contents with C's high 64 bits.
+  *
+  *  When RAPIDHASH_PROTECTED is defined:
+  *  Xors and overwrites A contents with C's low 64 bits.
+  *  Xors and overwrites B contents with C's high 64 bits.
+  */
+ RAPIDHASH_INLINE_CONSTEXPR void rapid_mum(uint64_t *A, uint64_t *B) RAPIDHASH_NOEXCEPT {
+ #if defined(__SIZEOF_INT128__)
+   __uint128_t r=*A; r*=*B;
+   #ifdef RAPIDHASH_PROTECTED
+   *A^=(uint64_t)r; *B^=(uint64_t)(r>>64);
+   #else
+   *A=(uint64_t)r; *B=(uint64_t)(r>>64);
+   #endif
+ #elif defined(_MSC_VER) && (defined(_WIN64) || defined(_M_HYBRID_CHPE_ARM64))
+   #if defined(_M_X64)
+     #ifdef RAPIDHASH_PROTECTED
+     uint64_t a, b;
+     a=_umul128(*A,*B,&b);
+     *A^=a;  *B^=b;
+     #else
+     *A=_umul128(*A,*B,B);
+     #endif
+   #else
+     #ifdef RAPIDHASH_PROTECTED
+     uint64_t a, b;
+     b = __umulh(*A, *B);
+     a = *A * *B;
+     *A^=a;  *B^=b;
+     #else
+     uint64_t c = __umulh(*A, *B);
+     *A = *A * *B;
+     *B = c;
+     #endif
+   #endif
+ #else
+   uint64_t ha=*A>>32, hb=*B>>32, la=(uint32_t)*A, lb=(uint32_t)*B;
+   uint64_t rh=ha*hb, rm0=ha*lb, rm1=hb*la, rl=la*lb, t=rl+(rm0<<32), c=t<rl;
+   uint64_t lo=t+(rm1<<32); 
+   c+=lo<t; 
+   uint64_t hi=rh+(rm0>>32)+(rm1>>32)+c;
+   #ifdef RAPIDHASH_PROTECTED
+   *A^=lo;  *B^=hi;
+   #else
+   *A=lo;  *B=hi;
+   #endif
+ #endif
+ }
+ 
+ /*
+  *  Multiply and xor mix function.
+  *
+  *  @param A  64-bit number.
+  *  @param B  64-bit number.
+  *
+  *  Calculates 128-bit C = A * B.
+  *  Returns 64-bit xor between high and low 64 bits of C.
+  */
+  RAPIDHASH_INLINE_CONSTEXPR uint64_t rapid_mix(uint64_t A, uint64_t B) RAPIDHASH_NOEXCEPT { rapid_mum(&A,&B); return A^B; }
+ 
+ /*
+  *  Read functions.
+  */
+ #ifdef RAPIDHASH_LITTLE_ENDIAN
+ RAPIDHASH_INLINE uint64_t rapid_read64(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint64_t v; memcpy(&v, p, sizeof(uint64_t)); return v;}
+ RAPIDHASH_INLINE uint64_t rapid_read32(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint32_t v; memcpy(&v, p, sizeof(uint32_t)); return v;}
+ #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
+ RAPIDHASH_INLINE uint64_t rapid_read64(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint64_t v; memcpy(&v, p, sizeof(uint64_t)); return __builtin_bswap64(v);}
+ RAPIDHASH_INLINE uint64_t rapid_read32(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint32_t v; memcpy(&v, p, sizeof(uint32_t)); return __builtin_bswap32(v);}
+ #elif defined(_MSC_VER)
+ RAPIDHASH_INLINE uint64_t rapid_read64(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint64_t v; memcpy(&v, p, sizeof(uint64_t)); return _byteswap_uint64(v);}
+ RAPIDHASH_INLINE uint64_t rapid_read32(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint32_t v; memcpy(&v, p, sizeof(uint32_t)); return _byteswap_ulong(v);}
+ #else
+ RAPIDHASH_INLINE uint64_t rapid_read64(const uint8_t *p) RAPIDHASH_NOEXCEPT {
+   uint64_t v; memcpy(&v, p, 8);
+   return (((v >> 56) & 0xff)| ((v >> 40) & 0xff00)| ((v >> 24) & 0xff0000)| ((v >>  8) & 0xff000000)| ((v <<  8) & 0xff00000000)| ((v << 24) & 0xff0000000000)| ((v << 40) & 0xff000000000000)| ((v << 56) & 0xff00000000000000));
+ }
+ RAPIDHASH_INLINE uint64_t rapid_read32(const uint8_t *p) RAPIDHASH_NOEXCEPT {
+   uint32_t v; memcpy(&v, p, 4);
+   return (((v >> 24) & 0xff)| ((v >>  8) & 0xff00)| ((v <<  8) & 0xff0000)| ((v << 24) & 0xff000000));
+ }
+ #endif
+ 
+ /*
+  *  rapidhash main function.
+  *
+  *  @param key     Buffer to be hashed.
+  *  @param len     @key length, in bytes.
+  *  @param seed    64-bit seed used to alter the hash result predictably.
+  *  @param secret  Triplet of 64-bit secrets used to alter hash result predictably.
+  *
+  *  Returns a 64-bit hash.
+  */
+RAPIDHASH_INLINE_CONSTEXPR uint64_t rapidhash_internal(const void *key, size_t len, uint64_t seed, const uint64_t* secret) RAPIDHASH_NOEXCEPT {
+  const uint8_t *p=(const uint8_t *)key;
+  seed ^= rapid_mix(seed ^ secret[2], secret[1]);
+  uint64_t a=0, b=0;
+  size_t i = len;
+  if (_likely_(len <= 16)) {
+    if (len >= 4) {
+      seed ^= len;
+      if (len >= 8) {
+        const uint8_t* plast = p + len - 8;
+        a = rapid_read64(p);
+        b = rapid_read64(plast);
+      } else {
+        const uint8_t* plast = p + len - 4;
+        a = rapid_read32(p);
+        b = rapid_read32(plast);
+      }
+    } else if (len > 0) {
+      a = (((uint64_t)p[0])<<45)|p[len-1];
+      b = p[len>>1];
+    } else
+      a = b = 0;
+  } else {
+    if (len > 112) {
+      uint64_t see1 = seed, see2 = seed;
+      uint64_t see3 = seed, see4 = seed;
+      uint64_t see5 = seed, see6 = seed;
+#ifdef RAPIDHASH_COMPACT
+      do {
+        seed = rapid_mix(rapid_read64(p) ^ secret[0], rapid_read64(p + 8) ^ seed);
+        see1 = rapid_mix(rapid_read64(p + 16) ^ secret[1], rapid_read64(p + 24) ^ see1);
+        see2 = rapid_mix(rapid_read64(p + 32) ^ secret[2], rapid_read64(p + 40) ^ see2);
+        see3 = rapid_mix(rapid_read64(p + 48) ^ secret[3], rapid_read64(p + 56) ^ see3);
+        see4 = rapid_mix(rapid_read64(p + 64) ^ secret[4], rapid_read64(p + 72) ^ see4);
+        see5 = rapid_mix(rapid_read64(p + 80) ^ secret[5], rapid_read64(p + 88) ^ see5);
+        see6 = rapid_mix(rapid_read64(p + 96) ^ secret[6], rapid_read64(p + 104) ^ see6);
+        p += 112;
+        i -= 112;
+      } while(i > 112);
+#else
+      while (i > 224) {
+        seed = rapid_mix(rapid_read64(p) ^ secret[0], rapid_read64(p + 8) ^ seed);
+        see1 = rapid_mix(rapid_read64(p + 16) ^ secret[1], rapid_read64(p + 24) ^ see1);
+        see2 = rapid_mix(rapid_read64(p + 32) ^ secret[2], rapid_read64(p + 40) ^ see2);
+        see3 = rapid_mix(rapid_read64(p + 48) ^ secret[3], rapid_read64(p + 56) ^ see3);
+        see4 = rapid_mix(rapid_read64(p + 64) ^ secret[4], rapid_read64(p + 72) ^ see4);
+        see5 = rapid_mix(rapid_read64(p + 80) ^ secret[5], rapid_read64(p + 88) ^ see5);
+        see6 = rapid_mix(rapid_read64(p + 96) ^ secret[6], rapid_read64(p + 104) ^ see6);
+        seed = rapid_mix(rapid_read64(p + 112) ^ secret[0], rapid_read64(p + 120) ^ seed);
+        see1 = rapid_mix(rapid_read64(p + 128) ^ secret[1], rapid_read64(p + 136) ^ see1);
+        see2 = rapid_mix(rapid_read64(p + 144) ^ secret[2], rapid_read64(p + 152) ^ see2);
+        see3 = rapid_mix(rapid_read64(p + 160) ^ secret[3], rapid_read64(p + 168) ^ see3);
+        see4 = rapid_mix(rapid_read64(p + 176) ^ secret[4], rapid_read64(p + 184) ^ see4);
+        see5 = rapid_mix(rapid_read64(p + 192) ^ secret[5], rapid_read64(p + 200) ^ see5);
+        see6 = rapid_mix(rapid_read64(p + 208) ^ secret[6], rapid_read64(p + 216) ^ see6);
+        p += 224;
+        i -= 224;
+      }
+      if (i > 112) {
+        seed = rapid_mix(rapid_read64(p) ^ secret[0], rapid_read64(p + 8) ^ seed);
+        see1 = rapid_mix(rapid_read64(p + 16) ^ secret[1], rapid_read64(p + 24) ^ see1);
+        see2 = rapid_mix(rapid_read64(p + 32) ^ secret[2], rapid_read64(p + 40) ^ see2);
+        see3 = rapid_mix(rapid_read64(p + 48) ^ secret[3], rapid_read64(p + 56) ^ see3);
+        see4 = rapid_mix(rapid_read64(p + 64) ^ secret[4], rapid_read64(p + 72) ^ see4);
+        see5 = rapid_mix(rapid_read64(p + 80) ^ secret[5], rapid_read64(p + 88) ^ see5);
+        see6 = rapid_mix(rapid_read64(p + 96) ^ secret[6], rapid_read64(p + 104) ^ see6);
+        p += 112;
+        i -= 112;
+      }
+#endif
+      seed ^= see1;
+      see2 ^= see3;
+      see4 ^= see5;
+      seed ^= see6;
+      see2 ^= see4;
+      seed ^= see2;
+    }
+    if (i > 16) {
+      seed = rapid_mix(rapid_read64(p) ^ secret[2], rapid_read64(p + 8) ^ seed);
+      if (i > 32) {
+          seed = rapid_mix(rapid_read64(p + 16) ^ secret[2], rapid_read64(p + 24) ^ seed);
+          if (i > 48) {
+              seed = rapid_mix(rapid_read64(p + 32) ^ secret[1], rapid_read64(p + 40) ^ seed);
+              if (i > 64) {
+                  seed = rapid_mix(rapid_read64(p + 48) ^ secret[1], rapid_read64(p + 56) ^ seed);
+                  if (i > 80) {
+                      seed = rapid_mix(rapid_read64(p + 64) ^ secret[2], rapid_read64(p + 72) ^ seed);
+                      if (i > 96) {
+                          seed = rapid_mix(rapid_read64(p + 80) ^ secret[1], rapid_read64(p + 88) ^ seed);
+                      }
+                  }
+              }
+          }
+      }
+    }
+    a=rapid_read64(p+i-16) ^ i;  b=rapid_read64(p+i-8);
+  }
+  a ^= secret[1];
+  b ^= seed;
+  rapid_mum(&a, &b);
+  return rapid_mix(a ^ secret[7], b ^ secret[1] ^ i);
+}
+
+ /*
+  *  rapidhashMicro main function.
+  *
+  *  @param key     Buffer to be hashed.
+  *  @param len     @key length, in bytes.
+  *  @param seed    64-bit seed used to alter the hash result predictably.
+  *  @param secret  Triplet of 64-bit secrets used to alter hash result predictably.
+  *
+  *  Returns a 64-bit hash.
+  */
+  RAPIDHASH_INLINE_CONSTEXPR uint64_t rapidhashMicro_internal(const void *key, size_t len, uint64_t seed, const uint64_t* secret) RAPIDHASH_NOEXCEPT {
+    const uint8_t *p=(const uint8_t *)key;
+    seed ^= rapid_mix(seed ^ secret[2], secret[1]);
+    uint64_t a=0, b=0;
+    size_t i = len;
+    if (_likely_(len <= 16)) {
+      if (len >= 4) {
+        seed ^= len;
+        if (len >= 8) {
+          const uint8_t* plast = p + len - 8;
+          a = rapid_read64(p);
+          b = rapid_read64(plast);
+        } else {
+          const uint8_t* plast = p + len - 4;
+          a = rapid_read32(p);
+          b = rapid_read32(plast);
+        }
+      } else if (len > 0) {
+        a = (((uint64_t)p[0])<<45)|p[len-1];
+        b = p[len>>1];
+      } else
+        a = b = 0;
+    } else {
+      if (i > 80) {
+        uint64_t see1 = seed, see2 = seed;
+        uint64_t see3 = seed, see4 = seed;
+        do {
+          seed = rapid_mix(rapid_read64(p) ^ secret[0], rapid_read64(p + 8) ^ seed);
+          see1 = rapid_mix(rapid_read64(p + 16) ^ secret[1], rapid_read64(p + 24) ^ see1);
+          see2 = rapid_mix(rapid_read64(p + 32) ^ secret[2], rapid_read64(p + 40) ^ see2);
+          see3 = rapid_mix(rapid_read64(p + 48) ^ secret[3], rapid_read64(p + 56) ^ see3);
+          see4 = rapid_mix(rapid_read64(p + 64) ^ secret[4], rapid_read64(p + 72) ^ see4);
+          p += 80;
+          i -= 80;
+        } while(i > 80);
+        seed ^= see1;
+        see2 ^= see3;
+        seed ^= see4;
+        seed ^= see2;
+      }
+      if (i > 16) {
+        seed = rapid_mix(rapid_read64(p) ^ secret[2], rapid_read64(p + 8) ^ seed);
+        if (i > 32) {
+            seed = rapid_mix(rapid_read64(p + 16) ^ secret[2], rapid_read64(p + 24) ^ seed);
+            if (i > 48) {
+                seed = rapid_mix(rapid_read64(p + 32) ^ secret[1], rapid_read64(p + 40) ^ seed);
+                if (i > 64) {
+                    seed = rapid_mix(rapid_read64(p + 48) ^ secret[1], rapid_read64(p + 56) ^ seed);
+                }
+            }
+        }
+      }
+      a=rapid_read64(p+i-16) ^ i;  b=rapid_read64(p+i-8);
+    }
+    a ^= secret[1];
+    b ^= seed;
+    rapid_mum(&a, &b);
+    return rapid_mix(a ^ secret[7], b ^ secret[1] ^ i);
+  }
+
+  /*
+  *  rapidhashNano main function.
+  *
+  *  @param key     Buffer to be hashed.
+  *  @param len     @key length, in bytes.
+  *  @param seed    64-bit seed used to alter the hash result predictably.
+  *  @param secret  Triplet of 64-bit secrets used to alter hash result predictably.
+  *
+  *  Returns a 64-bit hash.
+  */
+  RAPIDHASH_INLINE_CONSTEXPR uint64_t rapidhashNano_internal(const void *key, size_t len, uint64_t seed, const uint64_t* secret) RAPIDHASH_NOEXCEPT {
+    const uint8_t *p=(const uint8_t *)key;
+    seed ^= rapid_mix(seed ^ secret[2], secret[1]);
+    uint64_t a=0, b=0;
+    size_t i = len;
+    if (_likely_(len <= 16)) {
+      if (len >= 4) {
+        seed ^= len;
+        if (len >= 8) {
+          const uint8_t* plast = p + len - 8;
+          a = rapid_read64(p);
+          b = rapid_read64(plast);
+        } else {
+          const uint8_t* plast = p + len - 4;
+          a = rapid_read32(p);
+          b = rapid_read32(plast);
+        }
+      } else if (len > 0) {
+        a = (((uint64_t)p[0])<<45)|p[len-1];
+        b = p[len>>1];
+      } else
+        a = b = 0;
+    } else {
+      if (i > 48) {
+        uint64_t see1 = seed, see2 = seed;
+        do {
+          seed = rapid_mix(rapid_read64(p) ^ secret[0], rapid_read64(p + 8) ^ seed);
+          see1 = rapid_mix(rapid_read64(p + 16) ^ secret[1], rapid_read64(p + 24) ^ see1);
+          see2 = rapid_mix(rapid_read64(p + 32) ^ secret[2], rapid_read64(p + 40) ^ see2);
+          p += 48;
+          i -= 48;
+        } while(i > 48);
+        seed ^= see1;
+        seed ^= see2;
+      }
+      if (i > 16) {
+        seed = rapid_mix(rapid_read64(p) ^ secret[2], rapid_read64(p + 8) ^ seed);
+        if (i > 32) {
+            seed = rapid_mix(rapid_read64(p + 16) ^ secret[2], rapid_read64(p + 24) ^ seed);
+        }
+      }
+      a=rapid_read64(p+i-16) ^ i;  b=rapid_read64(p+i-8);
+    }
+    a ^= secret[1];
+    b ^= seed;
+    rapid_mum(&a, &b);
+    return rapid_mix(a ^ secret[7], b ^ secret[1] ^ i);
+  }
+ 
+/*
+ *  rapidhash seeded hash function.
+ *
+ *  @param key     Buffer to be hashed.
+ *  @param len     @key length, in bytes.
+ *  @param seed    64-bit seed used to alter the hash result predictably.
+ *
+ *  Calls rapidhash_internal using provided parameters and default secrets.
+ *
+ *  Returns a 64-bit hash.
+ */
+RAPIDHASH_INLINE_CONSTEXPR uint64_t rapidhash_withSeed(const void *key, size_t len, uint64_t seed) RAPIDHASH_NOEXCEPT {
+  return rapidhash_internal(key, len, seed, rapid_secret);
+}
+ 
+/*
+ *  rapidhash general purpose hash function.
+ *
+ *  @param key     Buffer to be hashed.
+ *  @param len     @key length, in bytes.
+ *
+ *  Calls rapidhash_withSeed using provided parameters and the default seed.
+ *
+ *  Returns a 64-bit hash.
+ */
+RAPIDHASH_INLINE_CONSTEXPR uint64_t rapidhash(const void *key, size_t len) RAPIDHASH_NOEXCEPT {
+  return rapidhash_withSeed(key, len, 0);
+}
+
+/*
+ *  rapidhashMicro seeded hash function.
+ *
+ *  Designed for HPC and server applications, where cache misses make a noticeable performance detriment.
+ *  Clang-18+ compiles it to ~140 instructions without stack usage, both on x86-64 and aarch64.
+ *  Faster for sizes up to 512 bytes, just 15%-20% slower for inputs above 1kb.
+ *
+ *  @param key     Buffer to be hashed.
+ *  @param len     @key length, in bytes.
+ *  @param seed    64-bit seed used to alter the hash result predictably.
+ *
+ *  Calls rapidhash_internal using provided parameters and default secrets.
+ *
+ *  Returns a 64-bit hash.
+ */
+ RAPIDHASH_INLINE_CONSTEXPR uint64_t rapidhashMicro_withSeed(const void *key, size_t len, uint64_t seed) RAPIDHASH_NOEXCEPT {
+  return rapidhashMicro_internal(key, len, seed, rapid_secret);
+}
+ 
+/*
+ *  rapidhashMicro hash function.
+ *
+ *  @param key     Buffer to be hashed.
+ *  @param len     @key length, in bytes.
+ *
+ *  Calls rapidhash_withSeed using provided parameters and the default seed.
+ *
+ *  Returns a 64-bit hash.
+ */
+RAPIDHASH_INLINE_CONSTEXPR uint64_t rapidhashMicro(const void *key, size_t len) RAPIDHASH_NOEXCEPT {
+  return rapidhashMicro_withSeed(key, len, 0);
+}
+
+/*
+ *  rapidhashNano seeded hash function.
+ *
+ *  @param key     Buffer to be hashed.
+ *  @param len     @key length, in bytes.
+ *  @param seed    64-bit seed used to alter the hash result predictably.
+ *
+ *  Calls rapidhash_internal using provided parameters and default secrets.
+ *
+ *  Returns a 64-bit hash.
+ */
+ RAPIDHASH_INLINE_CONSTEXPR uint64_t rapidhashNano_withSeed(const void *key, size_t len, uint64_t seed) RAPIDHASH_NOEXCEPT {
+  return rapidhashNano_internal(key, len, seed, rapid_secret);
+}
+ 
+/*
+ *  rapidhashNano hash function.
+ *
+ *  Designed for Mobile and embedded applications, where keeping a small code size is a top priority.
+ *  Clang-18+ compiles it to less than 100 instructions without stack usage, both on x86-64 and aarch64.
+ *  The fastest for sizes up to 48 bytes, but may be considerably slower for larger inputs.
+ *
+ *  @param key     Buffer to be hashed.
+ *  @param len     @key length, in bytes.
+ *
+ *  Calls rapidhash_withSeed using provided parameters and the default seed.
+ *
+ *  Returns a 64-bit hash.
+ */
+RAPIDHASH_INLINE_CONSTEXPR uint64_t rapidhashNano(const void *key, size_t len) RAPIDHASH_NOEXCEPT {
+  return rapidhashNano_withSeed(key, len, 0);
+}
diff --git a/thirdparty/zxc/src/lib/zxc_common.c b/thirdparty/zxc/src/lib/zxc_common.c
new file mode 100644
index 000000000000..5487cb9d637d
--- /dev/null
+++ b/thirdparty/zxc/src/lib/zxc_common.c
@@ -0,0 +1,931 @@
+/*
+ * ZXC - High-performance lossless compression
+ *
+ * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file zxc_common.c
+ * @brief Shared library utilities: context management, header I/O,
+ *        compress-bound calculation, and error-code name lookup.
+ *
+ * This translation unit contains the functions shared by both the buffer and
+ * streaming APIs.  It is linked into every build of libzxc.
+ */
+
+#include "../../include/zxc_buffer.h"
+#include "../../include/zxc_error.h"
+#include "zxc_internal.h"
+
+/*
+ * ============================================================================
+ * CONTEXT MANAGEMENT
+ * ============================================================================
+ */
+
+/**
+ * @brief Allocates memory aligned to the specified boundary.
+ *
+ * Uses `_aligned_malloc` on Windows and `posix_memalign` elsewhere.
+ *
+ * @param[in] size      Number of bytes to allocate.
+ * @param[in] alignment Required alignment (must be a power of two).
+ * @return Pointer to the allocated block, or @c NULL on failure.
+ */
+void* zxc_aligned_malloc(const size_t size, const size_t alignment) {
+#if defined(_WIN32)
+    return _aligned_malloc(size, alignment);
+#else
+    void* ptr = NULL;
+    if (posix_memalign(&ptr, alignment, size) != 0) return NULL;
+    return ptr;
+#endif
+}
+
+/**
+ * @brief Frees memory previously allocated by zxc_aligned_malloc().
+ *
+ * @param[in] ptr Pointer returned by zxc_aligned_malloc() (may be @c NULL).
+ */
+void zxc_aligned_free(void* ptr) {
+#if defined(_WIN32)
+    _aligned_free(ptr);
+#else
+    free(ptr);
+#endif
+}
+
+/**
+ * @brief Returns @c sizeof(zxc_compress_opts_t) for ABI-safe allocation.
+ *
+ * Public API; see @c zxc_buffer.h. Lets callers (other languages, or a
+ * different library version) size the options struct without knowing its layout.
+ *
+ * @return Size of @ref zxc_compress_opts_t in bytes.
+ */
+size_t zxc_compress_opts_size(void) { return sizeof(zxc_compress_opts_t); }
+
+/**
+ * @brief Returns @c sizeof(zxc_decompress_opts_t) for ABI-safe allocation.
+ *
+ * Public API; see @c zxc_buffer.h. Lets callers (other languages, or a
+ * different library version) size the options struct without knowing its layout.
+ *
+ * @return Size of @ref zxc_decompress_opts_t in bytes.
+ */
+size_t zxc_decompress_opts_size(void) { return sizeof(zxc_decompress_opts_t); }
+
+/*
+ * Layout of the persistent buffer carved by every cctx/dctx init: both modes
+ * (compress, decompress) compute the same offset table, used by the workspace
+ * sizer and the in-place init.
+ */
+typedef struct {
+    size_t total;
+    /* mode == 0 (decompress) */
+    size_t off_work;
+    size_t off_lit_dctx;
+    /* mode == 0 with dict only: prebuilt shared-dictionary Huffman decode table. */
+    size_t off_huf_dict;
+    /* mode == 1 (compress) */
+    size_t off_hash_pos;
+    size_t off_hash_tags;
+    size_t off_chain;
+    size_t off_seq_union;
+    size_t off_extras;
+    size_t off_lit_cctx;
+    /* meaningful only when sz_opt > 0 (level >= ZXC_LEVEL_DENSITY). */
+    size_t off_opt;
+    /* both modes: [dict | data] concat scratch, present only when dict_size > 0. */
+    size_t off_dict;
+    /* Sub-buffer sizes (re-used by the partitioning step + zero-init). */
+    size_t sz_hash_pos;
+    size_t sz_hash_tags;
+    size_t sz_opt;
+    size_t sz_dict; /* 0 = no dictionary buffer. */
+    size_t max_seq;
+} zxc_cctx_layout_t;
+
+/**
+ * @brief Computes the single-allocation memory layout for a compression /
+ *        decompression context.
+ *
+ * Walks the same partition order used by @ref zxc_cctx_init_in_workspace and
+ * records each sub-buffer's offset plus the running @c total, so the sizing
+ * query and the partitioning step share one source of truth and can never
+ * disagree.
+ *
+ * Decompress (@p mode == 0) reserves @c work_buf and @c lit_buffer (both padded
+ * for wild-copy overshoot) and, when @p dict_size > 0, the shared-dictionary
+ * Huffman decode table. Compress (@p mode == 1) reserves the LZ match-finder
+ * tables (hash positions, tags, chain), the sequence / extras / literal buffers
+ * and - only at @c level >= ZXC_LEVEL_DENSITY - the optimal-parser scratch. A
+ * @p dict_size > 0 appends the [dict | data] concat scratch in both modes.
+ *
+ * Every offset is cache-line aligned via @c ZXC_ALIGN_CL.
+ *
+ * @param[in] chunk_size  Block size in bytes.
+ * @param[in] mode        1 = compression, 0 = decompression.
+ * @param[in] level       Compression level (only consulted when @p mode == 1).
+ * @param[in] dict_size   Dictionary prefill size; when > 0 the layout includes
+ *                        the [dict | data] concat buffer (and, on decompress,
+ *                        the dictionary Huffman decode table).
+ * @return Fully populated layout; @c .total is the required workspace size.
+ */
+static zxc_cctx_layout_t compute_cctx_layout(const size_t chunk_size, const int mode,
+                                             const int level, const size_t dict_size) {
+    zxc_cctx_layout_t layout = {0};
+
+    if (mode == 0) {
+        /* Decompress: work_buf + lit_buffer, both padded for wild-copy
+         * overshoot and sized worst-case (chunk_size + ZXC_DECOMPRESS_TAIL_PAD).
+         * lit_buffer is provisioned regardless of level because the decoder cannot
+         * predict the per-block literal encoding (RAW / RLE / HUFFMAN). */
+        const size_t sz_work = chunk_size + ZXC_DECOMPRESS_TAIL_PAD;
+        const size_t sz_lit = chunk_size + ZXC_PAD_SIZE;
+
+        layout.off_work = layout.total;
+        layout.total += ZXC_ALIGN_CL(sz_work);
+        layout.off_lit_dctx = layout.total;
+        layout.total += ZXC_ALIGN_CL(sz_lit);
+        /* Shared-dictionary Huffman decode table: built once per context by
+         * zxc_cctx_attach_dict_huf, read by HUFFMAN_DICT literal sections. */
+        if (dict_size > 0) {
+            layout.off_huf_dict = layout.total;
+            layout.total += ZXC_ALIGN_CL(ZXC_HUF_DEC_TABLE_SIZE * sizeof(zxc_huf_dec_entry_t));
+        }
+    } else {
+        /* Compress: 6 partitions + optional opt_scratch at level >= ZXC_LEVEL_DENSITY. */
+        const uint32_t offset_bits = zxc_log2_u32((uint32_t)chunk_size);
+        layout.max_seq = chunk_size / ZXC_LZ_MIN_MATCH_LEN + 16;
+        layout.sz_hash_pos = ZXC_LZ_HASH_SIZE * sizeof(uint32_t);
+        layout.sz_hash_tags = ZXC_LZ_HASH_SIZE * sizeof(uint8_t);
+        const size_t sz_chain = ZXC_LZ_WINDOW_SIZE * sizeof(uint16_t);
+        /* buf_sequences (GHI, level <= ZXC_LEVEL_FAST) aliases buf_offsets + buf_tokens (GLO,
+         * level >= ZXC_LEVEL_DEFAULT). Mutually exclusive per block; sized for the larger. */
+        const size_t sz_seq_union = layout.max_seq * sizeof(uint32_t);
+        const size_t vbyte_len = (offset_bits + 6) / 7;
+        const size_t sz_extras = layout.max_seq * 2 * vbyte_len;
+        const size_t sz_lit = chunk_size + ZXC_PAD_SIZE;
+
+        /* opt_scratch (level >= ZXC_LEVEL_DENSITY only): DP arrays for the optimal parser, also
+         * reused transiently as the package-merge scratch for the length-limited
+         * Huffman code-length builder. Sized to the larger of the two demands.
+         * The formula must stay in sync with zxc_estimate_cctx_size() and the
+         * consumer in zxc_compress.c. */
+        if (level >= ZXC_LEVEL_DENSITY) {
+            const size_t sz_dp = ZXC_ALIGN_CL((chunk_size + 1) * sizeof(uint32_t));
+            const size_t sz_pl = ZXC_ALIGN_CL((chunk_size + 1) * sizeof(uint16_t));
+            const size_t sz_po = ZXC_ALIGN_CL((chunk_size + 1) * sizeof(uint16_t));
+            const size_t n_bm_words = ZXC_BITMAP_WORDS(chunk_size + 1);
+            const size_t sz_bm = ZXC_ALIGN_CL(n_bm_words * sizeof(uint64_t));
+            const size_t dp_needed = sz_dp + sz_pl + sz_po + sz_bm;
+            layout.sz_opt =
+                (dp_needed > ZXC_HUF_BUILD_SCRATCH_SIZE) ? dp_needed : ZXC_HUF_BUILD_SCRATCH_SIZE;
+        }
+
+        layout.off_hash_pos = layout.total;
+        layout.total += ZXC_ALIGN_CL(layout.sz_hash_pos);
+        layout.off_hash_tags = layout.total;
+        layout.total += ZXC_ALIGN_CL(layout.sz_hash_tags);
+        layout.off_chain = layout.total;
+        layout.total += ZXC_ALIGN_CL(sz_chain);
+        layout.off_seq_union = layout.total;
+        layout.total += ZXC_ALIGN_CL(sz_seq_union);
+        layout.off_extras = layout.total;
+        layout.total += ZXC_ALIGN_CL(sz_extras);
+        layout.off_lit_cctx = layout.total;
+        layout.total += ZXC_ALIGN_CL(sz_lit);
+        /* opt_scratch is appended last so it is absent for levels 1..5 (zero
+         * waste on the common path) and only inflates the workspace at level 6. */
+        if (layout.sz_opt) {
+            layout.off_opt = layout.total;
+            layout.total += ZXC_ALIGN_CL(layout.sz_opt);
+        }
+    }
+
+    /* [dict | data] concat scratch (dict only). Compress chunk_size already
+     * spans [dict | block]; decompress prepends dict to a (chunk + PAD) region. */
+    if (dict_size > 0) {
+        layout.sz_dict = (mode == 1) ? (chunk_size + ZXC_DECOMPRESS_TAIL_PAD)
+                                     : (dict_size + chunk_size + ZXC_DECOMPRESS_TAIL_PAD);
+        layout.off_dict = layout.total;
+        layout.total += ZXC_ALIGN_CL(layout.sz_dict);
+    }
+    return layout;
+}
+
+/**
+ * @brief Returns the workspace byte count required for the given parameters.
+ *
+ * Public contract documented at the declaration in @c zxc_internal.h. Thin
+ * wrapper that returns @c compute_cctx_layout(...).total, or 0 when
+ * @p chunk_size is 0.
+ *
+ * @param[in] chunk_size  Block size in bytes.
+ * @param[in] mode        1 = compression, 0 = decompression.
+ * @param[in] level       Compression level (only consulted when @p mode == 1).
+ * @param[in] dict_size   Dictionary prefill size; > 0 includes the concat buffer.
+ * @return Workspace size in bytes, or 0 if @p chunk_size is 0.
+ */
+size_t zxc_cctx_compute_workspace_size(const size_t chunk_size, const int mode, const int level,
+                                       const size_t dict_size) {
+    if (UNLIKELY(chunk_size == 0)) return 0;
+    return compute_cctx_layout(chunk_size, mode, level, dict_size).total;
+}
+
+/**
+ * @brief Partitions a caller-supplied workspace into a ready-to-use context.
+ *
+ * Public contract (alignment, lifetime, return codes) documented at the
+ * declaration in @c zxc_internal.h. Computes the layout via
+ * @ref compute_cctx_layout, rejects an undersized @p workspace, then carves the
+ * sub-buffers out of it. @c ctx->memory_block stays NULL so @ref zxc_cctx_free
+ * leaves the caller-owned workspace untouched.
+ *
+ * @param[out] ctx               Context to initialise.
+ * @param[in]  workspace         Caller-allocated, cache-line-aligned buffer.
+ * @param[in]  workspace_size    Capacity of @p workspace in bytes.
+ * @param[in]  chunk_size        Block size in bytes.
+ * @param[in]  mode              1 = compression, 0 = decompression.
+ * @param[in]  level             Compression level (ignored when @p mode == 0).
+ * @param[in]  checksum_enabled  Non-zero to enable checksum computation.
+ * @param[in]  dict_size         Dictionary prefill size; > 0 carves the concat buffer.
+ * @return @ref ZXC_OK, @ref ZXC_ERROR_NULL_INPUT, or @ref ZXC_ERROR_DST_TOO_SMALL.
+ */
+int zxc_cctx_init_in_workspace(zxc_cctx_t* RESTRICT ctx, void* RESTRICT workspace,
+                               const size_t workspace_size, const size_t chunk_size, const int mode,
+                               const int level, const int checksum_enabled,
+                               const size_t dict_size) {
+    if (UNLIKELY(!ctx || !workspace || chunk_size == 0)) return ZXC_ERROR_NULL_INPUT;
+
+    const zxc_cctx_layout_t layout = compute_cctx_layout(chunk_size, mode, level, dict_size);
+    if (UNLIKELY(workspace_size < layout.total)) return ZXC_ERROR_DST_TOO_SMALL;
+
+    ZXC_MEMSET(ctx, 0, sizeof(zxc_cctx_t));
+    ctx->checksum_enabled = checksum_enabled;
+    ctx->chunk_size = chunk_size;
+    const uint32_t offset_bits = zxc_log2_u32((uint32_t)chunk_size);
+    ctx->offset_bits = offset_bits;
+    ctx->offset_mask = (uint32_t)((1ULL << offset_bits) - 1);
+    ctx->max_epoch = (uint32_t)(1ULL << (32 - offset_bits));
+
+    /* memory_block stays NULL on the static-init path so zxc_cctx_free does
+     * not try to free the caller's workspace.  Sub-buffer pointers carry the
+     * partition; ownership is implicit (the caller owns @p workspace). */
+    uint8_t* const mem = (uint8_t*)workspace;
+
+    /* Dictionary concat scratch (both modes); init owns dict_size now so callers
+     * no longer assign ctx->dict_size after init. */
+    ctx->dict_size = dict_size;
+    if (dict_size > 0) {
+        ctx->dict_buffer = mem + layout.off_dict;
+        ctx->dict_buffer_cap = layout.sz_dict;
+    }
+
+    if (mode == 0) {
+        ctx->work_buf = mem + layout.off_work;
+        ctx->work_buf_cap = chunk_size + ZXC_DECOMPRESS_TAIL_PAD;
+        ctx->lit_buffer = mem + layout.off_lit_dctx;
+        ctx->lit_buffer_cap = chunk_size + ZXC_PAD_SIZE;
+        if (dict_size > 0) ctx->dict_huf_table = (zxc_huf_dec_entry_t*)(mem + layout.off_huf_dict);
+        return ZXC_OK;
+    }
+
+    ctx->hash_table = (uint32_t*)(mem + layout.off_hash_pos);
+    ctx->hash_tags = mem + layout.off_hash_tags;
+    ctx->chain_table = (uint16_t*)(mem + layout.off_chain);
+    ctx->buf_sequences = (uint32_t*)(mem + layout.off_seq_union);
+    ctx->buf_offsets = (uint16_t*)(mem + layout.off_seq_union);
+    ctx->buf_tokens = mem + layout.off_seq_union + layout.max_seq * sizeof(uint16_t);
+    ctx->buf_extras = mem + layout.off_extras;
+    ctx->literals = mem + layout.off_lit_cctx;
+    if (layout.sz_opt) {
+        ctx->opt_scratch = mem + layout.off_opt;
+        ctx->opt_scratch_cap = layout.sz_opt;
+    }
+
+    ctx->compression_level = level;
+    ctx->epoch = 1;
+
+    ZXC_MEMSET(ctx->hash_table, 0, layout.sz_hash_pos);
+    ZXC_MEMSET(ctx->hash_tags, 0, layout.sz_hash_tags);
+    return ZXC_OK;
+}
+
+/**
+ * @brief Initialises a compression / decompression context, allocating the
+ *        persistent buffer with @c ZXC_ALIGNED_MALLOC.
+ *
+ * Thin wrapper around @ref zxc_cctx_init_in_workspace: sizes the buffer via
+ * @ref zxc_cctx_compute_workspace_size, allocates it, then partitions it.
+ * The pointer is stored in @c ctx->memory_block so @ref zxc_cctx_free can
+ * release it.  The static-cctx public API (see @c zxc_buffer.h) bypasses
+ * this wrapper and partitions a caller-supplied workspace directly.
+ *
+ * @param[out] ctx               Context to initialise.
+ * @param[in]  chunk_size        Block size in bytes.
+ * @param[in]  mode              1 = compression, 0 = decompression.
+ * @param[in]  level             Compression level (ignored when @p mode == 0).
+ * @param[in]  checksum_enabled  Non-zero to enable checksum computation.
+ * @param[in]  dict_size         Dictionary prefill size.
+ * @return @ref ZXC_OK on success, @ref ZXC_ERROR_MEMORY on allocation failure.
+ */
+int zxc_cctx_init(zxc_cctx_t* RESTRICT ctx, const size_t chunk_size, const int mode,
+                  const int level, const int checksum_enabled, const size_t dict_size) {
+    const size_t total = zxc_cctx_compute_workspace_size(chunk_size, mode, level, dict_size);
+    if (UNLIKELY(total == 0)) return ZXC_ERROR_NULL_INPUT;
+
+    uint8_t* const mem = (uint8_t*)ZXC_ALIGNED_MALLOC(total, ZXC_CACHE_LINE_SIZE);
+    if (UNLIKELY(!mem)) return ZXC_ERROR_MEMORY;
+
+    const int rc = zxc_cctx_init_in_workspace(ctx, mem, total, chunk_size, mode, level,
+                                              checksum_enabled, dict_size);
+    if (UNLIKELY(rc != ZXC_OK)) {
+        // LCOV_EXCL_START
+        ZXC_ALIGNED_FREE(mem);
+        return rc;
+        // LCOV_EXCL_STOP
+    }
+    /* Library-owned buffer: record the allocation so zxc_cctx_free frees it. */
+    ctx->memory_block = mem;
+    return ZXC_OK;
+}
+
+/**
+ * @brief Releases all resources owned by a compression context.
+ *
+ * After this call every pointer inside @p ctx is @c NULL and the context
+ * may be safely re-initialised with zxc_cctx_init().
+ *
+ * @param[in,out] ctx Context to tear down.
+ */
+void zxc_cctx_free(zxc_cctx_t* ctx) {
+    if (ctx->memory_block) {
+        ZXC_ALIGNED_FREE(ctx->memory_block);
+        ctx->memory_block = NULL;
+    }
+
+    ctx->lit_buffer = NULL;
+    ctx->hash_table = NULL;
+    ctx->hash_tags = NULL;
+    ctx->chain_table = NULL;
+    ctx->buf_sequences = NULL;
+    ctx->buf_tokens = NULL;
+    ctx->buf_offsets = NULL;
+    ctx->buf_extras = NULL;
+    ctx->literals = NULL;
+    ctx->work_buf = NULL;
+    ctx->opt_scratch = NULL;
+    ctx->dict_buffer = NULL;
+
+    ctx->epoch = 0;
+    ctx->lit_buffer_cap = 0;
+    ctx->work_buf_cap = 0;
+    ctx->opt_scratch_cap = 0;
+    ctx->dict_buffer_cap = 0;
+    ctx->dict_size = 0;
+    ctx->dict_huf_lengths = NULL;
+    ctx->dict_huf_table = NULL;
+    ctx->lit_freq_acc = NULL;
+}
+
+/**
+ * @brief Attach the shared dictionary literal Huffman table to a context.
+ *
+ * Stores @p lengths (128-byte packed code-lengths header, caller-owned, must
+ * outlive the context) and, on decompression contexts created with
+ * @c dict_size > 0, builds the decode table once into the workspace-carved
+ * @c dict_huf_table. A NULL @p lengths is a no-op.
+ *
+ * @param[in,out] ctx      Initialised context to attach the table to.
+ * @param[in]     lengths  128-byte packed code lengths, or NULL for a no-op.
+ * @return @ref ZXC_OK on success, @ref ZXC_ERROR_CORRUPT_DATA if @p lengths is
+ *         structurally invalid (bad nibble, Kraft inequality).
+ */
+int zxc_cctx_attach_dict_huf(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT lengths) {
+    if (UNLIKELY(!ctx)) return ZXC_ERROR_NULL_INPUT;
+    ctx->dict_huf_lengths = lengths;
+    if (lengths == NULL || ctx->dict_huf_table == NULL) return ZXC_OK;
+
+    /* Empty (all-zero) table from a low-entropy corpus: treat it as "no shared table". */
+    int empty = 1;
+    for (size_t i = 0; i < ZXC_HUF_TABLE_SIZE; i++) {
+        if (lengths[i]) {
+            empty = 0;
+            break;
+        }
+    }
+    if (empty) {
+        ctx->dict_huf_lengths = NULL;
+        ctx->dict_huf_table = NULL;
+        return ZXC_OK;
+    }
+
+    uint8_t code_len[ZXC_HUF_NUM_SYMBOLS];
+    int rc = zxc_huf_unpack_lengths(lengths, code_len);
+    if (LIKELY(rc == ZXC_OK)) rc = zxc_huf_build_dec_table(code_len, ctx->dict_huf_table);
+    if (UNLIKELY(rc != ZXC_OK)) ctx->dict_huf_table = NULL; /* invalid table: refuse decode */
+    return rc;
+}
+
+/*
+ * ============================================================================
+ * HEADER I/O
+ * ============================================================================
+ */
+
+/**
+ * @brief Serialises a ZXC file header into @p dst.
+ *
+ * Layout (16 bytes): Magic (4) | Version (1) | Chunk (1) | Flags (1) |
+ * Reserved (7) | CRC-16 (2).
+ *
+ * @param[out] dst          Destination buffer (>= @ref ZXC_FILE_HEADER_SIZE bytes).
+ * @param[in]  dst_capacity Capacity of @p dst.
+ * @param[in]  chunk_size   Block size (stored as its log2 exponent).
+ * @param[in]  has_checksum Non-zero to set the checksum flag.
+ * @param[in]  dict_id      Dictionary id; when non-zero, sets the dictionary flag
+ *                          and is stored in the header.
+ * @return Number of bytes written (@ref ZXC_FILE_HEADER_SIZE) on success,
+ *         or a negative @ref zxc_error_t code.
+ */
+int zxc_write_file_header(uint8_t* RESTRICT dst, const size_t dst_capacity, const size_t chunk_size,
+                          const int has_checksum, const uint32_t dict_id) {
+    if (UNLIKELY(dst_capacity < ZXC_FILE_HEADER_SIZE)) return ZXC_ERROR_DST_TOO_SMALL;
+
+    zxc_store_le32(dst, ZXC_MAGIC_WORD);
+    dst[4] = ZXC_FILE_FORMAT_VERSION;
+
+    // Block size stored as log2 exponent (e.g. 18 = 256 KB)
+    dst[5] = (uint8_t)zxc_log2_u32((uint32_t)chunk_size);
+
+    // Flags are at offset 6
+    uint8_t flags = has_checksum ? (ZXC_FILE_FLAG_HAS_CHECKSUM | ZXC_CHECKSUM_RAPIDHASH) : 0;
+    if (dict_id != 0) flags |= ZXC_FILE_FLAG_HAS_DICTIONARY;
+    dst[6] = flags;
+
+    // Bytes 7-13: Reserved / dict_id
+    ZXC_MEMSET(dst + 7, 0, 7);
+    if (dict_id != 0) zxc_store_le32(dst + 7, dict_id);
+
+    // Bytes 14-15: CRC (16-bit)
+    zxc_store_le16(dst + 14, 0);  // Zero out before hashing
+    const uint16_t crc = zxc_hash16(dst);
+    zxc_store_le16(dst + 14, crc);
+
+    return ZXC_FILE_HEADER_SIZE;
+}
+
+/**
+ * @brief Parses and validates a ZXC file header from @p src.
+ *
+ * Checks the magic word, format version, and CRC-16.
+ *
+ * @param[in]  src              Source buffer (>= @ref ZXC_FILE_HEADER_SIZE bytes).
+ * @param[in]  src_size         Size of @p src.
+ * @param[out] out_block_size   Receives the decoded block size (may be @c NULL).
+ * @param[out] out_has_checksum Receives 1 if checksums are present, 0 otherwise
+ *                              (may be @c NULL).
+ * @param[out] out_dict_id      Receives the dictionary id, or 0 if none
+ *                              (may be @c NULL).
+ * @return @ref ZXC_OK on success, or a negative @ref zxc_error_t code.
+ */
+int zxc_read_file_header(const uint8_t* RESTRICT src, const size_t src_size,
+                         size_t* RESTRICT out_block_size, int* RESTRICT out_has_checksum,
+                         uint32_t* RESTRICT out_dict_id) {
+    if (UNLIKELY(src_size < ZXC_FILE_HEADER_SIZE)) return ZXC_ERROR_SRC_TOO_SMALL;
+    if (UNLIKELY(zxc_le32(src) != ZXC_MAGIC_WORD)) return ZXC_ERROR_BAD_MAGIC;
+    if (UNLIKELY(src[4] != ZXC_FILE_FORMAT_VERSION)) return ZXC_ERROR_BAD_VERSION;
+
+    uint8_t temp[ZXC_FILE_HEADER_SIZE];
+    ZXC_MEMCPY(temp, src, ZXC_FILE_HEADER_SIZE);
+    // Zero out CRC bytes (14-15) before hash check
+    temp[14] = 0;
+    temp[15] = 0;
+    // Header CRC16 (integrity), then the checksum-algorithm id in flags bits 0-3
+    // (only 0 = RapidHash is defined). CRC is checked first via short-circuit.
+    if (UNLIKELY(zxc_le16(src + 14) != zxc_hash16(temp) ||
+                 (src[6] & 0x0FU) != ZXC_CHECKSUM_RAPIDHASH))
+        return ZXC_ERROR_BAD_HEADER;
+
+    if (out_block_size) {
+        const uint8_t code = src[5];
+        if (UNLIKELY(code < ZXC_BLOCK_SIZE_MIN_LOG2 || code > ZXC_BLOCK_SIZE_MAX_LOG2))
+            return ZXC_ERROR_BAD_BLOCK_SIZE;
+        // Exponent encoding: block_size = 2^code  (4 KB - 2 MB)
+        *out_block_size = (size_t)1U << code;
+    }
+    // Flags are at offset 6
+    if (out_has_checksum) *out_has_checksum = (src[6] & ZXC_FILE_FLAG_HAS_CHECKSUM) ? 1 : 0;
+    if (out_dict_id) *out_dict_id = (src[6] & ZXC_FILE_FLAG_HAS_DICTIONARY) ? zxc_le32(src + 7) : 0;
+
+    return ZXC_OK;
+}
+
+/**
+ * @brief Serialises a block header (8 bytes) into @p dst.
+ *
+ * @param[out] dst          Destination buffer (>= @ref ZXC_BLOCK_HEADER_SIZE bytes).
+ * @param[in]  dst_capacity Capacity of @p dst.
+ * @param[in]  bh           Populated block header descriptor.
+ * @return Number of bytes written (@ref ZXC_BLOCK_HEADER_SIZE) on success,
+ *         or a negative @ref zxc_error_t code.
+ */
+int zxc_write_block_header(uint8_t* RESTRICT dst, const size_t dst_capacity,
+                           const zxc_block_header_t* RESTRICT bh) {
+    if (UNLIKELY(dst_capacity < ZXC_BLOCK_HEADER_SIZE)) return ZXC_ERROR_DST_TOO_SMALL;
+
+    dst[0] = bh->block_type;
+    dst[1] = 0;  // Flags not used currently
+    dst[2] = 0;  // Reserved
+    zxc_store_le32(dst + 3, bh->comp_size);
+    dst[7] = 0;               // Zero before hashing
+    dst[7] = zxc_hash8(dst);  // Checksum at the end
+
+    return ZXC_BLOCK_HEADER_SIZE;
+}
+
+/**
+ * @brief Parses and validates a block header from @p src.
+ *
+ * Validates the 8-bit CRC embedded in the header.
+ *
+ * @param[in]  src      Source buffer (>= @ref ZXC_BLOCK_HEADER_SIZE bytes).
+ * @param[in]  src_size Size of @p src.
+ * @param[out] bh       Receives the decoded block header fields.
+ * @return @ref ZXC_OK on success, or a negative @ref zxc_error_t code.
+ */
+int zxc_read_block_header(const uint8_t* RESTRICT src, const size_t src_size,
+                          zxc_block_header_t* RESTRICT bh) {
+    if (UNLIKELY(src_size < ZXC_BLOCK_HEADER_SIZE)) return ZXC_ERROR_SRC_TOO_SMALL;
+
+    uint8_t temp[ZXC_BLOCK_HEADER_SIZE];
+    ZXC_MEMCPY(temp, src, ZXC_BLOCK_HEADER_SIZE);
+    temp[7] = 0;  // Zero out checksum byte before hashing
+    if (UNLIKELY(src[7] != zxc_hash8(temp))) return ZXC_ERROR_BAD_HEADER;
+
+    bh->block_type = src[0];
+    bh->block_flags = 0;  // Flags not used currently
+    bh->reserved = src[2];
+    bh->comp_size = zxc_le32(src + 3);
+    bh->header_crc = src[7];
+
+    return ZXC_OK;
+}
+
+/**
+ * @brief Writes the 12-byte file footer (source size + global checksum).
+ *
+ * @param[out] dst              Destination buffer (>= @ref ZXC_FILE_FOOTER_SIZE bytes).
+ * @param[in]  dst_capacity     Capacity of @p dst.
+ * @param[in]  src_size         Original uncompressed size in bytes.
+ * @param[in]  global_hash      Accumulated global checksum value.
+ * @param[in]  checksum_enabled Non-zero to write the checksum; zero to zero-fill.
+ * @return Number of bytes written (@ref ZXC_FILE_FOOTER_SIZE) on success,
+ *         or a negative @ref zxc_error_t code.
+ */
+int zxc_write_file_footer(uint8_t* RESTRICT dst, const size_t dst_capacity, const uint64_t src_size,
+                          const uint32_t global_hash, const int checksum_enabled) {
+    if (UNLIKELY(dst_capacity < ZXC_FILE_FOOTER_SIZE)) return ZXC_ERROR_DST_TOO_SMALL;
+
+    zxc_store_le64(dst, src_size);
+
+    if (checksum_enabled) {
+        zxc_store_le32(dst + sizeof(uint64_t), global_hash);
+    } else {
+        ZXC_MEMSET(dst + sizeof(uint64_t), 0, sizeof(uint32_t));
+    }
+
+    return ZXC_FILE_FOOTER_SIZE;
+}
+
+/**
+ * @brief Serialises a GLO block header followed by its section descriptors.
+ *
+ * @param[out] dst  Destination buffer.
+ * @param[in]  rem  Remaining capacity of @p dst.
+ * @param[in]  gh   Populated GLO header descriptor.
+ * @param[in]  desc Array of @ref ZXC_GLO_SECTIONS section descriptors.
+ * @return Total bytes written on success, or a negative @ref zxc_error_t code.
+ */
+int zxc_write_glo_header_and_desc(uint8_t* RESTRICT dst, const size_t rem,
+                                  const zxc_gnr_header_t* RESTRICT gh,
+                                  const zxc_section_desc_t desc[ZXC_GLO_SECTIONS]) {
+    const size_t needed =
+        ZXC_GLO_HEADER_BINARY_SIZE + ZXC_GLO_SECTIONS * ZXC_SECTION_DESC_BINARY_SIZE;
+
+    if (UNLIKELY(rem < needed)) return ZXC_ERROR_DST_TOO_SMALL;
+
+    zxc_store_le32(dst, gh->n_sequences);
+    zxc_store_le32(dst + 4, gh->n_literals);
+
+    dst[8] = gh->enc_lit;
+    dst[9] = gh->enc_litlen;
+    dst[10] = gh->enc_mlen;
+    dst[11] = gh->enc_off;
+
+    zxc_store_le32(dst + 12, 0);
+    uint8_t* p = dst + ZXC_GLO_HEADER_BINARY_SIZE;
+
+    for (int i = 0; i < ZXC_GLO_SECTIONS; i++) {
+        zxc_store_le64(p, desc[i].sizes);
+        p += ZXC_SECTION_DESC_BINARY_SIZE;
+    }
+
+    return (int)needed;
+}
+
+/**
+ * @brief Parses a GLO block header and its section descriptors from @p src.
+ *
+ * @param[in]  src  Source buffer.
+ * @param[in]  len  Size of @p src.
+ * @param[out] gh   Receives the decoded GLO header.
+ * @param[out] desc Receives @ref ZXC_GLO_SECTIONS decoded section descriptors.
+ * @return @ref ZXC_OK on success, or a negative @ref zxc_error_t code.
+ */
+int zxc_read_glo_header_and_desc(const uint8_t* RESTRICT src, const size_t len,
+                                 zxc_gnr_header_t* RESTRICT gh,
+                                 zxc_section_desc_t desc[ZXC_GLO_SECTIONS]) {
+    const size_t needed =
+        ZXC_GLO_HEADER_BINARY_SIZE + ZXC_GLO_SECTIONS * ZXC_SECTION_DESC_BINARY_SIZE;
+
+    if (UNLIKELY(len < needed)) return ZXC_ERROR_SRC_TOO_SMALL;
+
+    gh->n_sequences = zxc_le32(src);
+    gh->n_literals = zxc_le32(src + 4);
+    gh->enc_lit = src[8];
+    gh->enc_litlen = src[9];
+    gh->enc_mlen = src[10];
+    gh->enc_off = src[11];
+
+    const uint8_t* p = src + ZXC_GLO_HEADER_BINARY_SIZE;
+
+    for (int i = 0; i < ZXC_GLO_SECTIONS; i++) {
+        desc[i].sizes = zxc_le64(p);
+        p += ZXC_SECTION_DESC_BINARY_SIZE;
+    }
+    return ZXC_OK;
+}
+
+/**
+ * @brief Serialises a GHI block header followed by its section descriptors.
+ *
+ * @param[out] dst  Destination buffer.
+ * @param[in]  rem  Remaining capacity of @p dst.
+ * @param[in]  gh   Populated GHI header descriptor.
+ * @param[in]  desc Array of @ref ZXC_GHI_SECTIONS section descriptors.
+ * @return Total bytes written on success, or a negative @ref zxc_error_t code.
+ */
+int zxc_write_ghi_header_and_desc(uint8_t* RESTRICT dst, const size_t rem,
+                                  const zxc_gnr_header_t* RESTRICT gh,
+                                  const zxc_section_desc_t desc[ZXC_GHI_SECTIONS]) {
+    const size_t needed =
+        ZXC_GHI_HEADER_BINARY_SIZE + ZXC_GHI_SECTIONS * ZXC_SECTION_DESC_BINARY_SIZE;
+
+    if (UNLIKELY(rem < needed)) return ZXC_ERROR_DST_TOO_SMALL;
+
+    zxc_store_le32(dst, gh->n_sequences);
+    zxc_store_le32(dst + 4, gh->n_literals);
+
+    dst[8] = gh->enc_lit;
+    dst[9] = gh->enc_litlen;
+    dst[10] = gh->enc_mlen;
+    dst[11] = gh->enc_off;
+
+    zxc_store_le32(dst + 12, 0);
+    uint8_t* p = dst + ZXC_GHI_HEADER_BINARY_SIZE;
+
+    for (int i = 0; i < ZXC_GHI_SECTIONS; i++) {
+        zxc_store_le64(p, desc[i].sizes);
+        p += ZXC_SECTION_DESC_BINARY_SIZE;
+    }
+
+    return (int)needed;
+}
+
+/**
+ * @brief Parses a GHI block header and its section descriptors from @p src.
+ *
+ * @param[in]  src  Source buffer.
+ * @param[in]  len  Size of @p src.
+ * @param[out] gh   Receives the decoded GHI header.
+ * @param[out] desc Receives @ref ZXC_GHI_SECTIONS decoded section descriptors.
+ * @return @ref ZXC_OK on success, or a negative @ref zxc_error_t code.
+ */
+int zxc_read_ghi_header_and_desc(const uint8_t* RESTRICT src, const size_t len,
+                                 zxc_gnr_header_t* RESTRICT gh,
+                                 zxc_section_desc_t desc[ZXC_GHI_SECTIONS]) {
+    const size_t needed =
+        ZXC_GHI_HEADER_BINARY_SIZE + ZXC_GHI_SECTIONS * ZXC_SECTION_DESC_BINARY_SIZE;
+
+    if (UNLIKELY(len < needed)) return ZXC_ERROR_SRC_TOO_SMALL;
+
+    gh->n_sequences = zxc_le32(src);
+    gh->n_literals = zxc_le32(src + 4);
+    gh->enc_lit = src[8];
+    gh->enc_litlen = src[9];
+    gh->enc_mlen = src[10];
+    gh->enc_off = src[11];
+
+    const uint8_t* p = src + ZXC_GHI_HEADER_BINARY_SIZE;
+
+    for (int i = 0; i < ZXC_GHI_SECTIONS; i++) {
+        desc[i].sizes = zxc_le64(p);
+        p += ZXC_SECTION_DESC_BINARY_SIZE;
+    }
+    return ZXC_OK;
+}
+
+/*
+ * ============================================================================
+ * COMPRESS BOUND CALCULATION
+ * ============================================================================
+ */
+/**
+ * @brief Returns the maximum compressed size for a given input size.
+ *
+ * The result accounts for the file header, per-block headers, block
+ * checksums, worst-case expansion, EOF block, seekable overhead (SEK
+ * block), and the file footer.
+ *
+ * The block count is derived from @ref ZXC_BLOCK_SIZE_MIN (4 KB) to
+ * guarantee the bound holds for all valid block sizes and seekable mode.
+ *
+ * @param[in] input_size Uncompressed input size in bytes.
+ * @return Upper bound on compressed size, or 0 if @p input_size would overflow.
+ */
+uint64_t zxc_compress_bound(const size_t input_size) {
+    // Guard against uint64 overflow when summing per-block overhead
+    // across very large inputs (input_size approaching SIZE_MAX).
+    if (UNLIKELY(input_size > (SIZE_MAX - (SIZE_MAX >> 8)))) return 0;
+    uint64_t n = ((uint64_t)input_size + ZXC_BLOCK_SIZE_MIN - 1) / ZXC_BLOCK_SIZE_MIN;
+    if (n == 0) n = 1;
+    return ZXC_FILE_HEADER_SIZE +
+           (n * (ZXC_BLOCK_HEADER_SIZE + ZXC_BLOCK_CHECKSUM_SIZE + ZXC_BLOCK_FORMAT_OVERHEAD)) +
+           (uint64_t)input_size + ZXC_BLOCK_HEADER_SIZE + /* EOF block */
+           ZXC_BLOCK_HEADER_SIZE +                        /* SEK block header (seekable) */
+           (n * ZXC_SEEK_ENTRY_SIZE) +                    /* SEK entries: 4 bytes per block */
+           ZXC_FILE_FOOTER_SIZE;
+}
+
+/**
+ * @brief Returns the maximum compressed size for a single block (no file framing).
+ *
+ * @param[in] input_size Uncompressed block size in bytes
+ *                       (must be <= @ref ZXC_BLOCK_SIZE_MAX).
+ * @return Upper bound on compressed block size, or 0 if @p input_size is out
+ *         of range for the Block API (i.e. exceeds ZXC_BLOCK_SIZE_MAX) or if
+ *         the arithmetic would overflow.
+ */
+uint64_t zxc_compress_block_bound(const size_t input_size) {
+    // Mirror the Block API contract: src_size must be in [1, ZXC_BLOCK_SIZE_MAX].
+    // Inputs outside this range cause zxc_compress_block to fail
+    // (NULL_INPUT for 0, BAD_BLOCK_SIZE above MAX), so the bound is undefined.
+    // Returning 0 signals "unusable" upfront. The cap also makes the addition
+    // below trivially overflow-free.
+    if (UNLIKELY(input_size == 0 || input_size > ZXC_BLOCK_SIZE_MAX)) return 0;
+    // Outer block header + payload (worst case: incompressible, raw bytes)
+    // + inner format overhead + optional checksum.
+    return (uint64_t)ZXC_BLOCK_HEADER_SIZE + (uint64_t)input_size + ZXC_BLOCK_FORMAT_OVERHEAD +
+           ZXC_BLOCK_CHECKSUM_SIZE;
+}
+
+/**
+ * @brief Returns the minimum dst_capacity required by zxc_decompress_block().
+ *
+ * The decoder uses speculative wild-copy writes on its fast path.
+ * Sizing the destination to uncompressed_size + ZXC_PAD_SIZE*66 guarantees
+ * the fast path is always reachable and that tail bounds checks never
+ * spuriously reject the last literals of a valid block.
+ *
+ * Returns 0 if @p uncompressed_size exceeds ZXC_BLOCK_SIZE_MAX (the Block API
+ * limit), or if the arithmetic would overflow.
+ *
+ * @param[in] uncompressed_size  Exact decompressed size of the block.
+ * @return Minimum @c dst_capacity in bytes, or 0 if @p uncompressed_size exceeds
+ *         @c ZXC_BLOCK_SIZE_MAX.
+ */
+uint64_t zxc_decompress_block_bound(const size_t uncompressed_size) {
+    if (UNLIKELY(uncompressed_size > ZXC_BLOCK_SIZE_MAX)) return 0;
+    return (uint64_t)uncompressed_size + ZXC_DECOMPRESS_TAIL_PAD;
+}
+
+/**
+ * @brief Estimates the total buffer bytes allocated inside a cctx for a block.
+ *
+ * Thin wrapper around @ref zxc_cctx_compute_workspace_size for @c mode == 1
+ * (compress), with @c src_size clamped up to a valid block size via
+ * @ref zxc_block_size_ceil.  The opaque wrapper struct allocated by
+ * @ref zxc_create_cctx adds a fixed overhead (< 128 B) that is negligible
+ * next to the per-chunk buffers and is intentionally omitted.
+ *
+ * For @p level >= 6 the figure includes the optimal-parser scratch
+ * (@c opt_scratch, ~8.125 bytes per chunk_size byte) used by the optimal
+ * parser and reused as transient package-merge scratch for the Huffman
+ * code-length builder.
+ *
+ * @param[in] src_size  Input size; rounded up to a valid block size.
+ * @param[in] level     Compression level (>= 6 includes the optimal-parser scratch).
+ * @return Estimated context buffer size in bytes, or 0 if @p src_size is 0.
+ */
+uint64_t zxc_estimate_cctx_size(const size_t src_size, const int level) {
+    if (UNLIKELY(src_size == 0)) return 0;
+    const size_t chunk_size = zxc_block_size_ceil(src_size);
+    return (uint64_t)zxc_cctx_compute_workspace_size(chunk_size, 1, level, 0);
+}
+
+/*
+ * ============================================================================
+ * ERROR CODE UTILITIES
+ * ============================================================================
+ */
+
+/**
+ * @brief Returns a human-readable string for the given error code.
+ *
+ * @param[in] code An error code from @ref zxc_error_t (or @ref ZXC_OK).
+ * @return A static string such as @c "ZXC_OK" or @c "ZXC_ERROR_MEMORY".
+ *         Returns @c "ZXC_UNKNOWN_ERROR" for unrecognised codes.
+ */
+const char* zxc_error_name(const int code) {
+    switch ((zxc_error_t)code) {
+        case ZXC_OK:
+            return "ZXC_OK";
+        case ZXC_ERROR_MEMORY:
+            return "ZXC_ERROR_MEMORY";
+        case ZXC_ERROR_DST_TOO_SMALL:
+            return "ZXC_ERROR_DST_TOO_SMALL";
+        case ZXC_ERROR_SRC_TOO_SMALL:
+            return "ZXC_ERROR_SRC_TOO_SMALL";
+        case ZXC_ERROR_BAD_MAGIC:
+            return "ZXC_ERROR_BAD_MAGIC";
+        case ZXC_ERROR_BAD_VERSION:
+            return "ZXC_ERROR_BAD_VERSION";
+        case ZXC_ERROR_BAD_HEADER:
+            return "ZXC_ERROR_BAD_HEADER";
+        case ZXC_ERROR_BAD_CHECKSUM:
+            return "ZXC_ERROR_BAD_CHECKSUM";
+        case ZXC_ERROR_CORRUPT_DATA:
+            return "ZXC_ERROR_CORRUPT_DATA";
+        case ZXC_ERROR_BAD_OFFSET:
+            return "ZXC_ERROR_BAD_OFFSET";
+        case ZXC_ERROR_OVERFLOW:
+            return "ZXC_ERROR_OVERFLOW";
+        case ZXC_ERROR_IO:
+            return "ZXC_ERROR_IO";
+        case ZXC_ERROR_NULL_INPUT:
+            return "ZXC_ERROR_NULL_INPUT";
+        case ZXC_ERROR_BAD_BLOCK_TYPE:
+            return "ZXC_ERROR_BAD_BLOCK_TYPE";
+        case ZXC_ERROR_BAD_BLOCK_SIZE:
+            return "ZXC_ERROR_BAD_BLOCK_SIZE";
+        case ZXC_ERROR_DICT_REQUIRED:
+            return "ZXC_ERROR_DICT_REQUIRED";
+        case ZXC_ERROR_DICT_MISMATCH:
+            return "ZXC_ERROR_DICT_MISMATCH";
+        case ZXC_ERROR_DICT_TOO_LARGE:
+            return "ZXC_ERROR_DICT_TOO_LARGE";
+        default:
+            return "ZXC_UNKNOWN_ERROR";
+    }
+}
+
+/*
+ * ============================================================================
+ * LIBRARY INFORMATION
+ * ============================================================================
+ */
+
+/**
+ * @brief Returns the minimum supported compression level.
+ *
+ * Returns the value of ZXC_LEVEL_FASTEST (currently 1).
+ * This allows integrators to discover the level range at runtime without relying on
+ * compile-time macros alone.
+ */
+int zxc_min_level(void) { return ZXC_LEVEL_FASTEST; }
+
+/**
+ * @brief Returns the maximum supported compression level.
+ *
+ * Returns the value of ZXC_LEVEL_DENSITY (currently 6).
+ */
+int zxc_max_level(void) { return ZXC_LEVEL_DENSITY; }
+
+/**
+ * @brief Returns the default compression level.
+ *
+ * Returns the value of ZXC_LEVEL_DEFAULT (currently 3).
+ */
+int zxc_default_level(void) { return ZXC_LEVEL_DEFAULT; }
+
+/**
+ * @brief Returns the human-readable library version string.
+ *
+ * The returned pointer is a compile-time constant and must not be freed.
+ * Format: "MAJOR.MINOR.PATCH" (e.g. "0.12.0").
+ */
+const char* zxc_version_string(void) { return ZXC_LIB_VERSION_STR; }
diff --git a/thirdparty/zxc/src/lib/zxc_compress.c b/thirdparty/zxc/src/lib/zxc_compress.c
new file mode 100644
index 000000000000..52d9eb8d1e78
--- /dev/null
+++ b/thirdparty/zxc/src/lib/zxc_compress.c
@@ -0,0 +1,2137 @@
+/*
+ * ZXC - High-performance lossless compression
+ *
+ * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file zxc_compress.c
+ * @brief Block-level compression: LZ77 parsing, GLO / GHI / RAW encoding,
+ *        and the chunk-wrapper entry point.
+ *
+ * Compiled multiple times with different @c ZXC_FUNCTION_SUFFIX values to
+ * produce AVX2, AVX-512, NEON, and scalar variants dispatched at runtime
+ * by @ref zxc_dispatch.c.
+ */
+
+/*
+ * Function Multi-Versioning Support
+ * If ZXC_FUNCTION_SUFFIX is defined (e.g. _avx2, _neon), rename the public
+ * entry point AND the Huffman entry points consumed by this TU. The defines
+ * sit before zxc_internal.h so that the prototypes the header declares are
+ * also rewritten with the suffix, keeping callers and callees consistent.
+ */
+#ifdef ZXC_FUNCTION_SUFFIX
+#define ZXC_CAT_IMPL(x, y) x##y
+#define ZXC_CAT(x, y) ZXC_CAT_IMPL(x, y)
+#define zxc_compress_chunk_wrapper ZXC_CAT(zxc_compress_chunk_wrapper, ZXC_FUNCTION_SUFFIX)
+#define zxc_huf_build_code_lengths ZXC_CAT(zxc_huf_build_code_lengths, ZXC_FUNCTION_SUFFIX)
+#define zxc_huf_encode_section ZXC_CAT(zxc_huf_encode_section, ZXC_FUNCTION_SUFFIX)
+#define zxc_huf_encode_section_dict ZXC_CAT(zxc_huf_encode_section_dict, ZXC_FUNCTION_SUFFIX)
+#define zxc_huf_unpack_lengths ZXC_CAT(zxc_huf_unpack_lengths, ZXC_FUNCTION_SUFFIX)
+#endif
+
+#include "../../include/zxc_error.h"
+#include "zxc_internal.h"
+
+/**
+ * @brief Computes a hash value for either a 4-byte or 5-byte sequence.
+ *
+ * @param[in] val The 64-bit integer sequence (e.g., 8 bytes read from input stream).
+ * @param[in] use_hash5 Non-zero to use the 5-byte xorshift64* hash (Marsaglia/Vigna), zero for
+ * 4-byte Marsaglia hash.
+ * @return uint32_t A hash value suitable for indexing the match table.
+ */
+static ZXC_ALWAYS_INLINE uint32_t zxc_hash_func(const uint64_t val, const int use_hash5) {
+    if (use_hash5) {
+        const uint64_t v5 = val & 0xFFFFFFFFFFULL;
+        return (uint32_t)((v5 * ZXC_LZ_HASH_PRIME2) >> (64 - ZXC_LZ_HASH_BITS));
+    } else {
+        const uint64_t v4 = val ^ (val >> 15);
+        return ((uint32_t)v4 * ZXC_LZ_HASH_PRIME1) >> (32 - ZXC_LZ_HASH_BITS);
+    }
+}
+
+#if defined(ZXC_USE_AVX2)
+/**
+ * @brief Reduces a 256-bit integer vector to a single scalar by finding the maximum unsigned 32-bit
+ * integer element.
+ *
+ * This function performs a horizontal reduction across the 8 packed 32-bit unsigned integers
+ * in the source vector to determine the maximum value.
+ *
+ * @param[in] v The 256-bit vector containing 8 unsigned 32-bit integers.
+ * @return The maximum unsigned 32-bit integer found in the vector.
+ */
+// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_AVX2 is defined
+static ZXC_ALWAYS_INLINE uint32_t zxc_mm256_reduce_max_epu32(__m256i v) {
+    __m128i vlow = _mm256_castsi256_si128(v);        // Extract the lower 128 bits
+    __m128i vhigh = _mm256_extracti128_si256(v, 1);  // Extract the upper 128 bits
+    vlow = _mm_max_epu32(vlow, vhigh);               // Element-wise max of lower and upper halves
+    __m128i vshuf = _mm_shuffle_epi32(vlow, _MM_SHUFFLE(1, 0, 3, 2));  // Shuffle to swap pairs
+    vlow = _mm_max_epu32(vlow, vshuf);  // Max of original and swapped
+    vshuf =
+        _mm_shuffle_epi32(vlow, _MM_SHUFFLE(2, 3, 0, 1));  // Shuffle to bring remaining candidates
+    vlow = _mm_max_epu32(vlow, vshuf);                     // Final max comparison
+    return (uint32_t)_mm_cvtsi128_si32(vlow);              // Extract the scalar result
+}
+#endif
+
+#if defined(ZXC_USE_SSE2)
+/**
+ * @brief SSE2 emulation of SSE4.1 @c _mm_blendv_epi8.
+ *
+ * Selects bytes from @p b where the corresponding @p mask byte has its high bit
+ * set, else from @p a. In every call site the mask lanes are full-width compare
+ * results (all-ones or all-zero per element), so a plain bitwise select is exact.
+ *
+ * @param[in] a     Lanes chosen where @p mask is clear.
+ * @param[in] b     Lanes chosen where @p mask is set.
+ * @param[in] mask  Per-byte selector (a full-width compare result).
+ * @return The blended 128-bit vector.
+ */
+// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined
+static ZXC_ALWAYS_INLINE __m128i zxc_mm_blendv_epi8_sse2(__m128i a, __m128i b, __m128i mask) {
+    return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a));
+}
+
+/**
+ * @brief SSE2 emulation of SSE4.1 @c _mm_packus_epi32 (saturating u32 -> u16).
+ *
+ * SSE2 only has signed @c _mm_packs_epi32 (saturates to int16). Bias each lane
+ * by -0x8000 so values in [0, 0xFFFF] land in the signed int16 range with no
+ * saturation, pack, then add 0x8000 back per 16-bit lane. Exact for inputs in
+ * [0, 0xFFFF] (all call sites pass match lengths < 2^16).
+ *
+ * @param[in] a  Four u32 lanes forming the low half of the result.
+ * @param[in] b  Four u32 lanes forming the high half of the result.
+ * @return The eight u16 lanes packed from @p a then @p b.
+ */
+// codeql[cpp/unused-static-function] : Used conditionally when ZXC_USE_SSE2 is defined
+static ZXC_ALWAYS_INLINE __m128i zxc_mm_packus_epi32_sse2(__m128i a, __m128i b) {
+    const __m128i bias32 = _mm_set1_epi32(0x8000);
+    const __m128i bias16 = _mm_set1_epi16((short)0x8000);
+    const __m128i pa = _mm_sub_epi32(a, bias32);
+    const __m128i pb = _mm_sub_epi32(b, bias32);
+    return _mm_add_epi16(_mm_packs_epi32(pa, pb), bias16);
+}
+#endif
+
+/**
+ * @brief Writes a Prefix Varint encoded value to a buffer.
+ *
+ * This function encodes a 32-bit unsigned integer using Prefix Varint encoding
+ * and writes it to the destination buffer. Unary prefix bits in the first
+ * byte determine the total length (1-5 bytes), allowing for branchless or
+ * predictable decoding.
+ *
+ * Format:
+ * - 0xxxxxxx (1 byte)
+ * - 10xxxxxx ... (2 bytes)
+ * - 110xxxxx ... (3 bytes)
+ * ...
+ *
+ * @param[out] dst Pointer to the destination buffer where the encoded value will be written.
+ * @param[in] val The 32-bit unsigned integer value to encode.
+ * @return The number of bytes written to the destination buffer.
+ */
+static ZXC_ALWAYS_INLINE size_t zxc_write_varint(uint8_t* RESTRICT dst, const uint32_t val) {
+    // Refuse to emit varints above ZXC_MAX_VARINT_VALUE: such values are
+    // out-of-spec (block_size_max is 2^21, the largest legitimate varint is
+    // strictly less) and would be rejected by the decoder. For valid inputs
+    // from the Block API (src_size <= ZXC_BLOCK_SIZE_MAX) this never triggers;
+    // it is a defense-in-depth check. Callers must treat a return of 0 as an
+    // encoding error.
+    if (UNLIKELY(val > ZXC_MAX_VARINT_VALUE)) {
+        return 0;
+    }
+
+    // 1 byte: 0xxxxxxx (7 bits) = 2^7 = 128
+    if (LIKELY(val < (1U << 7))) {
+        dst[0] = (uint8_t)val;
+        return 1;
+    }
+
+    // 2 bytes: 10xxxxxx xxxxxxxx (14 bits) = 2^14 = 16384
+    if (LIKELY(val < (1U << 14))) {
+        dst[0] = (uint8_t)(0x80 | (val & 0x3F));
+        dst[1] = (uint8_t)(val >> 6);
+        return 2;
+    }
+
+    // 3 bytes: 110xxxxx xxxxxxxx xxxxxxxx (21 bits) -> max emittable value,
+    // matching ZXC_MAX_VARINT_VALUE = ZXC_BLOCK_SIZE_MAX - 1.
+    dst[0] = (uint8_t)(0xC0 | (val & 0x1F));
+    dst[1] = (uint8_t)(val >> 5);
+    dst[2] = (uint8_t)(val >> 13);
+    return 3;
+}
+
+/**
+ * @brief Structure representing a match found during compression.
+ *
+ * This structure holds information about a matching sequence found
+ * in the input data during the compression process.
+ *
+ * @param ref       Pointer to the reference data where the match was found.
+ * @param len       Length of the matching sequence in bytes.
+ * @param backtrack Distance to backtrack from the current position to find the match.
+ */
+typedef struct {
+    const uint8_t* ref;
+    uint32_t len;
+    uint32_t backtrack;
+} zxc_match_t;
+
+/**
+ * @brief Finds the best matching sequence for LZ77 compression
+ *
+ * Uses a split hash table layout:
+ * - hash_table[h]  : uint32_t position + epoch (128 KB for 15-bit hash)
+ * - hash_tags[h]   : uint8_t tag for fast rejection (32 KB, L1-resident)
+ *
+ * @param[in] src Pointer to the start of the source buffer.
+ * @param[in] ip Current input position pointer.
+ * @param[in] iend Pointer to the end of the input buffer.
+ * @param[in] search_limit Pointer to the match finding limit.
+ * @param[in] anchor Pointer to the current anchor position.
+ * @param[in,out] hash_table Pointer to the position table for match finding.
+ * @param[in,out] hash_tags Pointer to the tag table for fast rejection.
+ * @param[in,out] chain_table Pointer to the chain table for collision handling.
+ * @param[in] epoch_mark Current epoch marker for hash table invalidation.
+ * @param[in] offset_mask Mask isolating the position bits in chain/table entries.
+ * @param[in] level Compression level (selects search depth and matcher behaviour).
+ * @param[in] p LZ77 parameters controlling search depth, lazy matching, and stepping.
+ * @param[in] last_off Most recently accepted match offset, used as a repeat-offset
+ *            seed probed before the hash-chain walk. Pass 0 to disable (all callers
+ *            except the level-6 optimal parser do so).
+ * @return zxc_match_t Structure containing the best match information
+ *         (reference pointer, length of the match, and backtrack distance).
+ */
+static ZXC_ALWAYS_INLINE zxc_match_t zxc_lz77_find_best_match(
+    const uint8_t* src, const uint8_t* ip, const uint8_t* iend, const uint8_t* search_limit,
+    const uint8_t* anchor, uint32_t* RESTRICT hash_table, uint8_t* RESTRICT hash_tags,
+    uint16_t* RESTRICT chain_table, const uint32_t epoch_mark, const uint32_t offset_mask,
+    const int level, const zxc_lz77_params_t p, const uint32_t last_off) {
+    const int use_hash5 = (level >= 3);
+    // Track the best match found so far.
+    //  ref is the pointer to the start of the match in the history buffer,
+    //  len is the match length, and backtrack is the distance from ip to ref.
+    //  Start with a sentinel length just below the minimum so any valid match will replace it.
+    zxc_match_t best = (zxc_match_t){NULL, ZXC_LZ_MIN_MATCH_LEN - 1, 0};
+
+    // Load the 8-byte sequence at the current position.
+    uint64_t cur_val8 = zxc_le64(ip);
+    uint32_t cur_val = (uint32_t)cur_val8;
+    uint32_t h = zxc_hash_func(cur_val8, use_hash5);
+
+    // 8-bit tag: XOR fold of first 4 bytes for fast rejection
+    const uint8_t cur_tag = (uint8_t)(cur_val ^ (cur_val >> 16));
+
+    // Current position in the input buffer expressed as a 32-bit index.
+    const uint32_t cur_pos = (uint32_t)(ip - src);
+
+    // Tag-first filter on fast levels.
+    const uint8_t stored_tag = hash_tags[h];
+    uint32_t match_idx;
+    if (level <= ZXC_LEVEL_FAST && stored_tag != cur_tag) {
+        match_idx = 0;
+    } else {
+        const uint32_t raw_head = hash_table[h];
+        match_idx = ((raw_head & ~offset_mask) == epoch_mark) ? (raw_head & offset_mask) : 0;
+    }
+
+    // skip_head still drives the chain walk on level >= 3 (advances past the
+    // mismatched head without comparing). On level <= 2 it is always 0 here:
+    // either match_idx == 0 (filter-skip) or stored_tag == cur_tag.
+    const int skip_head = (match_idx != 0) & (stored_tag != cur_tag);
+
+    // Split table writes
+    hash_table[h] = epoch_mark | cur_pos;
+    hash_tags[h] = cur_tag;
+
+    // Branchless chain table update
+    const uint32_t dist = cur_pos - match_idx;
+    const uint32_t valid_mask = -((int32_t)((match_idx != 0) & (dist < ZXC_LZ_WINDOW_SIZE)));
+    chain_table[cur_pos & ZXC_LZ_WINDOW_MASK] = (uint16_t)(dist & valid_mask);
+
+    int attempts = p.search_depth;
+
+    /* Repeat-offset seed (level-6 parser passes last_off; others pass 0).
+     * Probing the previous offset first often finds the longest match right
+     * away, speeding up the chain walk. It only raises best.len, never lowers
+     * it, so the result is unchanged - this is purely a speed optimization. */
+    if (last_off != 0U && last_off <= (uint32_t)ZXC_LZ_MAX_DIST && last_off <= cur_pos) {
+        const uint8_t* const rep_ref = src + (cur_pos - last_off);
+        if (zxc_le32(rep_ref) == cur_val) {
+            uint32_t mlen = sizeof(uint32_t);
+            const uint8_t* const limit_8 = iend - sizeof(uint64_t);
+            while (ip + mlen < limit_8) {
+                const uint64_t diff = zxc_le64(ip + mlen) ^ zxc_le64(rep_ref + mlen);
+                if (diff == 0) {
+                    mlen += sizeof(uint64_t);
+                } else {
+                    mlen += (uint32_t)(zxc_ctz64(diff) >> 3);
+                    goto _rep_done;
+                }
+            }
+            while (ip + mlen < iend && rep_ref[mlen] == ip[mlen]) mlen++;
+        _rep_done:;
+            best.len = mlen;
+            best.ref = rep_ref;
+            if (UNLIKELY(best.len >= (uint32_t)p.sufficient_len || ip + best.len >= iend))
+                goto _finalize_match;
+        }
+    }
+
+    if (match_idx == 0) goto _finalize_match;
+
+    // Optimization: If head tag doesn't match, advance immediately without loading the first
+    // mismatch.
+    if (skip_head) {
+        const uint16_t delta = chain_table[match_idx & ZXC_LZ_WINDOW_MASK];
+        const uint32_t next_idx = match_idx - delta;
+        match_idx = (delta != 0) ? next_idx : 0;
+        attempts--;
+    }
+
+    while (match_idx > 0) {
+        if (UNLIKELY(attempts-- < 0 || cur_pos - match_idx > ZXC_LZ_MAX_DIST)) break;
+        const uint8_t* ref = src + match_idx;
+
+        // Load the next chain link early (before the compare) so its address
+        // resolves while we prefetch.
+        const uint16_t delta = chain_table[match_idx & ZXC_LZ_WINDOW_MASK];
+        const uint32_t next_idx = match_idx - delta;
+        ZXC_PREFETCH_READ(src + next_idx);
+
+        const uint32_t ref_val = zxc_le32(ref);
+        const int tag_match = (ref_val == cur_val);
+        // Cheap gate: 4-byte tag match, then check the byte past the current
+        // best (the && skips that load unless the tag already matched).
+        const int should_compare = tag_match && (ref[best.len] == ip[best.len]);
+
+        if (should_compare) {
+            uint32_t mlen = sizeof(uint32_t);  // We already know the first 4 bytes match
+
+            // Fast path: Scalar 64-bit comparison for short matches (=< 64 bytes)
+            // Most matches are short, so this avoids SIMD overhead for common cases
+            const uint8_t* limit_8 = iend - sizeof(uint64_t);
+            const uint8_t* scalar_limit = ip + mlen + 64;
+            if (scalar_limit > limit_8) scalar_limit = limit_8;
+
+            while (ip + mlen < scalar_limit) {
+                uint64_t diff = zxc_le64(ip + mlen) ^ zxc_le64(ref + mlen);
+                if (diff == 0)
+                    mlen += sizeof(uint64_t);
+                else {
+                    mlen += (zxc_ctz64(diff) >> 3);
+                    goto _match_len_done;
+                }
+            }
+
+            // Long match path: Use SIMD for matches exceeding 64 bytes
+#if defined(ZXC_USE_AVX512)
+            const uint8_t* limit_64 = iend - 64;
+            while (ip + mlen < limit_64) {
+                const __m512i v_src = _mm512_loadu_si512((const void*)(ip + mlen));
+                const __m512i v_ref = _mm512_loadu_si512((const void*)(ref + mlen));
+                const __mmask64 mask = _mm512_cmpeq_epi8_mask(v_src, v_ref);
+                if (mask == 0xFFFFFFFFFFFFFFFF)
+                    mlen += 64;
+                else {
+                    mlen += (uint32_t)zxc_ctz64(~mask);
+                    goto _match_len_done;
+                }
+            }
+#elif defined(ZXC_USE_AVX2)
+            const uint8_t* limit_32 = iend - 32;
+            while (ip + mlen < limit_32) {
+                const __m256i v_src = _mm256_loadu_si256((const __m256i*)(ip + mlen));
+                const __m256i v_ref = _mm256_loadu_si256((const __m256i*)(ref + mlen));
+                const __m256i v_cmp = _mm256_cmpeq_epi8(v_src, v_ref);
+                const uint32_t mask = (uint32_t)_mm256_movemask_epi8(v_cmp);
+                if (mask == 0xFFFFFFFF)
+                    mlen += 32;
+                else {
+                    mlen += zxc_ctz32(~mask);
+                    goto _match_len_done;
+                }
+            }
+#elif defined(ZXC_USE_SSE2)
+            const uint8_t* limit_16 = iend - 16;
+            while (ip + mlen < limit_16) {
+                const __m128i v_src = _mm_loadu_si128((const __m128i*)(ip + mlen));
+                const __m128i v_ref = _mm_loadu_si128((const __m128i*)(ref + mlen));
+                const __m128i v_cmp = _mm_cmpeq_epi8(v_src, v_ref);
+                const uint32_t mask = (uint32_t)_mm_movemask_epi8(v_cmp);
+                if (mask == 0xFFFFU)
+                    mlen += 16;
+                else {
+                    // mask != 0xFFFF => a differing byte exists in bits 0..15,
+                    // so the lowest set bit of ~mask lies in that range.
+                    mlen += zxc_ctz32(~mask);
+                    goto _match_len_done;
+                }
+            }
+#elif defined(ZXC_USE_NEON64)
+            {
+                const uint8_t* limit_32 = iend - 32;
+                while (ip + mlen < limit_32) {
+                    const uint8x16_t s0 = vld1q_u8(ip + mlen);
+                    const uint8x16_t r0 = vld1q_u8(ref + mlen);
+                    const uint8x16_t c0 = vceqq_u8(s0, r0);
+                    const uint64_t m0 = vget_lane_u64(
+                        vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(c0), 4)), 0);
+                    if (UNLIKELY(m0 != ~(uint64_t)0)) {
+                        mlen += (uint32_t)(zxc_ctz64(~m0) >> 2);
+                        goto _match_len_done;
+                    }
+                    const uint8x16_t s1 = vld1q_u8(ip + mlen + 16);
+                    const uint8x16_t r1 = vld1q_u8(ref + mlen + 16);
+                    const uint8x16_t c1 = vceqq_u8(s1, r1);
+                    const uint64_t m1 = vget_lane_u64(
+                        vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(c1), 4)), 0);
+                    if (UNLIKELY(m1 != ~(uint64_t)0)) {
+                        mlen += 16 + (uint32_t)(zxc_ctz64(~m1) >> 2);
+                        goto _match_len_done;
+                    }
+                    mlen += 32;
+                }
+                if (ip + mlen < iend - 16) {
+                    const uint8x16_t v_src = vld1q_u8(ip + mlen);
+                    const uint8x16_t v_ref = vld1q_u8(ref + mlen);
+                    const uint8x16_t v_cmp = vceqq_u8(v_src, v_ref);
+                    const uint64_t mask = vget_lane_u64(
+                        vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(v_cmp), 4)), 0);
+                    if (LIKELY(mask == ~(uint64_t)0))
+                        mlen += 16;
+                    else {
+                        mlen += (uint32_t)(zxc_ctz64(~mask) >> 2);
+                        goto _match_len_done;
+                    }
+                }
+            }
+#elif defined(ZXC_USE_NEON32)
+            {
+                const uint8_t* limit_16 = iend - 16;
+                while (ip + mlen < limit_16) {
+                    const uint8x16_t v_src = vld1q_u8(ip + mlen);
+                    const uint8x16_t v_ref = vld1q_u8(ref + mlen);
+                    const uint8x16_t v_cmp = vceqq_u8(v_src, v_ref);
+                    uint8x8_t p1 = vpmin_u8(vget_low_u8(v_cmp), vget_high_u8(v_cmp));
+                    uint8x8_t p2 = vpmin_u8(p1, p1);
+                    uint8x8_t p3 = vpmin_u8(p2, p2);
+                    uint8x8_t p4 = vpmin_u8(p3, p3);
+                    uint8_t min_val = vget_lane_u8(p4, 0);
+                    if (min_val == 0xFF)
+                        mlen += 16;
+                    else {
+                        uint8x16_t v_diff = vmvnq_u8(v_cmp);
+                        uint64_t lo =
+                            (uint64_t)vgetq_lane_u32(vreinterpretq_u32_u8(v_diff), 0) |
+                            ((uint64_t)vgetq_lane_u32(vreinterpretq_u32_u8(v_diff), 1) << 32);
+                        if (lo != 0)
+                            mlen += (zxc_ctz64(lo) >> 3);
+                        else
+                            mlen +=
+                                8 + (zxc_ctz64(
+                                         (uint64_t)vgetq_lane_u32(vreinterpretq_u32_u8(v_diff), 2) |
+                                         ((uint64_t)vgetq_lane_u32(vreinterpretq_u32_u8(v_diff), 3)
+                                          << 32)) >>
+                                     3);
+                        goto _match_len_done;
+                    }
+                }
+            }
+#endif
+            while (ip + mlen < limit_8) {
+                const uint64_t diff = zxc_le64(ip + mlen) ^ zxc_le64(ref + mlen);
+                if (diff == 0)
+                    mlen += sizeof(uint64_t);
+                else {
+                    mlen += (zxc_ctz64(diff) >> 3);
+                    goto _match_len_done;
+                }
+            }
+            while (ip + mlen < iend && ref[mlen] == ip[mlen]) mlen++;
+
+        _match_len_done:;
+            const int better = (mlen > best.len);
+            best.len = better ? mlen : best.len;
+            best.ref = better ? ref : best.ref;
+
+            if (UNLIKELY(best.len >= (uint32_t)p.sufficient_len || ip + best.len >= iend)) break;
+        }
+
+        match_idx = (delta != 0) ? next_idx : 0;
+    }
+
+_finalize_match:
+    if (best.ref) {
+        // Backtrack to extend match backwards
+        const uint8_t* b_ip = ip;
+        const uint8_t* b_ref = best.ref;
+        while (b_ip > anchor && b_ref > src && b_ip[-1] == b_ref[-1]) {
+            b_ip--;
+            b_ref--;
+            best.len++;
+            best.backtrack++;
+        }
+        best.ref = b_ref;
+    }
+
+    if (p.use_lazy && best.ref && best.len < (uint32_t)p.lazy_len_threshold &&
+        ip + 1 < search_limit) {
+        // --- Lazy evaluation at ip+1 ---
+        const uint64_t next_val8 = zxc_le64(ip + 1);
+        const uint32_t next_val = (uint32_t)next_val8;
+        const uint32_t h2 = zxc_hash_func(next_val8, use_hash5);
+        const uint8_t next_stored_tag = hash_tags[h2];
+        const uint32_t next_head = hash_table[h2];
+        uint32_t next_idx =
+            (next_head & ~offset_mask) == epoch_mark ? (next_head & offset_mask) : 0;
+        const uint8_t next_tag = (uint8_t)(next_val ^ (next_val >> 16));
+        const int skip_lazy_head = (next_idx > 0 && next_stored_tag != next_tag);
+        uint32_t max_lazy2 = 0;
+        int lazy_att = p.lazy_attempts;
+        int is_lazy_first = 1;
+
+        while (next_idx > 0) {
+            if (UNLIKELY(lazy_att-- <= 0 || (uint32_t)(ip + 1 - src) - next_idx > ZXC_LZ_MAX_DIST))
+                break;
+            const uint8_t* ref2 = src + next_idx;
+
+            if ((!is_lazy_first || !skip_lazy_head) && zxc_le32(ref2) == next_val) {
+                uint32_t l2 = sizeof(uint32_t);
+                const uint8_t* limit = iend - sizeof(uint64_t);
+
+                while (ip + 1 + l2 < limit) {
+                    const uint64_t v1 = zxc_le64(ip + 1 + l2);
+                    const uint64_t v2 = zxc_le64(ref2 + l2);
+                    if (v1 != v2) {
+                        l2 += (uint32_t)(zxc_ctz64(v1 ^ v2) >> 3);
+                        goto lazy2_done;
+                    }
+                    l2 += sizeof(uint64_t);
+                }
+                while (ip + 1 + l2 < iend && ref2[l2] == ip[1 + l2]) l2++;
+            lazy2_done:
+                max_lazy2 = l2 > max_lazy2 ? l2 : max_lazy2;
+            }
+
+            const uint16_t delta = chain_table[next_idx & ZXC_LZ_WINDOW_MASK];
+            if (UNLIKELY(delta == 0)) break;
+            next_idx -= delta;
+            is_lazy_first = 0;
+        }
+
+        // --- Lazy evaluation at ip+2 (computed in parallel, no dependency on lazy 1) ---
+        uint32_t max_lazy3 = 0;
+        if (level >= ZXC_LEVEL_BALANCED && ip + 2 < search_limit) {
+            const uint64_t val3_8 = zxc_le64(ip + 2);
+            const uint32_t val3 = (uint32_t)val3_8;
+            const uint32_t h3 = zxc_hash_func(val3_8, use_hash5);
+            const uint8_t tag3 = hash_tags[h3];
+            const uint32_t head3 = hash_table[h3];
+            uint32_t idx3 = (head3 & ~offset_mask) == epoch_mark ? (head3 & offset_mask) : 0;
+            const uint8_t cur_tag3 = (uint8_t)(val3 ^ (val3 >> 16));
+            const int skip_head3 = (idx3 > 0 && tag3 != cur_tag3);
+
+            int is_first3 = 1;
+            lazy_att = p.lazy_attempts;
+            while (idx3 > 0) {
+                if (UNLIKELY(lazy_att-- <= 0 || (uint32_t)(ip + 2 - src) - idx3 > ZXC_LZ_MAX_DIST))
+                    break;
+
+                const uint8_t* ref3 = src + idx3;
+                if ((!is_first3 || !skip_head3) && zxc_le32(ref3) == val3) {
+                    uint32_t l3 = sizeof(uint32_t);
+                    const uint8_t* limit = iend - sizeof(uint64_t);
+
+                    while (ip + 2 + l3 < limit) {
+                        const uint64_t v1 = zxc_le64(ip + 2 + l3);
+                        const uint64_t v2 = zxc_le64(ref3 + l3);
+                        if (v1 != v2) {
+                            l3 += (uint32_t)(zxc_ctz64(v1 ^ v2) >> 3);
+                            goto lazy3_done;
+                        }
+                        l3 += sizeof(uint64_t);
+                    }
+                    while (ip + 2 + l3 < iend && ref3[l3] == ip[2 + l3]) l3++;
+                lazy3_done:
+                    max_lazy3 = l3 > max_lazy3 ? l3 : max_lazy3;
+                }
+
+                const uint16_t delta = chain_table[idx3 & ZXC_LZ_WINDOW_MASK];
+                if (UNLIKELY(delta == 0)) break;
+                idx3 -= delta;
+                is_first3 = 0;
+            }
+        }
+
+        // Single decision: invalidate if either lazy position found a better match
+        if (max_lazy2 > best.len + 1 || max_lazy3 > best.len + 2) best.ref = NULL;
+    }
+
+    return best;
+}
+
+/**
+ * @brief Update dp[p + L_start .. p + L_end) with a constant transition
+ *        cost, in parallel where the target ISA allows.
+ *
+ * For each L in [L_start, L_end), if @p nxt is strictly less than the
+ * current dp[p+L], rewrite dp/parent_len/parent_off in lockstep: same
+ * semantics as the scalar update inside ::zxc_lz77_optimal_parse_glo.
+ * Caller guarantees @p nxt is independent of L (the cost of the L-th
+ * transition does not vary across the requested span).
+ *
+ * Vectorized prologue per ISA, falling through to a scalar tail:
+ *   - AVX-512 BW + VL : 16-wide via vpcmpud + vmask{store,storeu}.
+ *                       Falls back to AVX2 if VL is absent.
+ *   - AVX2            : 8-wide via biased vpcmpgt + vpblendvb (no 32-bit
+ *                       unsigned cmpgt before AVX-512). parent_off is
+ *                       updated with a packed 8x16 mask + 128-bit blend.
+ *   - NEON64 / NEON32 : 4-wide via vcgtq_u32 + vbslq_u32, with vmovn_u32
+ *                       to narrow the mask for the 4x16 parent_off update.
+ *
+ * @param[in,out] dp         DP cost array; dp[p + L] is relaxed when
+ *                           @p nxt < dp[p + L].
+ * @param[in,out] parent_len Backtrack length array, written in lockstep
+ *                           with @p dp; receives the L of the relaxing
+ *                           transition.
+ * @param[in,out] parent_off Backtrack offset array, written in lockstep;
+ *                           receives @p off_biased on relaxation.
+ * @param[in]     p          Source DP position the transitions originate
+ *                           from. Indexing into the three arrays is
+ *                           `p + L`.
+ * @param[in]     L          Initial L value (start of the span, inclusive).
+ * @param[in]     L_end      End of the span (exclusive). Must satisfy
+ *                           @p L_end <= UINT16_MAX so every written length
+ *                           fits in @c parent_len's @c uint16_t cells.
+ * @param[in]     nxt        Constant successor cost `dp[p] + transition`,
+ *                           shared across the [L, L_end) span.
+ * @param[in]     off_biased Match offset minus ::ZXC_LZ_OFFSET_BIAS, the
+ *                           value stored when a transition wins.
+ * @return The first L value not processed (i.e., @p L_end on success).
+ */
+// codeql[cpp/unused-static-function]: false positive
+static ZXC_ALWAYS_INLINE size_t zxc_opt_dp_update_const_cost(
+    uint32_t* RESTRICT dp, uint16_t* RESTRICT parent_len, uint16_t* RESTRICT parent_off,
+    const size_t p, size_t L, const size_t L_end, const uint32_t nxt, const uint16_t off_biased) {
+#if defined(ZXC_USE_AVX512) && defined(__AVX512VL__)
+    if (L + 16 <= L_end) {
+        const __m512i v_inc =
+            _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        const __m512i v_nxt = _mm512_set1_epi32((int)nxt);
+        const __m256i v_off = _mm256_set1_epi16((int16_t)off_biased);
+        for (; L + 16 <= L_end; L += 16) {
+            const __m512i v_L_lanes = _mm512_add_epi32(v_inc, _mm512_set1_epi32((int)L));
+            const __m512i v_dp = _mm512_loadu_si512((const void*)&dp[p + L]);
+            const __mmask16 m = _mm512_cmplt_epu32_mask(v_nxt, v_dp);
+            _mm512_mask_storeu_epi32(&dp[p + L], m, v_nxt);
+            const __m256i v_L_u16 = _mm512_cvtusepi32_epi16(v_L_lanes);
+            _mm256_mask_storeu_epi16((void*)&parent_len[p + L], m, v_L_u16);
+            _mm256_mask_storeu_epi16((void*)&parent_off[p + L], m, v_off);
+        }
+    }
+#elif defined(ZXC_USE_AVX2)
+    if (L + 8 <= L_end) {
+        const __m256i v_inc = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        const __m256i v_nxt = _mm256_set1_epi32((int)nxt);
+        const __m256i v_bias = _mm256_set1_epi32((int)0x80000000);
+        const __m256i v_nxt_b = _mm256_xor_si256(v_nxt, v_bias);
+        const __m128i v_off = _mm_set1_epi16((int16_t)off_biased);
+        for (; L + 8 <= L_end; L += 8) {
+            const __m256i v_L_lanes = _mm256_add_epi32(v_inc, _mm256_set1_epi32((int)L));
+            const __m256i v_dp = _mm256_loadu_si256((const __m256i*)&dp[p + L]);
+            /* Unsigned-compare-via-bias trick:
+             *   (dp ^ 0x80000000) > (nxt ^ 0x80000000)  iff  dp > nxt
+             * because XOR with the sign bit maps unsigned ordering to
+             * signed ordering. AVX2 only has signed cmpgt for 32-bit. */
+            const __m256i v_dp_b = _mm256_xor_si256(v_dp, v_bias);
+            const __m256i v_mask = _mm256_cmpgt_epi32(v_dp_b, v_nxt_b);
+            const __m256i v_dp_new = _mm256_blendv_epi8(v_dp, v_nxt, v_mask);
+            _mm256_storeu_si256((__m256i*)&dp[p + L], v_dp_new);
+            /* Pack 8x int32 mask -> 8x int16 mask with signed saturation:
+             * 0xFFFFFFFF -> 0xFFFF, 0x00000000 -> 0x0000. */
+            const __m128i v_mask16 = _mm_packs_epi32(_mm256_castsi256_si128(v_mask),
+                                                     _mm256_extracti128_si256(v_mask, 1));
+            const __m128i v_L_u16 = _mm_packus_epi32(_mm256_castsi256_si128(v_L_lanes),
+                                                     _mm256_extracti128_si256(v_L_lanes, 1));
+            const __m128i v_pl = _mm_loadu_si128((const __m128i*)&parent_len[p + L]);
+            const __m128i v_pl_new = _mm_blendv_epi8(v_pl, v_L_u16, v_mask16);
+            _mm_storeu_si128((__m128i*)&parent_len[p + L], v_pl_new);
+            const __m128i v_po = _mm_loadu_si128((const __m128i*)&parent_off[p + L]);
+            const __m128i v_po_new = _mm_blendv_epi8(v_po, v_off, v_mask16);
+            _mm_storeu_si128((__m128i*)&parent_off[p + L], v_po_new);
+        }
+    }
+#elif defined(ZXC_USE_NEON64) || defined(ZXC_USE_NEON32)
+    if (L + 4 <= L_end) {
+        static const uint32_t k_inc_array[4] = {0, 1, 2, 3};
+        const uint32x4_t v_inc = vld1q_u32(k_inc_array);
+        const uint32x4_t v_nxt = vdupq_n_u32(nxt);
+        const uint16x4_t v_off = vdup_n_u16(off_biased);
+        for (; L + 4 <= L_end; L += 4) {
+            const uint32x4_t v_L_lanes = vaddq_u32(v_inc, vdupq_n_u32((uint32_t)L));
+            const uint32x4_t v_dp = vld1q_u32(&dp[p + L]);
+            const uint32x4_t v_mask = vcgtq_u32(v_dp, v_nxt);
+            vst1q_u32(&dp[p + L], vbslq_u32(v_mask, v_nxt, v_dp));
+            const uint16x4_t v_mask16 = vmovn_u32(v_mask);
+            const uint16x4_t v_L_u16 = vqmovn_u32(v_L_lanes);
+            const uint16x4_t v_pl = vld1_u16(&parent_len[p + L]);
+            vst1_u16(&parent_len[p + L], vbsl_u16(v_mask16, v_L_u16, v_pl));
+            const uint16x4_t v_po = vld1_u16(&parent_off[p + L]);
+            vst1_u16(&parent_off[p + L], vbsl_u16(v_mask16, v_off, v_po));
+        }
+    }
+#elif defined(ZXC_USE_SSE2)
+    if (L + 4 <= L_end) {
+        const __m128i v_inc = _mm_setr_epi32(0, 1, 2, 3);
+        const __m128i v_nxt = _mm_set1_epi32((int)nxt);
+        const __m128i v_bias = _mm_set1_epi32((int)0x80000000);
+        const __m128i v_nxt_b = _mm_xor_si128(v_nxt, v_bias);
+        const __m128i v_off = _mm_set1_epi16((short)off_biased);
+        for (; L + 4 <= L_end; L += 4) {
+            const __m128i v_L_lanes = _mm_add_epi32(v_inc, _mm_set1_epi32((int)L));
+            const __m128i v_dp = _mm_loadu_si128((const __m128i*)&dp[p + L]);
+            /* Unsigned compare via sign-bit bias (SSE2 cmpgt is signed only):
+             *   (dp ^ 0x80000000) > (nxt ^ 0x80000000)  iff  dp > nxt. */
+            const __m128i v_dp_b = _mm_xor_si128(v_dp, v_bias);
+            const __m128i v_mask = _mm_cmpgt_epi32(v_dp_b, v_nxt_b);
+            const __m128i v_dp_new = zxc_mm_blendv_epi8_sse2(v_dp, v_nxt, v_mask);
+            _mm_storeu_si128((__m128i*)&dp[p + L], v_dp_new);
+            /* Narrow the 4x int32 mask / length lanes to 4x int16 (low 64 bits).
+             * packs: 0xFFFFFFFF -> 0xFFFF, 0 -> 0; packus (SSE2-emulated): u32->u16. */
+            const __m128i v_mask16 = _mm_packs_epi32(v_mask, v_mask);
+            const __m128i v_L_u16 = zxc_mm_packus_epi32_sse2(v_L_lanes, v_L_lanes);
+            __m128i v_pl = _mm_loadl_epi64((const __m128i*)&parent_len[p + L]);
+            v_pl = zxc_mm_blendv_epi8_sse2(v_pl, v_L_u16, v_mask16);
+            _mm_storel_epi64((__m128i*)&parent_len[p + L], v_pl);
+            __m128i v_po = _mm_loadl_epi64((const __m128i*)&parent_off[p + L]);
+            v_po = zxc_mm_blendv_epi8_sse2(v_po, v_off, v_mask16);
+            _mm_storel_epi64((__m128i*)&parent_off[p + L], v_po);
+        }
+    }
+#endif
+    /* Scalar tail (and full path on archs without SIMD).
+     * L < L_end <= UINT16_MAX (caller precondition), so the cast is lossless. */
+    for (; L < L_end; L++) {
+        if (nxt < dp[p + L]) {
+            dp[p + L] = nxt;
+            parent_len[p + L] = (uint16_t)L;
+            parent_off[p + L] = off_biased;
+        }
+    }
+    return L;
+}
+
+/**
+ * @brief Estimate per-block literal cost from a sampled histogram passed
+ *        through the actual length-limited Huffman builder.
+ *
+ * Strategy: build a strided sample of @p src (4096 entries), run the same
+ * length-limited Huffman code construction the encoder uses, and report the
+ * sample-weighted average code length. This is the predicted bits/byte
+ * for Huffman-encoded literals on this distribution: no calibration
+ * constants, no per-corpus tuning. The cap at 8 reflects that RAW is
+ * always available at exactly that cost; if Huffman doesn't beat 8 on the
+ * sample, the encoder will pick RAW and 8 is the right price.
+ *
+ * @param[in] src     Source buffer for the block.
+ * @param[in] src_sz  Length of @p src in bytes.
+ * @param[in] scratch Package-merge scratch (pre-allocated in the cctx for
+ *                    level >= 6). May be `NULL`, in which case the builder
+ *                    allocates its own working memory.
+ * @return Estimated literal cost in bits, in `[1, 8]`.
+ */
+// codeql[cpp/unused-static-function]: false positive
+static uint32_t zxc_opt_estimate_lit_bits(const uint8_t* RESTRICT src, const size_t src_sz,
+                                          void* RESTRICT scratch) {
+    if (UNLIKELY(src_sz < ZXC_OPT_LIT_SAMPLE_MIN)) return CHAR_BIT;
+
+    uint32_t hist[ZXC_HUF_NUM_SYMBOLS] = {0};
+    const size_t step = (src_sz > 4096) ? (src_sz >> 12) : 1U;
+    size_t sampled = 0;
+    for (size_t i = 0; i < src_sz; i += step) {
+        hist[src[i]]++;
+        sampled++;
+    }
+
+    uint8_t code_len[ZXC_HUF_NUM_SYMBOLS];
+    if (UNLIKELY(zxc_huf_build_code_lengths(hist, code_len, scratch) != ZXC_OK)) return CHAR_BIT;
+
+    /* Sample-weighted sum of code lengths == predicted total Huffman bits
+     * for the sample. Divide by sample count for bits/byte, rounded up
+     * (DP works in integer bits; rounding up errs on the conservative
+     * side, slightly favoring matches over fractional-cost literals). */
+    uint64_t total_bits = 0;
+    for (int k = 0; k < ZXC_HUF_NUM_SYMBOLS; k++) {
+        total_bits += (uint64_t)hist[k] * (uint64_t)code_len[k];
+    }
+    const uint32_t avg = (uint32_t)((total_bits + sampled - 1) / sampled);
+
+    /* Cap at RAW cost: if Huffman can't beat 8 bits/byte on the sample,
+     * the encoder will pick RAW anyway and 8 is the actual literal cost. */
+    return (avg < CHAR_BIT) ? avg : CHAR_BIT;
+}
+
+/**
+ * @brief Static price-based optimal LZ77 parser for level 6.
+ *
+ * Forward DP over the block's positions: `dp[p]` = min bit cost to encode
+ * `src[0..p)`. Per-position transitions are
+ *   - literal: `dp[p+1] = min(dp[p+1], dp[p] + lit_cost`
+ *   - match  : `dp[p+L] = min(dp[p+L], dp[p] + match_cost(L))` for L in
+ *              `[MIN_MATCH, max_L]`
+ * where `max_L` is the longest match found by ::zxc_lz77_find_best_match at
+ * `p` (with lazy disabled, the DP itself handles position-based
+ * optimization). Backtracking from `dp[src_sz]` reconstructs the
+ * optimal token sequence.
+ *
+ * Complexity guard: ::ZXC_OPT_LONG_MATCH_SKIP causes ::zxc_lz77_find_best_match
+ * to be skipped at positions strictly inside a long match, without this
+ * guard, highly repetitive data (e.g. Lorem-loop with multi-MB matches at
+ * every offset) makes the parser quadratic and unit tests run for minutes.
+ * The inner sub-length update loop visits every L from `MIN_MATCH` to
+ * `max_L`; the skip threshold means each long-match region only pays its
+ * O(L) cost once at the starting position, keeping total work O(N).
+ *
+ * @param[in,out] ctx           Compression context. The lazy-allocated
+ *                              `opt_scratch` field provides the DP arrays;
+ *                              it is grown on first use and reused on
+ *                              subsequent blocks.
+ * @param[in]  src              Source buffer to parse.
+ * @param[in]  src_sz           Length of @p src in bytes.
+ * @param[in,out] hash_table    LZ77 hash table (epoch | position entries).
+ * @param[in,out] hash_tags     8-bit fast-rejection tags paired with @p hash_table.
+ * @param[in,out] chain_table   Hash-chain link table (ring buffer).
+ * @param[in]  epoch_mark       Current epoch shifted into the high bits.
+ * @param[in]  offset_mask      Mask isolating the position bits in chain entries.
+ * @param[in]  level            Compression level (used to size the matcher).
+ * @param[out] literals         Buffer receiving the gathered literal bytes.
+ * @param[out] buf_tokens       Buffer receiving the per-sequence token bytes.
+ * @param[out] buf_offsets      Buffer receiving the per-sequence offsets.
+ * @param[out] buf_extras       Buffer receiving variable-length overflow data.
+ * @param[out] seq_c_out        Number of emitted sequences.
+ * @param[out] lit_c_out        Number of literal bytes written into @p literals.
+ * @param[out] extras_sz_out    Number of bytes written into @p buf_extras.
+ * @param[out] max_offset_out   Largest biased offset emitted (used by the caller
+ *                              to choose 1-byte vs 2-byte offset encoding).
+ * @return @c ZXC_OK on success, or a negative @ref zxc_error_t.
+ */
+static int zxc_lz77_optimal_parse_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                      const size_t src_sz, uint32_t* RESTRICT hash_table,
+                                      uint8_t* RESTRICT hash_tags, uint16_t* RESTRICT chain_table,
+                                      const uint32_t epoch_mark, const uint32_t offset_mask,
+                                      const int level, uint8_t* RESTRICT literals,
+                                      uint8_t* RESTRICT buf_tokens, uint16_t* RESTRICT buf_offsets,
+                                      uint8_t* RESTRICT buf_extras, uint32_t* RESTRICT seq_c_out,
+                                      size_t* RESTRICT lit_c_out, size_t* RESTRICT extras_sz_out,
+                                      uint16_t* RESTRICT max_offset_out) {
+    zxc_lz77_params_t lzp_opt = zxc_get_lz77_params(level);
+    lzp_opt.use_lazy = 0;  // guard
+
+    /* When a dictionary is active, src = [dict | block_data]. DP arrays are
+     * indexed relative to the block start (position dict_sz in src). The
+     * variable src_base points to the first block byte for literal copies,
+     * while src remains the base for the match finder (absolute positions). */
+    const size_t dict_sz = ctx->dict_size;
+    const size_t block_sz = src_sz - dict_sz;
+    const uint8_t* const src_base = src + dict_sz;
+    const uint8_t* const iend = src + src_sz;
+
+    /* Block too small for any match: emit all as literals. */
+    if (UNLIKELY(block_sz < ZXC_LZ_SEARCH_MARGIN + 1)) {
+        if (block_sz > 0) ZXC_MEMCPY(literals, src_base, block_sz);
+        *lit_c_out = block_sz;
+        *seq_c_out = 0;
+        *extras_sz_out = 0;
+        *max_offset_out = 0;
+        return 0;
+    }
+
+    const size_t search_limit_pos = block_sz - ZXC_LZ_SEARCH_MARGIN;
+    const uint8_t* const search_limit = src + search_limit_pos;
+
+    /* DP arrays carved from ctx->opt_scratch: a single allocation lazy-
+     * grown on the first level-6 call and reused across blocks. Each
+     * sub-buffer is cache-line padded so the next one starts on a 64 B
+     * boundary. The total `needed` matches zxc_estimate_cctx_size() keep
+     * the formula in sync.
+     *
+     *   dp             : (chunk+1) x uint32_t: min cost to reach position p.
+     *   parent_len     : (chunk+1) x uint16_t: 0 = literal, >= MIN_MATCH = match.
+     *   parent_off     : (chunk+1) x uint16_t: biased match offset (distance-1).
+     *   match_end_bits : ceil((chunk+1)/64) x uint64_t: 1 bit per position,
+     *                                                  set when that position
+     *                                                  is the end of a match
+     *                                                  on the chosen DP path.
+     *                                                  Replaces a forward-order
+     *                                                  actions[] stack at 1/64
+     *                                                  the cost.
+     *
+     * The same buffer is reused as transient scratch for the length-limited
+     * Huffman code-length builder (see zxc_opt_estimate_lit_bits below and
+     * the Huffman selection in zxc_encode_block_glo): the package-merge
+     * scratch is needed before the DP runs and again after the parse has
+     * been read out, so the lifetimes never overlap. The capacity is the
+     * larger of the two demands. */
+    const size_t chunk = ctx->chunk_size;
+    const size_t sz_dp = ZXC_ALIGN_CL((chunk + 1) * sizeof(uint32_t));
+    const size_t sz_pl = ZXC_ALIGN_CL((chunk + 1) * sizeof(uint16_t));
+    const size_t sz_po = ZXC_ALIGN_CL((chunk + 1) * sizeof(uint16_t));
+    const size_t n_bm_words = ZXC_BITMAP_WORDS(chunk + 1);
+    const size_t sz_bm = ZXC_ALIGN_CL(n_bm_words * sizeof(uint64_t));
+    const size_t dp_needed = sz_dp + sz_pl + sz_po + sz_bm;
+    const size_t needed =
+        (dp_needed > ZXC_HUF_BUILD_SCRATCH_SIZE) ? dp_needed : ZXC_HUF_BUILD_SCRATCH_SIZE;
+
+    /* opt_scratch is now pre-allocated inside ctx->memory_block by
+     * zxc_cctx_init when level >= ZXC_LEVEL_DENSITY. The formula above must
+     * stay byte-for-byte in sync with the one in zxc_cctx_init() and
+     * zxc_estimate_cctx_size(). */
+    (void)needed;
+
+    /* Per-block literal cost (sample only block data, not dict prefix): */
+    const uint32_t lit_cost = zxc_opt_estimate_lit_bits(src_base, block_sz, ctx->opt_scratch);
+
+    uint32_t* const dp = (uint32_t*)ctx->opt_scratch;
+    uint16_t* const parent_len = (uint16_t*)(ctx->opt_scratch + sz_dp);
+    uint16_t* const parent_off = (uint16_t*)(ctx->opt_scratch + sz_dp + sz_pl);
+    uint64_t* const match_end_bits = (uint64_t*)(ctx->opt_scratch + sz_dp + sz_pl + sz_po);
+
+    dp[0] = 0;
+    ZXC_MEMSET(dp + 1, 0xFF, block_sz * sizeof(uint32_t));
+    ZXC_MEMSET(parent_len, 0, sz_pl + sz_po + sz_bm);
+
+    /* Forward DP: visit every position, update reachable successors.
+     * `skip_until` skips find_best_match at positions strictly inside the
+     * last long match, the DP transition from the start of the match
+     * already covers dp[p+1..p+L], and re-searching at every intra-match
+     * position is what makes the parser quadratic on repetitive inputs. */
+    size_t skip_until = 0;
+    /* Rolling repeat-offset seed for find_best_match */
+    uint32_t last_off = 0;
+    for (size_t p = 0; p < search_limit_pos; p++) {
+        if (UNLIKELY(dp[p] == UINT32_MAX)) continue;
+
+        /* Literal transition. */
+        const uint32_t lit_next = dp[p] + lit_cost;
+        if (lit_next < dp[p + 1]) {
+            dp[p + 1] = lit_next;
+            parent_len[p + 1] = 0;
+        }
+
+        if (p < skip_until) continue;
+
+        /* Match transition: call find_best_match (no lazy, no backtrack via
+         * anchor=ip). Iterate sub-lengths since any L <= max_L matches at the
+         * same offset and may end at a more useful DP position.
+         * ip uses absolute position (src + dict_sz + p) so match finder
+         * resolves dict references correctly via src as base. */
+        const uint8_t* ip = src_base + p;
+        const zxc_match_t m = zxc_lz77_find_best_match(
+            src, ip, iend, search_limit, /*anchor=*/ip, hash_table, hash_tags, chain_table,
+            epoch_mark, offset_mask, level, lzp_opt, last_off);
+
+        if (m.ref) {
+            const uint32_t off = (uint32_t)(ip - m.ref);
+            if (off > 0 && off <= ZXC_LZ_WINDOW_SIZE) {
+                last_off = off;
+                const size_t L_max_raw = (m.len > block_sz - p) ? (block_sz - p) : (size_t)m.len;
+                const size_t L_max = (L_max_raw > UINT16_MAX) ? UINT16_MAX : L_max_raw;
+
+                /* The L-iteration cost function is piecewise constant in
+                 * varint segments. Split the [MIN_MATCH, L_max] span into:
+                 *   1. cheap   : v < ML_MASK            -> cost = base
+                 *   2. varint1 : v in [ML_MASK, ML_MASK + 128) -> cost = base + 8
+                 *   3. varint2+: v >= ML_MASK + 128     -> cost = base + 16, +24, ...
+                 *
+                 * Steps 1 and 2 use constant nxt and are vectorized via
+                 * the helper. Step 3 is rare (typical matches are short)
+                 * and stays scalar. */
+                const uint16_t off_biased = (uint16_t)(off - ZXC_LZ_OFFSET_BIAS);
+                const size_t L_max_plus = L_max + 1;
+                size_t L = ZXC_LZ_MIN_MATCH_LEN;
+
+                /* 1. Cheap range. */
+                {
+                    const size_t L_cheap_end = ZXC_LZ_MIN_MATCH_LEN + ZXC_TOKEN_ML_MASK;
+                    const size_t L_end = (L_max_plus < L_cheap_end) ? L_max_plus : L_cheap_end;
+                    const uint32_t nxt = dp[p] + ZXC_OPT_MATCH_COST_BASE;
+                    L = zxc_opt_dp_update_const_cost(dp, parent_len, parent_off, p, L, L_end, nxt,
+                                                     off_biased);
+                }
+
+                /* 2. First varint level (1-byte extension). */
+                if (L < L_max_plus) {
+                    const size_t L_v1_end = ZXC_LZ_MIN_MATCH_LEN + ZXC_TOKEN_ML_MASK + 128;
+                    const size_t L_end = (L_max_plus < L_v1_end) ? L_max_plus : L_v1_end;
+                    const uint32_t nxt = dp[p] + ZXC_OPT_MATCH_COST_BASE + CHAR_BIT;
+                    L = zxc_opt_dp_update_const_cost(dp, parent_len, parent_off, p, L, L_end, nxt,
+                                                     off_biased);
+                }
+
+                /* 3. Higher varint levels: variable cost, kept scalar.
+                 * Reached only by L >= ML_MASK + 128 + MIN_MATCH, so the
+                 * v >= ML_MASK guard from the original loop is implied. */
+                for (; L < L_max_plus; L++) {
+                    uint32_t cost = ZXC_OPT_MATCH_COST_BASE;
+                    uint32_t v = (uint32_t)(L - ZXC_LZ_MIN_MATCH_LEN) - ZXC_TOKEN_ML_MASK;
+                    cost += CHAR_BIT;
+                    while (v >= 128) {
+                        v >>= 7;
+                        cost += CHAR_BIT;
+                    }
+                    const uint32_t nxt = dp[p] + cost;
+                    if (nxt < dp[p + L]) {
+                        dp[p + L] = nxt;
+                        parent_len[p + L] = (uint16_t)L;
+                        parent_off[p + L] = off_biased;
+                    }
+                }
+                if (UNLIKELY(L_max >= ZXC_OPT_LONG_MATCH_SKIP)) skip_until = p + L_max - 1;
+            }
+        }
+    }
+
+    /* Tail (last ZXC_LZ_SEARCH_MARGIN bytes) can only be literals: the match finder
+     * stops at search_limit so its 8-byte probe reads stay in bounds. */
+    for (size_t p = search_limit_pos; p < block_sz; p++) {
+        if (UNLIKELY(dp[p] == UINT32_MAX)) continue;
+        const uint32_t lit_next = dp[p] + lit_cost;
+        if (lit_next < dp[p + 1]) {
+            dp[p + 1] = lit_next;
+            parent_len[p + 1] = 0;
+        }
+    }
+
+    /* Backtrack from src_sz to 0: only match endpoints are recorded (one bit
+     * per position in match_end_bits). Literals between matches are implicit
+     * runs of unmarked positions and are reconstructed during forward emission
+     * via lit_start tracking, so they need no backtrack storage. */
+    {
+        size_t pos = block_sz;
+        while (pos > 0) {
+            const uint32_t L = parent_len[pos];
+            if (L == 0) {
+                pos -= 1;
+            } else {
+                match_end_bits[pos >> 6] |= (uint64_t)1 << (pos & 63);
+                pos -= L;
+            }
+        }
+    }
+
+    /* Forward emission: walk match_end_bits word-by-word, peeling set bits
+     * with ctzll. Each set bit gives a match endpoint; parent_len/parent_off
+     * at that position recover (length, offset). */
+    uint32_t seq_c = 0;
+    size_t lit_c = 0;
+    size_t extras_sz = 0;
+    uint16_t max_offset = 0;
+    size_t lit_start = 0;
+
+    for (size_t word_idx = 0; word_idx < n_bm_words; word_idx++) {
+        uint64_t w = match_end_bits[word_idx];
+        while (w) {
+            const size_t pos = (word_idx << 6) + (size_t)zxc_ctz64(w);
+            w &= w - 1;
+            const uint32_t L = parent_len[pos];
+            const uint16_t off_biased = parent_off[pos];
+            const size_t match_start = pos - L;
+
+            const size_t LL = match_start - lit_start;
+            if (LL > 0) {
+                ZXC_MEMCPY(literals + lit_c, src_base + lit_start, LL);
+                lit_c += LL;
+            }
+            const uint32_t ll = (uint32_t)LL;
+            const uint32_t ml = L - ZXC_LZ_MIN_MATCH_LEN;
+            const uint8_t ll_code = (ll >= ZXC_TOKEN_LL_MASK) ? ZXC_TOKEN_LL_MASK : (uint8_t)ll;
+            const uint8_t ml_code = (ml >= ZXC_TOKEN_ML_MASK) ? ZXC_TOKEN_ML_MASK : (uint8_t)ml;
+            buf_tokens[seq_c] = (ll_code << ZXC_TOKEN_LIT_BITS) | ml_code;
+            buf_offsets[seq_c] = off_biased;
+            if (off_biased > max_offset) max_offset = off_biased;
+
+            if (UNLIKELY(ll >= ZXC_TOKEN_LL_MASK)) {
+                const size_t n = zxc_write_varint(buf_extras + extras_sz, ll - ZXC_TOKEN_LL_MASK);
+                if (UNLIKELY(n == 0)) return ZXC_ERROR_OVERFLOW;
+                extras_sz += n;
+            }
+            if (UNLIKELY(ml >= ZXC_TOKEN_ML_MASK)) {
+                const size_t n = zxc_write_varint(buf_extras + extras_sz, ml - ZXC_TOKEN_ML_MASK);
+                if (UNLIKELY(n == 0)) return ZXC_ERROR_OVERFLOW;
+                extras_sz += n;
+            }
+
+            seq_c++;
+            lit_start = pos;
+        }
+    }
+
+    /* Tail literals after the last match (or all literals if no match). */
+    if (lit_start < block_sz) {
+        const size_t tail = block_sz - lit_start;
+        ZXC_MEMCPY(literals + lit_c, src_base + lit_start, tail);
+        lit_c += tail;
+    }
+
+    *seq_c_out = seq_c;
+    *lit_c_out = lit_c;
+    *extras_sz_out = extras_sz;
+    *max_offset_out = max_offset;
+    return 0;
+}
+
+/**
+ * @brief Seeds the hash/chain tables from dictionary content prepended to @p src.
+ *
+ * When a dictionary is active, @p src is laid out as [dict_content | block_data].
+ * This function inserts hash entries for dictionary positions [0, dict_size) so
+ * the match finder can reference them during block encoding.
+ *
+ * @param[in]     src         Source buffer starting with dictionary content.
+ * @param[in]     dict_size   Size of the dictionary prefix in bytes.
+ * @param[in,out] hash_table  Hash table to seed with dictionary positions.
+ * @param[in,out] hash_tags   Tag table for fast match rejection.
+ * @param[in,out] chain_table Chain table for collision resolution.
+ * @param[in]     epoch_mark  Current epoch marker for hash table entries.
+ * @param[in]     offset_mask Position mask for epoch/offset encoding.
+ * @param[in]     level       Compression level (controls hash function variant).
+ */
+static void zxc_lz_seed_dict(const uint8_t* RESTRICT src, const size_t dict_size,
+                             uint32_t* RESTRICT hash_table, uint8_t* RESTRICT hash_tags,
+                             uint16_t* RESTRICT chain_table, const uint32_t epoch_mark,
+                             const uint32_t offset_mask, const int level) {
+    if (UNLIKELY(dict_size < ZXC_LZ_MIN_MATCH_LEN)) return;
+
+    const int use_hash5 = (level >= 3);
+    const size_t limit = dict_size - (ZXC_LZ_MIN_MATCH_LEN - 1);
+
+    /* Sparse seeding for the first half, dense for the second half.
+     * Positions near the end of the dict produce shorter offsets and are
+     * more likely to yield matches, so they deserve full coverage. */
+    const size_t half = limit / 2;
+    for (size_t i = 0; i < half; i += 4) {
+        const uint64_t val8 = zxc_le64(src + i);
+        const uint32_t h = zxc_hash_func(val8, use_hash5);
+        const uint32_t cur_pos = (uint32_t)i;
+        const uint8_t tag = (uint8_t)((uint32_t)val8 ^ ((uint32_t)val8 >> 16));
+
+        hash_table[h] = epoch_mark | cur_pos;
+        hash_tags[h] = tag;
+        chain_table[cur_pos & ZXC_LZ_WINDOW_MASK] = 0;
+    }
+    for (size_t i = half; i < limit; i++) {
+        const uint64_t val8 = zxc_le64(src + i);
+        const uint32_t h = zxc_hash_func(val8, use_hash5);
+        const uint32_t cur_pos = (uint32_t)i;
+        const uint8_t tag = (uint8_t)((uint32_t)val8 ^ ((uint32_t)val8 >> 16));
+
+        const uint32_t raw_head = hash_table[h];
+        const uint32_t prev_idx =
+            ((raw_head & ~offset_mask) == epoch_mark) ? (raw_head & offset_mask) : 0;
+
+        hash_table[h] = epoch_mark | cur_pos;
+        hash_tags[h] = tag;
+
+        const uint32_t dist = cur_pos - prev_idx;
+        const uint32_t valid = -((int32_t)((prev_idx != 0) & (dist < ZXC_LZ_WINDOW_SIZE)));
+        chain_table[cur_pos & ZXC_LZ_WINDOW_MASK] = (uint16_t)(dist & valid);
+    }
+}
+
+/**
+ * @brief Encodes a data block using the General (GLO) compression format.
+ *
+ * This function implements the core LZ77 compression logic. It dynamically
+ * adjusts compression parameters (search depth, lazy matching strategy, and
+ * step skipping) based on the compression level configured in the context.
+ *
+ * **LZ77 Implementation Details:**
+ * 1. **Hash Chain:** Uses a hash table (`ctx->hash_table`) to find potential
+ * match positions. Collisions are handled via a `chain_table`, allowing us to
+ * search deeper into the history for a better match.
+ * 2. **Lazy Matching:** If a match is found, we check the *next* byte to see if
+ *    it produces a longer match. If so, we output a literal and take the better
+ * match. This is enabled for levels >= 3.
+ * 3. **Step Skipping:** For lower levels (1-3), we skip bytes when updating the
+ *    hash table to increase speed (`step > 1`). For levels 4+, we process every
+ * byte to maximize compression ratio.
+ * 4. **SIMD Match Finding:** Uses AVX2/AVX512/NEON to compare 32/64 bytes at a
+ * time during match length calculation, significantly speeding up long match
+ * verification.
+ * 5. **RLE Detection:** Analyzes literals to see if Run-Length Encoding would
+ * be beneficial (saving > 10% space).
+ *
+ * The encoding process consists of:
+ * 1. **LZ77 Parsing**: The function iterates through the source data,
+ * maintaining a hash chain to find repeated patterns (matches). It supports
+ * "Lazy Matching" for higher compression levels to optimize match selection.
+ * 2. **Sequence Storage**: Matches are converted into sequences consisting of
+ *    literal lengths, match lengths, and offsets.
+ * 3. **Bitpacking & Serialization**: The sequences are analyzed to determine
+ * optimal bit-widths. The function then writes the block header, encodes
+ * literals (using Raw or RLE encoding), and bit-packs the sequence streams into
+ * the destination buffer.
+ *
+ * @param[in,out] ctx       Pointer to the compression context containing hash tables
+ * and configuration.
+ * @param[in] src       Pointer to the input source data.
+ * @param[in] src_sz  Size of the input data in bytes.
+ * @param[out] dst       Pointer to the destination buffer where compressed data will
+ * be written.
+ * @param[in] dst_cap   Maximum capacity of the destination buffer.
+ * @param[out] out_sz    [Out] Pointer to a variable that will receive the total size
+ * of the compressed output.
+ *
+ * @return ZXC_OK on success, or a negative zxc_error_t code (e.g., ZXC_ERROR_DST_TOO_SMALL) if an
+ * error occurs (e.g., buffer overflow).
+ */
+static int zxc_encode_block_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                const size_t src_sz, uint8_t* RESTRICT dst, size_t dst_cap,
+                                size_t* RESTRICT out_sz) {
+    const int level = ctx->compression_level;
+    const size_t dict_sz = ctx->dict_size;
+
+    const zxc_lz77_params_t lzp = zxc_get_lz77_params(level);
+
+    ctx->epoch++;
+    if (UNLIKELY(ctx->epoch >= ctx->max_epoch)) {
+        ZXC_MEMSET(ctx->hash_table, 0, ZXC_LZ_HASH_SIZE * sizeof(uint32_t));
+        ZXC_MEMSET(ctx->hash_tags, 0, ZXC_LZ_HASH_SIZE * sizeof(uint8_t));
+        ctx->epoch = 1;
+    }
+    const uint32_t offset_bits = ctx->offset_bits;
+    const uint32_t offset_mask = ctx->offset_mask;
+    const uint32_t epoch_mark = ctx->epoch << offset_bits;
+
+    if (dict_sz > 0)
+        zxc_lz_seed_dict(src, dict_sz, ctx->hash_table, ctx->hash_tags, ctx->chain_table,
+                         epoch_mark, offset_mask, level);
+
+    const uint8_t* ip = src + dict_sz;
+    const uint8_t* iend = src + src_sz;
+    const uint8_t* anchor = ip;
+    const uint8_t* search_limit = iend - ZXC_LZ_SEARCH_MARGIN;
+
+    uint32_t* const hash_table = ctx->hash_table;
+    uint8_t* const hash_tags = ctx->hash_tags;
+    uint16_t* const chain_table = ctx->chain_table;
+    uint8_t* const literals = ctx->literals;
+    uint8_t* const buf_tokens = ctx->buf_tokens;
+    uint16_t* const buf_offsets = ctx->buf_offsets;
+    uint8_t* const buf_extras = ctx->buf_extras;
+
+    uint32_t seq_c = 0;
+    size_t lit_c = 0;
+    size_t extras_sz = 0;
+    uint16_t max_offset = 0;  // Track max offset for 1-byte/2-byte mode decision
+
+    /* Level 6+: price-based optimal parser (fills outputs and skips the
+     * lazy loop + last_lits handling below via `goto parse_done`). */
+    if (level >= ZXC_LEVEL_DENSITY) {
+        const int rc = zxc_lz77_optimal_parse_glo(
+            ctx, src, src_sz, hash_table, hash_tags, chain_table, epoch_mark, offset_mask, level,
+            literals, buf_tokens, buf_offsets, buf_extras, &seq_c, &lit_c, &extras_sz, &max_offset);
+        if (UNLIKELY(rc != 0)) return rc;
+        goto parse_done;
+    }
+
+    while (LIKELY(ip < search_limit)) {
+        const size_t dist = (size_t)(ip - anchor);
+        size_t step = lzp.step_base + (dist >> lzp.step_shift);
+        if (UNLIKELY(ip + step >= search_limit)) step = 1;
+
+        if (LIKELY(ip + step + sizeof(uint64_t) <= iend)) {
+            const uint64_t v_next = zxc_le64(ip + step);
+            // cppcheck-suppress unreadVariable
+            const uint32_t h_next = zxc_hash_func(v_next, 1);
+            ZXC_PREFETCH_READ(&hash_tags[h_next]);
+            ZXC_PREFETCH_READ(&hash_table[h_next]);
+        }
+
+        const zxc_match_t m =
+            zxc_lz77_find_best_match(src, ip, iend, search_limit, anchor, hash_table, hash_tags,
+                                     chain_table, epoch_mark, offset_mask, level, lzp,
+                                     /*last_off=*/0U);
+
+        if (m.ref) {
+            ip -= m.backtrack;
+            const uint32_t ll = (uint32_t)(ip - anchor);
+            const uint32_t ml = m.len - ZXC_LZ_MIN_MATCH_LEN;
+            const uint32_t off = (uint32_t)(ip - m.ref);
+
+            if (ll > 0) {
+                if (LIKELY(anchor + ZXC_PAD_SIZE <= iend)) {
+                    zxc_copy32(literals + lit_c, anchor);
+                    if (UNLIKELY(ll > ZXC_PAD_SIZE)) {
+                        ZXC_MEMCPY(literals + lit_c + ZXC_PAD_SIZE, anchor + ZXC_PAD_SIZE,
+                                   ll - ZXC_PAD_SIZE);
+                    }
+                } else {
+                    ZXC_MEMCPY(literals + lit_c, anchor, ll);
+                }
+                lit_c += ll;
+            }
+
+            const uint8_t ll_code = (ll >= ZXC_TOKEN_LL_MASK) ? ZXC_TOKEN_LL_MASK : (uint8_t)ll;
+            const uint8_t ml_code = (ml >= ZXC_TOKEN_ML_MASK) ? ZXC_TOKEN_ML_MASK : (uint8_t)ml;
+            buf_tokens[seq_c] = (ll_code << ZXC_TOKEN_LIT_BITS) | ml_code;
+            buf_offsets[seq_c] = (uint16_t)(off - ZXC_LZ_OFFSET_BIAS);
+            if ((off - ZXC_LZ_OFFSET_BIAS) > max_offset)
+                max_offset = (uint16_t)(off - ZXC_LZ_OFFSET_BIAS);
+
+            if (ll >= ZXC_TOKEN_LL_MASK) {
+                const size_t n = zxc_write_varint(buf_extras + extras_sz, ll - ZXC_TOKEN_LL_MASK);
+                if (UNLIKELY(n == 0)) return ZXC_ERROR_OVERFLOW;
+                extras_sz += n;
+            }
+            if (ml >= ZXC_TOKEN_ML_MASK) {
+                const size_t n = zxc_write_varint(buf_extras + extras_sz, ml - ZXC_TOKEN_ML_MASK);
+                if (UNLIKELY(n == 0)) return ZXC_ERROR_OVERFLOW;
+                extras_sz += n;
+            }
+
+            seq_c++;
+
+            if (m.len > 2 && level > ZXC_LEVEL_BALANCED) {
+                const uint8_t* match_end = ip + m.len;
+                if (match_end < iend - 7) {
+                    const uint32_t pos_u = (uint32_t)((match_end - 2) - src);
+                    const uint64_t val_u8 = zxc_le64(match_end - 2);
+                    const uint32_t val_u = (uint32_t)val_u8;
+                    const uint32_t h_u = zxc_hash_func(val_u8, 1);
+                    const uint32_t prev_head = hash_table[h_u];
+                    const uint32_t prev_idx =
+                        (prev_head & ~offset_mask) == epoch_mark ? (prev_head & offset_mask) : 0;
+                    hash_table[h_u] = epoch_mark | pos_u;
+                    hash_tags[h_u] = (uint8_t)(val_u ^ (val_u >> 16));
+                    chain_table[pos_u & ZXC_LZ_WINDOW_MASK] =
+                        (prev_idx > 0 && (pos_u - prev_idx) < ZXC_LZ_WINDOW_SIZE)
+                            ? (uint16_t)(pos_u - prev_idx)
+                            : 0;
+                }
+            }
+
+            ip += m.len;
+            anchor = ip;
+        } else {
+            ip += step;
+        }
+    }
+
+    const size_t last_lits = iend - anchor;
+    if (last_lits > 0) {
+        ZXC_MEMCPY(literals + lit_c, anchor, last_lits);
+        lit_c += last_lits;
+    }
+
+parse_done:;
+    /* Dictionary-table trainer hook: accumulate the REAL post-LZ literal
+     * frequencies (see zxc_train_dict_huf). Cold path, NULL outside training. */
+    if (UNLIKELY(ctx->lit_freq_acc != NULL)) {
+        for (size_t i = 0; i < lit_c; i++) ctx->lit_freq_acc[literals[i]]++;
+    }
+
+    // --- RLE ANALYSIS ---
+    size_t rle_size = 0;
+    int enc_lit = ZXC_SECTION_ENCODING_RAW;
+
+    if (lit_c > 0) {
+        const uint8_t* p = literals;
+        const uint8_t* const p_end = literals + lit_c;
+        const uint8_t* const p_end_4 = p_end - 3;  // Safe limit for 4-byte lookahead
+
+        while (LIKELY(p < p_end)) {
+            const uint8_t b = *p;
+            const uint8_t* run_start = p++;
+
+            // Fast run counting with early SIMD exit
+#if defined(ZXC_USE_AVX512)
+            const __m512i vb = _mm512_set1_epi8((char)b);
+            while (p <= p_end - 64) {
+                const __m512i v = _mm512_loadu_si512((const void*)p);
+                const __mmask64 mask = _mm512_cmpeq_epi8_mask(v, vb);
+                if (mask != 0xFFFFFFFFFFFFFFFFULL) {
+                    p += (size_t)zxc_ctz64(~mask);
+                    goto _run_done;
+                }
+                p += 64;
+            }
+#elif defined(ZXC_USE_AVX2)
+            const __m256i vb = _mm256_set1_epi8((char)b);
+            while (p <= p_end - 32) {
+                const __m256i v = _mm256_loadu_si256((const __m256i*)p);
+                const uint32_t mask = (uint32_t)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, vb));
+                if (mask != 0xFFFFFFFF) {
+                    p += zxc_ctz32(~mask);
+                    goto _run_done;
+                }
+                p += 32;
+            }
+#elif defined(ZXC_USE_SSE2)
+            const __m128i vb = _mm_set1_epi8((char)b);
+            while (p <= p_end - 16) {
+                const __m128i v = _mm_loadu_si128((const __m128i*)p);
+                const uint32_t mask = (uint32_t)_mm_movemask_epi8(_mm_cmpeq_epi8(v, vb));
+                if (mask != 0xFFFFU) {
+                    p += zxc_ctz32(~mask);
+                    goto _run_done;
+                }
+                p += 16;
+            }
+#elif defined(ZXC_USE_NEON64)
+            const uint8x16_t vb = vdupq_n_u8(b);
+            while (p <= p_end - 16) {
+                const uint8x16_t v = vld1q_u8(p);
+                const uint8x16_t eq = vceqq_u8(v, vb);
+                /* SHRN nibble-mask: see find_best_match above for rationale. */
+                const uint64_t mask =
+                    vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(eq), 4)), 0);
+                if (LIKELY(mask == ~(uint64_t)0)) {
+                    p += 16;
+                } else {
+                    p += (size_t)(zxc_ctz64(~mask) >> 2);
+                    goto _run_done;
+                }
+            }
+#elif defined(ZXC_USE_NEON32)
+            uint8x16_t vb = vdupq_n_u8(b);
+            while (p <= p_end - 16) {
+                uint8x16_t v = vld1q_u8(p);
+                uint8x16_t eq = vceqq_u8(v, vb);
+                uint8x16_t not_eq = vmvnq_u8(eq);
+
+                // 32-bit ARM NEON doesn't always support vgetq_lane_u64 / vreinterpretq_u64_u8 so
+                // we treat the 128-bit vector as 4 x 32-bit lanes */
+                const uint32x4_t neq32 = vreinterpretq_u32_u8(not_eq);
+                const uint32_t l0 = vgetq_lane_u32(neq32, 0);
+                const uint32_t l1 = vgetq_lane_u32(neq32, 1);
+
+                const uint64_t lo = ((uint64_t)l1 << 32) | l0;
+                if (lo != 0) {
+                    p += (size_t)(zxc_ctz64(lo) >> 3);
+                    goto _run_done;
+                }
+
+                const uint32_t h0 = vgetq_lane_u32(neq32, 2);
+                const uint32_t h1 = vgetq_lane_u32(neq32, 3);
+                const uint64_t hi = ((uint64_t)h1 << 32) | h0;
+
+                if (hi != 0) {
+                    p += 8 + (zxc_ctz64(hi) >> 3);
+                    goto _run_done;
+                }
+                p += 16;
+            }
+#endif
+            while (p < p_end && *p == b) p++;
+
+#if defined(ZXC_USE_AVX512) || defined(ZXC_USE_AVX2) || defined(ZXC_USE_NEON64) || \
+    defined(ZXC_USE_NEON32) || defined(ZXC_USE_SSE2)
+        _run_done:;
+#endif
+            const size_t run = (size_t)(p - run_start);
+
+            if (run >= 4) {
+                // RLE run: 2 bytes per 131 values, then remainder
+                // Branchless: full_chunks * 2 + remainder handling
+                const size_t full_chunks = run / 131;
+                const size_t rem = run - full_chunks * 131;  // Avoid modulo
+                rle_size += full_chunks * 2;
+                // Remainder: if >= 4 -> 2 bytes (RLE), else 1 + rem (literal)
+                if (rem >= 4)
+                    rle_size += 2;
+                else if (rem > 0)
+                    rle_size += 1 + rem;
+            } else {
+                // Literal run: scan ahead with fast SIMD lookahead
+                const uint8_t* lit_start = run_start;
+
+#if defined(ZXC_USE_AVX512)
+                while (p <= p_end_4 - 64) {
+                    const __m512i v0 = _mm512_loadu_si512((const void*)p);
+                    const __m512i v1 = _mm512_loadu_si512((const void*)(p + 1));
+                    const __m512i v2 = _mm512_loadu_si512((const void*)(p + 2));
+                    const __m512i v3 = _mm512_loadu_si512((const void*)(p + 3));
+                    const __mmask64 mask = _mm512_cmpeq_epi8_mask(v0, v1) &
+                                           _mm512_cmpeq_epi8_mask(v1, v2) &
+                                           _mm512_cmpeq_epi8_mask(v2, v3);
+                    if (mask != 0) {
+                        p += (size_t)zxc_ctz64(mask);
+                        goto _lit_done;
+                    }
+                    p += 64;
+                }
+#elif defined(ZXC_USE_AVX2)
+                while (p <= p_end_4 - 32) {
+                    __m256i v0 = _mm256_loadu_si256((const __m256i*)p);
+                    __m256i v1 = _mm256_loadu_si256((const __m256i*)(p + 1));
+                    __m256i v2 = _mm256_loadu_si256((const __m256i*)(p + 2));
+                    __m256i v3 = _mm256_loadu_si256((const __m256i*)(p + 3));
+                    __m256i vend = _mm256_and_si256(
+                        _mm256_cmpeq_epi8(v0, v1),
+                        _mm256_and_si256(_mm256_cmpeq_epi8(v1, v2), _mm256_cmpeq_epi8(v2, v3)));
+                    uint32_t mask = (uint32_t)_mm256_movemask_epi8(vend);
+                    if (mask != 0) {
+                        p += zxc_ctz32(mask);
+                        goto _lit_done;
+                    }
+                    p += 32;
+                }
+#elif defined(ZXC_USE_SSE2)
+                while (p <= p_end_4 - 16) {
+                    __m128i v0 = _mm_loadu_si128((const __m128i*)p);
+                    __m128i v1 = _mm_loadu_si128((const __m128i*)(p + 1));
+                    __m128i v2 = _mm_loadu_si128((const __m128i*)(p + 2));
+                    __m128i v3 = _mm_loadu_si128((const __m128i*)(p + 3));
+                    __m128i vend = _mm_and_si128(
+                        _mm_cmpeq_epi8(v0, v1),
+                        _mm_and_si128(_mm_cmpeq_epi8(v1, v2), _mm_cmpeq_epi8(v2, v3)));
+                    uint32_t mask = (uint32_t)_mm_movemask_epi8(vend);
+                    if (mask != 0) {
+                        p += zxc_ctz32(mask);
+                        goto _lit_done;
+                    }
+                    p += 16;
+                }
+#elif defined(ZXC_USE_NEON64)
+                while (p <= p_end_4 - 16) {
+                    uint8x16_t v0 = vld1q_u8(p);
+                    uint8x16_t v1 = vld1q_u8(p + 1);
+                    uint8x16_t v2 = vld1q_u8(p + 2);
+                    uint8x16_t v3 = vld1q_u8(p + 3);
+                    uint8x16_t eq =
+                        vandq_u8(vceqq_u8(v0, v1), vandq_u8(vceqq_u8(v1, v2), vceqq_u8(v2, v3)));
+                    /* Dual of the run scan: searching for the FIRST set
+                     * nibble (a position where 4 consecutive bytes match).
+                     * mask == 0 means no break found in this 16-byte
+                     * window. Same SHRN compression as elsewhere. */
+                    const uint64_t mask = vget_lane_u64(
+                        vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(eq), 4)), 0);
+                    if (LIKELY(mask == 0)) {
+                        p += 16;
+                    } else {
+                        p += (size_t)(zxc_ctz64(mask) >> 2);
+                        goto _lit_done;
+                    }
+                }
+#elif defined(ZXC_USE_NEON32)
+                while (p <= p_end_4 - 16) {
+                    uint8x16_t v0 = vld1q_u8(p);
+                    uint8x16_t v1 = vld1q_u8(p + 1);
+                    uint8x16_t v2 = vld1q_u8(p + 2);
+                    uint8x16_t v3 = vld1q_u8(p + 3);
+                    uint8x16_t eq =
+                        vandq_u8(vceqq_u8(v0, v1), vandq_u8(vceqq_u8(v1, v2), vceqq_u8(v2, v3)));
+
+                    uint32x4_t eq32 = vreinterpretq_u32_u8(eq);
+                    uint32_t l0 = vgetq_lane_u32(eq32, 0);
+                    uint32_t l1 = vgetq_lane_u32(eq32, 1);
+                    uint64_t lo = ((uint64_t)l1 << 32) | l0;
+
+                    if (lo != 0) {
+                        p += (zxc_ctz64(lo) >> 3);
+                        goto _lit_done;
+                    }
+
+                    uint32_t h0 = vgetq_lane_u32(eq32, 2);
+                    uint32_t h1 = vgetq_lane_u32(eq32, 3);
+                    uint64_t hi = ((uint64_t)h1 << 32) | h0;
+
+                    if (hi != 0) {
+                        p += 8 + (zxc_ctz64(hi) >> 3);
+                        goto _lit_done;
+                    }
+                    p += 16;
+                }
+#endif
+                while (p < p_end_4) {
+                    // Check for RLE opportunity (4 identical bytes)
+                    if (UNLIKELY(p[0] == p[1] && p[1] == p[2] && p[2] == p[3])) break;
+                    p++;
+                }
+                // Handle remaining bytes near end
+                while (p < p_end) {
+                    if (UNLIKELY(p + 3 < p_end && p[0] == p[1] && p[1] == p[2] && p[2] == p[3]))
+                        break;
+                    p++;
+                }
+
+#if defined(ZXC_USE_AVX512) || defined(ZXC_USE_AVX2) || defined(ZXC_USE_NEON64) || \
+    defined(ZXC_USE_NEON32) || defined(ZXC_USE_SSE2)
+            _lit_done:;
+#endif
+                const size_t lit_run = (size_t)(p - lit_start);
+                // 1 header per 128 bytes + all data bytes
+                // lit_run + ceil(lit_run / 128)
+                rle_size += lit_run + ((lit_run + 127) >> 7);
+            }
+        }
+
+        // Threshold: ~3% savings using integer math (97% ~= 1 - 1/32)
+        if (rle_size < lit_c - (lit_c >> 5)) enc_lit = ZXC_SECTION_ENCODING_RLE;
+    }
+
+    /* Level >= 6: also evaluate Huffman as a 3rd literal-encoding candidate.
+     * Build a histogram and length-limited canonical code lengths, compute the
+     * exact byte size of the 4-way interleaved bitstream + 134-byte header,
+     * and switch to HUFFMAN if it beats the current choice by >= 3%. */
+    uint8_t huf_code_len[ZXC_HUF_NUM_SYMBOLS];
+    size_t huf_total_size = SIZE_MAX;
+    if (level >= ZXC_LEVEL_DENSITY && lit_c >= ZXC_HUF_MIN_LITERALS) {
+        uint32_t freq0[ZXC_HUF_NUM_SYMBOLS] = {0};
+        uint32_t freq1[ZXC_HUF_NUM_SYMBOLS] = {0};
+        uint32_t freq2[ZXC_HUF_NUM_SYMBOLS] = {0};
+        uint32_t freq3[ZXC_HUF_NUM_SYMBOLS] = {0};
+        {
+            size_t i = 0;
+            for (; i + 4 <= lit_c; i += 4) {
+                freq0[literals[i + 0]]++;
+                freq1[literals[i + 1]]++;
+                freq2[literals[i + 2]]++;
+                freq3[literals[i + 3]]++;
+            }
+            for (; i < lit_c; i++) freq0[literals[i]]++;
+        }
+        uint32_t freq[ZXC_HUF_NUM_SYMBOLS];
+        for (int k = 0; k < ZXC_HUF_NUM_SYMBOLS; k++) {
+            freq[k] = freq0[k] + freq1[k] + freq2[k] + freq3[k];
+        }
+
+        if (zxc_huf_build_code_lengths(freq, huf_code_len, ctx->opt_scratch) == ZXC_OK) {
+            const size_t Q = (lit_c + ZXC_HUF_NUM_STREAMS - 1) / ZXC_HUF_NUM_STREAMS;
+            size_t streams_bytes = 0;
+            for (int s = 0; s < ZXC_HUF_NUM_STREAMS; s++) {
+                size_t start = (size_t)s * Q;
+                size_t stop = start + Q;
+                if (start > lit_c) start = lit_c;
+                if (stop > lit_c) stop = lit_c;
+                uint64_t b0 = 0;
+                uint64_t b1 = 0;
+                uint64_t b2 = 0;
+                uint64_t b3 = 0;
+                size_t i = start;
+
+                for (; i + 4 <= stop; i += 4) {
+                    b0 += huf_code_len[literals[i + 0]];
+                    b1 += huf_code_len[literals[i + 1]];
+                    b2 += huf_code_len[literals[i + 2]];
+                    b3 += huf_code_len[literals[i + 3]];
+                }
+                uint64_t bits = b0 + b1 + b2 + b3;
+                for (; i < stop; i++) bits += huf_code_len[literals[i]];
+                streams_bytes += (size_t)((bits + 7) / 8);
+            }
+            huf_total_size = ZXC_HUF_HEADER_SIZE + streams_bytes;
+            const size_t baseline = (enc_lit == ZXC_SECTION_ENCODING_RLE) ? rle_size : lit_c;
+            /* Threshold: 3% savings (1/32) over the chosen RAW/RLE baseline.
+             * Same heuristic as the RAW/RLE switch above. */
+            if (huf_total_size < baseline - (baseline >> 5)) {
+                enc_lit = ZXC_SECTION_ENCODING_HUFFMAN;
+            }
+        }
+    }
+
+    /* Shared dictionary table candidate (level >= 6, dict with table attached):
+     * same bitstream as HUFFMAN but no 128-byte lengths header, so it stays
+     * viable on literal sections far below ZXC_HUF_MIN_LITERALS. Exact size
+     * accounting; invalid (a literal byte without a code) drops the candidate. */
+    size_t huf_dict_total_size = SIZE_MAX;
+    uint8_t dict_code_len[ZXC_HUF_NUM_SYMBOLS];
+    if (level >= ZXC_LEVEL_DENSITY && ctx->dict_huf_lengths != NULL && lit_c > 0 &&
+        zxc_huf_unpack_lengths(ctx->dict_huf_lengths, dict_code_len) == ZXC_OK) {
+        const size_t Q = (lit_c + ZXC_HUF_NUM_STREAMS - 1) / ZXC_HUF_NUM_STREAMS;
+        size_t streams_bytes = 0;
+        int valid = 1;
+        for (int s = 0; s < ZXC_HUF_NUM_STREAMS && valid; s++) {
+            size_t start = (size_t)s * Q;
+            size_t stop = start + Q;
+            if (start > lit_c) start = lit_c;
+            if (stop > lit_c) stop = lit_c;
+            uint64_t bits = 0;
+            for (size_t i = start; i < stop; i++) {
+                const int len = dict_code_len[literals[i]];
+                if (UNLIKELY(len == 0)) {
+                    valid = 0;
+                    break;
+                }
+                bits += (uint64_t)len;
+            }
+            streams_bytes += (size_t)((bits + 7) / 8);
+        }
+        if (valid) {
+            huf_dict_total_size = (size_t)ZXC_HUF_STREAM_SIZES_HEADER_SIZE + streams_bytes;
+            if (enc_lit == ZXC_SECTION_ENCODING_HUFFMAN) {
+                /* Both candidates are Huffman bitstreams: pick the smaller. */
+                if (huf_dict_total_size < huf_total_size)
+                    enc_lit = ZXC_SECTION_ENCODING_HUFFMAN_DICT;
+            } else {
+                const size_t baseline = (enc_lit == ZXC_SECTION_ENCODING_RLE) ? rle_size : lit_c;
+                /* Same 3% (1/32) margin as the other encoding switches. */
+                if (huf_dict_total_size < baseline - (baseline >> 5))
+                    enc_lit = ZXC_SECTION_ENCODING_HUFFMAN_DICT;
+            }
+        }
+    }
+
+    zxc_block_header_t bh = {.block_type = ZXC_BLOCK_GLO};
+    uint8_t* const p = dst + ZXC_BLOCK_HEADER_SIZE;
+    size_t rem = dst_cap - ZXC_BLOCK_HEADER_SIZE;
+
+    // Decide offset encoding mode: 1-byte if all offsets <= 255
+    const int use_8bit_off = (max_offset <= 255) ? 1 : 0;
+    const size_t off_stream_size = use_8bit_off ? seq_c : (seq_c * 2);
+
+    const zxc_gnr_header_t gh = {.n_sequences = seq_c,
+                                 .n_literals = (uint32_t)lit_c,
+                                 .enc_lit = enc_lit,
+                                 .enc_litlen = 0,
+                                 .enc_mlen = 0,
+                                 .enc_off = (uint8_t)use_8bit_off};
+
+    zxc_section_desc_t desc[ZXC_GLO_SECTIONS] = {0};
+    const size_t lit_section_size = (enc_lit == ZXC_SECTION_ENCODING_RLE)       ? rle_size
+                                    : (enc_lit == ZXC_SECTION_ENCODING_HUFFMAN) ? huf_total_size
+                                    : (enc_lit == ZXC_SECTION_ENCODING_HUFFMAN_DICT)
+                                        ? huf_dict_total_size
+                                        : lit_c;
+    desc[0].sizes = (uint64_t)lit_section_size | ((uint64_t)lit_c << 32);
+    desc[1].sizes = (uint64_t)seq_c | ((uint64_t)seq_c << 32);
+    desc[2].sizes = (uint64_t)off_stream_size | ((uint64_t)off_stream_size << 32);
+    desc[3].sizes = (uint64_t)extras_sz | ((uint64_t)extras_sz << 32);
+
+    const int ghs = zxc_write_glo_header_and_desc(p, rem, &gh, desc);
+    if (UNLIKELY(ghs < 0)) return ghs;
+
+    uint8_t* p_curr = p + ghs;
+    rem -= ghs;
+
+    // Extract stream sizes once
+    const size_t sz_lit = (size_t)(desc[0].sizes & ZXC_SECTION_SIZE_MASK);
+    const size_t sz_tok = (size_t)(desc[1].sizes & ZXC_SECTION_SIZE_MASK);
+    const size_t sz_off = (size_t)(desc[2].sizes & ZXC_SECTION_SIZE_MASK);
+    const size_t sz_ext = (size_t)(desc[3].sizes & ZXC_SECTION_SIZE_MASK);
+
+    if (UNLIKELY(rem < sz_lit)) return ZXC_ERROR_DST_TOO_SMALL;
+
+    if (enc_lit == ZXC_SECTION_ENCODING_HUFFMAN) {
+        const int written = zxc_huf_encode_section(literals, lit_c, huf_code_len, p_curr, rem);
+        if (UNLIKELY(written < 0)) return written;
+        if (UNLIKELY((size_t)written != huf_total_size)) return ZXC_ERROR_DST_TOO_SMALL;
+        p_curr += written;
+    } else if (enc_lit == ZXC_SECTION_ENCODING_HUFFMAN_DICT) {
+        const int written =
+            zxc_huf_encode_section_dict(literals, lit_c, dict_code_len, p_curr, rem);
+        if (UNLIKELY(written < 0)) return written;
+        if (UNLIKELY((size_t)written != huf_dict_total_size)) return ZXC_ERROR_DST_TOO_SMALL;
+        p_curr += written;
+    } else if (enc_lit == ZXC_SECTION_ENCODING_RLE) {
+        // Write RLE - optimized single-pass encoding
+        const uint8_t* lit_ptr = literals;
+        const uint8_t* const lit_end = literals + lit_c;
+
+        while (lit_ptr < lit_end) {
+            uint8_t b = *lit_ptr;
+            const uint8_t* run_start = lit_ptr++;
+
+            // Count run length
+            while (lit_ptr < lit_end && *lit_ptr == b) lit_ptr++;
+            size_t run = (size_t)(lit_ptr - run_start);
+
+            if (run >= 4) {
+                // RLE runs: emit 2-byte tokens (header + value)
+                while (run >= 4) {
+                    size_t chunk = (run > 131) ? 131 : run;
+                    *p_curr++ = (uint8_t)(ZXC_LIT_RLE_FLAG | (chunk - 4));
+                    *p_curr++ = b;
+                    run -= chunk;
+                }
+                // Leftover < 4 bytes: emit as literal
+                if (run > 0) {
+                    *p_curr++ = (uint8_t)(run - 1);
+                    ZXC_MEMCPY(p_curr, lit_ptr - run, run);
+                    p_curr += run;
+                }
+            } else {
+                // Literal run: scan ahead to find next RLE opportunity
+                const uint8_t* lit_run_start = run_start;
+
+                while (lit_ptr < lit_end) {
+                    // Quick check: need 4 identical bytes to break
+                    if (UNLIKELY(lit_ptr + 3 < lit_end && lit_ptr[0] == lit_ptr[1] &&
+                                 lit_ptr[1] == lit_ptr[2] && lit_ptr[2] == lit_ptr[3])) {
+                        break;
+                    }
+                    lit_ptr++;
+                }
+
+                size_t lit_run = (size_t)(lit_ptr - lit_run_start);
+                const uint8_t* src_ptr = lit_run_start;
+
+                // Emit literal chunks (max 128 bytes each)
+                while (lit_run > 0) {
+                    size_t chunk = (lit_run > 128) ? 128 : lit_run;
+                    *p_curr++ = (uint8_t)(chunk - 1);
+                    ZXC_MEMCPY(p_curr, src_ptr, chunk);
+                    p_curr += chunk;
+                    src_ptr += chunk;
+                    lit_run -= chunk;
+                }
+            }
+        }
+    } else {
+        ZXC_MEMCPY(p_curr, literals, lit_c);
+        p_curr += lit_c;
+    }
+    rem -= sz_lit;
+
+    if (UNLIKELY(rem < sz_tok)) return ZXC_ERROR_DST_TOO_SMALL;
+
+    ZXC_MEMCPY(p_curr, buf_tokens, seq_c);
+    p_curr += seq_c;
+    rem -= sz_tok;
+
+    if (UNLIKELY(rem < sz_off)) return ZXC_ERROR_DST_TOO_SMALL;
+
+    if (use_8bit_off) {
+        // Write 1-byte offsets - unroll for better throughput
+        uint32_t i = 0;
+        for (; i + 8 <= seq_c; i += 8) {
+            p_curr[0] = (uint8_t)buf_offsets[i + 0];
+            p_curr[1] = (uint8_t)buf_offsets[i + 1];
+            p_curr[2] = (uint8_t)buf_offsets[i + 2];
+            p_curr[3] = (uint8_t)buf_offsets[i + 3];
+            p_curr[4] = (uint8_t)buf_offsets[i + 4];
+            p_curr[5] = (uint8_t)buf_offsets[i + 5];
+            p_curr[6] = (uint8_t)buf_offsets[i + 6];
+            p_curr[7] = (uint8_t)buf_offsets[i + 7];
+            p_curr += 8;
+        }
+        for (; i < seq_c; i++) {
+            *p_curr++ = (uint8_t)buf_offsets[i];
+        }
+    } else {
+        // Write 2-byte offsets in little-endian order
+#ifdef ZXC_BIG_ENDIAN
+        for (uint32_t i = 0; i < seq_c; i++) {
+            zxc_store_le16(p_curr, buf_offsets[i]);
+            p_curr += sizeof(uint16_t);
+        }
+#else
+        ZXC_MEMCPY(p_curr, buf_offsets, seq_c * sizeof(uint16_t));
+        p_curr += seq_c * sizeof(uint16_t);
+#endif
+    }
+    rem -= sz_off;
+
+    if (UNLIKELY(rem < sz_ext)) return ZXC_ERROR_DST_TOO_SMALL;
+
+    ZXC_MEMCPY(p_curr, buf_extras, extras_sz);
+    p_curr += extras_sz;
+
+    bh.comp_size = (uint32_t)(p_curr - (dst + ZXC_BLOCK_HEADER_SIZE));
+    const int hw = zxc_write_block_header(dst, dst_cap, &bh);
+    if (UNLIKELY(hw < 0)) return hw;
+
+    // Checksum will be appended by the wrapper
+    *out_sz = ZXC_BLOCK_HEADER_SIZE + bh.comp_size;
+    return ZXC_OK;
+}
+
+/**
+ * @brief Encodes a data block using the General High Velocity (GHI) compression format.
+ *
+ * 1. Compression Strategy
+ * It uses an LZ77-based algorithm with a sliding window (64KB) and a hash table/chain table
+ * mechanism.
+ *
+ * 2. Token Format (Fixed-Width)
+ * Unlike the standard GLO block which uses 1-byte tokens (4-bit literal length / 4-bit match
+ * length), GHI uses 4-byte (32-bit) sequence records for better performance on long runs:
+ * Literal Length (LL): 8 bits (stores 0-254; 255 indicates overflow).
+ * Match Length (ML): 8 bits (stores 0-254; 255 indicates overflow).
+ * Offset: 16 bits (supports the full 64KB window).
+ * This format minimizes the number of expensive VByte reads during decompression for common
+ * sequences where lengths are between 16 and 255.
+ *
+ * @param[in,out] ctx   Pointer to the compression context containing hash tables
+ * and configuration.
+ * @param[in] src       Pointer to the input source data.
+ * @param[in] src_sz    Size of the input data in bytes.
+ * @param[out] dst      Pointer to the destination buffer where compressed data will
+ * be written.
+ * @param[in] dst_cap   Maximum capacity of the destination buffer.
+ * @param[out] out_sz   Pointer to a variable that will receive the total size
+ * of the compressed output.
+ *
+ * @return ZXC_OK on success, or a negative zxc_error_t code (e.g., ZXC_ERROR_DST_TOO_SMALL) if an
+ * error occurs (e.g., buffer overflow).
+ */
+static int zxc_encode_block_ghi(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                const size_t src_sz, uint8_t* RESTRICT dst, const size_t dst_cap,
+                                size_t* RESTRICT const out_sz) {
+    const int level = ctx->compression_level;
+    const size_t dict_sz = ctx->dict_size;
+
+    const zxc_lz77_params_t lzp = zxc_get_lz77_params(level);
+
+    ctx->epoch++;
+    if (UNLIKELY(ctx->epoch >= ctx->max_epoch)) {
+        ZXC_MEMSET(ctx->hash_table, 0, ZXC_LZ_HASH_SIZE * sizeof(uint32_t));
+        ZXC_MEMSET(ctx->hash_tags, 0, ZXC_LZ_HASH_SIZE * sizeof(uint8_t));
+        ctx->epoch = 1;
+    }
+    const uint32_t offset_bits = ctx->offset_bits;
+    const uint32_t offset_mask = ctx->offset_mask;
+    const uint32_t epoch_mark = ctx->epoch << offset_bits;
+
+    if (dict_sz > 0)
+        zxc_lz_seed_dict(src, dict_sz, ctx->hash_table, ctx->hash_tags, ctx->chain_table,
+                         epoch_mark, offset_mask, level);
+
+    const uint8_t* ip = src + dict_sz;
+    const uint8_t* iend = src + src_sz;
+    const uint8_t* anchor = ip;
+    const uint8_t* search_limit = iend - ZXC_LZ_SEARCH_MARGIN;
+
+    uint32_t* const hash_table = ctx->hash_table;
+    uint8_t* const hash_tags = ctx->hash_tags;
+    uint8_t* const buf_extras = ctx->buf_extras;
+    uint16_t* const chain_table = ctx->chain_table;
+    uint8_t* const literals = ctx->literals;
+    uint32_t* const buf_sequences = ctx->buf_sequences;
+
+    uint32_t seq_c = 0;
+    size_t extras_c = 0;
+    size_t lit_c = 0;
+    uint16_t max_offset = 0;
+
+    while (LIKELY(ip < search_limit)) {
+        size_t dist = (size_t)(ip - anchor);
+        size_t step = lzp.step_base + (dist >> lzp.step_shift);
+        if (UNLIKELY(ip + step >= search_limit)) step = 1;
+
+        ZXC_PREFETCH_READ(ip + step * 4 + ZXC_CACHE_LINE_SIZE);
+
+        if (LIKELY(ip + step + sizeof(uint64_t) <= iend)) {
+            const uint64_t v_next = zxc_le64(ip + step);
+            // cppcheck-suppress unreadVariable
+            const uint32_t h_next = zxc_hash_func(v_next, 0);
+            ZXC_PREFETCH_READ(&hash_tags[h_next]);
+            ZXC_PREFETCH_READ(&hash_table[h_next]);
+        }
+
+        const zxc_match_t m =
+            zxc_lz77_find_best_match(src, ip, iend, search_limit, anchor, hash_table, hash_tags,
+                                     chain_table, epoch_mark, offset_mask, level, lzp,
+                                     /*last_off=*/0U);
+
+        if (m.ref) {
+            ip -= m.backtrack;
+            const uint32_t ll = (uint32_t)(ip - anchor);
+            const uint32_t ml = m.len - ZXC_LZ_MIN_MATCH_LEN;
+            const uint32_t off = (uint32_t)(ip - m.ref);
+
+            if (ll > 0) {
+                if (LIKELY(anchor + ZXC_PAD_SIZE <= iend)) {
+                    zxc_copy32(literals + lit_c, anchor);
+                    if (UNLIKELY(ll > ZXC_PAD_SIZE)) {
+                        ZXC_MEMCPY(literals + lit_c + ZXC_PAD_SIZE, anchor + ZXC_PAD_SIZE,
+                                   ll - ZXC_PAD_SIZE);
+                    }
+                } else {
+                    ZXC_MEMCPY(literals + lit_c, anchor, ll);
+                }
+                lit_c += ll;
+            }
+
+            const uint32_t ll_write = (ll >= ZXC_SEQ_LL_MASK) ? 255U : ll;
+            const uint32_t ml_write = (ml >= ZXC_SEQ_ML_MASK) ? 255U : ml;
+            const uint32_t seq_val = (ll_write << (ZXC_SEQ_ML_BITS + ZXC_SEQ_OFF_BITS)) |
+                                     (ml_write << ZXC_SEQ_OFF_BITS) |
+                                     ((off - ZXC_LZ_OFFSET_BIAS) & ZXC_SEQ_OFF_MASK);
+            if ((off - ZXC_LZ_OFFSET_BIAS) > max_offset)
+                max_offset = (uint16_t)(off - ZXC_LZ_OFFSET_BIAS);
+            buf_sequences[seq_c] = seq_val;
+            seq_c++;
+
+            if (ll >= ZXC_SEQ_LL_MASK) {
+                const size_t n = zxc_write_varint(buf_extras + extras_c, ll - ZXC_SEQ_LL_MASK);
+                if (UNLIKELY(n == 0)) return ZXC_ERROR_OVERFLOW;
+                extras_c += n;
+            }
+            if (ml >= ZXC_SEQ_ML_MASK) {
+                const size_t n = zxc_write_varint(buf_extras + extras_c, ml - ZXC_SEQ_ML_MASK);
+                if (UNLIKELY(n == 0)) return ZXC_ERROR_OVERFLOW;
+                extras_c += n;
+            }
+
+            ip += m.len;
+            anchor = ip;
+        } else {
+            ip += step;
+        }
+    }
+
+    const size_t last_lits = iend - anchor;
+    if (last_lits > 0) {
+        ZXC_MEMCPY(literals + lit_c, anchor, last_lits);
+        lit_c += last_lits;
+    }
+
+    zxc_block_header_t bh = {.block_type = ZXC_BLOCK_GHI};
+    uint8_t* const p = dst + ZXC_BLOCK_HEADER_SIZE;
+    size_t rem = dst_cap - ZXC_BLOCK_HEADER_SIZE;
+
+    // Decide offset encoding mode
+    const zxc_gnr_header_t gh = {.n_sequences = seq_c,
+                                 .n_literals = (uint32_t)lit_c,
+                                 .enc_lit = ZXC_SECTION_ENCODING_RAW,
+                                 .enc_litlen = 0,
+                                 .enc_mlen = 0,
+                                 .enc_off = (uint8_t)(max_offset <= 255) ? 1 : 0};
+
+    zxc_section_desc_t desc[ZXC_GHI_SECTIONS] = {0};
+    desc[0].sizes = (uint64_t)lit_c | ((uint64_t)lit_c << 32);
+    size_t sz_seqs = seq_c * sizeof(uint32_t);
+    desc[1].sizes = (uint64_t)sz_seqs | ((uint64_t)sz_seqs << 32);
+    desc[2].sizes = (uint64_t)extras_c | ((uint64_t)extras_c << 32);
+
+    const int ghs = zxc_write_ghi_header_and_desc(p, rem, &gh, desc);
+    if (UNLIKELY(ghs < 0)) return ghs;
+
+    uint8_t* p_curr = p + ghs;
+    rem -= ghs;
+
+    // Extract stream sizes once
+    const size_t sz_lit = (size_t)(desc[0].sizes & ZXC_SECTION_SIZE_MASK);
+    const size_t sz_seq = (size_t)(desc[1].sizes & ZXC_SECTION_SIZE_MASK);
+    const size_t sz_ext = (size_t)(desc[2].sizes & ZXC_SECTION_SIZE_MASK);
+
+    if (UNLIKELY(rem < sz_lit + sz_seq + sz_ext)) return ZXC_ERROR_DST_TOO_SMALL;
+
+    ZXC_MEMCPY(p_curr, literals, lit_c);
+    p_curr += lit_c;
+    rem -= sz_lit;
+
+    if (UNLIKELY(rem < sz_seq)) return ZXC_ERROR_DST_TOO_SMALL;
+    // Write sequences in little-endian order
+#ifdef ZXC_BIG_ENDIAN
+    for (uint32_t i = 0; i < seq_c; i++) {
+        zxc_store_le32(p_curr, buf_sequences[i]);
+        p_curr += sizeof(uint32_t);
+    }
+#else
+    ZXC_MEMCPY(p_curr, buf_sequences, sz_seq);
+    p_curr += sz_seq;
+#endif
+
+    // --- WRITE EXTRAS ---
+    ZXC_MEMCPY(p_curr, buf_extras, sz_ext);
+    p_curr += sz_ext;
+
+    bh.comp_size = (uint32_t)(p_curr - (dst + ZXC_BLOCK_HEADER_SIZE));
+    const int hw = zxc_write_block_header(dst, dst_cap, &bh);
+    if (UNLIKELY(hw < 0)) return hw;
+
+    // Checksum will be appended by the wrapper
+    *out_sz = ZXC_BLOCK_HEADER_SIZE + bh.comp_size;
+    return ZXC_OK;
+}
+
+/**
+ * @brief Encodes a raw data block (uncompressed).
+ *
+ * This function prepares and writes a "RAW" type block into the destination
+ * buffer. It handles the block header and copying of source data; any checksum
+ * is appended separately by the wrapper.
+ *
+ * @param[in] src Pointer to the source data to encode.
+ * @param[in] src_sz Size of the source data in bytes.
+ * @param[out] dst Pointer to the destination buffer.
+ * @param[in] dst_cap Maximum capacity of the destination buffer.
+ * @param[out] out_sz Pointer to a variable receiving the total written size
+ * (header + data).
+ *
+ * @return ZXC_OK on success, or a negative zxc_error_t code (e.g., ZXC_ERROR_DST_TOO_SMALL) if the
+ * destination buffer capacity is insufficient.
+ */
+static int zxc_encode_block_raw(const uint8_t* RESTRICT src, const size_t src_sz,
+                                uint8_t* RESTRICT const dst, const size_t dst_cap,
+                                size_t* RESTRICT const out_sz) {
+    if (UNLIKELY(dst_cap < ZXC_BLOCK_HEADER_SIZE + src_sz)) return ZXC_ERROR_DST_TOO_SMALL;
+
+    // Compute block RAW
+    zxc_block_header_t bh;
+    bh.block_type = ZXC_BLOCK_RAW;
+    bh.block_flags = 0;  // Checksum flag moved to file header
+    bh.reserved = 0;
+    bh.comp_size = (uint32_t)src_sz;
+
+    const int hw = zxc_write_block_header(dst, dst_cap, &bh);
+    if (UNLIKELY(hw < 0)) return hw;
+
+    ZXC_MEMCPY(dst + ZXC_BLOCK_HEADER_SIZE, src, src_sz);
+
+    // Checksum will be appended by the wrapper
+    *out_sz = ZXC_BLOCK_HEADER_SIZE + src_sz;
+    return ZXC_OK;
+}
+
+/**
+ * @brief Compresses one chunk into a single ZXC block (the compression hot path).
+ *
+ * Selects the GHI encoder at level <= 2, otherwise GLO; falls back to a RAW
+ * block when the coded form would not shrink the data. When @c ctx->dict_size
+ * is > 0, @p chunk is the [dict | block] concat and only the block tail counts
+ * toward the expansion check. Appends the per-block checksum when enabled.
+ *
+ * @param[in,out] ctx     Compression context (level, dict_size, checksum, buffers).
+ * @param[in]     chunk   Source bytes ([dict | block] when a dictionary is active).
+ * @param[in]     src_sz  Length of @p chunk in bytes (includes any dict prefix).
+ * @param[out]    dst     Destination block buffer.
+ * @param[in]     dst_cap Capacity of @p dst in bytes.
+ * @return Compressed block size in bytes on success, or a negative @ref zxc_error_t.
+ */
+// cppcheck-suppress unusedFunction
+int zxc_compress_chunk_wrapper(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT chunk,
+                               const size_t src_sz, uint8_t* RESTRICT dst, const size_t dst_cap) {
+    const size_t dict_sz = ctx->dict_size;
+    const size_t block_sz = src_sz - dict_sz;
+    const uint8_t* block_data = chunk + dict_sz;
+    size_t w = 0;
+    int res = ZXC_OK;
+
+    if (ctx->compression_level <= 2)
+        res = zxc_encode_block_ghi(ctx, chunk, src_sz, dst, dst_cap, &w);
+    else
+        res = zxc_encode_block_glo(ctx, chunk, src_sz, dst, dst_cap, &w);
+
+    // Check expansion against block data size (excluding dict prefix).
+    if (UNLIKELY(res != ZXC_OK || w >= block_sz)) {
+        res = zxc_encode_block_raw(block_data, block_sz, dst, dst_cap, &w);
+        if (UNLIKELY(res != ZXC_OK)) return res;
+    }
+
+    if (ctx->checksum_enabled) {
+        // Calculate checksum on the compressed payload (w currently excludes checksum)
+        // Header is at dst, data starts at dst + ZXC_BLOCK_HEADER_SIZE
+        if (UNLIKELY(w < ZXC_BLOCK_HEADER_SIZE || w + ZXC_BLOCK_CHECKSUM_SIZE > dst_cap))
+            return ZXC_ERROR_OVERFLOW;
+
+        uint32_t payload_sz = (uint32_t)(w - ZXC_BLOCK_HEADER_SIZE);
+        uint32_t crc =
+            zxc_checksum(dst + ZXC_BLOCK_HEADER_SIZE, payload_sz, ZXC_CHECKSUM_RAPIDHASH);
+        zxc_store_le32(dst + w, crc);
+        w += ZXC_BLOCK_CHECKSUM_SIZE;
+    }
+
+    return (int)w;
+}
diff --git a/thirdparty/zxc/src/lib/zxc_decompress.c b/thirdparty/zxc/src/lib/zxc_decompress.c
new file mode 100644
index 000000000000..88bd9f7b5536
--- /dev/null
+++ b/thirdparty/zxc/src/lib/zxc_decompress.c
@@ -0,0 +1,2096 @@
+/*
+ * ZXC - High-performance lossless compression
+ *
+ * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file zxc_decompress.c
+ * @brief Block-level decompression: GLO / GHI / RAW decoding with
+ *        SIMD-accelerated bit-unpacking and overlapping copies.
+ *
+ * Like @ref zxc_compress.c, this file is compiled multiple times with
+ * @c ZXC_FUNCTION_SUFFIX to produce per-ISA variants.
+ */
+
+/*
+ * Function Multi-Versioning Support
+ * If ZXC_FUNCTION_SUFFIX is defined (e.g. _avx2, _neon), rename the public
+ * entry point AND the Huffman decoder consumed by this TU. The defines sit
+ * before zxc_internal.h so that the prototypes the header declares are also
+ * rewritten with the suffix, keeping callers and callees consistent.
+ */
+#ifdef ZXC_FUNCTION_SUFFIX
+#define ZXC_CAT_IMPL(x, y) x##y
+#define ZXC_CAT(x, y) ZXC_CAT_IMPL(x, y)
+#define zxc_decompress_chunk_wrapper ZXC_CAT(zxc_decompress_chunk_wrapper, ZXC_FUNCTION_SUFFIX)
+#define zxc_decompress_chunk_wrapper_dict \
+    ZXC_CAT(zxc_decompress_chunk_wrapper_dict, ZXC_FUNCTION_SUFFIX)
+#define zxc_decompress_chunk_wrapper_safe \
+    ZXC_CAT(zxc_decompress_chunk_wrapper_safe, ZXC_FUNCTION_SUFFIX)
+#define zxc_huf_decode_section ZXC_CAT(zxc_huf_decode_section, ZXC_FUNCTION_SUFFIX)
+#define zxc_huf_decode_section_dict ZXC_CAT(zxc_huf_decode_section_dict, ZXC_FUNCTION_SUFFIX)
+#endif
+
+#include "../../include/zxc_error.h"
+#include "zxc_internal.h"
+
+/**
+ * @brief Reads a Prefix Varint encoded integer from a stream.
+ *
+ * This function decodes a 32-bit unsigned integer encoded in Prefix Varint format
+ * from the provided byte stream. Unary prefix bits in the first byte determine
+ * the total length (1-3 bytes).
+ *
+ * Format:
+ * - 1 byte  (0xxxxxxx):  7-bit payload (val < 2^7  = 128)
+ * - 2 bytes (10xxxxxx): 14-bit payload (val < 2^14 = 16384)
+ * - 3 bytes (110xxxxx): 21-bit payload (val < 2^21 = 2097152)
+ *
+ * @param[in,out] ptr Pointer to a pointer to the current position in the stream.
+ * @param[in] end Pointer to the end of the readable stream (for bounds checking).
+ * @return The decoded 32-bit integer, or 0 if reading would overflow bounds (safe default).
+ */
+static ZXC_ALWAYS_INLINE uint32_t zxc_read_varint(const uint8_t** ptr, const uint8_t* end) {
+    const uint8_t* p = *ptr;
+    // Bounds check: need at least 1 byte
+    if (UNLIKELY(p >= end)) return 0;
+
+    const uint32_t b0 = p[0];
+
+    // 1 Byte: 0xxxxxxx (7 bits) -> val < 128 (2^7)
+    if (LIKELY(b0 < 0x80)) {
+        *ptr = p + 1;
+        return b0;
+    }
+
+    // 2 Bytes: 10xxxxxx xxxxxxxx (14 bits) -> val < 16384 (2^14)
+    if (LIKELY(b0 < 0xC0)) {
+        if (UNLIKELY(p + 1 >= end)) {
+            *ptr = end;
+            return 0;
+        }
+        *ptr = p + 2;
+        return (b0 & 0x3F) | ((uint32_t)p[1] << 6);
+    }
+
+    // 3 Bytes: 110xxxxx xxxxxxxx xxxxxxxx (21 bits) -> val < 2097152 (2^21).
+    // This is the largest length a legitimate varint can take: block_size_max
+    // is 2^21 and varint values represent (ll - MASK) or (ml - MASK), which is
+    // always strictly less than block_size_max.
+    if (LIKELY(b0 < 0xE0)) {
+        if (UNLIKELY(p + 2 >= end)) {
+            *ptr = end;
+            return 0;
+        }
+        *ptr = p + 3;
+        return (b0 & 0x1F) | ((uint32_t)p[1] << 5) | ((uint32_t)p[2] << 13);
+    }
+
+    // extra encoding: out-of-spec for the current format, reject.
+    *ptr = end;
+    return 0;
+}
+
+/**
+ * @brief Shuffle masks for overlapping copies with small offsets (0-15).
+ *
+ * Shared between ARM NEON and x86 SSSE3. Each row defines how to replicate
+ * source bytes to fill 16 bytes when offset < 16.
+ */
+#if defined(ZXC_USE_NEON64) || defined(ZXC_USE_NEON32) || defined(ZXC_USE_AVX2) || \
+    defined(ZXC_USE_AVX512)
+/**
+ * @brief Precomputed masks for handling overlapping data during decompression.
+ *
+ * This 16x16 lookup table contains 128-bit aligned masks used to efficiently
+ * mask off or combine bytes when processing overlapping copy operations or
+ * boundary conditions in the ZXC decompression algorithm.
+ *
+ * The alignment to 16 bytes ensures compatibility with SIMD instructions
+ * (like SSE/AVX) for optimized memory operations.
+ */
+static const ZXC_ALIGN(16) uint8_t zxc_overlap_masks[16][16] = {
+    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},      // off=0 (unused)
+    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},      // off=1 (RLE handled separately)
+    {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},      // off=2
+    {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},      // off=3
+    {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},      // off=4
+    {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0},      // off=5
+    {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3},      // off=6
+    {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1},      // off=7
+    {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7},      // off=8
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6},      // off=9
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5},      // off=10
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4},     // off=11
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3},    // off=12
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2},   // off=13
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1},  // off=14
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0}  // off=15
+};
+#endif
+
+/**
+ * @brief Per-offset store stride for periodic overlap runs: the largest
+ *        multiple of @c off that fits in 16 bytes, i.e. `16 - (16 % off)`.
+ *
+ * Advancing the output cursor by a multiple of @c off keeps the 16-byte
+ * pattern vector phase-aligned, so the run is emitted with pure stores of a
+ * single register. Entries 0 and 1 are unused (RLE handled separately).
+ */
+static const uint8_t zxc_overlap_strides[16] = {16, 16, 16, 15, 16, 15, 12, 14,
+                                                16, 9,  10, 11, 12, 13, 14, 15};
+
+/**
+ * @brief Copies an @p ml-byte LZ run whose pattern repeats with period @p off (2..15).
+ *
+ * Builds the 16-byte periodic pattern `out[i] = dst[-off + (i % off)]` once
+ * (one shuffle on NEON/SSSE3, a wrap-counter byte loop on the SSE2/scalar
+ * tier), then emits 16-byte stores advancing by @ref zxc_overlap_strides so
+ * the pattern never needs re-shuffling. May overshoot up to 15 bytes past
+ * @p ml; the caller must guarantee @ref ZXC_PAD_SIZE bytes of headroom.
+ *
+ * @param[out] dst Output cursor; the run source is `dst - off`.
+ * @param[in]  off Back-reference distance, in [2, 15].
+ * @param[in]  ml  Run length in bytes (>= 1).
+ */
+// codeql[cpp/unused-static-function] : False positive
+static ZXC_ALWAYS_INLINE void zxc_decode_copy_overlap_run(uint8_t* dst, const uint32_t off,
+                                                          const uint64_t ml) {
+    const size_t stride = zxc_overlap_strides[off];
+    size_t copied = 0;
+#if defined(ZXC_USE_NEON64)
+    const uint8x16_t mask = vld1q_u8(zxc_overlap_masks[off]);
+    const uint8x16_t pat = vqtbl1q_u8(vld1q_u8(dst - off), mask);
+    do {
+        vst1q_u8(dst + copied, pat);
+        copied += stride;
+    } while (copied < ml);
+
+#elif defined(ZXC_USE_NEON32)
+    uint8x8x2_t src_tbl;
+    src_tbl.val[0] = vld1_u8(dst - off);
+    src_tbl.val[1] = vld1_u8(dst - off + 8);
+    const uint8x8_t pat_lo = vtbl2_u8(src_tbl, vld1_u8(zxc_overlap_masks[off]));
+    const uint8x8_t pat_hi = vtbl2_u8(src_tbl, vld1_u8(zxc_overlap_masks[off] + 8));
+    do {
+        vst1_u8(dst + copied, pat_lo);
+        vst1_u8(dst + copied + 8, pat_hi);
+        copied += stride;
+    } while (copied < ml);
+
+#elif defined(ZXC_USE_AVX2) || defined(ZXC_USE_AVX512)
+    const __m128i mask = _mm_load_si128((const __m128i*)zxc_overlap_masks[off]);
+    const __m128i src_data = _mm_loadu_si128((const __m128i*)(dst - off));
+    const __m128i pat = _mm_shuffle_epi8(src_data, mask);
+    do {
+        _mm_storeu_si128((__m128i*)(dst + copied), pat);
+        copied += stride;
+    } while (copied < ml);
+
+#else
+    // SSE2-only tier and non-SIMD builds: no PSHUFB, build the pattern with a
+    // wrap counter (no per-byte modulo), then store it via zxc_copy16.
+    const uint8_t* src = dst - off;
+    uint8_t pat[16];
+    uint32_t k = 0;
+    for (size_t i = 0; i < 16; i++) {
+        pat[i] = src[k];
+        if (++k == off) k = 0;
+    }
+    do {
+        zxc_copy16(dst + copied, pat);
+        copied += stride;
+    } while (copied < ml);
+#endif
+}
+
+/**
+ * @brief Fills an @p ml-byte single-byte run (LZ offset == 1) with wild stores.
+ *
+ * Splats @p byte into a vector register and emits @ref ZXC_PAD_SIZE-byte
+ * chunks, avoiding a libc memset call on the typically short runs of the hot
+ * path. Like the other run copiers it may **overshoot** up to
+ * @ref ZXC_PAD_SIZE - 1 bytes past @p ml; the caller must guarantee
+ * @ref ZXC_PAD_SIZE bytes of headroom. Falls back to @ref ZXC_MEMSET on
+ * non-SIMD builds.
+ *
+ * @param[out] dst  Output cursor.
+ * @param[in]  byte Byte value to replicate.
+ * @param[in]  ml   Run length in bytes (>= 1).
+ */
+// codeql[cpp/unused-static-function] : False positive, used in DECODE_SEQ_SAFE/FAST macros
+static ZXC_ALWAYS_INLINE void zxc_decode_fill_run(uint8_t* dst, const uint8_t byte,
+                                                  const uint64_t ml) {
+#if defined(ZXC_USE_AVX2) || defined(ZXC_USE_AVX512)
+    const __m256i v = _mm256_set1_epi8((char)byte);
+    _mm256_storeu_si256((__m256i*)dst, v);
+    if (UNLIKELY(ml > ZXC_PAD_SIZE)) {
+        uint8_t* out = dst + ZXC_PAD_SIZE;
+        size_t rem = ml - ZXC_PAD_SIZE;
+        while (rem > ZXC_PAD_SIZE) {
+            _mm256_storeu_si256((__m256i*)out, v);
+            out += ZXC_PAD_SIZE;
+            rem -= ZXC_PAD_SIZE;
+        }
+        _mm256_storeu_si256((__m256i*)out, v);
+    }
+#elif defined(ZXC_USE_SSE2)
+    const __m128i v = _mm_set1_epi8((char)byte);
+    _mm_storeu_si128((__m128i*)dst, v);
+    _mm_storeu_si128((__m128i*)(dst + 16), v);
+    if (UNLIKELY(ml > ZXC_PAD_SIZE)) {
+        uint8_t* out = dst + ZXC_PAD_SIZE;
+        size_t rem = ml - ZXC_PAD_SIZE;
+        while (rem > ZXC_PAD_SIZE) {
+            _mm_storeu_si128((__m128i*)out, v);
+            _mm_storeu_si128((__m128i*)(out + 16), v);
+            out += ZXC_PAD_SIZE;
+            rem -= ZXC_PAD_SIZE;
+        }
+        _mm_storeu_si128((__m128i*)out, v);
+        _mm_storeu_si128((__m128i*)(out + 16), v);
+    }
+#elif defined(ZXC_USE_NEON64) || defined(ZXC_USE_NEON32)
+    const uint8x16_t v = vdupq_n_u8(byte);
+    vst1q_u8(dst, v);
+    vst1q_u8(dst + 16, v);
+    if (UNLIKELY(ml > ZXC_PAD_SIZE)) {
+        uint8_t* out = dst + ZXC_PAD_SIZE;
+        size_t rem = ml - ZXC_PAD_SIZE;
+        while (rem > ZXC_PAD_SIZE) {
+            vst1q_u8(out, v);
+            vst1q_u8(out + 16, v);
+            out += ZXC_PAD_SIZE;
+            rem -= ZXC_PAD_SIZE;
+        }
+        vst1q_u8(out, v);
+        vst1q_u8(out + 16, v);
+    }
+#else
+    ZXC_MEMSET(dst, byte, ml);
+#endif
+}
+
+/* ==========================================================================
+ * Shared decode macros for the GLO and GHI decoders (fast + safe variants).
+ * Defined at file scope to avoid four identical copies inside each function.
+ * They reference the local names l_ptr, d_ptr, written that every call site
+ * has in scope. #undef-ed at the end of the last consumer.
+ * ========================================================================== */
+
+/**
+ * @brief Copies @p ll literal bytes from @p src to @p dst using 32-byte wild copies.
+ *
+ * Writes in @ref ZXC_PAD_SIZE-byte chunks and may **overshoot** by up to
+ * @ref ZXC_PAD_SIZE - 1 bytes past @p ll; the caller must guarantee @p dst has at
+ * least @ref ZXC_PAD_SIZE bytes of writable headroom (the unrolled loops and the
+ * trailing-literal margins ensure this). Pointers are taken by value and the
+ * caller advances its own cursors by @p ll, keeping them in registers on the hot
+ * path.
+ *
+ * @param[out] dst Output cursor. Must not overlap @p src and must have
+ *                 @ref ZXC_PAD_SIZE bytes of overshoot headroom.
+ * @param[in]  src Literal-stream source. Must not overlap @p dst (RESTRICT).
+ * @param[in]  ll  Number of literal bytes to copy.
+ */
+static ZXC_ALWAYS_INLINE void zxc_decode_copy_literals(uint8_t* RESTRICT dst,
+                                                       const uint8_t* RESTRICT src,
+                                                       const uint64_t ll) {
+    zxc_copy32(dst, src);
+    if (UNLIKELY(ll > ZXC_PAD_SIZE)) {
+        dst += ZXC_PAD_SIZE;
+        src += ZXC_PAD_SIZE;
+        size_t rem = ll - ZXC_PAD_SIZE;
+        while (rem > ZXC_PAD_SIZE) {
+            zxc_copy32(dst, src);
+            dst += ZXC_PAD_SIZE;
+            src += ZXC_PAD_SIZE;
+            rem -= ZXC_PAD_SIZE;
+        }
+        zxc_copy32(dst, src);
+    }
+}
+
+/**
+ * @brief Copies an @p ml-byte LZ match from @c d_ptr-off to @p d_ptr, handling overlap.
+ *
+ * The source @c d_ptr-off may overlap the destination (the LZ repeat case), so the
+ * copy strategy is chosen by back-reference distance:
+ *  - @p off >= @ref ZXC_PAD_SIZE      : 32-byte wild copies (no overlap within a chunk);
+ *  - @p off >= @ref ZXC_PAD_SIZE / 2  : 16-byte wild copies;
+ *  - @p off == 1                      : single-byte run via @ref zxc_decode_fill_run;
+ *  - otherwise (2..15)                : pattern-replicating overlap copy.
+ *
+ * Like @ref zxc_decode_copy_literals it may **overshoot** up to @ref ZXC_PAD_SIZE - 1
+ * bytes past @p ml, so @p d_ptr must have @ref ZXC_PAD_SIZE bytes of headroom. @p d_ptr
+ * is taken by value; the caller advances its cursor by @p ml.
+ *
+ * @param[in,out] d_ptr Output cursor; the match source is @c d_ptr-off. Must have
+ *                      @ref ZXC_PAD_SIZE bytes of overshoot headroom.
+ * @param[in]     off   Resolved (bias-removed) back-reference distance, @c >= 1.
+ * @param[in]     ml    Match length in bytes (@c >= ZXC_LZ_MIN_MATCH_LEN).
+ */
+static ZXC_ALWAYS_INLINE void zxc_decode_copy_match(uint8_t* RESTRICT d_ptr, const uint32_t off,
+                                                    const uint64_t ml) {
+    const uint8_t* match_src = d_ptr - off;
+    if (LIKELY(off >= ZXC_PAD_SIZE)) {
+        zxc_copy32(d_ptr, match_src);
+        if (UNLIKELY(ml > ZXC_PAD_SIZE)) {
+            uint8_t* out = d_ptr + ZXC_PAD_SIZE;
+            const uint8_t* ref = match_src + ZXC_PAD_SIZE;
+            size_t rem = ml - ZXC_PAD_SIZE;
+            while (rem > ZXC_PAD_SIZE) {
+                zxc_copy32(out, ref);
+                out += ZXC_PAD_SIZE;
+                ref += ZXC_PAD_SIZE;
+                rem -= ZXC_PAD_SIZE;
+            }
+            zxc_copy32(out, ref);
+        }
+    } else if (off >= (ZXC_PAD_SIZE / 2)) {
+        zxc_copy16(d_ptr, match_src);
+        if (UNLIKELY(ml > (ZXC_PAD_SIZE / 2))) {
+            uint8_t* out = d_ptr + (ZXC_PAD_SIZE / 2);
+            const uint8_t* ref = match_src + (ZXC_PAD_SIZE / 2);
+            size_t rem = ml - (ZXC_PAD_SIZE / 2);
+            while (rem > (ZXC_PAD_SIZE / 2)) {
+                zxc_copy16(out, ref);
+                out += (ZXC_PAD_SIZE / 2);
+                ref += (ZXC_PAD_SIZE / 2);
+                rem -= (ZXC_PAD_SIZE / 2);
+            }
+            zxc_copy16(out, ref);
+        }
+    } else if (off == 1) {
+        zxc_decode_fill_run(d_ptr, match_src[0], ml);
+    } else {
+        zxc_decode_copy_overlap_run(d_ptr, off, ml);
+    }
+}
+
+// SAFE version: validates offset against written bytes
+#define DECODE_SEQ_SAFE(ll, ml, off)                              \
+    do {                                                          \
+        zxc_decode_copy_literals(d_ptr, l_ptr, ll);               \
+        l_ptr += ll;                                              \
+        d_ptr += ll;                                              \
+        written += ll;                                            \
+        if (UNLIKELY(off > written)) return ZXC_ERROR_BAD_OFFSET; \
+        zxc_decode_copy_match(d_ptr, off, ml);                    \
+        d_ptr += ml;                                              \
+        written += ml;                                            \
+    } while (0)
+
+// FAST version: no offset validation (for use after written >= 256 or 65536)
+#define DECODE_SEQ_FAST(ll, ml, off)                \
+    do {                                            \
+        zxc_decode_copy_literals(d_ptr, l_ptr, ll); \
+        l_ptr += ll;                                \
+        d_ptr += ll;                                \
+        zxc_decode_copy_match(d_ptr, off, ml);      \
+        d_ptr += ml;                                \
+    } while (0)
+
+/**
+ * @brief Unified GLO (General Low) block decoder body, shared by the fast, safe
+ *        and dictionary variants.
+ *
+ * Decodes a block in the internal GLO format; the decompressed size is derived
+ * from the Section Descriptors in the payload. @p safe and @p has_dict must be
+ * compile-time constants (0 or 1): the 4x-unrolled loops are duplicated inside
+ * @c if(safe)/else branches so each variant keeps single-assignment @c const
+ * save pointers, and after constant propagation only one branch survives per
+ * wrapper (codegen equivalent to a hand-written pair).
+ *
+ * @param[in,out] ctx          Decompression context (dict buffer, tables).
+ * @param[in]     src          Compressed block payload.
+ * @param[in]     src_size     Size of @p src in bytes.
+ * @param[out]    dst          Destination buffer for decoded bytes.
+ * @param[in]     dst_capacity Capacity of @p dst in bytes.
+ * @param[in]     safe         Compile-time flag: 1 = strict bounds-checked loop.
+ * @param[in]     has_dict     Compile-time flag: 1 = resolve matches against a dict prefix.
+ * @return Bytes written to @p dst on success, or a negative @ref zxc_error_t.
+ */
+static ZXC_ALWAYS_INLINE int zxc_decode_block_glo_impl(const zxc_cctx_t* RESTRICT ctx,
+                                                       const uint8_t* RESTRICT src,
+                                                       const size_t src_size, uint8_t* RESTRICT dst,
+                                                       const size_t dst_capacity, const int safe,
+                                                       const int has_dict) {
+    zxc_gnr_header_t gh;
+
+    /* Constant 0 when !has_dict, so `written` starts at 0 and `dst - dict_size`
+     * folds to `dst` -- pre-dict codegen on the hot path. */
+    const size_t dict_size = has_dict ? ctx->dict_size : 0;
+    zxc_section_desc_t desc[ZXC_GLO_SECTIONS];
+
+    if (UNLIKELY(zxc_read_glo_header_and_desc(src, src_size, &gh, desc) != ZXC_OK))
+        return ZXC_ERROR_BAD_HEADER;
+
+    const uint8_t* p_data =
+        src + ZXC_GLO_HEADER_BINARY_SIZE + ZXC_GLO_SECTIONS * ZXC_SECTION_DESC_BINARY_SIZE;
+    const uint8_t* p_curr = p_data;
+
+    // --- Literal Stream Setup ---
+    const uint8_t* l_ptr;
+    const uint8_t* l_end;
+    uint8_t* rle_buf = NULL;
+
+    size_t lit_stream_size = (size_t)(desc[0].sizes & ZXC_SECTION_SIZE_MASK);
+
+    if (gh.enc_lit == ZXC_SECTION_ENCODING_HUFFMAN) {
+        const size_t required_size = (size_t)(desc[0].sizes >> 32);
+        if (UNLIKELY(lit_stream_size > (size_t)(src + src_size - p_curr)))
+            return ZXC_ERROR_CORRUPT_DATA;
+        if (required_size == 0) {
+            l_ptr = p_curr;
+            l_end = p_curr;
+        } else {
+            if (UNLIKELY(required_size > dst_capacity || required_size > SIZE_MAX - ZXC_PAD_SIZE))
+                return ZXC_ERROR_DST_TOO_SMALL;
+            const size_t alloc_size = required_size + ZXC_PAD_SIZE;
+            /* lit_buffer is pre-allocated to chunk_size + ZXC_PAD_SIZE by
+             * zxc_cctx_init (mode == 0). */
+            if (UNLIKELY(ctx->lit_buffer_cap < alloc_size)) return ZXC_ERROR_CORRUPT_DATA;
+            const int rc =
+                zxc_huf_decode_section(p_curr, lit_stream_size, ctx->lit_buffer, required_size);
+            if (UNLIKELY(rc != ZXC_OK)) return rc;
+            l_ptr = ctx->lit_buffer;
+            l_end = ctx->lit_buffer + required_size;
+        }
+    } else if (gh.enc_lit == ZXC_SECTION_ENCODING_HUFFMAN_DICT) {
+        /* Shared dictionary table: no inline lengths header; the prebuilt
+         * decode table was attached to the context with the dictionary. */
+        const size_t required_size = (size_t)(desc[0].sizes >> 32);
+        if (UNLIKELY(lit_stream_size > (size_t)(src + src_size - p_curr)))
+            return ZXC_ERROR_CORRUPT_DATA;
+        if (required_size == 0) {
+            l_ptr = p_curr;
+            l_end = p_curr;
+        } else {
+            if (UNLIKELY(ctx->dict_huf_table == NULL)) return ZXC_ERROR_DICT_REQUIRED;
+            if (UNLIKELY(required_size > dst_capacity || required_size > SIZE_MAX - ZXC_PAD_SIZE))
+                return ZXC_ERROR_DST_TOO_SMALL;
+            const size_t alloc_size = required_size + ZXC_PAD_SIZE;
+            /* lit_buffer is pre-allocated to chunk_size + ZXC_PAD_SIZE by
+             * zxc_cctx_init (mode == 0). */
+            if (UNLIKELY(ctx->lit_buffer_cap < alloc_size)) return ZXC_ERROR_CORRUPT_DATA;
+            const int rc = zxc_huf_decode_section_dict(p_curr, lit_stream_size, ctx->lit_buffer,
+                                                       required_size, ctx->dict_huf_table);
+            if (UNLIKELY(rc != ZXC_OK)) return rc;
+            l_ptr = ctx->lit_buffer;
+            l_end = ctx->lit_buffer + required_size;
+        }
+    } else if (gh.enc_lit == ZXC_SECTION_ENCODING_RLE) {
+        const size_t required_size = (size_t)(desc[0].sizes >> 32);
+
+        if (required_size > 0) {
+            if (UNLIKELY(required_size > dst_capacity || required_size > SIZE_MAX - ZXC_PAD_SIZE))
+                return ZXC_ERROR_DST_TOO_SMALL;
+            const size_t alloc_size = required_size + ZXC_PAD_SIZE;
+
+            /* lit_buffer is pre-allocated to chunk_size + ZXC_PAD_SIZE by
+             * zxc_cctx_init (mode == 0).*/
+            if (UNLIKELY(ctx->lit_buffer_cap < alloc_size)) return ZXC_ERROR_CORRUPT_DATA;
+
+            rle_buf = ctx->lit_buffer;
+            if (UNLIKELY(!rle_buf || lit_stream_size > (size_t)(src + src_size - p_curr)))
+                return ZXC_ERROR_CORRUPT_DATA;
+
+            const uint8_t* r_ptr = p_curr;
+            const uint8_t* r_end = r_ptr + lit_stream_size;
+            uint8_t* w_ptr = rle_buf;
+            const uint8_t* const w_end = rle_buf + required_size;
+
+            while (r_ptr < r_end && w_ptr < w_end) {
+                uint8_t token = *r_ptr++;
+                if (LIKELY(!(token & ZXC_LIT_RLE_FLAG))) {
+                    // Raw copy (most common path): use ZXC_PAD_SIZE-byte wild copies
+                    // token is 7-bit (0-127), so len is 1-128 bytes
+                    const uint32_t len = (uint32_t)token + 1;
+                    if (UNLIKELY(w_ptr + len > w_end || r_ptr + len > r_end))
+                        return ZXC_ERROR_CORRUPT_DATA;
+
+                    // Destination has ZXC_PAD_SIZE bytes of safe overrun space.
+                    // Source may not - check before wild copy.
+                    // Fast path: source has ZXC_PAD_SIZE-byte read headroom (most common)
+                    if (LIKELY(r_ptr + ZXC_PAD_SIZE <= r_end)) {
+                        // Single 32-byte copy covers len <= ZXC_PAD_SIZE (most tokens)
+                        zxc_copy32(w_ptr, r_ptr);
+
+                        if (UNLIKELY(len > ZXC_PAD_SIZE)) {
+                            // Unroll: max len=128, so max 4 copies total
+                            // Use unconditional stores with overlap - faster than branches
+                            if (len <= 2 * ZXC_PAD_SIZE) {
+                                zxc_copy32(w_ptr + len - ZXC_PAD_SIZE, r_ptr + len - ZXC_PAD_SIZE);
+                            } else if (len <= 3 * ZXC_PAD_SIZE) {
+                                zxc_copy32(w_ptr + ZXC_PAD_SIZE, r_ptr + ZXC_PAD_SIZE);
+                                zxc_copy32(w_ptr + len - ZXC_PAD_SIZE, r_ptr + len - ZXC_PAD_SIZE);
+                            } else {
+                                zxc_copy32(w_ptr + ZXC_PAD_SIZE, r_ptr + ZXC_PAD_SIZE);
+                                zxc_copy32(w_ptr + 2 * ZXC_PAD_SIZE, r_ptr + 2 * ZXC_PAD_SIZE);
+                                zxc_copy32(w_ptr + len - ZXC_PAD_SIZE, r_ptr + len - ZXC_PAD_SIZE);
+                            }
+                        }
+                    } else {
+                        // Near end of source: safe copy (rare cold path)
+                        ZXC_MEMCPY(w_ptr, r_ptr, len);
+                    }
+
+                    w_ptr += len;
+                    r_ptr += len;
+                } else {
+                    // RLE run: fill with single byte
+                    const uint32_t len = (token & ZXC_LIT_LEN_MASK) + 4;
+                    if (UNLIKELY(w_ptr + len > w_end || r_ptr >= r_end))
+                        return ZXC_ERROR_CORRUPT_DATA;
+                    ZXC_MEMSET(w_ptr, *r_ptr++, len);
+                    w_ptr += len;
+                }
+            }
+            if (UNLIKELY(w_ptr != w_end)) return ZXC_ERROR_CORRUPT_DATA;
+            l_ptr = rle_buf;
+            l_end = rle_buf + required_size;
+        } else {
+            l_ptr = p_curr;
+            l_end = p_curr;
+        }
+    } else if (gh.enc_lit == ZXC_SECTION_ENCODING_RAW) {
+        l_ptr = p_curr;
+        l_end = p_curr + lit_stream_size;
+    } else {
+        return ZXC_ERROR_CORRUPT_DATA;
+    }
+
+    p_curr += lit_stream_size;
+
+    // --- Stream Pointers & Validation ---
+    const size_t sz_tokens = (size_t)(desc[1].sizes & ZXC_SECTION_SIZE_MASK);
+    const size_t sz_offsets = (size_t)(desc[2].sizes & ZXC_SECTION_SIZE_MASK);
+    const size_t sz_extras = (size_t)(desc[3].sizes & ZXC_SECTION_SIZE_MASK);
+
+    // Validate stream sizes match sequence count (early rejection of malformed data)
+    const uint64_t expected_off_size =
+        (gh.enc_off == 1) ? (uint64_t)gh.n_sequences : (uint64_t)gh.n_sequences * 2;
+
+    const uint8_t* t_ptr = p_curr;
+    const uint8_t* o_ptr = t_ptr + sz_tokens;
+    const uint8_t* e_ptr = o_ptr + sz_offsets;
+    const uint8_t* const e_end = e_ptr + sz_extras;  // For vbyte overflow detection
+
+    // Validate streams don't overflow source buffer +
+    // Validate stream sizes match sequence count (early rejection of malformed data)
+    if (UNLIKELY((e_end != src + src_size) || sz_tokens < gh.n_sequences ||
+                 (uint64_t)sz_offsets < expected_off_size))
+        return ZXC_ERROR_CORRUPT_DATA;
+
+    uint8_t* d_ptr = dst;
+    const uint8_t* const d_end = dst + dst_capacity;
+    // Destination safe margin for 4x loop: max output without varint extension.
+    // ll_max = 14, ml_max = 14 + 5 = 19, per-seq = 33, 4x = 132.
+    // Add ZXC_PAD_SIZE (32) for the wild zxc_copy32 overshoot + 4 safety = 168.
+    const uint8_t* const d_end_safe = d_end - (132 + ZXC_PAD_SIZE + 4);
+
+    // Literal stream safe threshold for 4x-unrolled loops.
+    // Without varint extension, max ll per sequence = ZXC_TOKEN_LL_MASK - 1 = 14.
+    // For 4 sequences: 4 * 14 = 56. With this margin, l_ptr checks are only needed
+    // on the cold varint path, keeping the hot path free of l_ptr overhead.
+    const size_t glo_sz_lit = (size_t)(l_end - l_ptr);
+    const size_t glo_margin_4x = 4 * (ZXC_TOKEN_LL_MASK - 1);  // 56
+    const size_t glo_margin_1x = ZXC_TOKEN_LL_MASK - 1;        // 14
+    const uint8_t* const l_end_safe_4x =
+        (glo_sz_lit > glo_margin_4x) ? l_end - glo_margin_4x : l_ptr;
+    const uint8_t* const l_end_safe_1x =
+        (glo_sz_lit > glo_margin_1x) ? l_end - glo_margin_1x : l_ptr;
+
+    uint32_t n_seq = gh.n_sequences;
+
+    // Track bytes written for offset validation
+    // For 1-byte offsets (enc_off==1): validate until 256 bytes written (max 8-bit offset)
+    // For 2-byte offsets (enc_off==0): validate until 65536 bytes written (max 16-bit offset)
+    // After threshold, all offsets are guaranteed valid (can't exceed written bytes)
+    // When a dictionary is active, dict_size bytes are logically "already written"
+    // (prepended by the caller), so the SAFE loop may be skipped entirely.
+    size_t written = dict_size;
+
+    // --- SAFE Loop: offset validation until threshold (4x unroll) ---
+    // For 1-byte offsets: bounds check until 256 bytes written
+    // For 2-byte offsets: bounds check until 65536 bytes written
+    const size_t bounds_threshold = (gh.enc_off == 1) ? (1U << 8) : (1U << 16);
+
+    if (safe) {
+        /* SAFE variant: save per-batch state so overflow can rollback. */
+        while (n_seq >= 4 && d_ptr < d_end_safe && l_ptr < l_end_safe_4x &&
+               written < bounds_threshold) {
+            const uint8_t* const t_save = t_ptr;
+            const uint8_t* const o_save = o_ptr;
+            const uint8_t* const e_save = e_ptr;
+            uint8_t* const d_save = d_ptr;
+            const uint8_t* const l_save = l_ptr;
+            const size_t w_save = written;
+            uint32_t tokens = zxc_le32(t_ptr);
+            t_ptr += sizeof(uint32_t);
+
+            uint32_t off1 = ZXC_LZ_OFFSET_BIAS, off2 = ZXC_LZ_OFFSET_BIAS,
+                     off3 = ZXC_LZ_OFFSET_BIAS, off4 = ZXC_LZ_OFFSET_BIAS;
+            if (gh.enc_off == 1) {
+                uint32_t offsets = zxc_le32(o_ptr);
+                o_ptr += sizeof(uint32_t);
+                off1 += offsets & 0xFF;
+                off2 += (offsets >> 8) & 0xFF;
+                off3 += (offsets >> 16) & 0xFF;
+                off4 += (offsets >> 24) & 0xFF;
+            } else {
+                uint64_t offsets = zxc_le64(o_ptr);
+                o_ptr += sizeof(uint64_t);
+                off1 += (uint32_t)(offsets & 0xFFFF);
+                off2 += (uint32_t)((offsets >> 16) & 0xFFFF);
+                off3 += (uint32_t)((offsets >> 32) & 0xFFFF);
+                off4 += (uint32_t)((offsets >> 48) & 0xFFFF);
+            }
+
+            uint64_t ll1 = (tokens & 0x0F0) >> 4;
+            uint64_t ml1 = (tokens & 0x00F);
+            if (UNLIKELY(ll1 == ZXC_TOKEN_LL_MASK)) {
+                ll1 += zxc_read_varint(&e_ptr, e_end);
+                const uint64_t reserve =
+                    ((tokens >> 12) & 0xF) + ((tokens >> 20) & 0xF) + (tokens >> 28);
+                if (UNLIKELY(ll1 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll1 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    goto rollback_safe_4x;
+            }
+            if (UNLIKELY(ml1 == ZXC_TOKEN_ML_MASK)) {
+                ml1 += zxc_read_varint(&e_ptr, e_end);
+                if (UNLIKELY(ll1 + ml1 + ZXC_LZ_MIN_MATCH_LEN +
+                                 3U * ZXC_GLO_MAX_INLINE_OUT_PER_SEQ + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    goto rollback_safe_4x;
+            }
+            ml1 += ZXC_LZ_MIN_MATCH_LEN;
+            DECODE_SEQ_SAFE(ll1, ml1, off1);
+
+            uint64_t ll2 = (tokens & 0x0F000) >> 12;
+            uint64_t ml2 = (tokens & 0x00F00) >> 8;
+            if (UNLIKELY(ll2 == ZXC_TOKEN_LL_MASK)) {
+                ll2 += zxc_read_varint(&e_ptr, e_end);
+                const uint64_t reserve = ((tokens >> 20) & 0xF) + (tokens >> 28);
+                if (UNLIKELY(ll2 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll2 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    goto rollback_safe_4x;
+            }
+            if (UNLIKELY(ml2 == ZXC_TOKEN_ML_MASK)) {
+                ml2 += zxc_read_varint(&e_ptr, e_end);
+                if (UNLIKELY(ll2 + ml2 + ZXC_LZ_MIN_MATCH_LEN +
+                                 2U * ZXC_GLO_MAX_INLINE_OUT_PER_SEQ + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    goto rollback_safe_4x;
+            }
+            ml2 += ZXC_LZ_MIN_MATCH_LEN;
+            DECODE_SEQ_SAFE(ll2, ml2, off2);
+
+            uint64_t ll3 = (tokens & 0x0F00000) >> 20;
+            uint64_t ml3 = (tokens & 0x00F0000) >> 16;
+            if (UNLIKELY(ll3 == ZXC_TOKEN_LL_MASK)) {
+                ll3 += zxc_read_varint(&e_ptr, e_end);
+                const uint64_t reserve = (tokens >> 28);
+                if (UNLIKELY(ll3 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll3 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    goto rollback_safe_4x;
+            }
+            if (UNLIKELY(ml3 == ZXC_TOKEN_ML_MASK)) {
+                ml3 += zxc_read_varint(&e_ptr, e_end);
+                if (UNLIKELY(ll3 + ml3 + ZXC_LZ_MIN_MATCH_LEN + ZXC_GLO_MAX_INLINE_OUT_PER_SEQ +
+                                 ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    goto rollback_safe_4x;
+            }
+            ml3 += ZXC_LZ_MIN_MATCH_LEN;
+            DECODE_SEQ_SAFE(ll3, ml3, off3);
+
+            uint64_t ll4 = (tokens >> 28);
+            uint64_t ml4 = (tokens >> 24) & 0x0F;
+            if (UNLIKELY(ll4 == ZXC_TOKEN_LL_MASK)) {
+                ll4 += zxc_read_varint(&e_ptr, e_end);
+                if (UNLIKELY(ll4 > (size_t)(l_end - l_ptr) ||
+                             ll4 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    goto rollback_safe_4x;
+            }
+            if (UNLIKELY(ml4 == ZXC_TOKEN_ML_MASK)) {
+                ml4 += zxc_read_varint(&e_ptr, e_end);
+                if (UNLIKELY(ll4 + ml4 + ZXC_LZ_MIN_MATCH_LEN + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    goto rollback_safe_4x;
+            }
+            ml4 += ZXC_LZ_MIN_MATCH_LEN;
+            DECODE_SEQ_SAFE(ll4, ml4, off4);
+
+            n_seq -= 4;
+            continue;
+
+        rollback_safe_4x:
+            t_ptr = t_save;
+            o_ptr = o_save;
+            e_ptr = e_save;
+            d_ptr = d_save;
+            l_ptr = l_save;
+            written = w_save;
+            break;
+        }
+    } else {
+        while (n_seq >= 4 && d_ptr < d_end_safe && l_ptr < l_end_safe_4x &&
+               written < bounds_threshold) {
+            uint32_t tokens = zxc_le32(t_ptr);
+            t_ptr += sizeof(uint32_t);
+
+            uint32_t off1 = ZXC_LZ_OFFSET_BIAS;
+            uint32_t off2 = ZXC_LZ_OFFSET_BIAS;
+            uint32_t off3 = ZXC_LZ_OFFSET_BIAS;
+            uint32_t off4 = ZXC_LZ_OFFSET_BIAS;
+
+            if (gh.enc_off == 1) {
+                // Read 4 x 1-byte offsets
+                uint32_t offsets = zxc_le32(o_ptr);
+                o_ptr += sizeof(uint32_t);
+                off1 += offsets & 0xFF;
+                off2 += (offsets >> 8) & 0xFF;
+                off3 += (offsets >> 16) & 0xFF;
+                off4 += (offsets >> 24) & 0xFF;
+            } else {
+                // Read 4 x 2-byte offsets
+                uint64_t offsets = zxc_le64(o_ptr);
+                o_ptr += sizeof(uint64_t);
+                off1 += (uint32_t)(offsets & 0xFFFF);
+                off2 += (uint32_t)((offsets >> 16) & 0xFFFF);
+                off3 += (uint32_t)((offsets >> 32) & 0xFFFF);
+                off4 += (uint32_t)((offsets >> 48) & 0xFFFF);
+            }
+
+            uint64_t ll1 = (tokens & 0x0F0) >> 4;
+            uint64_t ml1 = (tokens & 0x00F);
+            if (UNLIKELY(ll1 == ZXC_TOKEN_LL_MASK)) {
+                ll1 += zxc_read_varint(&e_ptr, e_end);
+                const uint64_t reserve =
+                    ((tokens >> 12) & 0xF) + ((tokens >> 20) & 0xF) + (tokens >> 28);
+                if (UNLIKELY(ll1 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll1 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            if (UNLIKELY(ml1 == ZXC_TOKEN_ML_MASK)) {
+                ml1 += zxc_read_varint(&e_ptr, e_end);
+                if (UNLIKELY(ll1 + ml1 + ZXC_LZ_MIN_MATCH_LEN +
+                                 3U * ZXC_GLO_MAX_INLINE_OUT_PER_SEQ + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            ml1 += ZXC_LZ_MIN_MATCH_LEN;
+            DECODE_SEQ_SAFE(ll1, ml1, off1);
+
+            uint64_t ll2 = (tokens & 0x0F000) >> 12;
+            uint64_t ml2 = (tokens & 0x00F00) >> 8;
+            if (UNLIKELY(ll2 == ZXC_TOKEN_LL_MASK)) {
+                ll2 += zxc_read_varint(&e_ptr, e_end);
+                const uint64_t reserve = ((tokens >> 20) & 0xF) + (tokens >> 28);
+                if (UNLIKELY(ll2 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll2 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            if (UNLIKELY(ml2 == ZXC_TOKEN_ML_MASK)) {
+                ml2 += zxc_read_varint(&e_ptr, e_end);
+                if (UNLIKELY(ll2 + ml2 + ZXC_LZ_MIN_MATCH_LEN +
+                                 2U * ZXC_GLO_MAX_INLINE_OUT_PER_SEQ + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            ml2 += ZXC_LZ_MIN_MATCH_LEN;
+            DECODE_SEQ_SAFE(ll2, ml2, off2);
+
+            uint64_t ll3 = (tokens & 0x0F00000) >> 20;
+            uint64_t ml3 = (tokens & 0x00F0000) >> 16;
+            if (UNLIKELY(ll3 == ZXC_TOKEN_LL_MASK)) {
+                ll3 += zxc_read_varint(&e_ptr, e_end);
+                const uint64_t reserve = (tokens >> 28);
+                if (UNLIKELY(ll3 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll3 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            if (UNLIKELY(ml3 == ZXC_TOKEN_ML_MASK)) {
+                ml3 += zxc_read_varint(&e_ptr, e_end);
+                if (UNLIKELY(ll3 + ml3 + ZXC_LZ_MIN_MATCH_LEN + ZXC_GLO_MAX_INLINE_OUT_PER_SEQ +
+                                 ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            ml3 += ZXC_LZ_MIN_MATCH_LEN;
+            DECODE_SEQ_SAFE(ll3, ml3, off3);
+
+            uint64_t ll4 = (tokens >> 28);
+            uint64_t ml4 = (tokens >> 24) & 0x0F;
+            if (UNLIKELY(ll4 == ZXC_TOKEN_LL_MASK)) {
+                ll4 += zxc_read_varint(&e_ptr, e_end);
+                if (UNLIKELY(ll4 > (size_t)(l_end - l_ptr) ||
+                             ll4 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            if (UNLIKELY(ml4 == ZXC_TOKEN_ML_MASK)) {
+                ml4 += zxc_read_varint(&e_ptr, e_end);
+                if (UNLIKELY(ll4 + ml4 + ZXC_LZ_MIN_MATCH_LEN + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            ml4 += ZXC_LZ_MIN_MATCH_LEN;
+            DECODE_SEQ_SAFE(ll4, ml4, off4);
+
+            n_seq -= 4;
+        }
+    }
+
+    // --- FAST Loop: After threshold, no offset validation needed (4x unroll) ---
+    if (safe) {
+        while (n_seq >= 4 && d_ptr < d_end_safe && l_ptr < l_end_safe_4x) {
+            const uint8_t* const t_save = t_ptr;
+            const uint8_t* const o_save = o_ptr;
+            const uint8_t* const e_save = e_ptr;
+            uint8_t* const d_save = d_ptr;
+            const uint8_t* const l_save = l_ptr;
+            uint32_t tokens = zxc_le32(t_ptr);
+            t_ptr += sizeof(uint32_t);
+
+            uint32_t off1 = ZXC_LZ_OFFSET_BIAS;
+            uint32_t off2 = ZXC_LZ_OFFSET_BIAS;
+            uint32_t off3 = ZXC_LZ_OFFSET_BIAS;
+            uint32_t off4 = ZXC_LZ_OFFSET_BIAS;
+
+            if (gh.enc_off == 1) {
+                uint32_t offsets = zxc_le32(o_ptr);
+                o_ptr += sizeof(uint32_t);
+                off1 += offsets & 0xFF;
+                off2 += (offsets >> 8) & 0xFF;
+                off3 += (offsets >> 16) & 0xFF;
+                off4 += (offsets >> 24) & 0xFF;
+            } else {
+                uint64_t offsets = zxc_le64(o_ptr);
+                o_ptr += sizeof(uint64_t);
+                off1 += (uint32_t)(offsets & 0xFFFF);
+                off2 += (uint32_t)((offsets >> 16) & 0xFFFF);
+                off3 += (uint32_t)((offsets >> 32) & 0xFFFF);
+                off4 += (uint32_t)((offsets >> 48) & 0xFFFF);
+            }
+
+            uint64_t ll1 = (tokens & 0x0F0) >> 4;
+            uint64_t ml1 = (tokens & 0x00F);
+            if (UNLIKELY(ll1 == ZXC_TOKEN_LL_MASK)) {
+                ll1 += zxc_read_varint(&e_ptr, e_end);
+                const uint64_t reserve =
+                    ((tokens >> 12) & 0xF) + ((tokens >> 20) & 0xF) + (tokens >> 28);
+                if (UNLIKELY(ll1 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll1 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    goto rollback_fast_4x;
+            }
+            if (UNLIKELY(ml1 == ZXC_TOKEN_ML_MASK)) {
+                ml1 += zxc_read_varint(&e_ptr, e_end);
+                if (UNLIKELY(ll1 + ml1 + ZXC_LZ_MIN_MATCH_LEN +
+                                 3U * ZXC_GLO_MAX_INLINE_OUT_PER_SEQ + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    goto rollback_fast_4x;
+            }
+            ml1 += ZXC_LZ_MIN_MATCH_LEN;
+            DECODE_SEQ_FAST(ll1, ml1, off1);
+
+            uint64_t ll2 = (tokens & 0x0F000) >> 12;
+            uint64_t ml2 = (tokens & 0x00F00) >> 8;
+            if (UNLIKELY(ll2 == ZXC_TOKEN_LL_MASK)) {
+                ll2 += zxc_read_varint(&e_ptr, e_end);
+                const uint64_t reserve = ((tokens >> 20) & 0xF) + (tokens >> 28);
+                if (UNLIKELY(ll2 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll2 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    goto rollback_fast_4x;
+            }
+            if (UNLIKELY(ml2 == ZXC_TOKEN_ML_MASK)) {
+                ml2 += zxc_read_varint(&e_ptr, e_end);
+                if (UNLIKELY(ll2 + ml2 + ZXC_LZ_MIN_MATCH_LEN +
+                                 2U * ZXC_GLO_MAX_INLINE_OUT_PER_SEQ + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    goto rollback_fast_4x;
+            }
+            ml2 += ZXC_LZ_MIN_MATCH_LEN;
+            DECODE_SEQ_FAST(ll2, ml2, off2);
+
+            uint64_t ll3 = (tokens & 0x0F00000) >> 20;
+            uint64_t ml3 = (tokens & 0x00F0000) >> 16;
+            if (UNLIKELY(ll3 == ZXC_TOKEN_LL_MASK)) {
+                ll3 += zxc_read_varint(&e_ptr, e_end);
+                const uint64_t reserve = (tokens >> 28);
+                if (UNLIKELY(ll3 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll3 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    goto rollback_fast_4x;
+            }
+            if (UNLIKELY(ml3 == ZXC_TOKEN_ML_MASK)) {
+                ml3 += zxc_read_varint(&e_ptr, e_end);
+                if (UNLIKELY(ll3 + ml3 + ZXC_LZ_MIN_MATCH_LEN + ZXC_GLO_MAX_INLINE_OUT_PER_SEQ +
+                                 ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    goto rollback_fast_4x;
+            }
+            ml3 += ZXC_LZ_MIN_MATCH_LEN;
+            DECODE_SEQ_FAST(ll3, ml3, off3);
+
+            uint64_t ll4 = (tokens >> 28);
+            uint64_t ml4 = (tokens >> 24) & 0x0F;
+            if (UNLIKELY(ll4 == ZXC_TOKEN_LL_MASK)) {
+                ll4 += zxc_read_varint(&e_ptr, e_end);
+                if (UNLIKELY(ll4 > (size_t)(l_end - l_ptr) ||
+                             ll4 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    goto rollback_fast_4x;
+            }
+            if (UNLIKELY(ml4 == ZXC_TOKEN_ML_MASK)) {
+                ml4 += zxc_read_varint(&e_ptr, e_end);
+                if (UNLIKELY(ll4 + ml4 + ZXC_LZ_MIN_MATCH_LEN + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    goto rollback_fast_4x;
+            }
+            ml4 += ZXC_LZ_MIN_MATCH_LEN;
+            DECODE_SEQ_FAST(ll4, ml4, off4);
+
+            n_seq -= 4;
+            continue;
+
+        rollback_fast_4x:
+            t_ptr = t_save;
+            o_ptr = o_save;
+            e_ptr = e_save;
+            d_ptr = d_save;
+            l_ptr = l_save;
+            break;
+        }
+    } else {
+        while (n_seq >= 4 && d_ptr < d_end_safe && l_ptr < l_end_safe_4x) {
+            uint32_t tokens = zxc_le32(t_ptr);
+            t_ptr += sizeof(uint32_t);
+
+            uint32_t off1 = ZXC_LZ_OFFSET_BIAS;
+            uint32_t off2 = ZXC_LZ_OFFSET_BIAS;
+            uint32_t off3 = ZXC_LZ_OFFSET_BIAS;
+            uint32_t off4 = ZXC_LZ_OFFSET_BIAS;
+            if (gh.enc_off == 1) {
+                // Read 4 x 1-byte offsets
+                uint32_t offsets = zxc_le32(o_ptr);
+                o_ptr += sizeof(uint32_t);
+                off1 += offsets & 0xFF;
+                off2 += (offsets >> 8) & 0xFF;
+                off3 += (offsets >> 16) & 0xFF;
+                off4 += (offsets >> 24) & 0xFF;
+            } else {
+                // Read 4 x 2-byte offsets
+                uint64_t offsets = zxc_le64(o_ptr);
+                o_ptr += sizeof(uint64_t);
+                off1 += (uint32_t)(offsets & 0xFFFF);
+                off2 += (uint32_t)((offsets >> 16) & 0xFFFF);
+                off3 += (uint32_t)((offsets >> 32) & 0xFFFF);
+                off4 += (uint32_t)((offsets >> 48) & 0xFFFF);
+            }
+
+            uint64_t ll1 = (tokens & 0x0F0) >> 4;
+            uint64_t ml1 = (tokens & 0x00F);
+            if (UNLIKELY(ll1 == ZXC_TOKEN_LL_MASK)) {
+                ll1 += zxc_read_varint(&e_ptr, e_end);
+                const uint64_t reserve =
+                    ((tokens >> 12) & 0xF) + ((tokens >> 20) & 0xF) + (tokens >> 28);
+                if (UNLIKELY(ll1 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll1 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            if (UNLIKELY(ml1 == ZXC_TOKEN_ML_MASK)) {
+                ml1 += zxc_read_varint(&e_ptr, e_end);
+                if (UNLIKELY(ll1 + ml1 + ZXC_LZ_MIN_MATCH_LEN +
+                                 3U * ZXC_GLO_MAX_INLINE_OUT_PER_SEQ + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            ml1 += ZXC_LZ_MIN_MATCH_LEN;
+            DECODE_SEQ_FAST(ll1, ml1, off1);
+
+            uint64_t ll2 = (tokens & 0x0F000) >> 12;
+            uint64_t ml2 = (tokens & 0x00F00) >> 8;
+            if (UNLIKELY(ll2 == ZXC_TOKEN_LL_MASK)) {
+                ll2 += zxc_read_varint(&e_ptr, e_end);
+                const uint64_t reserve = ((tokens >> 20) & 0xF) + (tokens >> 28);
+                if (UNLIKELY(ll2 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll2 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            if (UNLIKELY(ml2 == ZXC_TOKEN_ML_MASK)) {
+                ml2 += zxc_read_varint(&e_ptr, e_end);
+                if (UNLIKELY(ll2 + ml2 + ZXC_LZ_MIN_MATCH_LEN +
+                                 2U * ZXC_GLO_MAX_INLINE_OUT_PER_SEQ + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            ml2 += ZXC_LZ_MIN_MATCH_LEN;
+            DECODE_SEQ_FAST(ll2, ml2, off2);
+
+            uint64_t ll3 = (tokens & 0x0F00000) >> 20;
+            uint64_t ml3 = (tokens & 0x00F0000) >> 16;
+            if (UNLIKELY(ll3 == ZXC_TOKEN_LL_MASK)) {
+                ll3 += zxc_read_varint(&e_ptr, e_end);
+                const uint64_t reserve = (tokens >> 28);
+                if (UNLIKELY(ll3 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll3 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            if (UNLIKELY(ml3 == ZXC_TOKEN_ML_MASK)) {
+                ml3 += zxc_read_varint(&e_ptr, e_end);
+                if (UNLIKELY(ll3 + ml3 + ZXC_LZ_MIN_MATCH_LEN + ZXC_GLO_MAX_INLINE_OUT_PER_SEQ +
+                                 ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            ml3 += ZXC_LZ_MIN_MATCH_LEN;
+            DECODE_SEQ_FAST(ll3, ml3, off3);
+
+            uint64_t ll4 = (tokens >> 28);
+            uint64_t ml4 = (tokens >> 24) & 0x0F;
+            if (UNLIKELY(ll4 == ZXC_TOKEN_LL_MASK)) {
+                ll4 += zxc_read_varint(&e_ptr, e_end);
+                if (UNLIKELY(ll4 > (size_t)(l_end - l_ptr) ||
+                             ll4 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            if (UNLIKELY(ml4 == ZXC_TOKEN_ML_MASK)) {
+                ml4 += zxc_read_varint(&e_ptr, e_end);
+                if (UNLIKELY(ll4 + ml4 + ZXC_LZ_MIN_MATCH_LEN + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            ml4 += ZXC_LZ_MIN_MATCH_LEN;
+            DECODE_SEQ_FAST(ll4, ml4, off4);
+
+            n_seq -= 4;
+        }
+    }
+
+    // Validate vbyte reads didn't overflow
+    if (UNLIKELY(e_ptr > e_end)) return ZXC_ERROR_CORRUPT_DATA;
+
+    // --- Remaining 1 sequence (Fast Path) ---
+    while (n_seq > 0 && d_ptr < d_end_safe && l_ptr < l_end_safe_1x) {
+        // Save pointers before reading (in case we need to fall back to Safe Path)
+        const uint8_t* t_save = t_ptr;
+        const uint8_t* o_save = o_ptr;
+        const uint8_t* e_save = e_ptr;
+
+        uint8_t token = *t_ptr++;
+        uint64_t ll = token >> ZXC_TOKEN_LIT_BITS;
+        uint64_t ml = token & ZXC_TOKEN_ML_MASK;
+        uint32_t offset = ZXC_LZ_OFFSET_BIAS;
+        if (gh.enc_off == 1) {
+            offset += *o_ptr++;  // 1-byte offset (biased)
+        } else {
+            offset += zxc_le16(o_ptr);  // 2-byte offset (biased)
+            o_ptr += sizeof(uint16_t);
+        }
+
+        if (UNLIKELY(ll == ZXC_TOKEN_LL_MASK)) {
+            ll += zxc_read_varint(&e_ptr, e_end);
+            if (UNLIKELY(l_ptr + ll > l_end)) {
+                t_ptr = t_save;
+                o_ptr = o_save;
+                e_ptr = e_save;
+                break;
+            }
+        }
+        if (UNLIKELY(ml == ZXC_TOKEN_ML_MASK)) ml += zxc_read_varint(&e_ptr, e_end);
+        ml += ZXC_LZ_MIN_MATCH_LEN;
+
+        // Check bounds before wild copies - if too close to end, fall back to Safe Path
+        if (UNLIKELY(ll + ml + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr))) {
+            // Restore pointers and let Safe Path handle this sequence
+            t_ptr = t_save;
+            o_ptr = o_save;
+            e_ptr = e_save;
+            break;
+        }
+
+        {
+            const uint8_t* src_lit = l_ptr;
+            uint8_t* dst_lit = d_ptr;
+            zxc_copy32(dst_lit, src_lit);
+            if (UNLIKELY(ll > ZXC_PAD_SIZE)) {
+                dst_lit += ZXC_PAD_SIZE;
+                src_lit += ZXC_PAD_SIZE;
+                size_t rem = ll - ZXC_PAD_SIZE;
+                while (rem > ZXC_PAD_SIZE) {
+                    zxc_copy32(dst_lit, src_lit);
+                    dst_lit += ZXC_PAD_SIZE;
+                    src_lit += ZXC_PAD_SIZE;
+                    rem -= ZXC_PAD_SIZE;
+                }
+                zxc_copy32(dst_lit, src_lit);
+            }
+            l_ptr += ll;
+            d_ptr += ll;
+            written += ll;
+        }
+
+        {
+            // Skip check if written >= bounds_threshold (256 for 8-bit, 65536 for 16-bit)
+            if (UNLIKELY(written < bounds_threshold && offset > written))
+                return ZXC_ERROR_BAD_OFFSET;
+
+            /* The loop entry check guarantees ll + ml + ZXC_PAD_SIZE bytes of
+             * headroom, so the wild-copy ladder (incl. overlap/fill runs) is safe. */
+            zxc_decode_copy_match(d_ptr, offset, ml);
+            d_ptr += ml;
+            written += ml;
+        }
+        n_seq--;
+    }
+
+    // --- Safe Path for Remaining Sequences ---
+    while (n_seq > 0) {
+        uint8_t token = *t_ptr++;
+        uint64_t ll = token >> ZXC_TOKEN_LIT_BITS;
+        uint64_t ml = token & ZXC_TOKEN_ML_MASK;
+        uint32_t offset = ZXC_LZ_OFFSET_BIAS;
+        if (gh.enc_off == 1) {
+            offset += *o_ptr++;  // 1-byte offset (biased)
+        } else {
+            offset += zxc_le16(o_ptr);  // 2-byte offset (biased)
+            o_ptr += sizeof(uint16_t);
+        }
+
+        if (UNLIKELY(ll == ZXC_TOKEN_LL_MASK)) ll += zxc_read_varint(&e_ptr, e_end);
+        if (UNLIKELY(ml == ZXC_TOKEN_ML_MASK)) ml += zxc_read_varint(&e_ptr, e_end);
+        ml += ZXC_LZ_MIN_MATCH_LEN;
+
+        if (UNLIKELY(ll + ml > (size_t)(d_end - d_ptr) || l_ptr + ll > l_end))
+            return ZXC_ERROR_OVERFLOW;
+        ZXC_MEMCPY(d_ptr, l_ptr, ll);
+        l_ptr += ll;
+        d_ptr += ll;
+
+        const uint8_t* match_src = d_ptr - offset;
+        if (UNLIKELY(match_src < dst - dict_size)) return ZXC_ERROR_BAD_OFFSET;
+
+        if (offset < ml) {
+            for (size_t i = 0; i < ml; i++) d_ptr[i] = match_src[i];
+        } else {
+            ZXC_MEMCPY(d_ptr, match_src, ml);
+        }
+        d_ptr += ml;
+        n_seq--;
+    }
+
+    // --- Trailing Literals ---
+    // Copy remaining literals from source stream (literal exhaustion)
+    if (UNLIKELY(l_ptr > l_end)) return ZXC_ERROR_CORRUPT_DATA;
+    const size_t remaining_literals = (size_t)(l_end - l_ptr);
+    if (remaining_literals > 0) {
+        if (UNLIKELY(d_ptr + remaining_literals > d_end)) return ZXC_ERROR_OVERFLOW;
+        ZXC_MEMCPY(d_ptr, l_ptr, remaining_literals);
+        d_ptr += remaining_literals;
+    }
+
+    return (int)(d_ptr - dst);
+}
+
+/**
+ * @brief Unified GHI (General High) block decoder body, shared by the fast, safe
+ *        and dictionary variants.
+ *
+ * Decodes a block in the internal GHI format; the decompressed size is derived
+ * from the Section Descriptors in the payload. @p safe and @p has_dict must be
+ * compile-time constants (0 or 1): the 4x-unrolled loops are duplicated inside
+ * @c if(safe)/else branches so each variant keeps single-assignment @c const
+ * save pointers, and after constant propagation only one branch survives per
+ * wrapper.
+ *
+ * @param[in,out] ctx          Decompression context (dict buffer, tables).
+ * @param[in]     src          Compressed block payload.
+ * @param[in]     src_size     Size of @p src in bytes.
+ * @param[out]    dst          Destination buffer for decoded bytes.
+ * @param[in]     dst_capacity Capacity of @p dst in bytes.
+ * @param[in]     safe         Compile-time flag: 1 = strict bounds-checked loop.
+ * @param[in]     has_dict     Compile-time flag: 1 = resolve matches against a dict prefix.
+ * @return Bytes written to @p dst on success, or a negative @ref zxc_error_t.
+ */
+static ZXC_ALWAYS_INLINE int zxc_decode_block_ghi_impl(const zxc_cctx_t* RESTRICT ctx,
+                                                       const uint8_t* RESTRICT src,
+                                                       const size_t src_size, uint8_t* RESTRICT dst,
+                                                       const size_t dst_capacity, const int safe,
+                                                       const int has_dict) {
+    zxc_gnr_header_t gh;
+
+    /* 0 when !has_dict (safe path) -> folds `written`/`dst - dict_size`. */
+    const size_t dict_size = has_dict ? ctx->dict_size : 0;
+    zxc_section_desc_t desc[ZXC_GHI_SECTIONS];
+
+    if (UNLIKELY(zxc_read_ghi_header_and_desc(src, src_size, &gh, desc) != ZXC_OK))
+        return ZXC_ERROR_BAD_HEADER;
+
+    const uint8_t* p_curr =
+        src + ZXC_GHI_HEADER_BINARY_SIZE + ZXC_GHI_SECTIONS * ZXC_SECTION_DESC_BINARY_SIZE;
+
+    // --- Stream Pointers & Validation ---
+    const size_t sz_lit = (uint32_t)desc[0].sizes;
+    const size_t sz_seqs = (uint32_t)desc[1].sizes;
+    const size_t sz_exts = (uint32_t)desc[2].sizes;
+    const uint8_t* l_ptr = p_curr;
+    const uint8_t* l_end = l_ptr + sz_lit;
+    p_curr += sz_lit;
+
+    const uint8_t* seq_ptr = p_curr;
+    const uint8_t* extras_ptr = p_curr + sz_seqs;
+    const uint8_t* const extras_end = extras_ptr + sz_exts;
+
+    // Validate streams don't overflow source buffer +
+    // Validate sequence stream size matches sequence count
+    if (UNLIKELY((extras_end != src + src_size) ||
+                 ((uint64_t)sz_seqs < (uint64_t)gh.n_sequences * 4)))
+        return ZXC_ERROR_CORRUPT_DATA;
+
+    uint8_t* d_ptr = dst;
+    const uint8_t* const d_end = dst + dst_capacity;
+    const uint8_t* const d_end_safe = d_end - (ZXC_PAD_SIZE * 4);  // 128
+    // Safety margin for 4x unrolled loop: 4 * (ZXC_SEQ_LL_MASK LL +
+    // ZXC_SEQ_ML_MASK+ZXC_LZ_MIN_MATCH_LEN ML) + ZXC_PAD_SIZE Pad = 4 x (255 + 255 + 5) + 32 = 2092
+    const uint8_t* const d_end_fast = d_end - ZXC_DECOMPRESS_TAIL_PAD;  // 2112
+
+    // Literal stream safe thresholds for GHI loops.
+    // Without varint extension, max ll per sequence = ZXC_SEQ_LL_MASK - 1 = 254.
+    // For 4 sequences: 4 * 254 = 1016. With this margin, l_ptr checks are only needed
+    // on the cold varint path, keeping the hot path free of l_ptr overhead.
+    const size_t ghi_margin_4x = 4 * (ZXC_SEQ_LL_MASK - 1);  // 1016
+    const size_t ghi_margin_1x = ZXC_SEQ_LL_MASK - 1;        // 254
+    const uint8_t* const l_end_safe_4x = (sz_lit > ghi_margin_4x) ? l_end - ghi_margin_4x : l_ptr;
+    const uint8_t* const l_end_safe_1x = (sz_lit > ghi_margin_1x) ? l_end - ghi_margin_1x : l_ptr;
+
+    uint32_t n_seq = gh.n_sequences;
+
+    // Track bytes written for offset validation
+    // For 1-byte offsets (enc_off==1): validate until 256 bytes written (max 8-bit offset)
+    // For 2-byte offsets (enc_off==0): validate until 65536 bytes written (max 16-bit offset)
+    // After threshold, all offsets are guaranteed valid (can't exceed written bytes)
+    // When a dictionary is active, dict_size bytes are logically "already written"
+    // (prepended by the caller), so the SAFE loop may be skipped entirely.
+    size_t written = dict_size;
+
+    // --- SAFE Loop: offset validation until threshold (4x unroll) ---
+    // Since offset is 16-bit, threshold is 65536.
+    // For 1-byte offsets (enc_off==1): validate until 256 bytes written
+    // For 2-byte offsets (enc_off==0): validate until 65536 bytes written
+    const size_t bounds_threshold = (gh.enc_off == 1) ? (1U << 8) : (1U << 16);
+
+    if (safe) {
+        /* SAFE variant: save per-batch state so an OVERFLOW can rollback and
+         * hand over to the 1x loop / Safe Path. Wild writes already committed
+         * are deterministically overwritten when the 1x loop replays. */
+        while (n_seq >= 4 && d_ptr < d_end_fast && l_ptr < l_end_safe_4x &&
+               written < bounds_threshold) {
+            const uint8_t* const t_save = seq_ptr;
+            const uint8_t* const e_save = extras_ptr;
+            uint8_t* const d_save = d_ptr;
+            const uint8_t* const l_save = l_ptr;
+            const size_t w_save = written;
+            uint32_t s1 = zxc_le32(seq_ptr);
+            uint32_t s2 = zxc_le32(seq_ptr + sizeof(uint32_t));
+            uint32_t s3 = zxc_le32(seq_ptr + 2 * sizeof(uint32_t));
+            uint32_t s4 = zxc_le32(seq_ptr + 3 * sizeof(uint32_t));
+            seq_ptr += 4 * sizeof(uint32_t);
+
+            uint64_t ll1 = s1 >> 24;
+            if (UNLIKELY(ll1 == ZXC_SEQ_LL_MASK)) {
+                ll1 += zxc_read_varint(&extras_ptr, extras_end);
+                const uint64_t reserve = (s2 >> 24) + (s3 >> 24) + (s4 >> 24);
+                if (UNLIKELY(ll1 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll1 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    goto rollback_safe_4x;
+            }
+            uint32_t m1b = (s1 >> 16) & 0xFF;
+            uint64_t ml1 = m1b + ZXC_LZ_MIN_MATCH_LEN;
+            if (UNLIKELY(m1b == ZXC_SEQ_ML_MASK)) {
+                ml1 += zxc_read_varint(&extras_ptr, extras_end);
+                if (UNLIKELY(ll1 + ml1 + 3U * ZXC_GHI_MAX_INLINE_OUT_PER_SEQ + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    goto rollback_safe_4x;
+            }
+            uint32_t off1 = (s1 & 0xFFFF) + ZXC_LZ_OFFSET_BIAS;
+            DECODE_SEQ_SAFE(ll1, ml1, off1);
+
+            uint64_t ll2 = s2 >> 24;
+            if (UNLIKELY(ll2 == ZXC_SEQ_LL_MASK)) {
+                ll2 += zxc_read_varint(&extras_ptr, extras_end);
+                const uint64_t reserve = (s3 >> 24) + (s4 >> 24);
+                if (UNLIKELY(ll2 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll2 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    goto rollback_safe_4x;
+            }
+            uint32_t m2b = (s2 >> 16) & 0xFF;
+            uint64_t ml2 = m2b + ZXC_LZ_MIN_MATCH_LEN;
+            if (UNLIKELY(m2b == ZXC_SEQ_ML_MASK)) {
+                ml2 += zxc_read_varint(&extras_ptr, extras_end);
+                if (UNLIKELY(ll2 + ml2 + 2U * ZXC_GHI_MAX_INLINE_OUT_PER_SEQ + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    goto rollback_safe_4x;
+            }
+            uint32_t off2 = (s2 & 0xFFFF) + ZXC_LZ_OFFSET_BIAS;
+            DECODE_SEQ_SAFE(ll2, ml2, off2);
+
+            uint64_t ll3 = s3 >> 24;
+            if (UNLIKELY(ll3 == ZXC_SEQ_LL_MASK)) {
+                ll3 += zxc_read_varint(&extras_ptr, extras_end);
+                const uint64_t reserve = (s4 >> 24);
+                if (UNLIKELY(ll3 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll3 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    goto rollback_safe_4x;
+            }
+            uint32_t m3b = (s3 >> 16) & 0xFF;
+            uint64_t ml3 = m3b + ZXC_LZ_MIN_MATCH_LEN;
+            if (UNLIKELY(m3b == ZXC_SEQ_ML_MASK)) {
+                ml3 += zxc_read_varint(&extras_ptr, extras_end);
+                if (UNLIKELY(ll3 + ml3 + ZXC_GHI_MAX_INLINE_OUT_PER_SEQ + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    goto rollback_safe_4x;
+            }
+            uint32_t off3 = (s3 & 0xFFFF) + ZXC_LZ_OFFSET_BIAS;
+            DECODE_SEQ_SAFE(ll3, ml3, off3);
+
+            uint64_t ll4 = s4 >> 24;
+            if (UNLIKELY(ll4 == ZXC_SEQ_LL_MASK)) {
+                ll4 += zxc_read_varint(&extras_ptr, extras_end);
+                if (UNLIKELY(ll4 > (size_t)(l_end - l_ptr) ||
+                             ll4 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    goto rollback_safe_4x;
+            }
+            uint32_t m4b = (s4 >> 16) & 0xFF;
+            uint64_t ml4 = m4b + ZXC_LZ_MIN_MATCH_LEN;
+            if (UNLIKELY(m4b == ZXC_SEQ_ML_MASK)) {
+                ml4 += zxc_read_varint(&extras_ptr, extras_end);
+                if (UNLIKELY(ll4 + ml4 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    goto rollback_safe_4x;
+            }
+            uint32_t off4 = (s4 & 0xFFFF) + ZXC_LZ_OFFSET_BIAS;
+            DECODE_SEQ_SAFE(ll4, ml4, off4);
+
+            n_seq -= 4;
+            continue;
+
+        rollback_safe_4x:
+            seq_ptr = t_save;
+            extras_ptr = e_save;
+            d_ptr = d_save;
+            l_ptr = l_save;
+            written = w_save;
+            break;
+        }
+    } else {
+        while (n_seq >= 4 && d_ptr < d_end_fast && l_ptr < l_end_safe_4x &&
+               written < bounds_threshold) {
+            uint32_t s1 = zxc_le32(seq_ptr);
+            uint32_t s2 = zxc_le32(seq_ptr + sizeof(uint32_t));
+            uint32_t s3 = zxc_le32(seq_ptr + 2 * sizeof(uint32_t));
+            uint32_t s4 = zxc_le32(seq_ptr + 3 * sizeof(uint32_t));
+            seq_ptr += 4 * sizeof(uint32_t);
+
+            uint64_t ll1 = s1 >> 24;
+            if (UNLIKELY(ll1 == ZXC_SEQ_LL_MASK)) {
+                ll1 += zxc_read_varint(&extras_ptr, extras_end);
+                const uint64_t reserve = (s2 >> 24) + (s3 >> 24) + (s4 >> 24);
+                if (UNLIKELY(ll1 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll1 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            uint32_t m1b = (s1 >> 16) & 0xFF;
+            uint64_t ml1 = m1b + ZXC_LZ_MIN_MATCH_LEN;
+            if (UNLIKELY(m1b == ZXC_SEQ_ML_MASK)) {
+                ml1 += zxc_read_varint(&extras_ptr, extras_end);
+                if (UNLIKELY(ll1 + ml1 + 3U * ZXC_GHI_MAX_INLINE_OUT_PER_SEQ + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            uint32_t off1 = (s1 & 0xFFFF) + ZXC_LZ_OFFSET_BIAS;
+            DECODE_SEQ_SAFE(ll1, ml1, off1);
+
+            uint64_t ll2 = s2 >> 24;
+            if (UNLIKELY(ll2 == ZXC_SEQ_LL_MASK)) {
+                ll2 += zxc_read_varint(&extras_ptr, extras_end);
+                const uint64_t reserve = (s3 >> 24) + (s4 >> 24);
+                if (UNLIKELY(ll2 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll2 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            uint32_t m2b = (s2 >> 16) & 0xFF;
+            uint64_t ml2 = m2b + ZXC_LZ_MIN_MATCH_LEN;
+            if (UNLIKELY(m2b == ZXC_SEQ_ML_MASK)) {
+                ml2 += zxc_read_varint(&extras_ptr, extras_end);
+                if (UNLIKELY(ll2 + ml2 + 2U * ZXC_GHI_MAX_INLINE_OUT_PER_SEQ + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            uint32_t off2 = (s2 & 0xFFFF) + ZXC_LZ_OFFSET_BIAS;
+            DECODE_SEQ_SAFE(ll2, ml2, off2);
+
+            uint64_t ll3 = s3 >> 24;
+            if (UNLIKELY(ll3 == ZXC_SEQ_LL_MASK)) {
+                ll3 += zxc_read_varint(&extras_ptr, extras_end);
+                const uint64_t reserve = (s4 >> 24);
+                if (UNLIKELY(ll3 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll3 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            uint32_t m3b = (s3 >> 16) & 0xFF;
+            uint64_t ml3 = m3b + ZXC_LZ_MIN_MATCH_LEN;
+            if (UNLIKELY(m3b == ZXC_SEQ_ML_MASK)) {
+                ml3 += zxc_read_varint(&extras_ptr, extras_end);
+                if (UNLIKELY(ll3 + ml3 + ZXC_GHI_MAX_INLINE_OUT_PER_SEQ + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            uint32_t off3 = (s3 & 0xFFFF) + ZXC_LZ_OFFSET_BIAS;
+            DECODE_SEQ_SAFE(ll3, ml3, off3);
+
+            uint64_t ll4 = s4 >> 24;
+            if (UNLIKELY(ll4 == ZXC_SEQ_LL_MASK)) {
+                ll4 += zxc_read_varint(&extras_ptr, extras_end);
+                if (UNLIKELY(ll4 > (size_t)(l_end - l_ptr) ||
+                             ll4 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            uint32_t m4b = (s4 >> 16) & 0xFF;
+            uint64_t ml4 = m4b + ZXC_LZ_MIN_MATCH_LEN;
+            if (UNLIKELY(m4b == ZXC_SEQ_ML_MASK)) {
+                ml4 += zxc_read_varint(&extras_ptr, extras_end);
+                if (UNLIKELY(ll4 + ml4 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            uint32_t off4 = (s4 & 0xFFFF) + ZXC_LZ_OFFSET_BIAS;
+            DECODE_SEQ_SAFE(ll4, ml4, off4);
+
+            n_seq -= 4;
+        }
+    }
+
+    // --- SAFE Loop tail: remaining sequences with offset validation (1x) ---
+    while (n_seq > 0 && d_ptr < d_end_safe && written < bounds_threshold) {
+        uint32_t seq = zxc_le32(seq_ptr);
+        seq_ptr += sizeof(uint32_t);
+
+        uint64_t ll = seq >> 24;
+        if (UNLIKELY(ll == ZXC_SEQ_LL_MASK)) ll += zxc_read_varint(&extras_ptr, extras_end);
+
+        uint32_t m_bits = (seq >> 16) & 0xFF;
+        uint64_t ml = m_bits + ZXC_LZ_MIN_MATCH_LEN;
+        if (UNLIKELY(m_bits == ZXC_SEQ_ML_MASK)) ml += zxc_read_varint(&extras_ptr, extras_end);
+
+        uint32_t offset = (seq & 0xFFFF) + ZXC_LZ_OFFSET_BIAS;
+
+        // Strict bounds check: sequence must fit, AND wild copies must not overshoot
+        // Check both destination (d_ptr) and source literal stream (l_ptr)
+        if (UNLIKELY(ll + ml + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr) ||
+                     ll + ZXC_PAD_SIZE > (size_t)(l_end - l_ptr))) {
+            // Fallback to exact copy (slow but safe)
+            if (UNLIKELY(d_ptr + ll > d_end || l_ptr + ll > l_end)) return ZXC_ERROR_OVERFLOW;
+            ZXC_MEMCPY(d_ptr, l_ptr, ll);
+            l_ptr += ll;
+            d_ptr += ll;
+            written += ll;
+
+            if (UNLIKELY(offset > written || d_ptr + ml > d_end)) return ZXC_ERROR_BAD_OFFSET;
+            const uint8_t* match_src = d_ptr - offset;
+
+            if (offset < ml) {
+                for (size_t i = 0; i < ml; i++) d_ptr[i] = match_src[i];
+            } else {
+                ZXC_MEMCPY(d_ptr, match_src, ml);
+            }
+            d_ptr += ml;
+            written += ml;
+        } else {
+            // Safe to process with wild copies
+            DECODE_SEQ_SAFE(ll, ml, offset);
+        }
+        n_seq--;
+    }
+
+    // --- FAST Loop: After threshold, check large margin to avoid individual bounds checks ---
+    if (safe) {
+        while (n_seq >= 4 && d_ptr < d_end_fast && l_ptr < l_end_safe_4x) {
+            const uint8_t* const t_save = seq_ptr;
+            const uint8_t* const e_save = extras_ptr;
+            uint8_t* const d_save = d_ptr;
+            const uint8_t* const l_save = l_ptr;
+            uint32_t s1 = zxc_le32(seq_ptr);
+            uint32_t s2 = zxc_le32(seq_ptr + sizeof(uint32_t));
+            uint32_t s3 = zxc_le32(seq_ptr + 2 * sizeof(uint32_t));
+            uint32_t s4 = zxc_le32(seq_ptr + 3 * sizeof(uint32_t));
+            seq_ptr += 4 * sizeof(uint32_t);
+
+            // Prefetch ahead in literal and extras streams to hide memory latency
+            ZXC_PREFETCH_READ(l_ptr + ZXC_CACHE_LINE_SIZE);
+
+            uint64_t ll1 = s1 >> 24;
+            if (UNLIKELY(ll1 == ZXC_SEQ_LL_MASK)) {
+                ll1 += zxc_read_varint(&extras_ptr, extras_end);
+                const uint64_t reserve = (s2 >> 24) + (s3 >> 24) + (s4 >> 24);
+                if (UNLIKELY(ll1 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll1 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    goto rollback_fast_4x;
+            }
+            uint32_t m1b = (s1 >> 16) & 0xFF;
+            uint64_t ml1 = m1b + ZXC_LZ_MIN_MATCH_LEN;
+            if (UNLIKELY(m1b == ZXC_SEQ_ML_MASK)) {
+                ml1 += zxc_read_varint(&extras_ptr, extras_end);
+                if (UNLIKELY(ll1 + ml1 + 3U * ZXC_GHI_MAX_INLINE_OUT_PER_SEQ + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    goto rollback_fast_4x;
+            }
+            uint32_t off1 = (s1 & 0xFFFF) + ZXC_LZ_OFFSET_BIAS;
+            DECODE_SEQ_FAST(ll1, ml1, off1);
+
+            uint64_t ll2 = s2 >> 24;
+            if (UNLIKELY(ll2 == ZXC_SEQ_LL_MASK)) {
+                ll2 += zxc_read_varint(&extras_ptr, extras_end);
+                const uint64_t reserve = (s3 >> 24) + (s4 >> 24);
+                if (UNLIKELY(ll2 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll2 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    goto rollback_fast_4x;
+            }
+            uint32_t m2b = (s2 >> 16) & 0xFF;
+            uint64_t ml2 = m2b + ZXC_LZ_MIN_MATCH_LEN;
+            if (UNLIKELY(m2b == ZXC_SEQ_ML_MASK)) {
+                ml2 += zxc_read_varint(&extras_ptr, extras_end);
+                if (UNLIKELY(ll2 + ml2 + 2U * ZXC_GHI_MAX_INLINE_OUT_PER_SEQ + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    goto rollback_fast_4x;
+            }
+            uint32_t off2 = (s2 & 0xFFFF) + ZXC_LZ_OFFSET_BIAS;
+            DECODE_SEQ_FAST(ll2, ml2, off2);
+
+            uint64_t ll3 = s3 >> 24;
+            if (UNLIKELY(ll3 == ZXC_SEQ_LL_MASK)) {
+                ll3 += zxc_read_varint(&extras_ptr, extras_end);
+                const uint64_t reserve = (s4 >> 24);
+                if (UNLIKELY(ll3 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll3 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    goto rollback_fast_4x;
+            }
+            uint32_t m3b = (s3 >> 16) & 0xFF;
+            uint64_t ml3 = m3b + ZXC_LZ_MIN_MATCH_LEN;
+            if (UNLIKELY(m3b == ZXC_SEQ_ML_MASK)) {
+                ml3 += zxc_read_varint(&extras_ptr, extras_end);
+                if (UNLIKELY(ll3 + ml3 + ZXC_GHI_MAX_INLINE_OUT_PER_SEQ + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    goto rollback_fast_4x;
+            }
+            uint32_t off3 = (s3 & 0xFFFF) + ZXC_LZ_OFFSET_BIAS;
+            DECODE_SEQ_FAST(ll3, ml3, off3);
+
+            uint64_t ll4 = s4 >> 24;
+            if (UNLIKELY(ll4 == ZXC_SEQ_LL_MASK)) {
+                ll4 += zxc_read_varint(&extras_ptr, extras_end);
+                if (UNLIKELY(ll4 > (size_t)(l_end - l_ptr) ||
+                             ll4 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    goto rollback_fast_4x;
+            }
+            uint32_t m4b = (s4 >> 16) & 0xFF;
+            uint64_t ml4 = m4b + ZXC_LZ_MIN_MATCH_LEN;
+            if (UNLIKELY(m4b == ZXC_SEQ_ML_MASK)) {
+                ml4 += zxc_read_varint(&extras_ptr, extras_end);
+                if (UNLIKELY(ll4 + ml4 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    goto rollback_fast_4x;
+            }
+            uint32_t off4 = (s4 & 0xFFFF) + ZXC_LZ_OFFSET_BIAS;
+            DECODE_SEQ_FAST(ll4, ml4, off4);
+
+            n_seq -= 4;
+            continue;
+
+        rollback_fast_4x:
+            seq_ptr = t_save;
+            extras_ptr = e_save;
+            d_ptr = d_save;
+            l_ptr = l_save;
+            break;
+        }
+    } else {
+        while (n_seq >= 4 && d_ptr < d_end_fast && l_ptr < l_end_safe_4x) {
+            uint32_t s1 = zxc_le32(seq_ptr);
+            uint32_t s2 = zxc_le32(seq_ptr + sizeof(uint32_t));
+            uint32_t s3 = zxc_le32(seq_ptr + 2 * sizeof(uint32_t));
+            uint32_t s4 = zxc_le32(seq_ptr + 3 * sizeof(uint32_t));
+            seq_ptr += 4 * sizeof(uint32_t);
+
+            // Prefetch ahead in literal and extras streams to hide memory latency
+            ZXC_PREFETCH_READ(l_ptr + ZXC_CACHE_LINE_SIZE);
+
+            uint64_t ll1 = s1 >> 24;
+            if (UNLIKELY(ll1 == ZXC_SEQ_LL_MASK)) {
+                ll1 += zxc_read_varint(&extras_ptr, extras_end);
+                const uint64_t reserve = (s2 >> 24) + (s3 >> 24) + (s4 >> 24);
+                if (UNLIKELY(ll1 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll1 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            uint32_t m1b = (s1 >> 16) & 0xFF;
+            uint64_t ml1 = m1b + ZXC_LZ_MIN_MATCH_LEN;
+            if (UNLIKELY(m1b == ZXC_SEQ_ML_MASK)) {
+                ml1 += zxc_read_varint(&extras_ptr, extras_end);
+                if (UNLIKELY(ll1 + ml1 + 3U * ZXC_GHI_MAX_INLINE_OUT_PER_SEQ + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            uint32_t off1 = (s1 & 0xFFFF) + ZXC_LZ_OFFSET_BIAS;
+            DECODE_SEQ_FAST(ll1, ml1, off1);
+
+            uint64_t ll2 = s2 >> 24;
+            if (UNLIKELY(ll2 == ZXC_SEQ_LL_MASK)) {
+                ll2 += zxc_read_varint(&extras_ptr, extras_end);
+                const uint64_t reserve = (s3 >> 24) + (s4 >> 24);
+                if (UNLIKELY(ll2 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll2 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            uint32_t m2b = (s2 >> 16) & 0xFF;
+            uint64_t ml2 = m2b + ZXC_LZ_MIN_MATCH_LEN;
+            if (UNLIKELY(m2b == ZXC_SEQ_ML_MASK)) {
+                ml2 += zxc_read_varint(&extras_ptr, extras_end);
+                if (UNLIKELY(ll2 + ml2 + 2U * ZXC_GHI_MAX_INLINE_OUT_PER_SEQ + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            uint32_t off2 = (s2 & 0xFFFF) + ZXC_LZ_OFFSET_BIAS;
+            DECODE_SEQ_FAST(ll2, ml2, off2);
+
+            uint64_t ll3 = s3 >> 24;
+            if (UNLIKELY(ll3 == ZXC_SEQ_LL_MASK)) {
+                ll3 += zxc_read_varint(&extras_ptr, extras_end);
+                const uint64_t reserve = (s4 >> 24);
+                if (UNLIKELY(ll3 + reserve > (size_t)(l_end - l_ptr) ||
+                             ll3 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            uint32_t m3b = (s3 >> 16) & 0xFF;
+            uint64_t ml3 = m3b + ZXC_LZ_MIN_MATCH_LEN;
+            if (UNLIKELY(m3b == ZXC_SEQ_ML_MASK)) {
+                ml3 += zxc_read_varint(&extras_ptr, extras_end);
+                if (UNLIKELY(ll3 + ml3 + ZXC_GHI_MAX_INLINE_OUT_PER_SEQ + ZXC_PAD_SIZE >
+                             (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            uint32_t off3 = (s3 & 0xFFFF) + ZXC_LZ_OFFSET_BIAS;
+            DECODE_SEQ_FAST(ll3, ml3, off3);
+
+            uint64_t ll4 = s4 >> 24;
+            if (UNLIKELY(ll4 == ZXC_SEQ_LL_MASK)) {
+                ll4 += zxc_read_varint(&extras_ptr, extras_end);
+                if (UNLIKELY(ll4 > (size_t)(l_end - l_ptr) ||
+                             ll4 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            uint32_t m4b = (s4 >> 16) & 0xFF;
+            uint64_t ml4 = m4b + ZXC_LZ_MIN_MATCH_LEN;
+            if (UNLIKELY(m4b == ZXC_SEQ_ML_MASK)) {
+                ml4 += zxc_read_varint(&extras_ptr, extras_end);
+                if (UNLIKELY(ll4 + ml4 + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr)))
+                    return ZXC_ERROR_OVERFLOW;
+            }
+            uint32_t off4 = (s4 & 0xFFFF) + ZXC_LZ_OFFSET_BIAS;
+            DECODE_SEQ_FAST(ll4, ml4, off4);
+
+            n_seq -= 4;
+        }
+    }
+
+    // --- Remaining 1 sequence (Fast Path) ---
+    while (n_seq > 0 && d_ptr < d_end_safe && l_ptr < l_end_safe_1x) {
+        // Save state for fallback
+        const uint8_t* seq_save = seq_ptr;
+        const uint8_t* ext_save = extras_ptr;
+
+        const uint32_t seq = zxc_le32(seq_ptr);
+        seq_ptr += sizeof(uint32_t);
+
+        uint64_t ll = seq >> 24;
+        if (UNLIKELY(ll == ZXC_SEQ_LL_MASK)) {
+            ll += zxc_read_varint(&extras_ptr, extras_end);
+            if (UNLIKELY(l_ptr + ll > l_end)) {
+                seq_ptr = seq_save;
+                extras_ptr = ext_save;
+                break;
+            }
+        }
+
+        uint32_t m_bits = (seq >> 16) & 0xFF;
+        uint64_t ml = m_bits + ZXC_LZ_MIN_MATCH_LEN;
+        if (UNLIKELY(m_bits == ZXC_SEQ_ML_MASK)) ml += zxc_read_varint(&extras_ptr, extras_end);
+
+        // Strict bounds checks (including wild copy overrun safety)
+        if (UNLIKELY(ll + ml + ZXC_PAD_SIZE > (size_t)(d_end - d_ptr) ||
+                     ll + ZXC_PAD_SIZE > (size_t)(l_end - l_ptr))) {
+            // Restore state and break to Safe Path
+            seq_ptr = seq_save;
+            extras_ptr = ext_save;
+            break;
+        }
+        uint32_t offset = (seq & 0xFFFF) + ZXC_LZ_OFFSET_BIAS;
+
+        {
+            const uint8_t* src_lit = l_ptr;
+            uint8_t* dst_lit = d_ptr;
+            zxc_copy32(dst_lit, src_lit);
+            if (UNLIKELY(ll > ZXC_PAD_SIZE)) {
+                dst_lit += ZXC_PAD_SIZE;
+                src_lit += ZXC_PAD_SIZE;
+                size_t rem = ll - ZXC_PAD_SIZE;
+                while (rem > ZXC_PAD_SIZE) {
+                    zxc_copy32(dst_lit, src_lit);
+                    dst_lit += ZXC_PAD_SIZE;
+                    src_lit += ZXC_PAD_SIZE;
+                    rem -= ZXC_PAD_SIZE;
+                }
+                zxc_copy32(dst_lit, src_lit);
+            }
+            l_ptr += ll;
+            d_ptr += ll;
+            written += ll;
+        }
+
+        {
+            // Skip check if written >= bounds_threshold (256 for 8-bit, 65536 for 16-bit)
+            if (UNLIKELY(written < bounds_threshold && offset > written))
+                return ZXC_ERROR_BAD_OFFSET;
+
+            /* The loop entry check guarantees ll + ml + ZXC_PAD_SIZE bytes of
+             * headroom, so the wild-copy ladder (incl. overlap/fill runs) is safe. */
+            zxc_decode_copy_match(d_ptr, offset, ml);
+            d_ptr += ml;
+            written += ml;
+        }
+        n_seq--;
+    }
+
+    // --- Safe Path for Remaining Sequences ---
+    while (n_seq > 0) {
+        uint32_t seq = zxc_le32(seq_ptr);
+        seq_ptr += sizeof(uint32_t);
+
+        uint64_t ll = seq >> 24;
+        if (UNLIKELY(ll == ZXC_SEQ_LL_MASK)) ll += zxc_read_varint(&extras_ptr, extras_end);
+
+        uint32_t m_bits = (seq >> 16) & 0xFF;
+        uint64_t ml = m_bits + ZXC_LZ_MIN_MATCH_LEN;
+        if (UNLIKELY(m_bits == ZXC_SEQ_ML_MASK)) ml += zxc_read_varint(&extras_ptr, extras_end);
+        uint32_t offset = (seq & 0xFFFF) + ZXC_LZ_OFFSET_BIAS;
+
+        if (UNLIKELY(ll + ml > (size_t)(d_end - d_ptr) || l_ptr + ll > l_end))
+            return ZXC_ERROR_OVERFLOW;
+        ZXC_MEMCPY(d_ptr, l_ptr, ll);
+        l_ptr += ll;
+        d_ptr += ll;
+
+        const uint8_t* match_src = d_ptr - offset;
+        if (UNLIKELY(match_src < dst - dict_size)) return ZXC_ERROR_BAD_OFFSET;
+
+        if (offset < ml) {
+            for (size_t i = 0; i < ml; i++) d_ptr[i] = match_src[i];
+        } else {
+            ZXC_MEMCPY(d_ptr, match_src, ml);
+        }
+        d_ptr += ml;
+        n_seq--;
+    }
+
+    // --- Trailing Literals ---
+    // Copy remaining literals from source stream (literal exhaustion)
+    if (UNLIKELY(l_ptr > l_end)) return ZXC_ERROR_CORRUPT_DATA;
+    const size_t remaining_literals = (size_t)(l_end - l_ptr);
+    if (remaining_literals > 0) {
+        if (UNLIKELY(d_ptr + remaining_literals > d_end)) return ZXC_ERROR_OVERFLOW;
+        ZXC_MEMCPY(d_ptr, l_ptr, remaining_literals);
+        d_ptr += remaining_literals;
+    }
+
+    return (int)(d_ptr - dst);
+}
+
+/**
+ * @brief Decode a no-dict GLO block (plain, inlinable path).
+ *
+ * Wrapper over @ref zxc_decode_block_glo_impl with @c safe=0, @c has_dict=0, so
+ * the no-dict chunk wrapper inlines it exactly like the dict-free build.
+ *
+ * @param[in,out] ctx          Decompression context.
+ * @param[in]     src          Compressed GLO block payload.
+ * @param[in]     src_size     Size of @p src in bytes.
+ * @param[out]    dst          Destination buffer.
+ * @param[in]     dst_capacity Capacity of @p dst in bytes.
+ * @return Bytes written on success, or a negative @ref zxc_error_t.
+ */
+static int zxc_decode_block_glo(const zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                const size_t src_size, uint8_t* RESTRICT dst,
+                                const size_t dst_capacity) {
+    return zxc_decode_block_glo_impl(ctx, src, src_size, dst, dst_capacity, 0, 0);
+}
+
+/**
+ * @brief Decode a no-dict GHI block (plain, inlinable path).
+ *
+ * Wrapper over @ref zxc_decode_block_ghi_impl with @c safe=0, @c has_dict=0.
+ *
+ * @param[in,out] ctx          Decompression context.
+ * @param[in]     src          Compressed GHI block payload.
+ * @param[in]     src_size     Size of @p src in bytes.
+ * @param[out]    dst          Destination buffer.
+ * @param[in]     dst_capacity Capacity of @p dst in bytes.
+ * @return Bytes written on success, or a negative @ref zxc_error_t.
+ */
+static int zxc_decode_block_ghi(const zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                const size_t src_size, uint8_t* RESTRICT dst,
+                                const size_t dst_capacity) {
+    return zxc_decode_block_ghi_impl(ctx, src, src_size, dst, dst_capacity, 0, 0);
+}
+
+/**
+ * @brief Decode a GLO block against a dictionary prefix (cold path).
+ *
+ * Wrapper over @ref zxc_decode_block_glo_impl with @c safe=0, @c has_dict=1.
+ * NOINLINE: only reached on the cold dict path (@ref zxc_decompress_chunk_wrapper_dict),
+ * so it never loads into I-cache on a no-dict stream.
+ *
+ * @param[in,out] ctx          Decompression context (dict prefix in its buffer).
+ * @param[in]     src          Compressed GLO block payload.
+ * @param[in]     src_size     Size of @p src in bytes.
+ * @param[out]    dst          Destination buffer.
+ * @param[in]     dst_capacity Capacity of @p dst in bytes.
+ * @return Bytes written on success, or a negative @ref zxc_error_t.
+ */
+static ZXC_NOINLINE int zxc_decode_block_glo_dict(const zxc_cctx_t* RESTRICT ctx,
+                                                  const uint8_t* RESTRICT src,
+                                                  const size_t src_size, uint8_t* RESTRICT dst,
+                                                  const size_t dst_capacity) {
+    return zxc_decode_block_glo_impl(ctx, src, src_size, dst, dst_capacity, 0, 1);
+}
+
+/**
+ * @brief Decode a GHI block against a dictionary prefix (cold path).
+ *
+ * Wrapper over @ref zxc_decode_block_ghi_impl with @c safe=0, @c has_dict=1
+ * (NOINLINE; see @ref zxc_decode_block_glo_dict).
+ *
+ * @param[in,out] ctx          Decompression context (dict prefix in its buffer).
+ * @param[in]     src          Compressed GHI block payload.
+ * @param[in]     src_size     Size of @p src in bytes.
+ * @param[out]    dst          Destination buffer.
+ * @param[in]     dst_capacity Capacity of @p dst in bytes.
+ * @return Bytes written on success, or a negative @ref zxc_error_t.
+ */
+static ZXC_NOINLINE int zxc_decode_block_ghi_dict(const zxc_cctx_t* RESTRICT ctx,
+                                                  const uint8_t* RESTRICT src,
+                                                  const size_t src_size, uint8_t* RESTRICT dst,
+                                                  const size_t dst_capacity) {
+    return zxc_decode_block_ghi_impl(ctx, src, src_size, dst, dst_capacity, 0, 1);
+}
+
+/**
+ * @brief Decode a GLO block with the strict-tail safe loop (no wild copies).
+ *
+ * Wrapper over @ref zxc_decode_block_glo_impl with @c safe=1, @c has_dict=0.
+ * The safe path never carries a dict (block_safe routes dict inputs to the
+ * bounce path), so @c has_dict=0 folds the dead dict handling.
+ *
+ * @param[in,out] ctx          Decompression context.
+ * @param[in]     src          Compressed GLO block payload.
+ * @param[in]     src_size     Size of @p src in bytes.
+ * @param[out]    dst          Destination buffer (capacity == exact decoded size).
+ * @param[in]     dst_capacity Capacity of @p dst in bytes.
+ * @return Bytes written on success, or a negative @ref zxc_error_t.
+ */
+static ZXC_NOINLINE int zxc_decode_block_glo_safe(const zxc_cctx_t* RESTRICT ctx,
+                                                  const uint8_t* RESTRICT src,
+                                                  const size_t src_size, uint8_t* RESTRICT dst,
+                                                  const size_t dst_capacity) {
+    return zxc_decode_block_glo_impl(ctx, src, src_size, dst, dst_capacity, 1, 0);
+}
+
+/**
+ * @brief Decode a GHI block with the strict-tail safe loop (no wild copies).
+ *
+ * Wrapper over @ref zxc_decode_block_ghi_impl with @c safe=1, @c has_dict=0
+ * (the strict-tail safe path never carries a dict; see
+ * @ref zxc_decode_block_glo_safe).
+ *
+ * @param[in,out] ctx          Decompression context.
+ * @param[in]     src          Compressed GHI block payload.
+ * @param[in]     src_size     Size of @p src in bytes.
+ * @param[out]    dst          Destination buffer (capacity == exact decoded size).
+ * @param[in]     dst_capacity Capacity of @p dst in bytes.
+ * @return Bytes written on success, or a negative @ref zxc_error_t.
+ */
+static ZXC_NOINLINE int zxc_decode_block_ghi_safe(const zxc_cctx_t* RESTRICT ctx,
+                                                  const uint8_t* RESTRICT src,
+                                                  const size_t src_size, uint8_t* RESTRICT dst,
+                                                  const size_t dst_capacity) {
+    return zxc_decode_block_ghi_impl(ctx, src, src_size, dst, dst_capacity, 1, 0);
+}
+
+#undef DECODE_SEQ_FAST
+#undef DECODE_SEQ_SAFE
+
+/**
+ * @brief Shared chunk-decode body: validates the block header, verifies the
+ *        optional checksum, then dispatches on block type.
+ *
+ * @p has_dict is a compile-time constant: the no-dict instantiation folds the
+ * GLO/GHI selection to the plain (inlinable) decoders, so
+ * @ref zxc_decompress_chunk_wrapper carries no dict code and matches the
+ * dict-free build; the dict instantiation calls the NOINLINE @c _dict decoders.
+ *
+ * @param[in,out] ctx       Decompression context.
+ * @param[in]     src       Compressed block (header + payload + optional checksum).
+ * @param[in]     src_sz    Size of @p src in bytes.
+ * @param[out]    dst       Destination buffer for the decoded block.
+ * @param[in]     dst_cap   Capacity of @p dst in bytes.
+ * @param[in]     has_dict  Compile-time flag: 1 = dictionary-aware decoders.
+ * @return Bytes written on success, or a negative @ref zxc_error_t.
+ */
+static ZXC_ALWAYS_INLINE int zxc_decompress_chunk_wrapper_body(
+    const zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src, const size_t src_sz,
+    uint8_t* RESTRICT dst, const size_t dst_cap, const int has_dict) {
+    if (UNLIKELY(src_sz < ZXC_BLOCK_HEADER_SIZE)) return ZXC_ERROR_SRC_TOO_SMALL;
+
+    const uint8_t type = src[0];
+    const uint32_t comp_sz = zxc_le32(src + 3);
+    const int has_crc = ctx->checksum_enabled;
+
+    // Check bounds: Header + Body + Checksum(if any)
+    const size_t expected_sz =
+        (size_t)ZXC_BLOCK_HEADER_SIZE + comp_sz + (has_crc ? ZXC_BLOCK_CHECKSUM_SIZE : 0);
+    if (UNLIKELY(src_sz < expected_sz)) return ZXC_ERROR_SRC_TOO_SMALL;
+
+    const uint8_t* data = src + ZXC_BLOCK_HEADER_SIZE;
+
+    if (has_crc) {
+        const uint32_t stored = zxc_le32(data + comp_sz);
+        const uint32_t calc = zxc_checksum(data, comp_sz, ZXC_CHECKSUM_RAPIDHASH);
+        if (UNLIKELY(stored != calc)) return ZXC_ERROR_BAD_CHECKSUM;
+    }
+
+    int decoded_sz = ZXC_ERROR_BAD_BLOCK_TYPE;
+
+    switch (type) {
+        case ZXC_BLOCK_GLO:
+            decoded_sz = has_dict ? zxc_decode_block_glo_dict(ctx, data, comp_sz, dst, dst_cap)
+                                  : zxc_decode_block_glo(ctx, data, comp_sz, dst, dst_cap);
+            break;
+        case ZXC_BLOCK_GHI:
+            decoded_sz = has_dict ? zxc_decode_block_ghi_dict(ctx, data, comp_sz, dst, dst_cap)
+                                  : zxc_decode_block_ghi(ctx, data, comp_sz, dst, dst_cap);
+            break;
+        case ZXC_BLOCK_RAW:
+            // For RAW blocks, comp_sz == raw_sz (uncompressed data stored as-is)
+            if (UNLIKELY(comp_sz > dst_cap)) return ZXC_ERROR_DST_TOO_SMALL;
+            ZXC_MEMCPY(dst, data, comp_sz);
+            decoded_sz = (int)comp_sz;
+            break;
+        case ZXC_BLOCK_EOF:
+            // EOF should be handled by the dispatcher, not here
+            return ZXC_ERROR_CORRUPT_DATA;
+        default:
+            return ZXC_ERROR_BAD_BLOCK_TYPE;
+    }
+
+    return decoded_sz;
+}
+
+/**
+ * @brief Public no-dict chunk decoder (decompression hot path).
+ *
+ * Inlines the plain GLO/GHI decoders via @ref zxc_decompress_chunk_wrapper_body
+ * with @c has_dict=0, so it carries no dict code and matches the dict-free build.
+ *
+ * @param[in,out] ctx     Decompression context.
+ * @param[in]     src     Compressed block bytes.
+ * @param[in]     src_sz  Size of @p src in bytes.
+ * @param[out]    dst     Destination buffer for the decoded block.
+ * @param[in]     dst_cap Capacity of @p dst in bytes.
+ * @return Bytes written on success, or a negative @ref zxc_error_t.
+ */
+// cppcheck-suppress unusedFunction
+int zxc_decompress_chunk_wrapper(const zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                 const size_t src_sz, uint8_t* RESTRICT dst, const size_t dst_cap) {
+    return zxc_decompress_chunk_wrapper_body(ctx, src, src_sz, dst, dst_cap, 0);
+}
+
+/**
+ * @brief Public dictionary chunk decoder.
+ *
+ * Routes through @ref zxc_decompress_chunk_wrapper_body with @c has_dict=1,
+ * which calls the NOINLINE @c _dict decoders (slower: dict back-refs read the
+ * prepended dictionary). Used only when @c ctx->dict_size != 0.
+ *
+ * @param[in,out] ctx     Decompression context (dict prefix in its buffer).
+ * @param[in]     src     Compressed block bytes.
+ * @param[in]     src_sz  Size of @p src in bytes.
+ * @param[out]    dst     Destination buffer for the decoded block.
+ * @param[in]     dst_cap Capacity of @p dst in bytes.
+ * @return Bytes written on success, or a negative @ref zxc_error_t.
+ */
+// cppcheck-suppress unusedFunction
+int zxc_decompress_chunk_wrapper_dict(const zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                      const size_t src_sz, uint8_t* RESTRICT dst,
+                                      const size_t dst_cap) {
+    return zxc_decompress_chunk_wrapper_body(ctx, src, src_sz, dst, dst_cap, 1);
+}
+
+/**
+ * @brief Public strict-tail safe chunk decoder (dst_cap == exact decoded size).
+ *
+ * Validates the block header and optional checksum, then decodes via the
+ * @c _safe decoders (no bounce buffer, no tail padding); RAW blocks are copied
+ * directly. Dict inputs are not handled here (the caller routes them to the
+ * bounce-capable path).
+ *
+ * @param[in,out] ctx     Decompression context.
+ * @param[in]     src     Compressed block bytes.
+ * @param[in]     src_sz  Size of @p src in bytes.
+ * @param[out]    dst     Destination buffer (capacity == exact decoded size).
+ * @param[in]     dst_cap Capacity of @p dst in bytes.
+ * @return Bytes written on success, or a negative @ref zxc_error_t.
+ */
+// cppcheck-suppress unusedFunction
+int zxc_decompress_chunk_wrapper_safe(const zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                      const size_t src_sz, uint8_t* RESTRICT dst,
+                                      const size_t dst_cap) {
+    if (UNLIKELY(src_sz < ZXC_BLOCK_HEADER_SIZE)) return ZXC_ERROR_SRC_TOO_SMALL;
+
+    const uint8_t type = src[0];
+    const uint32_t comp_sz = zxc_le32(src + 3);
+    const int has_crc = ctx->checksum_enabled;
+
+    const size_t expected_sz =
+        (size_t)ZXC_BLOCK_HEADER_SIZE + comp_sz + (has_crc ? ZXC_BLOCK_CHECKSUM_SIZE : 0);
+    if (UNLIKELY(src_sz < expected_sz)) return ZXC_ERROR_SRC_TOO_SMALL;
+
+    const uint8_t* data = src + ZXC_BLOCK_HEADER_SIZE;
+
+    if (has_crc) {
+        const uint32_t stored = zxc_le32(data + comp_sz);
+        const uint32_t calc = zxc_checksum(data, comp_sz, ZXC_CHECKSUM_RAPIDHASH);
+        if (UNLIKELY(stored != calc)) return ZXC_ERROR_BAD_CHECKSUM;
+    }
+
+    switch (type) {
+        case ZXC_BLOCK_GLO:
+            return zxc_decode_block_glo_safe(ctx, data, comp_sz, dst, dst_cap);
+        case ZXC_BLOCK_GHI:
+            return zxc_decode_block_ghi_safe(ctx, data, comp_sz, dst, dst_cap);
+        case ZXC_BLOCK_RAW:
+            if (UNLIKELY(comp_sz > dst_cap)) return ZXC_ERROR_DST_TOO_SMALL;
+            ZXC_MEMCPY(dst, data, comp_sz);
+            return (int)comp_sz;
+        case ZXC_BLOCK_EOF:
+            return ZXC_ERROR_CORRUPT_DATA;
+        default:
+            return ZXC_ERROR_BAD_BLOCK_TYPE;
+    }
+}
diff --git a/thirdparty/zxc/src/lib/zxc_deps.h b/thirdparty/zxc/src/lib/zxc_deps.h
new file mode 100644
index 000000000000..47397bcb69a4
--- /dev/null
+++ b/thirdparty/zxc/src/lib/zxc_deps.h
@@ -0,0 +1,123 @@
+/*
+ * ZXC - High-performance lossless compression
+ *
+ * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file zxc_deps.h
+ * @brief Single point of override for the C-library dependencies of libzxc.
+ *
+ * The core compression/decompression code only ever reaches the standard
+ * library through the macros and headers declared here.
+ * Freestanding consumers vendor the source tree and replace this file with
+ * an environment-specific version that maps the same macros onto their own
+ * allocator, sort, and basic header set.
+ *
+ * @par Stock (hosted) build
+ * Pulls in @c <limits.h>, @c <stdint.h>, @c <stdlib.h>, @c <string.h> and
+ * expands the macros to their libc equivalents.
+ *
+ * Per-symbol @c -D overrides are also accepted (each macro is guarded by
+ * an @c ifndef), so vendoring is optional for ad-hoc consumers.
+ */
+
+#ifndef ZXC_DEPS_H
+#define ZXC_DEPS_H
+
+/**
+ * @addtogroup internal
+ * @{
+ */
+
+/**
+ * @name Standard Headers
+ * @brief Pulled in by the stock libzxc build to provide @c size_t,
+ * @c uintN_t / @c intN_t, @c CHAR_BIT, @c malloc / @c calloc /
+ * @c realloc / @c free, and @c memcpy / @c memset / @c memmove /
+ * @c memcmp.
+ *
+ * Vendored overrides of this file typically substitute @c <linux/limits.h>,
+ * @c <linux/types.h>, @c <linux/string.h>, @c <linux/slab.h>,
+ * @c <linux/sort.h>.
+ * @{
+ */
+#include <limits.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+/** @} */ /* end of Standard Headers */
+
+/**
+ * @name Heap Allocator Abstraction
+ * @brief Macros around the libc allocators so non-libc targets (Linux kernel,
+ * embedded freestanding builds, custom arenas) can override them via @c -D
+ * flags **before** including any zxc header, or by vendoring this file.
+ *
+ * @note Aligned allocations go through @ref ZXC_ALIGNED_MALLOC /
+ * @ref ZXC_ALIGNED_FREE (see below), not these.
+ * @{
+ */
+
+/** @def ZXC_MALLOC
+ *  @brief Heap allocator. Default: libc @c malloc. */
+#ifndef ZXC_MALLOC
+#define ZXC_MALLOC(size) malloc(size)
+#endif
+
+/** @def ZXC_CALLOC
+ *  @brief Zero-initialised heap allocator. Default: libc @c calloc. */
+#ifndef ZXC_CALLOC
+#define ZXC_CALLOC(nmemb, size) calloc(nmemb, size)
+#endif
+
+/** @def ZXC_REALLOC
+ *  @brief In-place / move heap reallocator. Default: libc @c realloc. */
+#ifndef ZXC_REALLOC
+#define ZXC_REALLOC(ptr, size) realloc(ptr, size)
+#endif
+
+/** @def ZXC_FREE
+ *  @brief Heap deallocator. Default: libc @c free. */
+#ifndef ZXC_FREE
+#define ZXC_FREE(ptr) free(ptr)
+#endif
+
+/** @} */ /* end of Heap Allocator Abstraction */
+
+/**
+ * @name Aligned Allocator Abstraction
+ * @brief Macros around the cache-line-aligned allocator used for compression
+ * workspace and per-context scratch buffers.
+ *
+ * The default expansion calls the internal helpers @ref zxc_aligned_malloc /
+ * @ref zxc_aligned_free (forward-declared in @c zxc_internal.h, defined in
+ * @c zxc_common.c), which wrap @c _aligned_malloc / @c _aligned_free on
+ * Windows and @c posix_memalign / @c free on POSIX.
+ *
+ * Kernel builds typically map this to the slab allocator: @c kmalloc already
+ * returns @c ARCH_KMALLOC_MINALIGN-aligned memory, which is greater than or
+ * equal to the cache line size on every supported architecture.
+ * @{
+ */
+
+/** @def ZXC_ALIGNED_MALLOC
+ *  @brief Cache-line-aligned allocator.
+ *         Default: @c zxc_aligned_malloc (wraps @c posix_memalign /
+ *         @c _aligned_malloc). */
+#ifndef ZXC_ALIGNED_MALLOC
+#define ZXC_ALIGNED_MALLOC(size, alignment) zxc_aligned_malloc(size, alignment)
+#endif
+
+/** @def ZXC_ALIGNED_FREE
+ *  @brief Counterpart deallocator for @ref ZXC_ALIGNED_MALLOC. */
+#ifndef ZXC_ALIGNED_FREE
+#define ZXC_ALIGNED_FREE(ptr) zxc_aligned_free(ptr)
+#endif
+
+/** @} */ /* end of Aligned Allocator Abstraction */
+
+/** @} */ /* end of addtogroup internal */
+
+#endif /* ZXC_DEPS_H */
diff --git a/thirdparty/zxc/src/lib/zxc_dict.c b/thirdparty/zxc/src/lib/zxc_dict.c
new file mode 100644
index 000000000000..56a18e0bdab4
--- /dev/null
+++ b/thirdparty/zxc/src/lib/zxc_dict.c
@@ -0,0 +1,674 @@
+/*
+ * ZXC - High-performance lossless compression
+ *
+ * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file zxc_dict.c
+ * @brief Pre-trained dictionary: ID computation, .zxd serialization, and training.
+ */
+
+#include "../../include/zxc_dict.h"
+
+#include "../../include/zxc_buffer.h"
+#include "zxc_internal.h"
+
+/* -------------------------------------------------------------------------
+ *  Dictionary ID
+ * ------------------------------------------------------------------------- */
+
+/**
+ * @brief Computes the dictionary identifier for @p dict (and optional table).
+ *
+ * Public API; see @c zxc_dict.h. One logical checksum over the content bytes,
+ * optionally chained with the packed Huffman lengths so a single id covers
+ * both. Stored in the archive header and re-checked on decode.
+ *
+ * @param[in] dict         Dictionary content bytes.
+ * @param[in] dict_size    Content length in bytes.
+ * @param[in] huf_lengths  Optional packed Huffman lengths (@c ZXC_HUF_TABLE_SIZE
+ *                         bytes), or NULL to hash the content alone.
+ * @return The 32-bit dictionary id, or 0 if @p dict is NULL or empty.
+ */
+uint32_t zxc_dict_id(const void* RESTRICT dict, const size_t dict_size,
+                     const void* RESTRICT huf_lengths) {
+    if (UNLIKELY(!dict || dict_size == 0)) return 0;
+    /* One logical hash over the real bytes only: the content checksum seeds
+     * the table checksum (content and table are not contiguous at the API
+     * level, so chaining avoids both a concat copy and a synthetic buffer). */
+    const uint32_t base = zxc_checksum(dict, dict_size, 0);
+    if (huf_lengths == NULL) return base;
+    return zxc_checksum_seed(huf_lengths, ZXC_HUF_TABLE_SIZE, base, 0);
+}
+
+/* -------------------------------------------------------------------------
+ *  .zxd format: save / load / bound
+ *
+ *  Layout (ZXC_DICT_HEADER_SIZE = 16 bytes + content + Huffman table):
+ *    0x00  4  Magic   (0x9CB0D1C7 LE)
+ *    0x04  1  Version (1)
+ *    0x05  1  Flags   (bits 0-3: checksum algo id, 0=RapidHash; bits 4-7 reserved)
+ *    0x06  2  Content size (u16 LE)
+ *    0x08  4  dict_id (u32 LE; covers content AND the Huffman table --
+ *                      see zxc_dict_id)
+ *    0x0C  2  Reserved (0)
+ *    0x0E  2  Header CRC16 (zxc_hash16, computed with bytes 0x0C-0x0F zeroed)
+ *    0x10  N  Content bytes
+ *    +N    128 Packed Huffman code lengths (always present)
+ * ------------------------------------------------------------------------- */
+
+/**
+ * @brief Extracts the stored @c dict_id from a serialized .zxd buffer.
+ *
+ * Public API; see @c zxc_dict.h. Validates the magic, then reads the id field
+ * straight from the header without recomputing it.
+ *
+ * @param[in] buf       Serialized .zxd bytes.
+ * @param[in] buf_size  Size of @p buf in bytes.
+ * @return The stored dictionary id, or 0 if @p buf is too small or not a .zxd.
+ */
+uint32_t zxc_dict_get_id(const void* buf, const size_t buf_size) {
+    if (UNLIKELY(!buf || buf_size < ZXC_DICT_HEADER_SIZE)) return 0;
+    const uint8_t* p = (const uint8_t*)buf;
+    if (UNLIKELY(zxc_le32(p) != ZXC_DICT_MAGIC)) return 0;
+    return zxc_le32(p + 8);
+}
+
+/**
+ * @brief Worst-case .zxd byte size for a dictionary of @p content_size bytes.
+ *
+ * Public API; see @c zxc_dict.h. Use it to size the buffer passed to
+ * @ref zxc_dict_save.
+ *
+ * @param[in] content_size  Dictionary content length in bytes.
+ * @return Required buffer size: header + content + Huffman table.
+ */
+size_t zxc_dict_save_bound(const size_t content_size) {
+    return ZXC_DICT_HEADER_SIZE + content_size + ZXC_HUF_TABLE_SIZE;
+}
+
+/**
+ * @brief Serializes dictionary content + Huffman table into the .zxd format.
+ *
+ * Public API; full contract in @c zxc_dict.h. Writes the 16-byte header (magic,
+ * version, sizes, dict_id, header CRC), then the content bytes, then the packed
+ * Huffman lengths. See the on-disk layout note above.
+ *
+ * @param[in]  content       Dictionary content bytes.
+ * @param[in]  content_size  Content length (<= @c ZXC_DICT_SIZE_MAX).
+ * @param[in]  huf_lengths   Packed Huffman lengths (@c ZXC_HUF_TABLE_SIZE bytes).
+ * @param[out] buf           Destination .zxd buffer.
+ * @param[in]  buf_capacity  Capacity of @p buf (>= @ref zxc_dict_save_bound).
+ * @return Bytes written on success, or a negative @ref zxc_error_t
+ *         (@ref ZXC_ERROR_NULL_INPUT, @ref ZXC_ERROR_DICT_TOO_LARGE,
+ *         @ref ZXC_ERROR_DST_TOO_SMALL).
+ */
+int64_t zxc_dict_save(const void* RESTRICT content, const size_t content_size,
+                      const void* RESTRICT huf_lengths, void* RESTRICT buf,
+                      const size_t buf_capacity) {
+    if (UNLIKELY(!content || content_size == 0 || !huf_lengths)) return ZXC_ERROR_NULL_INPUT;
+    if (UNLIKELY(content_size > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE;
+
+    const size_t total = ZXC_DICT_HEADER_SIZE + content_size + ZXC_HUF_TABLE_SIZE;
+    if (UNLIKELY(buf_capacity < total)) return ZXC_ERROR_DST_TOO_SMALL;
+
+    uint8_t* dst = (uint8_t*)buf;
+
+    zxc_store_le32(dst + 0, ZXC_DICT_MAGIC);
+    dst[4] = ZXC_DICT_VERSION;
+    dst[5] = 0; /* flags: reserved */
+    zxc_store_le16(dst + 6, (uint16_t)content_size);
+    zxc_store_le32(dst + 8, zxc_dict_id(content, content_size, (const uint8_t*)huf_lengths));
+    zxc_store_le32(dst + 12, 0); /* reserved (0x0C) + CRC16 (0x0E), zeroed before CRC */
+    const uint16_t crc = zxc_hash16(dst);
+    zxc_store_le16(dst + 14, crc);
+
+    ZXC_MEMCPY(dst + ZXC_DICT_HEADER_SIZE, content, content_size);
+    ZXC_MEMCPY(dst + ZXC_DICT_HEADER_SIZE + content_size, huf_lengths, ZXC_HUF_TABLE_SIZE);
+
+    return (int64_t)total;
+}
+
+/**
+ * @brief Parses a .zxd buffer, returning in-buffer views of content and table.
+ *
+ * Public API; full contract in @c zxc_dict.h. Validates magic, version, header
+ * CRC and dict_id, then points the out-params at the content and Huffman table
+ * inside @p buf (no copy). The returned pointers alias @p buf and stay valid
+ * only while it does.
+ *
+ * @param[in]  buf               Serialized .zxd bytes.
+ * @param[in]  buf_size          Size of @p buf in bytes.
+ * @param[out] content_out       Receives a pointer to the content bytes.
+ * @param[out] content_size_out  Receives the content length.
+ * @param[out] huf_out           Receives a pointer to the Huffman table (optional).
+ * @param[out] dict_id_out       Receives the validated dict_id (optional).
+ * @return @ref ZXC_OK, or a negative @ref zxc_error_t (bad magic / version /
+ *         header / checksum, or too small).
+ */
+int zxc_dict_load(const void* RESTRICT buf, const size_t buf_size,
+                  const void** RESTRICT content_out, size_t* RESTRICT content_size_out,
+                  const void** RESTRICT huf_out, uint32_t* RESTRICT dict_id_out) {
+    if (UNLIKELY(!buf || !content_out || !content_size_out)) return ZXC_ERROR_NULL_INPUT;
+    if (UNLIKELY(buf_size < ZXC_DICT_HEADER_SIZE)) return ZXC_ERROR_SRC_TOO_SMALL;
+
+    const uint8_t* src = (const uint8_t*)buf;
+
+    if (UNLIKELY(zxc_le32(src) != ZXC_DICT_MAGIC)) return ZXC_ERROR_BAD_MAGIC;
+    if (UNLIKELY(src[4] != ZXC_DICT_VERSION)) return ZXC_ERROR_BAD_VERSION;
+
+    const size_t content_size = zxc_le16(src + 6);
+    if (UNLIKELY(content_size == 0)) return ZXC_ERROR_CORRUPT_DATA;
+    if (UNLIKELY(content_size > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE;
+    if (UNLIKELY(buf_size < ZXC_DICT_HEADER_SIZE + content_size + ZXC_HUF_TABLE_SIZE))
+        return ZXC_ERROR_SRC_TOO_SMALL;  // LCOV_EXCL_LINE
+
+    uint8_t temp[ZXC_DICT_HEADER_SIZE];
+    ZXC_MEMCPY(temp, src, sizeof(temp));
+    zxc_store_le32(temp + 12, 0); /* reserved (0x0C) + CRC16 (0x0E), zeroed before CRC */
+    const uint16_t expected_crc = zxc_hash16(temp);
+    if (UNLIKELY(zxc_le16(src + 14) != expected_crc)) return ZXC_ERROR_BAD_HEADER;
+
+    const uint8_t* content = src + ZXC_DICT_HEADER_SIZE;
+    const uint8_t* huf = content + content_size;
+
+    /* Verify dict_id matches the (content, table) pair. */
+    const uint32_t id = zxc_dict_id(content, content_size, huf);
+    if (UNLIKELY(zxc_le32(src + 8) != id)) return ZXC_ERROR_BAD_CHECKSUM;
+
+    *content_out = content;
+    *content_size_out = content_size;
+    if (huf_out) *huf_out = huf;
+    if (dict_id_out) *dict_id_out = id;
+
+    return ZXC_OK;
+}
+
+/**
+ * @brief Returns a pointer to the packed Huffman table inside a .zxd buffer.
+ *
+ * Public API; see @c zxc_dict.h. Lightweight accessor (checks magic, version
+ * and sizes) that locates the table without the full validation of
+ * @ref zxc_dict_load. The returned pointer aliases @p buf.
+ *
+ * @param[in] buf       Serialized .zxd bytes.
+ * @param[in] buf_size  Size of @p buf in bytes.
+ * @return Pointer to the @c ZXC_HUF_TABLE_SIZE-byte table, or NULL if @p buf is
+ *         too small or not a valid .zxd.
+ */
+const void* zxc_dict_huf(const void* buf, const size_t buf_size) {
+    if (UNLIKELY(!buf || buf_size < ZXC_DICT_HEADER_SIZE)) return NULL;
+    const uint8_t* src = (const uint8_t*)buf;
+    if (UNLIKELY(zxc_le32(src) != ZXC_DICT_MAGIC)) return NULL;
+    if (UNLIKELY(src[4] != ZXC_DICT_VERSION)) return NULL;
+    const size_t content_size = zxc_le16(src + 6);
+    if (UNLIKELY(content_size == 0 ||
+                 buf_size < ZXC_DICT_HEADER_SIZE + content_size + ZXC_HUF_TABLE_SIZE))
+        return NULL;  // LCOV_EXCL_LINE
+    return src + ZXC_DICT_HEADER_SIZE + content_size;
+}
+
+/* -------------------------------------------------------------------------
+ *  Dictionary training: k-gram frequency selection
+ *
+ *  Algorithm:
+ *  1. Concatenate all samples into a corpus.
+ *  2. For each position in the corpus, hash the k-gram (k = MIN_MATCH_LEN)
+ *     and count occurrences in a fixed-size hash map.
+ *  3. Walk the corpus, building candidate segments: each starts at a frequent
+ *     k-gram and extends while neighbours stay frequent. A segment's score is
+ *     the summed frequency of its k-grams (its coverage of the corpus).
+ *  4. Greedily fill the dictionary in descending coverage order, BUT account
+ *     for overlap: once a pattern is placed, a single copy serves all future
+ *     LZ matches, so its k-grams are zeroed in the frequency table. Segments
+ *     whose coverage has since collapsed (mostly already in the dict) are
+ *     skipped, so capacity goes to NEW patterns instead of redundant copies.
+ * ------------------------------------------------------------------------- */
+
+/**
+ * @brief Hashes the k-gram at @p p into a frequency-table bucket index.
+ *
+ * Training-internal: a multiplicative hash of the @c ZXC_DICT_KGRAM_LEN-byte
+ * k-gram, folded down to @c ZXC_DICT_HASH_BITS.
+ *
+ * @param[in] p  Pointer to at least @c ZXC_DICT_KGRAM_LEN readable bytes.
+ * @return Bucket index in [0, @c ZXC_DICT_HASH_SIZE).
+ */
+static uint32_t zxc_dict_hash(const uint8_t* p) {
+    uint32_t v = zxc_le32(p);
+    v ^= (uint32_t)p[4];
+    return (v * ZXC_LZ_HASH_PRIME1) >> (32 - ZXC_DICT_HASH_BITS);
+}
+
+/**
+ * @brief Segment descriptor for dictionary training, scored by coverage.
+ */
+typedef struct {
+    uint32_t offset; /**< Offset of the segment in the corpus. */
+    uint16_t length; /**< Length of the segment. */
+    uint32_t score;  /**< Summed k-gram frequency (coverage) of the segment. */
+} zxc_dict_seg_t;
+
+/**
+ * @brief Restore the min-heap property at @p root over the range @p a[0..n).
+ *
+ * Sinks @p a[root] down the binary heap (children at @c 2i+1 / @c 2i+2) until
+ * both children are @c >= it, comparing on @ref zxc_dict_seg_t::score. The loop
+ * is iterative (no recursion), so the call stack stays O(1) regardless of @p n.
+ *
+ * @param[in,out] a    Heap-ordered array; @p a[0..n) is treated as the heap.
+ * @param[in]     root Index of the element to sift down. Must be @c < n.
+ * @param[in]     n    Number of valid elements in the heap.
+ *
+ * @note Complexity O(log n).
+ */
+static void zxc_dict_sift_down(zxc_dict_seg_t* RESTRICT a, size_t root, const size_t n) {
+    for (;;) {
+        size_t child = 2 * root + 1;
+        if (child >= n) break;
+        if (child + 1 < n && a[child + 1].score < a[child].score) child++;
+        if (a[root].score <= a[child].score) break;
+        const zxc_dict_seg_t t = a[root];
+        a[root] = a[child];
+        a[child] = t;
+        root = child;
+    }
+}
+
+/**
+ * @brief Sort @p a[0..n) by @ref zxc_dict_seg_t::score in descending order.
+ *
+ * In-place heapsort: a min-heap is built over the whole array, then each
+ * extracted minimum is swapped to the shrinking tail. Because the smallest
+ * scores accumulate at the end, the array is left in descending order
+ * (largest score at index 0), as required by the dictionary fill step.
+ *
+ * Replaces a libc @c qsort call for two reasons:
+ *  - **Freestanding/kernel-safe**: no dependency on @c qsort and no indirect
+ *    comparator call (the @c score comparison is inlined in @ref
+ *    zxc_dict_sift_down).
+ *  - **Deterministic**: ordering is fixed by this code rather than by the
+ *    platform's @c qsort, which matters for reproducible dictionary output
+ *    across libc implementations.
+ *
+ * Equal scores keep an unspecified-but-deterministic relative order, matching
+ * the previous comparator that returned 0 on ties (heapsort is not stable).
+ *
+ * @param[in,out] a Array of @p n segments, sorted in place.
+ * @param[in]     n Number of segments. @c n < 2 is a no-op.
+ *
+ * @note Complexity O(n log n) worst case with no extra allocation. In practice
+ *       this matches or beats @c qsort on the sizes seen here (up to ~65536
+ *       segments): eliminating the per-comparison indirect call outweighs
+ *       heapsort's weaker cache locality. This is a cold path (dictionary
+ *       training), so absolute speed is not critical.
+ */
+static void zxc_dict_sort_segs_desc(zxc_dict_seg_t* RESTRICT a, const size_t n) {
+    if (UNLIKELY(n < 2)) return;
+    for (size_t i = n / 2; i-- > 0;) zxc_dict_sift_down(a, i, n);
+    for (size_t end = n; end > 1;) {
+        end--;
+        const zxc_dict_seg_t t = a[0];
+        a[0] = a[end];
+        a[end] = t;
+        zxc_dict_sift_down(a, 0, end);
+    }
+}
+
+/**
+ * @brief Trains a raw dictionary from sample buffers by k-gram coverage.
+ *
+ * Public API; full contract in @c zxc_dict.h, algorithm in the note above.
+ * Concatenates the samples, counts sampled k-gram frequencies, builds
+ * coverage-scored candidate segments, then greedily fills @p dict_buf in
+ * descending coverage while zeroing each pick's k-grams (so overlapping
+ * patterns aren't copied twice). Picks are emitted in reverse so the
+ * highest-coverage bytes land at the dict's end (smallest match offsets).
+ *
+ * @param[in]  samples        Array of @p n_samples sample buffers.
+ * @param[in]  sample_sizes   Array of @p n_samples sample lengths.
+ * @param[in]  n_samples      Number of samples.
+ * @param[out] dict_buf       Destination for the trained content.
+ * @param[in]  dict_capacity  Capacity of @p dict_buf (<= @c ZXC_DICT_SIZE_MAX).
+ * @return Trained content length in bytes, or a negative @ref zxc_error_t.
+ */
+int64_t zxc_train_dict(const void* const* RESTRICT samples, const size_t* RESTRICT sample_sizes,
+                       const size_t n_samples, void* RESTRICT dict_buf,
+                       const size_t dict_capacity) {
+    if (UNLIKELY(!samples || !sample_sizes || n_samples == 0 || !dict_buf || dict_capacity == 0))
+        return ZXC_ERROR_NULL_INPUT;  // LCOV_EXCL_LINE
+    if (UNLIKELY(dict_capacity > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE;
+
+    /* Step 1: concatenate samples */
+    size_t corpus_size = 0;
+    for (size_t i = 0; i < n_samples; i++) corpus_size += sample_sizes[i];
+    if (UNLIKELY(corpus_size < ZXC_DICT_KGRAM_LEN)) return ZXC_ERROR_SRC_TOO_SMALL;
+
+    uint8_t* corpus = (uint8_t*)ZXC_MALLOC(corpus_size);
+    if (UNLIKELY(!corpus)) return ZXC_ERROR_MEMORY;
+    {
+        size_t pos = 0;
+        for (size_t i = 0; i < n_samples; i++) {
+            if (sample_sizes[i] > 0) ZXC_MEMCPY(corpus + pos, samples[i], sample_sizes[i]);
+            pos += sample_sizes[i];
+        }
+    }
+
+    /* Step 2: count k-gram frequencies */
+    uint16_t* freq = (uint16_t*)ZXC_MALLOC(ZXC_DICT_HASH_SIZE * sizeof(uint16_t));
+    if (UNLIKELY(!freq)) {
+        // LCOV_EXCL_START
+        ZXC_FREE(corpus);
+        return ZXC_ERROR_MEMORY;
+        // LCOV_EXCL_STOP
+    }
+    ZXC_MEMSET(freq, 0, ZXC_DICT_HASH_SIZE * sizeof(uint16_t));
+
+    /* Count k-gram frequencies on a representative sample of positions, not all
+     * of them: counting a large corpus in full saturates the 16-bit counters,
+     * so the segment-extension test never stops and segments balloon into
+     * filler. Sampling keeps counts unsaturated and spread across the corpus. */
+    const size_t kgram_limit = corpus_size - ZXC_DICT_KGRAM_LEN + 1;
+    size_t freq_stride = kgram_limit / ZXC_DICT_SAMPLE_TARGET;
+    if (freq_stride < 1) freq_stride = 1;
+    for (size_t i = 0; i < kgram_limit; i += freq_stride) {
+        const uint32_t h = zxc_dict_hash(corpus + i);
+        if (freq[h] < UINT16_MAX) freq[h]++;
+    }
+
+    /* Step 3: build candidate segments, each scored by its coverage. Spread the
+     * candidate starts across the whole corpus: a fixed k-gram stride exhausts
+     * the segment budget within the prefix, leaving a large input's later
+     * content unseen. Segments still extend k-gram by k-gram, so they stay
+     * contiguous. */
+    const size_t max_segs = corpus_size / ZXC_DICT_KGRAM_LEN;
+    const size_t seg_alloc = (max_segs < ZXC_DICT_MAX_SEGMENTS) ? max_segs : ZXC_DICT_MAX_SEGMENTS;
+    size_t stride = ZXC_DICT_KGRAM_LEN;
+    if (seg_alloc > 0 && corpus_size / seg_alloc > stride) stride = corpus_size / seg_alloc;
+
+    zxc_dict_seg_t* segs = (zxc_dict_seg_t*)ZXC_MALLOC(seg_alloc * sizeof(zxc_dict_seg_t));
+    if (UNLIKELY(!segs)) {
+        // LCOV_EXCL_START
+        ZXC_FREE(freq);
+        ZXC_FREE(corpus);
+        return ZXC_ERROR_MEMORY;
+        // LCOV_EXCL_STOP
+    }
+
+    size_t n_segs = 0;
+    for (size_t i = 0; i + ZXC_DICT_KGRAM_LEN <= corpus_size && n_segs < seg_alloc; i += stride) {
+        const uint32_t h = zxc_dict_hash(corpus + i);
+        const uint16_t f = freq[h];
+        if (f < 2) continue;
+
+        /* Extend the segment as long as the next k-gram is also frequent, and
+         * accumulate coverage (summed k-gram frequency) as the score. */
+        uint32_t coverage = f;
+        size_t end = i + ZXC_DICT_KGRAM_LEN;
+        while (end + ZXC_DICT_KGRAM_LEN <= corpus_size && end - i < 4096) {
+            const uint16_t nf = freq[zxc_dict_hash(corpus + end)];
+            if (nf < 2) break;
+            coverage += nf;
+            end += ZXC_DICT_KGRAM_LEN;
+        }
+
+        segs[n_segs].offset = (uint32_t)i;
+        segs[n_segs].length = (uint16_t)(end - i);
+        segs[n_segs].score = coverage;
+        n_segs++;
+    }
+
+    if (UNLIKELY(n_segs == 0)) {
+        /* No frequent patterns. Use tail of corpus as dict. */
+        const size_t copy = (corpus_size < dict_capacity) ? corpus_size : dict_capacity;
+        ZXC_MEMCPY(dict_buf, corpus + corpus_size - copy, copy);
+        ZXC_FREE(freq);
+        ZXC_FREE(segs);
+        ZXC_FREE(corpus);
+        return (int64_t)copy;
+    }
+
+    /* Step 4: pick segments greedily in descending-coverage order, zeroing each
+     * pick's k-grams so overlapping patterns aren't copied twice. Picks are
+     * compacted in place into segs[0..n_sel); placement is step 5. */
+    zxc_dict_sort_segs_desc(segs, n_segs);
+
+    uint8_t* out = (uint8_t*)dict_buf;
+    size_t n_sel = 0;
+    size_t total = 0;
+
+    for (size_t i = 0; i < n_segs && total < dict_capacity; i++) {
+        const size_t seg_off = segs[i].offset;
+        const size_t seg_end = seg_off + segs[i].length;
+
+        /* Recompute coverage from the decrementing table: skip the segment if
+         * earlier picks have already covered more than half of its k-grams. */
+        uint32_t cur = 0;
+        for (size_t p = seg_off; p + ZXC_DICT_KGRAM_LEN <= seg_end; p += ZXC_DICT_KGRAM_LEN)
+            cur += freq[zxc_dict_hash(corpus + p)];
+        if (cur * 2 < segs[i].score) continue;
+
+        size_t copy = segs[i].length;
+        if (copy > dict_capacity - total) copy = dict_capacity - total;
+
+        /* One copy in the dictionary serves all future matches: mark this
+         * segment's k-grams as covered so later segments cover new ground. */
+        for (size_t p = seg_off; p + ZXC_DICT_KGRAM_LEN <= seg_end; p += ZXC_DICT_KGRAM_LEN)
+            freq[zxc_dict_hash(corpus + p)] = 0;
+
+        /* Record the pick (n_sel <= i, so this never clobbers an unread entry). */
+        segs[n_sel].offset = (uint32_t)seg_off;
+        segs[n_sel].length = (uint16_t)copy;
+        n_sel++;
+        total += copy;
+    }
+
+    ZXC_FREE(freq);
+
+    /* Step 5: emit picks in reverse order so the highest-coverage segment ends
+     * up at the END of the dict. The dict sits just before the data, so bytes
+     * nearer its end have the smallest match offset: cheapest to encode and the
+     * last to leave the 16-bit (65535) offset window.
+     *
+     * No padding: if the picks don't fill the capacity, the dict is just
+     * shorter. The old tail-padding only added low-value bytes that raised
+     * offsets for everything after them. */
+    size_t filled = 0;
+    for (size_t i = n_sel; i-- > 0;) {
+        ZXC_MEMCPY(out + filled, corpus + segs[i].offset, segs[i].length);
+        filled += segs[i].length;
+    }
+
+    /* Nothing selected (every segment subsumed by earlier picks): fall back to
+     * the corpus tail so the dict is never empty, like the n_segs == 0 path. */
+    if (UNLIKELY(filled == 0)) {
+        const size_t tail = (corpus_size < dict_capacity) ? corpus_size : dict_capacity;
+        ZXC_MEMCPY(out, corpus + corpus_size - tail, tail);
+        filled = tail;
+    }
+
+    ZXC_FREE(segs);
+    ZXC_FREE(corpus);
+    return (int64_t)filled;
+}
+
+/* -------------------------------------------------------------------------
+ *  Shared literal Huffman table training (Tier-2)
+ *
+ *  Compresses the training samples with the freshly trained dictionary at
+ *  level ZXC_LEVEL_DENSITY and accumulates the frequencies of the REAL
+ *  post-LZ literals (via the cctx lit_freq_acc hook in the GLO encoder).
+ *  Raw sample bytes are a poor proxy: LZ matches against the dictionary
+ *  remove most repeated content, so the literal distribution differs
+ *  substantially from the raw byte histogram.
+ *
+ *  Samples are sliced into ZXC_DICT_HUF_TRAIN_BLOCK-byte blocks: the small-
+ *  block regime is where the shared table pays (per-block table headers are
+ *  unaffordable there) and literal density is highest.
+ * ------------------------------------------------------------------------- */
+
+/**
+ * @brief Trains the shared literal Huffman table for a dictionary (Tier-2).
+ *
+ * Public API; see @c zxc_dict.h and the algorithm note above. Compresses the
+ * samples with @p dict at @c ZXC_LEVEL_DENSITY, accumulating the frequencies of
+ * the REAL post-LZ literals (raw bytes are a poor proxy), then builds and packs
+ * length-limited code lengths. Samples are sliced into
+ * @c ZXC_DICT_HUF_TRAIN_BLOCK blocks, sub-sampled to a fixed budget.
+ *
+ * @param[in]  samples          Array of @p n_samples sample buffers.
+ * @param[in]  sample_sizes     Array of @p n_samples sample lengths.
+ * @param[in]  n_samples        Number of samples.
+ * @param[in]  dict             Trained dictionary content.
+ * @param[in]  dict_size        Dictionary length (<= @c ZXC_DICT_SIZE_MAX).
+ * @param[out] huf_lengths_out  Receives the @c ZXC_HUF_TABLE_SIZE packed lengths.
+ * @return @ref ZXC_OK, or a negative @ref zxc_error_t.
+ */
+int zxc_train_dict_huf(const void* const* RESTRICT samples, const size_t* RESTRICT sample_sizes,
+                       const size_t n_samples, const void* RESTRICT dict, const size_t dict_size,
+                       uint8_t* RESTRICT huf_lengths_out) {
+    if (UNLIKELY(!samples || !sample_sizes || n_samples == 0 || !dict || dict_size == 0 ||
+                 !huf_lengths_out))
+        return ZXC_ERROR_NULL_INPUT;
+    if (UNLIKELY(dict_size > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE;
+
+    const size_t eff_chunk = zxc_block_size_ceil(dict_size + ZXC_DICT_HUF_TRAIN_BLOCK);
+
+    zxc_cctx_t cctx;
+    if (UNLIKELY(zxc_cctx_init(&cctx, eff_chunk, 1, ZXC_LEVEL_DENSITY, 0, dict_size) != ZXC_OK))
+        return ZXC_ERROR_MEMORY;  // LCOV_EXCL_LINE
+
+    uint32_t freq[ZXC_HUF_NUM_SYMBOLS] = {0};
+    cctx.lit_freq_acc = freq;
+
+    const size_t out_cap = (size_t)zxc_compress_bound(eff_chunk);
+    uint8_t* out_scratch = (uint8_t*)ZXC_MALLOC(out_cap);
+    if (UNLIKELY(!out_scratch)) {
+        // LCOV_EXCL_START
+        zxc_cctx_free(&cctx);
+        return ZXC_ERROR_MEMORY;
+        // LCOV_EXCL_STOP
+    }
+
+    /* [dict | slice] concat scratch carved by zxc_cctx_init (mode 1). */
+    uint8_t* const work = cctx.dict_buffer;
+    ZXC_MEMCPY(work, dict, dict_size);
+
+    /* Slice stride: process every slice while the corpus fits the budget,
+     * else 1 slice out of `stride` spread evenly across all samples. */
+    size_t corpus_total = 0;
+    for (size_t s = 0; s < n_samples; s++) corpus_total += sample_sizes[s];
+    const size_t stride =
+        (corpus_total > ZXC_DICT_HUF_SAMPLE_BUDGET)
+            ? (corpus_total + ZXC_DICT_HUF_SAMPLE_BUDGET - 1) / ZXC_DICT_HUF_SAMPLE_BUDGET
+            : 1;
+
+    int rc = ZXC_OK;
+    size_t slice_idx = 0;
+    for (size_t s = 0; s < n_samples && rc == ZXC_OK; s++) {
+        const uint8_t* sample = (const uint8_t*)samples[s];
+        const size_t sample_size = sample_sizes[s];
+        if (!sample || sample_size == 0) continue;
+
+        for (size_t off = 0; off < sample_size; off += ZXC_DICT_HUF_TRAIN_BLOCK, slice_idx++) {
+            if (slice_idx % stride != 0) continue;
+            const size_t slice = (sample_size - off < ZXC_DICT_HUF_TRAIN_BLOCK)
+                                     ? (sample_size - off)
+                                     : ZXC_DICT_HUF_TRAIN_BLOCK;
+            ZXC_MEMCPY(work + dict_size, sample + off, slice);
+            const int r =
+                zxc_compress_chunk_wrapper(&cctx, work, dict_size + slice, out_scratch, out_cap);
+            if (UNLIKELY(r < 0)) {
+                rc = r;
+                break;
+            }
+        }
+    }
+
+    if (rc == ZXC_OK) {
+        /* A low-entropy corpus leaves no post-LZ literals: an empty histogram,
+         * not corrupt input. Detect that on the histogram itself (OR-reduce) and
+         * emit an empty all-zero table -- every block then falls back to its
+         * per-block table. Inferring "empty" from a build error code instead
+         * would risk masking a genuine failure. */
+        uint32_t any = 0;
+        for (int i = 0; i < ZXC_HUF_NUM_SYMBOLS; i++) any |= freq[i];
+        if (any == 0) {
+            ZXC_MEMSET(huf_lengths_out, 0, ZXC_HUF_TABLE_SIZE);
+        } else {
+            /* No coverage smoothing: with ZXC_HUF_MAX_CODE_LEN == 8, a code over
+             * all 256 symbols can only be the degenerate all-8-bit code (Kraft
+             * equality), which compresses nothing. Symbols unseen in training
+             * stay code-less; blocks containing one fall back to their per-block
+             * table at compression time (the encoder's validity check). */
+            uint8_t code_len[ZXC_HUF_NUM_SYMBOLS];
+            rc = zxc_huf_build_code_lengths(freq, code_len, NULL);
+            if (rc == ZXC_OK) zxc_huf_pack_lengths(code_len, huf_lengths_out);
+        }
+    }
+
+    ZXC_FREE(out_scratch);
+    zxc_cctx_free(&cctx);
+    return rc;
+}
+
+/* -------------------------------------------------------------------------
+ *  All-in-one convenience: samples -> ready-to-write .zxd bytes
+ * ------------------------------------------------------------------------- */
+
+/**
+ * @brief One-shot training: sample buffers in, ready-to-write .zxd bytes out.
+ *
+ * Public API; see @c zxc_dict.h. Convenience wrapper that trains the content
+ * (@ref zxc_train_dict), then the shared Huffman table from that content
+ * (@ref zxc_train_dict_huf), then serializes both via @ref zxc_dict_save. The
+ * two phases are a real data dependency, hidden behind one call.
+ *
+ * @param[in]  samples       Array of @p n_samples sample buffers.
+ * @param[in]  sample_sizes  Array of @p n_samples sample lengths.
+ * @param[in]  n_samples     Number of samples.
+ * @param[out] zxd_buf       Destination .zxd buffer.
+ * @param[in]  zxd_capacity  Capacity of @p zxd_buf in bytes.
+ * @return Bytes written to @p zxd_buf, or a negative @ref zxc_error_t.
+ */
+int64_t zxc_dict_train(const void* const* RESTRICT samples, const size_t* RESTRICT sample_sizes,
+                       const size_t n_samples, void* RESTRICT zxd_buf, const size_t zxd_capacity) {
+    if (UNLIKELY(!samples || !sample_sizes || n_samples == 0 || !zxd_buf || zxd_capacity == 0))
+        return ZXC_ERROR_NULL_INPUT;
+
+    /* Train the content into a temporary buffer (max-content sized), then the
+     * shared table from that content, then serialize both into zxd_buf. The
+     * two-phase training is a real data dependency (the table needs the trained
+     * content to histogram post-LZ literals); this hides it behind one call. */
+    uint8_t* content = (uint8_t*)ZXC_MALLOC(ZXC_DICT_SIZE_MAX);
+    if (UNLIKELY(!content)) return ZXC_ERROR_MEMORY;
+
+    int64_t out;
+    const int64_t content_size =
+        zxc_train_dict(samples, sample_sizes, n_samples, content, ZXC_DICT_SIZE_MAX);
+    if (UNLIKELY(content_size <= 0)) {
+        // LCOV_EXCL_START
+        out = (content_size < 0) ? content_size : ZXC_ERROR_SRC_TOO_SMALL;
+        goto done;
+        // LCOV_EXCL_STOP
+    }
+
+    {
+        uint8_t huf[ZXC_HUF_TABLE_SIZE];
+        const int hrc = zxc_train_dict_huf(samples, sample_sizes, n_samples, content,
+                                           (size_t)content_size, huf);
+        if (UNLIKELY(hrc != ZXC_OK)) {
+            // LCOV_EXCL_START
+            out = hrc;
+            goto done;
+            // LCOV_EXCL_STOP
+        }
+        out = zxc_dict_save(content, (size_t)content_size, huf, zxd_buf, zxd_capacity);
+    }
+
+done:
+    ZXC_FREE(content);
+    return out;
+}
diff --git a/thirdparty/zxc/src/lib/zxc_dispatch.c b/thirdparty/zxc/src/lib/zxc_dispatch.c
new file mode 100644
index 000000000000..fea30df58271
--- /dev/null
+++ b/thirdparty/zxc/src/lib/zxc_dispatch.c
@@ -0,0 +1,1821 @@
+/*
+ * ZXC - High-performance lossless compression
+ *
+ * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file zxc_dispatch.c
+ * @brief Runtime CPU feature detection and SIMD dispatch layer.
+ *
+ * Detects AVX2/AVX512/NEON at runtime and routes compress/decompress calls
+ * to the best available implementation via lazy-initialised function pointers.
+ * Also contains the public one-shot buffer API (@ref zxc_compress,
+ * @ref zxc_decompress, @ref zxc_get_decompressed_size).
+ */
+
+#include "../../include/zxc_dict.h"
+#include "../../include/zxc_error.h"
+#include "../../include/zxc_seekable.h"
+#include "zxc_internal.h"
+
+/*
+ * ZXC_DISABLE_SIMD => force ZXC_ONLY_DEFAULT so the dispatcher never selects
+ * an AVX2/AVX512/NEON variant.
+ */
+#if defined(ZXC_DISABLE_SIMD) && !defined(ZXC_ONLY_DEFAULT)
+#define ZXC_ONLY_DEFAULT
+#endif
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#if defined(_M_X64)
+#include <immintrin.h>  // _xgetbv (x86-specific header; x64 AVX state check)
+#endif
+#endif
+
+#if defined(__linux__) && (defined(__arm__) || defined(_M_ARM))
+#include <asm/hwcap.h>
+#include <sys/auxv.h>
+#endif
+
+/*
+ * ============================================================================
+ * PROTOTYPES FOR MULTI-VERSIONED VARIANTS
+ * ============================================================================
+ * These are compiled in separate translation units with different flags.
+ */
+
+// Decompression Prototypes
+int zxc_decompress_chunk_wrapper_default(const zxc_cctx_t* RESTRICT ctx,
+                                         const uint8_t* RESTRICT src, const size_t src_sz,
+                                         uint8_t* RESTRICT dst, const size_t dst_cap);
+int zxc_decompress_chunk_wrapper_dict_default(const zxc_cctx_t* RESTRICT ctx,
+                                              const uint8_t* RESTRICT src, const size_t src_sz,
+                                              uint8_t* RESTRICT dst, const size_t dst_cap);
+int zxc_decompress_chunk_wrapper_safe_default(const zxc_cctx_t* RESTRICT ctx,
+                                              const uint8_t* RESTRICT src, const size_t src_sz,
+                                              uint8_t* RESTRICT dst, const size_t dst_cap);
+
+#ifndef ZXC_ONLY_DEFAULT
+#if defined(__x86_64__) || defined(_M_X64)
+int zxc_decompress_chunk_wrapper_avx2(const zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                      const size_t src_sz, uint8_t* RESTRICT dst,
+                                      const size_t dst_cap);
+int zxc_decompress_chunk_wrapper_dict_avx2(const zxc_cctx_t* RESTRICT ctx,
+                                           const uint8_t* RESTRICT src, const size_t src_sz,
+                                           uint8_t* RESTRICT dst, const size_t dst_cap);
+int zxc_decompress_chunk_wrapper_avx512(const zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                        const size_t src_sz, uint8_t* RESTRICT dst,
+                                        const size_t dst_cap);
+int zxc_decompress_chunk_wrapper_dict_avx512(const zxc_cctx_t* RESTRICT ctx,
+                                             const uint8_t* RESTRICT src, const size_t src_sz,
+                                             uint8_t* RESTRICT dst, const size_t dst_cap);
+int zxc_decompress_chunk_wrapper_safe_avx2(const zxc_cctx_t* RESTRICT ctx,
+                                           const uint8_t* RESTRICT src, const size_t src_sz,
+                                           uint8_t* RESTRICT dst, const size_t dst_cap);
+int zxc_decompress_chunk_wrapper_safe_avx512(const zxc_cctx_t* RESTRICT ctx,
+                                             const uint8_t* RESTRICT src, const size_t src_sz,
+                                             uint8_t* RESTRICT dst, const size_t dst_cap);
+int zxc_decompress_chunk_wrapper_sse2(const zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                      const size_t src_sz, uint8_t* RESTRICT dst,
+                                      const size_t dst_cap);
+int zxc_decompress_chunk_wrapper_dict_sse2(const zxc_cctx_t* RESTRICT ctx,
+                                           const uint8_t* RESTRICT src, const size_t src_sz,
+                                           uint8_t* RESTRICT dst, const size_t dst_cap);
+int zxc_decompress_chunk_wrapper_safe_sse2(const zxc_cctx_t* RESTRICT ctx,
+                                           const uint8_t* RESTRICT src, const size_t src_sz,
+                                           uint8_t* RESTRICT dst, const size_t dst_cap);
+#elif defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
+int zxc_decompress_chunk_wrapper_neon(const zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                      const size_t src_sz, uint8_t* RESTRICT dst,
+                                      const size_t dst_cap);
+int zxc_decompress_chunk_wrapper_dict_neon(const zxc_cctx_t* RESTRICT ctx,
+                                           const uint8_t* RESTRICT src, const size_t src_sz,
+                                           uint8_t* RESTRICT dst, const size_t dst_cap);
+int zxc_decompress_chunk_wrapper_safe_neon(const zxc_cctx_t* RESTRICT ctx,
+                                           const uint8_t* RESTRICT src, const size_t src_sz,
+                                           uint8_t* RESTRICT dst, const size_t dst_cap);
+#endif
+#endif
+
+// Compression Prototypes
+int zxc_compress_chunk_wrapper_default(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                       const size_t src_sz, uint8_t* RESTRICT dst,
+                                       const size_t dst_cap);
+
+// Huffman Prototypes (variant TUs of zxc_huffman.c). The compressor and
+// decompressor variants resolve their Huffman calls to the matching suffixed
+// symbol at compile time (zero dispatch overhead in the hot path); the thin
+// wrappers below expose the un-suffixed names for tests and external callers.
+int zxc_huf_build_code_lengths_default(const uint32_t* RESTRICT freq, uint8_t* RESTRICT code_len,
+                                       void* RESTRICT scratch);
+int zxc_huf_encode_section_default(const uint8_t* RESTRICT literals, const size_t n_literals,
+                                   const uint8_t* RESTRICT code_len, uint8_t* RESTRICT dst,
+                                   const size_t dst_cap);
+int zxc_huf_decode_section_default(const uint8_t* RESTRICT payload, const size_t payload_size,
+                                   uint8_t* RESTRICT dst, const size_t n_literals);
+int zxc_huf_encode_section_dict_default(const uint8_t* RESTRICT literals, const size_t n_literals,
+                                        const uint8_t* RESTRICT code_len, uint8_t* RESTRICT dst,
+                                        const size_t dst_cap);
+int zxc_huf_decode_section_dict_default(const uint8_t* RESTRICT payload, const size_t payload_size,
+                                        uint8_t* RESTRICT dst, const size_t n_literals,
+                                        const zxc_huf_dec_entry_t* RESTRICT table);
+int zxc_huf_build_dec_table_default(const uint8_t* RESTRICT code_len,
+                                    zxc_huf_dec_entry_t* RESTRICT table);
+void zxc_huf_pack_lengths_default(const uint8_t* RESTRICT code_len, uint8_t* RESTRICT out);
+int zxc_huf_unpack_lengths_default(const uint8_t* RESTRICT in, uint8_t* RESTRICT code_len);
+
+#if defined(__x86_64__) || defined(_M_X64)
+int zxc_compress_chunk_wrapper_avx2(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                    const size_t src_sz, uint8_t* RESTRICT dst,
+                                    const size_t dst_cap);
+int zxc_compress_chunk_wrapper_avx512(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                      const size_t src_sz, uint8_t* RESTRICT dst,
+                                      const size_t dst_cap);
+int zxc_compress_chunk_wrapper_sse2(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                    const size_t src_sz, uint8_t* RESTRICT dst,
+                                    const size_t dst_cap);
+#elif defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
+int zxc_compress_chunk_wrapper_neon(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                    const size_t src_sz, uint8_t* RESTRICT dst,
+                                    const size_t dst_cap);
+#endif
+
+/*
+ * ============================================================================
+ * CPU DETECTION LOGIC
+ * ============================================================================
+ */
+
+/**
+ * @enum zxc_cpu_feature_t
+ * @brief Detected CPU SIMD capability level.
+ */
+typedef enum {
+    ZXC_CPU_GENERIC = 0, /**< @brief Scalar-only fallback.   */
+    ZXC_CPU_AVX2 = 1,    /**< @brief x86-64 AVX2 available.  */
+    ZXC_CPU_AVX512 = 2,  /**< @brief x86-64 AVX-512F+BW available. */
+    ZXC_CPU_NEON = 3,    /**< @brief ARM NEON available.      */
+    ZXC_CPU_SSE2 = 4     /**< @brief x86 SSE2 available (no AVX2); x86-64 baseline. */
+} zxc_cpu_feature_t;
+
+/**
+ * @brief Probes the running CPU for SIMD support.
+ *
+ * Uses CPUID on x86-64 (MSVC and GCC/Clang paths), `getauxval` on
+ * 32-bit ARM Linux, and compile-time constants on AArch64.
+ *
+ * @return The highest @ref zxc_cpu_feature_t level supported.
+ */
+// LCOV_EXCL_START
+static zxc_cpu_feature_t zxc_detect_cpu_features(void) {
+#ifdef ZXC_ONLY_DEFAULT
+    return ZXC_CPU_GENERIC;
+#else
+    zxc_cpu_feature_t features = ZXC_CPU_GENERIC;
+
+#if defined(__x86_64__) || defined(_M_X64)
+#if defined(_MSC_VER)
+    // AVX2/AVX512 need OS-enabled YMM/ZMM state: gate on OSXSAVE + XGETBV/XCR0,
+    // not CPUID alone (else a VEX/EVEX op faults #UD when the OS hasn't enabled it).
+    int regs[4];
+    int sse2 = 0;
+    int avx2 = 0;
+    int avx512 = 0;
+
+    __cpuid(regs, 1);
+    if (regs[3] & (1 << 26)) sse2 = 1;  // SSE2
+    if (regs[2] & (1 << 27)) {          // OSXSAVE
+        const unsigned long long xcr0 = _xgetbv(0);
+        if ((xcr0 & 0x6) == 0x6) {  // SSE+YMM enabled
+            __cpuidex(regs, 7, 0);
+            if (regs[1] & (1 << 5)) avx2 = 1;
+            // AVX512 also needs XCR0[5..7] (opmask/ZMM)
+            if ((regs[1] & (1 << 16)) && (regs[1] & (1 << 30)) && (xcr0 & 0xE0) == 0xE0) avx512 = 1;
+        }
+    }
+
+    if (avx512) {
+        features = ZXC_CPU_AVX512;
+    } else if (avx2) {
+        features = ZXC_CPU_AVX2;
+    } else if (sse2) {
+        features = ZXC_CPU_SSE2;
+    }
+#else
+    // GCC/Clang built-in detection
+    __builtin_cpu_init();
+
+    if (__builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512bw")) {
+        features = ZXC_CPU_AVX512;
+    } else if (__builtin_cpu_supports("avx2")) {
+        features = ZXC_CPU_AVX2;
+    } else if (__builtin_cpu_supports("sse2")) {
+        features = ZXC_CPU_SSE2;
+    }
+#endif
+
+#elif defined(__aarch64__) || defined(_M_ARM64)
+    // ARM64 usually guarantees NEON
+    features = ZXC_CPU_NEON;
+
+#elif defined(__arm__) || defined(_M_ARM)
+    // ARM32 Runtime detection for Linux
+#if defined(__linux__)
+    const unsigned long hwcaps = getauxval(AT_HWCAP);
+    if (hwcaps & HWCAP_NEON) {
+        features = ZXC_CPU_NEON;
+    }
+#else
+// Fallback for non-Linux: rely on compiler flags.
+// If compiled with -mfpu=neon, we assume target supports it.
+// Otherwise, safe default is GENERIC.
+#if defined(__ARM_NEON)
+    features = ZXC_CPU_NEON;
+#endif
+#endif
+#endif
+
+    return features;
+#endif
+}
+// LCOV_EXCL_STOP
+
+/*
+ * ============================================================================
+ * DISPATCHERS
+ * ============================================================================
+ * We use a function pointer initialized on first use (lazy initialization).
+ */
+
+/** @brief Function pointer type for the chunk decompressor. */
+typedef int (*zxc_decompress_func_t)(const zxc_cctx_t* RESTRICT, const uint8_t* RESTRICT,
+                                     const size_t, uint8_t* RESTRICT, const size_t);
+/** @brief Function pointer type for the chunk compressor. */
+typedef int (*zxc_compress_func_t)(zxc_cctx_t* RESTRICT, const uint8_t* RESTRICT, const size_t,
+                                   uint8_t* RESTRICT, const size_t);
+
+/** @brief Lazily-resolved pointer to the best decompression variant. */
+static ZXC_ATOMIC zxc_decompress_func_t zxc_decompress_ptr = (zxc_decompress_func_t)0;
+/** @brief Lazily-resolved pointer to the best dict-decompression variant. */
+static ZXC_ATOMIC zxc_decompress_func_t zxc_decompress_dict_ptr = (zxc_decompress_func_t)0;
+/** @brief Lazily-resolved pointer to the best safe-decompression variant. */
+static ZXC_ATOMIC zxc_decompress_func_t zxc_decompress_safe_ptr = (zxc_decompress_func_t)0;
+/** @brief Lazily-resolved pointer to the best compression variant. */
+static ZXC_ATOMIC zxc_compress_func_t zxc_compress_ptr = (zxc_compress_func_t)0;
+
+/**
+ * @brief First-call initialiser for the decompression dispatcher.
+ *
+ * Detects CPU features, selects the best implementation, stores the
+ * pointer atomically, then tail-calls into it.
+ *
+ * @param[in]  ctx      Decompression context (its @c dict_size picks the dict variant).
+ * @param[in]  src      Compressed input chunk.
+ * @param[in]  src_sz   Size of @p src in bytes.
+ * @param[out] dst      Destination buffer for decompressed data.
+ * @param[in]  dst_cap  Capacity of @p dst in bytes.
+ * @return Result of the selected variant: decompressed size, or negative
+ *         @ref zxc_error_t.
+ */
+// LCOV_EXCL_START
+static int zxc_decompress_dispatch_init(const zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                        const size_t src_sz, uint8_t* RESTRICT dst,
+                                        const size_t dst_cap) {
+    const zxc_cpu_feature_t cpu = zxc_detect_cpu_features();
+    zxc_decompress_func_t zxc_decompress_ptr_local = NULL;
+    zxc_decompress_func_t zxc_decompress_dict_ptr_local = NULL;
+
+#ifndef ZXC_ONLY_DEFAULT
+#if defined(__x86_64__) || defined(_M_X64)
+    if (cpu == ZXC_CPU_AVX512) {
+        zxc_decompress_ptr_local = zxc_decompress_chunk_wrapper_avx512;
+        zxc_decompress_dict_ptr_local = zxc_decompress_chunk_wrapper_dict_avx512;
+    } else if (cpu == ZXC_CPU_AVX2) {
+        zxc_decompress_ptr_local = zxc_decompress_chunk_wrapper_avx2;
+        zxc_decompress_dict_ptr_local = zxc_decompress_chunk_wrapper_dict_avx2;
+    } else if (cpu == ZXC_CPU_SSE2) {
+        zxc_decompress_ptr_local = zxc_decompress_chunk_wrapper_sse2;
+        zxc_decompress_dict_ptr_local = zxc_decompress_chunk_wrapper_dict_sse2;
+    } else {
+        zxc_decompress_ptr_local = zxc_decompress_chunk_wrapper_default;
+        zxc_decompress_dict_ptr_local = zxc_decompress_chunk_wrapper_dict_default;
+    }
+#elif defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
+    // cppcheck-suppress knownConditionTrueFalse
+    if (cpu == ZXC_CPU_NEON) {
+        zxc_decompress_ptr_local = zxc_decompress_chunk_wrapper_neon;
+        zxc_decompress_dict_ptr_local = zxc_decompress_chunk_wrapper_dict_neon;
+    } else {
+        zxc_decompress_ptr_local = zxc_decompress_chunk_wrapper_default;
+        zxc_decompress_dict_ptr_local = zxc_decompress_chunk_wrapper_dict_default;
+    }
+#else
+    (void)cpu;
+    zxc_decompress_ptr_local = zxc_decompress_chunk_wrapper_default;
+    zxc_decompress_dict_ptr_local = zxc_decompress_chunk_wrapper_dict_default;
+#endif
+#else
+    (void)cpu;
+    zxc_decompress_ptr_local = zxc_decompress_chunk_wrapper_default;
+    zxc_decompress_dict_ptr_local = zxc_decompress_chunk_wrapper_dict_default;
+#endif
+
+#if ZXC_USE_C11_ATOMICS
+    atomic_store_explicit(&zxc_decompress_ptr, zxc_decompress_ptr_local, memory_order_release);
+    atomic_store_explicit(&zxc_decompress_dict_ptr, zxc_decompress_dict_ptr_local,
+                          memory_order_release);
+#else
+    zxc_decompress_ptr = zxc_decompress_ptr_local;
+    zxc_decompress_dict_ptr = zxc_decompress_dict_ptr_local;
+#endif
+    return (ctx->dict_size ? zxc_decompress_dict_ptr_local : zxc_decompress_ptr_local)(
+        ctx, src, src_sz, dst, dst_cap);
+}
+// LCOV_EXCL_STOP
+
+/**
+ * @brief First-call initialiser for the safe-decompression dispatcher.
+ *
+ * Mirrors @ref zxc_decompress_dispatch_init but selects the `_safe_*`
+ * decoder variants used by @ref zxc_decompress_block_safe.
+ *
+ * @param[in]  ctx      Decompression context.
+ * @param[in]  src      Compressed input chunk.
+ * @param[in]  src_sz   Size of @p src in bytes.
+ * @param[out] dst      Destination buffer (strict: exact uncompressed size).
+ * @param[in]  dst_cap  Capacity of @p dst in bytes.
+ * @return Result of the selected variant: decompressed size, or negative
+ *         @ref zxc_error_t.
+ */
+// LCOV_EXCL_START
+static int zxc_decompress_safe_dispatch_init(const zxc_cctx_t* RESTRICT ctx,
+                                             const uint8_t* RESTRICT src, const size_t src_sz,
+                                             uint8_t* RESTRICT dst, const size_t dst_cap) {
+    const zxc_cpu_feature_t cpu = zxc_detect_cpu_features();
+    zxc_decompress_func_t zxc_decompress_safe_ptr_local = NULL;
+
+#ifndef ZXC_ONLY_DEFAULT
+#if defined(__x86_64__) || defined(_M_X64)
+    if (cpu == ZXC_CPU_AVX512)
+        zxc_decompress_safe_ptr_local = zxc_decompress_chunk_wrapper_safe_avx512;
+    else if (cpu == ZXC_CPU_AVX2)
+        zxc_decompress_safe_ptr_local = zxc_decompress_chunk_wrapper_safe_avx2;
+    else if (cpu == ZXC_CPU_SSE2)
+        zxc_decompress_safe_ptr_local = zxc_decompress_chunk_wrapper_safe_sse2;
+    else
+        zxc_decompress_safe_ptr_local = zxc_decompress_chunk_wrapper_safe_default;
+#elif defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
+    // cppcheck-suppress knownConditionTrueFalse
+    if (cpu == ZXC_CPU_NEON)
+        zxc_decompress_safe_ptr_local = zxc_decompress_chunk_wrapper_safe_neon;
+    else
+        zxc_decompress_safe_ptr_local = zxc_decompress_chunk_wrapper_safe_default;
+#else
+    (void)cpu;
+    zxc_decompress_safe_ptr_local = zxc_decompress_chunk_wrapper_safe_default;
+#endif
+#else
+    (void)cpu;
+    zxc_decompress_safe_ptr_local = zxc_decompress_chunk_wrapper_safe_default;
+#endif
+
+#if ZXC_USE_C11_ATOMICS
+    atomic_store_explicit(&zxc_decompress_safe_ptr, zxc_decompress_safe_ptr_local,
+                          memory_order_release);
+#else
+    zxc_decompress_safe_ptr = zxc_decompress_safe_ptr_local;
+#endif
+    return zxc_decompress_safe_ptr_local(ctx, src, src_sz, dst, dst_cap);
+}
+// LCOV_EXCL_STOP
+
+/**
+ * @brief First-call initialiser for the compression dispatcher.
+ *
+ * Detects CPU features, selects the best implementation, stores the
+ * pointer atomically, then tail-calls into it.
+ *
+ * @param[in,out] ctx      Compression context.
+ * @param[in]     src      Uncompressed input chunk.
+ * @param[in]     src_sz   Size of @p src in bytes.
+ * @param[out]    dst      Destination buffer for the compressed chunk.
+ * @param[in]     dst_cap  Capacity of @p dst in bytes.
+ * @return Result of the selected variant: compressed size, or negative
+ *         @ref zxc_error_t.
+ */
+// LCOV_EXCL_START
+static int zxc_compress_dispatch_init(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                      const size_t src_sz, uint8_t* RESTRICT dst,
+                                      const size_t dst_cap) {
+    const zxc_cpu_feature_t cpu = zxc_detect_cpu_features();
+    zxc_compress_func_t zxc_compress_ptr_local = NULL;
+
+#ifndef ZXC_ONLY_DEFAULT
+#if defined(__x86_64__) || defined(_M_X64)
+    if (cpu == ZXC_CPU_AVX512)
+        zxc_compress_ptr_local = zxc_compress_chunk_wrapper_avx512;
+    else if (cpu == ZXC_CPU_AVX2)
+        zxc_compress_ptr_local = zxc_compress_chunk_wrapper_avx2;
+    else if (cpu == ZXC_CPU_SSE2)
+        zxc_compress_ptr_local = zxc_compress_chunk_wrapper_sse2;
+    else
+        zxc_compress_ptr_local = zxc_compress_chunk_wrapper_default;
+#elif defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
+    // cppcheck-suppress knownConditionTrueFalse
+    if (cpu == ZXC_CPU_NEON)
+        zxc_compress_ptr_local = zxc_compress_chunk_wrapper_neon;
+    else
+        zxc_compress_ptr_local = zxc_compress_chunk_wrapper_default;
+#else
+    (void)cpu;
+    zxc_compress_ptr_local = zxc_compress_chunk_wrapper_default;
+#endif
+#else
+    (void)cpu;
+    zxc_compress_ptr_local = zxc_compress_chunk_wrapper_default;
+#endif
+
+#if ZXC_USE_C11_ATOMICS
+    atomic_store_explicit(&zxc_compress_ptr, zxc_compress_ptr_local, memory_order_release);
+#else
+    zxc_compress_ptr = zxc_compress_ptr_local;
+#endif
+    return zxc_compress_ptr_local(ctx, src, src_sz, dst, dst_cap);
+}
+// LCOV_EXCL_STOP
+
+/**
+ * @brief Public decompression dispatcher (calls lazily-resolved implementation).
+ *
+ * @param[in,out] ctx    Decompression context.
+ * @param[in]     src    Compressed input chunk (header + payload + optional checksum).
+ * @param[in]     src_sz Size of @p src in bytes.
+ * @param[out]    dst    Destination buffer for decompressed data.
+ * @param[in]     dst_cap Capacity of @p dst.
+ * @return Decompressed size in bytes, or a negative @ref zxc_error_t code.
+ */
+int zxc_decompress_chunk_wrapper(const zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                 const size_t src_sz, uint8_t* RESTRICT dst, const size_t dst_cap) {
+    /* dict_size is constant for a stream; this per-block branch (outside the decode
+     * loop) routes to the dict variant only when a dictionary is active, so the
+     * no-dict path runs the dict-free chunk wrapper (identical codegen to main). */
+#if ZXC_USE_C11_ATOMICS
+    const zxc_decompress_func_t func = atomic_load_explicit(
+        ctx->dict_size ? &zxc_decompress_dict_ptr : &zxc_decompress_ptr, memory_order_acquire);
+#else
+    const zxc_decompress_func_t func =
+        ctx->dict_size ? zxc_decompress_dict_ptr : zxc_decompress_ptr;
+#endif
+    if (UNLIKELY(!func)) return zxc_decompress_dispatch_init(ctx, src, src_sz, dst, dst_cap);
+    return func(ctx, src, src_sz, dst, dst_cap);
+}
+
+/**
+ * @brief Internal safe-decompression dispatcher (strict dst_capacity == uncompressed_size).
+ *
+ * Calls the lazily-resolved `_safe_*` variant, running first-call init if needed.
+ *
+ * @param[in]  ctx      Decompression context.
+ * @param[in]  src      Compressed input chunk.
+ * @param[in]  src_sz   Size of @p src in bytes.
+ * @param[out] dst      Destination buffer (capacity == exact uncompressed size).
+ * @param[in]  dst_cap  Capacity of @p dst in bytes.
+ * @return Decompressed size in bytes, or a negative @ref zxc_error_t.
+ */
+static int zxc_decompress_chunk_wrapper_safe_public(const zxc_cctx_t* RESTRICT ctx,
+                                                    const uint8_t* RESTRICT src,
+                                                    const size_t src_sz, uint8_t* RESTRICT dst,
+                                                    const size_t dst_cap) {
+#if ZXC_USE_C11_ATOMICS
+    const zxc_decompress_func_t func =
+        atomic_load_explicit(&zxc_decompress_safe_ptr, memory_order_acquire);
+#else
+    const zxc_decompress_func_t func = zxc_decompress_safe_ptr;
+#endif
+    if (UNLIKELY(!func)) return zxc_decompress_safe_dispatch_init(ctx, src, src_sz, dst, dst_cap);
+    return func(ctx, src, src_sz, dst, dst_cap);
+}
+
+/**
+ * @brief Public compression dispatcher (calls lazily-resolved implementation).
+ *
+ * @param[in,out] ctx    Compression context.
+ * @param[in]     src    Uncompressed input chunk.
+ * @param[in]     src_sz Size of @p src in bytes.
+ * @param[out]    dst    Destination buffer for compressed data.
+ * @param[in]     dst_cap Capacity of @p dst.
+ * @return Compressed size in bytes, or a negative @ref zxc_error_t code.
+ */
+int zxc_compress_chunk_wrapper(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                               const size_t src_sz, uint8_t* RESTRICT dst, const size_t dst_cap) {
+#if ZXC_USE_C11_ATOMICS
+    const zxc_compress_func_t func = atomic_load_explicit(&zxc_compress_ptr, memory_order_acquire);
+#else
+    const zxc_compress_func_t func = zxc_compress_ptr;
+#endif
+    if (UNLIKELY(!func)) return zxc_compress_dispatch_init(ctx, src, src_sz, dst, dst_cap);
+    return func(ctx, src, src_sz, dst, dst_cap);
+}
+
+/*
+ * ============================================================================
+ * HUFFMAN TRAMPOLINES
+ * ============================================================================
+ * The Huffman codec is built per-variant (default / avx2 / avx512 / neon)
+ * alongside zxc_compress.c and zxc_decompress.c, so the LZ77 stages and the
+ * Huffman stage in a given variant share the same ISA flags (e.g. -mbmi2 on
+ * the AVX2/AVX512 variants). The compress/decompress variant TUs resolve
+ * their Huffman calls to the matching suffixed symbol at compile time, so
+ * the production hot path has zero dispatch overhead.
+ *
+ * These thin wrappers exist only for tests and external callers that link
+ * against the un-suffixed names. They forward to the default (scalar) variant.
+ */
+/**
+ * @brief Build length-limited per-symbol Huffman code lengths from frequencies.
+ *
+ * Un-suffixed entry forwarding to @ref zxc_huf_build_code_lengths_default; full
+ * contract in @c zxc_internal.h.
+ *
+ * @param[in]  freq      Per-symbol frequency counts.
+ * @param[out] code_len  Per-symbol code lengths.
+ * @param[in]  scratch   Caller-provided build scratch buffer.
+ * @return `ZXC_OK` on success, negative `zxc_error_t` on failure.
+ */
+int zxc_huf_build_code_lengths(const uint32_t* RESTRICT freq, uint8_t* RESTRICT code_len,
+                               void* RESTRICT scratch) {
+    return zxc_huf_build_code_lengths_default(freq, code_len, scratch);
+}
+
+/**
+ * @brief Encode a full Huffman literal section (lengths header + streams).
+ *
+ * Un-suffixed entry forwarding to @ref zxc_huf_encode_section_default; full
+ * contract in @c zxc_internal.h.
+ *
+ * @param[in]  literals    Source literal bytes.
+ * @param[in]  n_literals  Number of source bytes.
+ * @param[in]  code_len    Per-symbol code lengths.
+ * @param[out] dst         Destination section buffer.
+ * @param[in]  dst_cap     Capacity of @p dst in bytes.
+ * @return Bytes written on success, negative `zxc_error_t` on failure.
+ */
+int zxc_huf_encode_section(const uint8_t* RESTRICT literals, const size_t n_literals,
+                           const uint8_t* RESTRICT code_len, uint8_t* RESTRICT dst,
+                           const size_t dst_cap) {
+    return zxc_huf_encode_section_default(literals, n_literals, code_len, dst, dst_cap);
+}
+
+/**
+ * @brief Decode a full Huffman literal section.
+ *
+ * Un-suffixed entry forwarding to @ref zxc_huf_decode_section_default; full
+ * contract in @c zxc_internal.h.
+ *
+ * @param[in]  payload       Section payload.
+ * @param[in]  payload_size  Payload length in bytes.
+ * @param[out] dst           Destination buffer.
+ * @param[in]  n_literals    Expected number of decoded bytes.
+ * @return `ZXC_OK` on success, negative `zxc_error_t` on failure.
+ */
+int zxc_huf_decode_section(const uint8_t* RESTRICT payload, const size_t payload_size,
+                           uint8_t* RESTRICT dst, const size_t n_literals) {
+    return zxc_huf_decode_section_default(payload, payload_size, dst, n_literals);
+}
+
+/**
+ * @brief Encode a Huffman literal section without lengths header (shared dict table).
+ *
+ * Un-suffixed entry forwarding to @ref zxc_huf_encode_section_dict_default; full
+ * contract in @c zxc_internal.h.
+ *
+ * @param[in]  literals    Source literal bytes.
+ * @param[in]  n_literals  Number of source bytes.
+ * @param[in]  code_len    Per-symbol code lengths (from the shared dict table).
+ * @param[out] dst         Destination section buffer.
+ * @param[in]  dst_cap     Capacity of @p dst in bytes.
+ * @return Bytes written on success, negative `zxc_error_t` on failure.
+ */
+int zxc_huf_encode_section_dict(const uint8_t* RESTRICT literals, const size_t n_literals,
+                                const uint8_t* RESTRICT code_len, uint8_t* RESTRICT dst,
+                                const size_t dst_cap) {
+    return zxc_huf_encode_section_dict_default(literals, n_literals, code_len, dst, dst_cap);
+}
+
+/**
+ * @brief Decode a Huffman literal section using a prebuilt shared-dict table.
+ *
+ * Un-suffixed entry forwarding to @ref zxc_huf_decode_section_dict_default; full
+ * contract in @c zxc_internal.h.
+ *
+ * @param[in]  payload       Section payload.
+ * @param[in]  payload_size  Payload length in bytes.
+ * @param[out] dst           Destination buffer.
+ * @param[in]  n_literals    Expected number of decoded bytes.
+ * @param[in]  table         Prebuilt shared-dict decode table.
+ * @return `ZXC_OK` on success, negative `zxc_error_t` on failure.
+ */
+int zxc_huf_decode_section_dict(const uint8_t* RESTRICT payload, const size_t payload_size,
+                                uint8_t* RESTRICT dst, const size_t n_literals,
+                                const zxc_huf_dec_entry_t* RESTRICT table) {
+    return zxc_huf_decode_section_dict_default(payload, payload_size, dst, n_literals, table);
+}
+
+/**
+ * @brief Build the multi-symbol Huffman decode table from code lengths.
+ *
+ * Un-suffixed entry forwarding to @ref zxc_huf_build_dec_table_default; full
+ * contract in @c zxc_internal.h.
+ *
+ * @param[in]  code_len  Per-symbol code lengths.
+ * @param[out] table     Destination decode table.
+ * @return `ZXC_OK` on success, `ZXC_ERROR_CORRUPT_DATA` on invalid lengths.
+ */
+int zxc_huf_build_dec_table(const uint8_t* RESTRICT code_len, zxc_huf_dec_entry_t* RESTRICT table) {
+    return zxc_huf_build_dec_table_default(code_len, table);
+}
+
+/**
+ * @brief Pack per-symbol code lengths into the 128-byte nibble header.
+ *
+ * Un-suffixed entry forwarding to @ref zxc_huf_pack_lengths_default; full
+ * contract in @c zxc_internal.h.
+ *
+ * @param[in]  code_len  Per-symbol code lengths (one byte each).
+ * @param[out] out       Destination 128-byte packed header.
+ */
+void zxc_huf_pack_lengths(const uint8_t* RESTRICT code_len, uint8_t* RESTRICT out) {
+    zxc_huf_pack_lengths_default(code_len, out);
+}
+
+/**
+ * @brief Unpack and validate a 128-byte packed lengths header.
+ *
+ * Un-suffixed entry forwarding to @ref zxc_huf_unpack_lengths_default; full
+ * contract in @c zxc_internal.h.
+ *
+ * @param[in]  in        128-byte packed lengths header.
+ * @param[out] code_len  Destination per-symbol code lengths.
+ * @return `ZXC_OK` on success, `ZXC_ERROR_CORRUPT_DATA` on invalid lengths.
+ */
+int zxc_huf_unpack_lengths(const uint8_t* RESTRICT in, uint8_t* RESTRICT code_len) {
+    return zxc_huf_unpack_lengths_default(in, code_len);
+}
+
+/*
+ * ============================================================================
+ * PUBLIC UTILITY API
+ * ============================================================================
+ * These wrapper functions provide a simplified interface by managing context
+ * allocation and looping over blocks. They call the dispatched wrappers above.
+ */
+
+/**
+ * @brief Compresses an entire buffer in one call.
+ *
+ * Manages context allocation internally, loops over blocks, writes the
+ * file header / EOF block / footer, and accumulates the global checksum.
+ *
+ * @param[in]  src              Uncompressed input data.
+ * @param[in]  src_size         Size of @p src in bytes.
+ * @param[out] dst              Destination buffer (use zxc_compress_bound() to size).
+ * @param[in]  dst_capacity     Capacity of @p dst.
+ * @param[in]  opts             Compression options (level, block size, checksum,
+ *                              dictionary, seekable, threads), or NULL for defaults.
+ * @return Total compressed size in bytes, or a negative @ref zxc_error_t code.
+ */
+// cppcheck-suppress unusedFunction
+int64_t zxc_compress(const void* RESTRICT src, const size_t src_size, void* RESTRICT dst,
+                     const size_t dst_capacity, const zxc_compress_opts_t* opts) {
+    if (UNLIKELY(!dst || dst_capacity == 0 || (src_size > 0 && !src))) return ZXC_ERROR_NULL_INPUT;
+
+    const int checksum_enabled = opts ? opts->checksum_enabled : 0;
+    const int seekable = opts ? opts->seekable : 0;
+    const int level = (opts && opts->level > 0) ? opts->level : ZXC_LEVEL_DEFAULT;
+    const size_t block_size =
+        (opts && opts->block_size > 0) ? opts->block_size : ZXC_BLOCK_SIZE_DEFAULT;
+    const uint8_t* dict = opts ? (const uint8_t*)opts->dict : NULL;
+    const size_t dict_size = (opts && opts->dict) ? opts->dict_size : 0;
+    const uint8_t* dict_huf = (opts && opts->dict) ? (const uint8_t*)opts->dict_huf : NULL;
+
+    if (UNLIKELY(dict_size > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE;
+    if (UNLIKELY(!zxc_validate_block_size(block_size))) return ZXC_ERROR_BAD_BLOCK_SIZE;
+
+    const uint32_t did = (dict && dict_size > 0) ? zxc_dict_id(dict, dict_size, dict_huf) : 0;
+
+    const uint8_t* ip = (const uint8_t*)src;
+    uint8_t* op = (uint8_t*)dst;
+    const uint8_t* op_start = op;
+    const uint8_t* op_end = op + dst_capacity;
+    uint32_t global_hash = 0;
+    zxc_cctx_t ctx;
+
+    const size_t eff_chunk =
+        dict_size > 0 ? zxc_block_size_ceil(dict_size + block_size) : block_size;
+    // LCOV_EXCL_START
+    if (UNLIKELY(zxc_cctx_init(&ctx, eff_chunk, 1, level, checksum_enabled, dict_size) != ZXC_OK))
+        return ZXC_ERROR_MEMORY;
+    // LCOV_EXCL_STOP
+    if (UNLIKELY(zxc_cctx_attach_dict_huf(&ctx, dict_huf) != ZXC_OK)) {
+        // LCOV_EXCL_START
+        zxc_cctx_free(&ctx);
+        return ZXC_ERROR_CORRUPT_DATA;
+        // LCOV_EXCL_STOP
+    }
+
+    /* Dict input buffer: [dict_content | block_data] for the encoder, carved
+     * into the cctx workspace (NULL when no dictionary is active). */
+    uint8_t* const dict_input = ctx.dict_buffer;
+    if (dict_input) ZXC_MEMCPY(dict_input, dict, dict_size);
+
+    const int h_val =
+        zxc_write_file_header(op, (size_t)(op_end - op), block_size, checksum_enabled, did);
+    // LCOV_EXCL_START
+    if (UNLIKELY(h_val < 0)) {
+        zxc_cctx_free(&ctx);
+        return h_val;
+    }
+    // LCOV_EXCL_STOP
+    op += h_val;
+
+    /* Seekable: dynamic array for per-block compressed sizes */
+    uint32_t* seek_comp = NULL;
+    uint32_t seek_count = 0;
+    uint32_t seek_cap = 0;
+    if (seekable) {
+        const size_t block_count = src_size / block_size;
+        if (UNLIKELY(block_count > (size_t)UINT32_MAX - 2)) {
+            // LCOV_EXCL_START
+            zxc_cctx_free(&ctx);
+            return ZXC_ERROR_BAD_BLOCK_SIZE;
+            // LCOV_EXCL_STOP
+        }
+        seek_cap = (uint32_t)(block_count + 2);
+        seek_comp = (uint32_t*)ZXC_MALLOC(seek_cap * sizeof(uint32_t));
+        // LCOV_EXCL_START
+        if (UNLIKELY(!seek_comp)) {
+            zxc_cctx_free(&ctx);
+            return ZXC_ERROR_MEMORY;
+        }
+        // LCOV_EXCL_STOP
+    }
+
+    size_t pos = 0;
+    while (pos < src_size) {
+        const size_t chunk_len = (src_size - pos > block_size) ? block_size : (src_size - pos);
+        const size_t rem_cap = (size_t)(op_end - op);
+
+        int res;
+        if (dict_input) {
+            ZXC_MEMCPY(dict_input + dict_size, ip + pos, chunk_len);
+            res = zxc_compress_chunk_wrapper(&ctx, dict_input, dict_size + chunk_len, op, rem_cap);
+        } else {
+            res = zxc_compress_chunk_wrapper(&ctx, ip + pos, chunk_len, op, rem_cap);
+        }
+        if (UNLIKELY(res < 0)) {
+            ZXC_FREE(seek_comp);
+            zxc_cctx_free(&ctx);
+            return res;
+        }
+
+        if (checksum_enabled) {
+            // Update Global Hash (Rotation + XOR)
+            // Block checksum is at the end of the written block data
+            if (LIKELY(res >= ZXC_GLOBAL_CHECKSUM_SIZE)) {
+                const uint32_t block_hash = zxc_le32(op + res - ZXC_GLOBAL_CHECKSUM_SIZE);
+                global_hash = zxc_hash_combine_rotate(global_hash, block_hash);
+            }
+        }
+
+        /* Seekable: record compressed block size */
+        if (seekable) {
+            // LCOV_EXCL_START
+            if (UNLIKELY(seek_count >= seek_cap)) {
+                seek_cap = seek_cap * 2;
+                uint32_t* nc = (uint32_t*)ZXC_REALLOC(seek_comp, seek_cap * sizeof(uint32_t));
+                if (UNLIKELY(!nc)) {
+                    ZXC_FREE(seek_comp);
+                    zxc_cctx_free(&ctx);
+                    return ZXC_ERROR_MEMORY;
+                }
+                seek_comp = nc;
+            }
+            // LCOV_EXCL_STOP
+            seek_comp[seek_count] = (uint32_t)res;
+            seek_count++;
+        }
+
+        op += res;
+        pos += chunk_len;
+    }
+
+    zxc_cctx_free(&ctx);
+
+    // Write EOF Block
+    const size_t rem_cap = (size_t)(op_end - op);
+    const zxc_block_header_t eof_bh = {
+        .block_type = ZXC_BLOCK_EOF, .block_flags = 0, .reserved = 0, .comp_size = 0};
+    const int eof_val = zxc_write_block_header(op, rem_cap, &eof_bh);
+    // LCOV_EXCL_START
+    if (UNLIKELY(eof_val < 0)) {
+        ZXC_FREE(seek_comp);
+        return eof_val;
+    }
+    // LCOV_EXCL_STOP
+    op += eof_val;
+
+    /* Seekable: write seek table between EOF block and footer */
+    if (seekable && seek_count > 0) {
+        const size_t st_cap = (size_t)(op_end - op);
+        const int64_t st_val = zxc_write_seek_table(op, st_cap, seek_comp, seek_count);
+        ZXC_FREE(seek_comp);
+        if (UNLIKELY(st_val < 0)) return st_val;  // LCOV_EXCL_LINE
+        op += st_val;
+    } else {
+        ZXC_FREE(seek_comp);
+    }
+
+    if (UNLIKELY((size_t)(op_end - op) < ZXC_FILE_FOOTER_SIZE))
+        return ZXC_ERROR_DST_TOO_SMALL;  // LCOV_EXCL_LINE
+
+    // Write 12-byte Footer: [Source Size (8)] + [Global Hash (4)]
+    const int footer_val =
+        zxc_write_file_footer(op, (size_t)(op_end - op), src_size, global_hash, checksum_enabled);
+    if (UNLIKELY(footer_val < 0)) return footer_val;  // LCOV_EXCL_LINE
+    op += footer_val;
+
+    return (int64_t)(op - op_start);
+}
+
+/**
+ * @brief Decompresses an entire buffer in one call.
+ *
+ * Validates the file header and footer, loops over compressed blocks,
+ * and verifies the global checksum when enabled.
+ *
+ * @param[in]  src              Compressed input data.
+ * @param[in]  src_size         Size of @p src in bytes.
+ * @param[out] dst              Destination buffer for decompressed data.
+ * @param[in]  dst_capacity     Capacity of @p dst.
+ * @param[in]  opts             Decompression options (checksum verification,
+ *                              dictionary, threads), or NULL for defaults.
+ * @return Total decompressed size in bytes, or a negative @ref zxc_error_t code.
+ */
+// cppcheck-suppress unusedFunction
+int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RESTRICT dst,
+                       const size_t dst_capacity, const zxc_decompress_opts_t* opts) {
+    if (UNLIKELY(!src || src_size < ZXC_FILE_HEADER_SIZE + ZXC_FILE_FOOTER_SIZE ||
+                 (!dst && dst_capacity != 0)))
+        return ZXC_ERROR_NULL_INPUT;
+
+    if (UNLIKELY(!dst || dst_capacity == 0)) {
+        /* Empty-frame case (stored size == 0). */
+        if (UNLIKELY(zxc_le32(src) != ZXC_MAGIC_WORD)) return ZXC_ERROR_NULL_INPUT;
+        const uint8_t* footer = (const uint8_t*)src + src_size - ZXC_FILE_FOOTER_SIZE;
+        return (zxc_le64(footer) == 0) ? 0 : (int64_t)ZXC_ERROR_DST_TOO_SMALL;
+    }
+
+    const int checksum_enabled = opts ? opts->checksum_enabled : 0;
+    const uint8_t* dict = opts ? (const uint8_t*)opts->dict : NULL;
+    const size_t dict_size = (opts && opts->dict) ? opts->dict_size : 0;
+    const uint8_t* dict_huf = (opts && opts->dict) ? (const uint8_t*)opts->dict_huf : NULL;
+
+    const uint8_t* ip = (const uint8_t*)src;
+    const uint8_t* ip_end = ip + src_size;
+    uint8_t* op = (uint8_t*)dst;
+    const uint8_t* op_start = op;
+    const uint8_t* op_end = op + dst_capacity;
+    size_t runtime_chunk_size = 0;
+    zxc_cctx_t ctx;
+
+    int file_has_checksums = 0;
+    uint32_t header_dict_id = 0;
+    if (UNLIKELY(zxc_read_file_header(ip, src_size, &runtime_chunk_size, &file_has_checksums,
+                                      &header_dict_id) != ZXC_OK ||
+                 zxc_cctx_init(&ctx, runtime_chunk_size, 0, 0,
+                               file_has_checksums && checksum_enabled, dict_size) != ZXC_OK)) {
+        return ZXC_ERROR_BAD_HEADER;
+    }
+
+    /* Dictionary validation */
+    if (header_dict_id != 0) {
+        if (UNLIKELY(!dict || dict_size == 0)) {
+            zxc_cctx_free(&ctx);
+            return ZXC_ERROR_DICT_REQUIRED;
+        }
+        if (UNLIKELY(zxc_dict_id(dict, dict_size, dict_huf) != header_dict_id)) {
+            zxc_cctx_free(&ctx);
+            return ZXC_ERROR_DICT_MISMATCH;
+        }
+    }
+    if (UNLIKELY(zxc_cctx_attach_dict_huf(&ctx, dict_huf) != ZXC_OK)) {
+        // LCOV_EXCL_START
+        zxc_cctx_free(&ctx);
+        return ZXC_ERROR_CORRUPT_DATA;
+        // LCOV_EXCL_STOP
+    }
+
+    ip += ZXC_FILE_HEADER_SIZE;
+
+    const size_t work_sz = runtime_chunk_size + ZXC_DECOMPRESS_TAIL_PAD;
+
+    /* Dict decode buffer: [dict_content | decode_space + PAD], carved into the
+     * cctx workspace (NULL when no dictionary is active). */
+    uint8_t* const dict_dec = ctx.dict_buffer;
+    if (dict_dec) ZXC_MEMCPY(dict_dec, dict, dict_size);
+
+    // Block decompression loop
+    uint32_t global_hash = 0;
+
+    while (ip < ip_end) {
+        const size_t rem_src = (size_t)(ip_end - ip);
+        zxc_block_header_t bh;
+        // Read the block header to determine the compressed size
+        if (UNLIKELY(zxc_read_block_header(ip, rem_src, &bh) != ZXC_OK)) {
+            zxc_cctx_free(&ctx);
+            return ZXC_ERROR_BAD_HEADER;
+        }
+
+        // Handle EOF block separately (not a real chunk to decompress)
+        if (UNLIKELY(bh.block_type == ZXC_BLOCK_EOF)) {
+            // EOF carries no payload; a non-zero comp_size is a malformed header.
+            if (UNLIKELY(bh.comp_size != 0)) {
+                zxc_cctx_free(&ctx);
+                return ZXC_ERROR_BAD_HEADER;
+            }
+            // Footer is always the last ZXC_FILE_FOOTER_SIZE bytes of the source,
+            // even when a seek table is inserted between EOF block and footer.
+            // LCOV_EXCL_START
+            if (UNLIKELY(src_size < ZXC_FILE_FOOTER_SIZE)) {
+                zxc_cctx_free(&ctx);
+                return ZXC_ERROR_SRC_TOO_SMALL;
+            }
+            // LCOV_EXCL_STOP
+            const uint8_t* const footer = (const uint8_t*)src + src_size - ZXC_FILE_FOOTER_SIZE;
+
+            // Validate source size matches what we decompressed
+            const uint64_t stored_size = zxc_le64(footer);
+            if (UNLIKELY(stored_size != (uint64_t)(op - op_start))) {
+                zxc_cctx_free(&ctx);
+                return ZXC_ERROR_CORRUPT_DATA;
+            }
+
+            // Validate global checksum if enabled and file has checksums
+            if (checksum_enabled && file_has_checksums) {
+                const uint32_t stored_hash = zxc_le32(footer + sizeof(uint64_t));
+                if (UNLIKELY(stored_hash != global_hash)) {
+                    zxc_cctx_free(&ctx);
+                    return ZXC_ERROR_BAD_CHECKSUM;
+                }
+            }
+            break;  // EOF reached, exit loop
+        }
+
+        int res;
+        const size_t rem_cap = (size_t)(op_end - op);
+        if (dict_dec) {
+            /* Dict path: decode into bounce buffer with dict prefix so match
+             * copies that reference dict content resolve naturally. */
+            res = zxc_decompress_chunk_wrapper(&ctx, ip, rem_src, dict_dec + dict_size, work_sz);
+            if (LIKELY(res > 0)) {
+                if (UNLIKELY((size_t)res > rem_cap)) {
+                    // LCOV_EXCL_START
+                    zxc_cctx_free(&ctx);
+                    return ZXC_ERROR_DST_TOO_SMALL;
+                    // LCOV_EXCL_STOP
+                }
+                ZXC_MEMCPY(op, dict_dec + dict_size, (size_t)res);
+            }
+        } else if (LIKELY(rem_cap >= work_sz)) {
+            // Fast path: decode directly into dst. Cap dst_cap to chunk_size + PAD
+            res = zxc_decompress_chunk_wrapper(&ctx, ip, rem_src, op, work_sz);
+        } else {
+            // Safe path: decode into bounce buffer, then copy exact result.
+            res = zxc_decompress_chunk_wrapper(&ctx, ip, rem_src, ctx.work_buf, ctx.work_buf_cap);
+            if (LIKELY(res > 0)) {
+                // LCOV_EXCL_START
+                if (UNLIKELY((size_t)res > rem_cap)) {
+                    zxc_cctx_free(&ctx);
+                    return ZXC_ERROR_DST_TOO_SMALL;
+                }
+                // LCOV_EXCL_STOP
+                ZXC_MEMCPY(op, ctx.work_buf, (size_t)res);
+            }
+        }
+        if (UNLIKELY(res < 0)) {
+            zxc_cctx_free(&ctx);
+            return res;
+        }
+
+        // Update global hash from block checksum
+        if (checksum_enabled && file_has_checksums) {
+            const uint32_t block_hash = zxc_le32(ip + ZXC_BLOCK_HEADER_SIZE + bh.comp_size);
+            global_hash = zxc_hash_combine_rotate(global_hash, block_hash);
+        }
+
+        ip += ZXC_BLOCK_HEADER_SIZE + bh.comp_size +
+              (file_has_checksums ? ZXC_BLOCK_CHECKSUM_SIZE : 0);
+        op += res;
+    }
+
+    zxc_cctx_free(&ctx);
+    return (int64_t)(op - op_start);
+}
+
+/**
+ * @brief Reads the decompressed size from a ZXC-compressed buffer.
+ *
+ * The size is stored in the file footer (last @ref ZXC_FILE_FOOTER_SIZE bytes).
+ *
+ * @param[in] src      Compressed data.
+ * @param[in] src_size Size of @p src in bytes.
+ * @return Original uncompressed size, or 0 on error.
+ */
+uint64_t zxc_get_decompressed_size(const void* src, const size_t src_size) {
+    if (UNLIKELY(src_size < ZXC_FILE_HEADER_SIZE + ZXC_FILE_FOOTER_SIZE)) return 0;
+
+    const uint8_t* const p = (const uint8_t*)src;
+    if (UNLIKELY(zxc_le32(p) != ZXC_MAGIC_WORD)) return 0;
+
+    const uint8_t* const footer = p + src_size - ZXC_FILE_FOOTER_SIZE;
+    return zxc_le64(footer);
+}
+
+/**
+ * @brief Reads the dictionary id from a compressed archive's file header.
+ *
+ * Public API; see @c zxc_buffer.h. Validates the magic, then returns the
+ * header's @c dict_id field when the dictionary flag is set. Does not decompress.
+ *
+ * @param[in] src       Start of the compressed archive (>= @c ZXC_FILE_HEADER_SIZE).
+ * @param[in] src_size  Size of @p src in bytes.
+ * @return The dictionary id, or 0 if @p src is invalid or the archive uses no
+ *         dictionary.
+ */
+// cppcheck-suppress unusedFunction
+uint32_t zxc_get_dict_id(const void* src, const size_t src_size) {
+    if (UNLIKELY(!src || src_size < ZXC_FILE_HEADER_SIZE)) return 0;
+
+    const uint8_t* const p = (const uint8_t*)src;
+    if (UNLIKELY(zxc_le32(p) != ZXC_MAGIC_WORD)) return 0;
+
+    return (p[6] & ZXC_FILE_FLAG_HAS_DICTIONARY) ? zxc_le32(p + 7) : 0;
+}
+
+/*
+ * ============================================================================
+ * REUSABLE CONTEXT API (Opaque)
+ * ============================================================================
+ *
+ * Provides heap-allocated, opaque contexts that integrators can reuse across
+ * multiple compress / decompress calls, eliminating per-call malloc/free
+ * overhead.
+ */
+
+/* --- Compression --------------------------------------------------------- */
+
+/**
+ * @brief Opaque reusable compression context (public handle @ref zxc_cctx).
+ *
+ * Wraps one internal @ref zxc_cctx_t plus the sticky options and bookkeeping
+ * needed to reuse buffers across calls and re-init only when the block size
+ * changes.
+ */
+struct zxc_cctx_s {
+    zxc_cctx_t inner;       /* existing internal context */
+    int initialized;        /* 1 if inner has live allocations */
+    int owns_workspace;     /* 0 = library-allocated (free in zxc_free_cctx),
+                               1 = caller-supplied static workspace (no-op free,
+                               block_size pinned at init) */
+    size_t last_block_size; /* block size used for last init */
+    /* Sticky options (remembered from create or last compress call). */
+    int stored_level;
+    int stored_checksum;
+    size_t stored_block_size;
+};
+
+/**
+ * @brief Creates a reusable compression context.
+ *
+ * Public API; full contract in @c zxc_buffer.h. With non-NULL @p opts the
+ * internal buffers are pre-allocated for the given level / block size /
+ * checksum; with NULL @p opts allocation is deferred to the first
+ * @ref zxc_compress_cctx call. The resolved settings become sticky defaults.
+ *
+ * @param[in] opts  Initial compression options, or NULL to defer allocation.
+ * @return A context to release with @ref zxc_free_cctx, or NULL on allocation
+ *         failure or invalid @p opts.
+ */
+zxc_cctx* zxc_create_cctx(const zxc_compress_opts_t* opts) {
+    zxc_cctx* const cctx = (zxc_cctx*)ZXC_CALLOC(1, sizeof(zxc_cctx));
+    if (UNLIKELY(!cctx)) return NULL;  // LCOV_EXCL_LINE
+
+    /* Resolve and store sticky defaults. */
+    cctx->stored_level = (opts && opts->level > 0) ? opts->level : ZXC_LEVEL_DEFAULT;
+    cctx->stored_block_size =
+        (opts && opts->block_size > 0) ? opts->block_size : ZXC_BLOCK_SIZE_DEFAULT;
+    cctx->stored_checksum = opts ? opts->checksum_enabled : 0;
+
+    if (opts) {
+        // LCOV_EXCL_START
+        if (UNLIKELY(!zxc_validate_block_size(cctx->stored_block_size) ||
+                     zxc_cctx_init(&cctx->inner, cctx->stored_block_size, 1, cctx->stored_level,
+                                   cctx->stored_checksum, 0) != ZXC_OK)) {
+            ZXC_FREE(cctx);
+            return NULL;
+        }
+        // LCOV_EXCL_STOP
+        cctx->last_block_size = cctx->stored_block_size;
+        cctx->initialized = 1;
+    }
+
+    return cctx;
+}
+
+/**
+ * @brief Releases a reusable compression context.
+ *
+ * Public API; see @c zxc_buffer.h. Frees the inner buffers and the handle.
+ * NULL-safe. For a static (caller-workspace) context this is a no-op, since the
+ * caller owns the workspace.
+ *
+ * @param[in] cctx  Context from @ref zxc_create_cctx (may be NULL).
+ */
+void zxc_free_cctx(zxc_cctx* cctx) {
+    if (UNLIKELY(!cctx)) return;
+    /* Static cctx: handle + inner buffers live inside the caller's workspace,
+     * which we do not own. Free is a no-op; the caller owns the workspace. */
+    if (cctx->owns_workspace) return;
+    if (cctx->initialized) zxc_cctx_free(&cctx->inner);
+    ZXC_FREE(cctx);
+}
+
+/**
+ * @brief Compresses a whole buffer into a framed archive, reusing @p cctx.
+ *
+ * Public API; full contract in @c zxc_buffer.h. Resolves per-call options over
+ * the context's sticky defaults, re-initialises the inner buffers only when the
+ * block size changes (level / checksum update in place), then writes the file
+ * header, the compressed blocks, the EOF block and the footer.
+ *
+ * @param[in,out] cctx          Reusable compression context.
+ * @param[in]     src           Source bytes.
+ * @param[in]     src_size      Number of source bytes (must be > 0).
+ * @param[out]    dst           Destination buffer for the archive.
+ * @param[in]     dst_capacity  Capacity of @p dst in bytes.
+ * @param[in]     opts          Per-call option overrides, or NULL for the
+ *                              context defaults.
+ * @return Archive size in bytes on success, or a negative @ref zxc_error_t.
+ */
+int64_t zxc_compress_cctx(zxc_cctx* cctx, const void* RESTRICT src, const size_t src_size,
+                          void* RESTRICT dst, const size_t dst_capacity,
+                          const zxc_compress_opts_t* opts) {
+    if (UNLIKELY(!cctx)) return ZXC_ERROR_NULL_INPUT;
+    if (UNLIKELY(!src || !dst || src_size == 0 || dst_capacity == 0)) return ZXC_ERROR_NULL_INPUT;
+
+    const int checksum_enabled = opts ? opts->checksum_enabled : cctx->stored_checksum;
+    const int level = (opts && opts->level > 0) ? opts->level : cctx->stored_level;
+    const size_t block_size =
+        (opts && opts->block_size > 0) ? opts->block_size : cctx->stored_block_size;
+
+    if (UNLIKELY(!zxc_validate_block_size(block_size))) return ZXC_ERROR_BAD_BLOCK_SIZE;
+
+    /* Static cctx: block_size is locked at workspace init.  Reject any opts
+     * that would force a re-partition, since the workspace cannot grow.
+     * level / checksum_enabled may still vary per call. */
+    if (UNLIKELY(cctx->owns_workspace && block_size != cctx->last_block_size))
+        return ZXC_ERROR_BAD_BLOCK_SIZE;
+
+    cctx->stored_level = level;
+    cctx->stored_block_size = block_size;
+    cctx->stored_checksum = checksum_enabled;
+
+    /* Re-init only when block_size changed (it drives buffer sizes). */
+    if (UNLIKELY(!cctx->initialized || cctx->last_block_size != block_size)) {
+        if (cctx->initialized) {
+            // LCOV_EXCL_START
+            zxc_cctx_free(&cctx->inner);
+            cctx->initialized = 0;
+            // LCOV_EXCL_STOP
+        }
+        // LCOV_EXCL_START
+        if (UNLIKELY(zxc_cctx_init(&cctx->inner, block_size, 1, level, checksum_enabled, 0) !=
+                     ZXC_OK))
+            return ZXC_ERROR_MEMORY;
+        // LCOV_EXCL_STOP
+        cctx->last_block_size = block_size;
+        cctx->initialized = 1;
+    } else {
+        /* Same block_size: update level + checksum without realloc. */
+        cctx->inner.compression_level = level;
+        cctx->inner.checksum_enabled = checksum_enabled;
+    }
+
+    zxc_cctx_t* const ctx = &cctx->inner;
+
+    uint8_t* op = (uint8_t*)dst;
+    const uint8_t* const op_start = op;
+    const uint8_t* const op_end = op + dst_capacity;
+    const uint8_t* const ip = (const uint8_t*)src;
+    uint32_t global_hash = 0;
+
+    const int h_val =
+        zxc_write_file_header(op, (size_t)(op_end - op), block_size, checksum_enabled, 0);
+    if (UNLIKELY(h_val < 0)) return h_val;  // LCOV_EXCL_LINE
+    op += h_val;
+
+    size_t pos = 0;
+    while (pos < src_size) {
+        const size_t chunk_len = (src_size - pos > block_size) ? block_size : (src_size - pos);
+        const size_t rem_cap = (size_t)(op_end - op);
+
+        const int res = zxc_compress_chunk_wrapper(ctx, ip + pos, chunk_len, op, rem_cap);
+        if (UNLIKELY(res < 0)) return res;
+
+        if (checksum_enabled) {
+            if (LIKELY(res >= ZXC_GLOBAL_CHECKSUM_SIZE)) {
+                const uint32_t block_hash = zxc_le32(op + res - ZXC_GLOBAL_CHECKSUM_SIZE);
+                global_hash = zxc_hash_combine_rotate(global_hash, block_hash);
+            }
+        }
+
+        op += res;
+        pos += chunk_len;
+    }
+
+    /* EOF block */
+    const size_t rem_cap = (size_t)(op_end - op);
+    const zxc_block_header_t eof_bh = {
+        .block_type = ZXC_BLOCK_EOF, .block_flags = 0, .reserved = 0, .comp_size = 0};
+    const int eof_val = zxc_write_block_header(op, rem_cap, &eof_bh);
+    if (UNLIKELY(eof_val < 0)) return eof_val;  // LCOV_EXCL_LINE
+    op += eof_val;
+
+    if (UNLIKELY(rem_cap < (size_t)eof_val + ZXC_FILE_FOOTER_SIZE))
+        return ZXC_ERROR_DST_TOO_SMALL;  // LCOV_EXCL_LINE
+
+    const int footer_val =
+        zxc_write_file_footer(op, (size_t)(op_end - op), src_size, global_hash, checksum_enabled);
+    if (UNLIKELY(footer_val < 0)) return footer_val;  // LCOV_EXCL_LINE
+    op += footer_val;
+
+    return (int64_t)(op - op_start);
+}
+
+/* --- Decompression ------------------------------------------------------- */
+
+/**
+ * @brief Opaque reusable decompression context (public handle @ref zxc_dctx).
+ *
+ * Reuses the internal @ref zxc_cctx_t type for decode, tracking the last block
+ * and dict sizes so the inner buffers are re-carved only when they change.
+ */
+struct zxc_dctx_s {
+    zxc_cctx_t inner;       /* reuses the same internal context type */
+    size_t last_block_size; /* block size from last header parse */
+    size_t last_dict_size;  /* dict_size the inner buffer was carved for (drives re-init) */
+    int initialized;        /* 1 if inner has live allocations */
+    int owns_workspace;     /* 0 = library-allocated (free in zxc_free_dctx),
+                               1 = caller-supplied static workspace (no-op free,
+                               block_size pinned at init) */
+};
+
+/**
+ * @brief Creates a reusable decompression context.
+ *
+ * Public API; see @c zxc_buffer.h. The inner buffers are allocated lazily on
+ * the first decode (sized from the archive header), so this only allocates the
+ * handle itself.
+ *
+ * @return A context to release with @ref zxc_free_dctx, or NULL on allocation
+ *         failure.
+ */
+zxc_dctx* zxc_create_dctx(void) {
+    zxc_dctx* const dctx = (zxc_dctx*)ZXC_CALLOC(1, sizeof(zxc_dctx));
+    return dctx;
+}
+
+/**
+ * @brief Releases a reusable decompression context.
+ *
+ * Public API; see @c zxc_buffer.h. Frees the inner buffers and the handle.
+ * NULL-safe; a no-op for a static (caller-workspace) context.
+ *
+ * @param[in] dctx  Context from @ref zxc_create_dctx (may be NULL).
+ */
+void zxc_free_dctx(zxc_dctx* dctx) {
+    if (UNLIKELY(!dctx)) return;
+    /* Static dctx: handle + inner buffers live inside the caller's workspace,
+     * which we do not own. Free is a no-op; the caller owns the workspace. */
+    if (dctx->owns_workspace) return;
+    if (dctx->initialized) zxc_cctx_free(&dctx->inner);
+    ZXC_FREE(dctx);
+}
+
+/**
+ * @brief Decompresses a framed archive into @p dst, reusing @p dctx.
+ *
+ * Public API; full contract in @c zxc_buffer.h. Parses the file header,
+ * re-initialises the inner buffers only when the block size changes (or a prior
+ * dict call left a prefix), then decodes each block - straight into @p dst when
+ * the tail padding fits, otherwise through a bounce buffer - and verifies the
+ * footer size and optional checksum.
+ *
+ * @param[in,out] dctx          Reusable decompression context.
+ * @param[in]     src           Compressed archive bytes.
+ * @param[in]     src_size      Archive size (>= @c ZXC_FILE_HEADER_SIZE).
+ * @param[out]    dst           Destination for the decompressed output.
+ * @param[in]     dst_capacity  Capacity of @p dst in bytes.
+ * @param[in]     opts          Per-call options (e.g. checksum), or NULL.
+ * @return Decompressed size in bytes on success, or a negative @ref zxc_error_t.
+ */
+int64_t zxc_decompress_dctx(zxc_dctx* dctx, const void* RESTRICT src, const size_t src_size,
+                            void* RESTRICT dst, const size_t dst_capacity,
+                            const zxc_decompress_opts_t* opts) {
+    if (UNLIKELY(!dctx || !src || !dst || src_size < ZXC_FILE_HEADER_SIZE))
+        return ZXC_ERROR_NULL_INPUT;
+
+    const int checksum_enabled = opts ? opts->checksum_enabled : 0;
+
+    const uint8_t* ip = (const uint8_t*)src;
+    const uint8_t* const ip_end = ip + src_size;
+    uint8_t* op = (uint8_t*)dst;
+    const uint8_t* const op_start = op;
+    const uint8_t* const op_end = op + dst_capacity;
+    size_t runtime_chunk_size = 0;
+    int file_has_checksums = 0;
+    uint32_t global_hash = 0;
+
+    if (UNLIKELY(zxc_read_file_header(ip, src_size, &runtime_chunk_size, &file_has_checksums,
+                                      NULL) != ZXC_OK))
+        return ZXC_ERROR_BAD_HEADER;
+
+    /* Static dctx: block_size is locked at workspace init; reject any
+     * archive whose declared block_size would require a re-partition. */
+    if (UNLIKELY(dctx->owns_workspace && runtime_chunk_size != dctx->last_block_size))
+        return ZXC_ERROR_BAD_BLOCK_SIZE;
+
+    /* Re-init when block size changed, or when a prior dict-using call (block
+     * API) left the inner context carrying a dict prefix. */
+    if (UNLIKELY(!dctx->initialized || dctx->last_block_size != runtime_chunk_size ||
+                 dctx->last_dict_size != 0)) {
+        if (dctx->initialized) {
+            // LCOV_EXCL_START
+            zxc_cctx_free(&dctx->inner);
+            dctx->initialized = 0;
+            // LCOV_EXCL_STOP
+        }
+        // LCOV_EXCL_START
+        if (UNLIKELY(zxc_cctx_init(&dctx->inner, runtime_chunk_size, 0, 0,
+                                   file_has_checksums && checksum_enabled, 0) != ZXC_OK))
+            return ZXC_ERROR_MEMORY;
+        // LCOV_EXCL_STOP
+        dctx->last_block_size = runtime_chunk_size;
+        dctx->last_dict_size = 0;
+        dctx->initialized = 1;
+    } else {
+        dctx->inner.checksum_enabled = file_has_checksums && checksum_enabled;
+    }
+
+    zxc_cctx_t* const ctx = &dctx->inner;
+    ip += ZXC_FILE_HEADER_SIZE;
+
+    /* work_buf was pre-sized to runtime_chunk_size + ZXC_DECOMPRESS_TAIL_PAD
+     * inside the matching zxc_cctx_init call above; the re-init guard ensures
+     * it stays in sync when chunk_size changes between calls. */
+    const size_t work_sz = runtime_chunk_size + ZXC_DECOMPRESS_TAIL_PAD;
+
+    while (ip < ip_end) {
+        const size_t rem_src = (size_t)(ip_end - ip);
+        zxc_block_header_t bh;
+        if (UNLIKELY(zxc_read_block_header(ip, rem_src, &bh) != ZXC_OK))
+            return ZXC_ERROR_BAD_HEADER;
+
+        if (UNLIKELY(bh.block_type == ZXC_BLOCK_EOF)) {
+            if (UNLIKELY(bh.comp_size != 0)) return ZXC_ERROR_BAD_HEADER;
+            if (UNLIKELY(rem_src < ZXC_BLOCK_HEADER_SIZE + ZXC_FILE_FOOTER_SIZE))
+                return ZXC_ERROR_SRC_TOO_SMALL;
+
+            const uint8_t* const footer = ip + ZXC_BLOCK_HEADER_SIZE;
+            const uint64_t stored_size = zxc_le64(footer);
+            if (UNLIKELY(stored_size != (uint64_t)(op - op_start))) return ZXC_ERROR_CORRUPT_DATA;
+
+            if (checksum_enabled && file_has_checksums) {
+                const uint32_t stored_hash = zxc_le32(footer + sizeof(uint64_t));
+                if (UNLIKELY(stored_hash != global_hash)) return ZXC_ERROR_BAD_CHECKSUM;
+            }
+            break;
+        }
+
+        const size_t rem_cap = (size_t)(op_end - op);
+        int res;
+        if (LIKELY(rem_cap >= work_sz)) {
+            // Fast path: decode directly into dst (enough padding for wild copies).
+            res = zxc_decompress_chunk_wrapper(ctx, ip, rem_src, op, rem_cap);
+        } else {
+            // Safe path: decode into bounce buffer, then copy exact result.
+            res = zxc_decompress_chunk_wrapper(ctx, ip, rem_src, ctx->work_buf, ctx->work_buf_cap);
+            if (LIKELY(res > 0)) {
+                if (UNLIKELY((size_t)res > rem_cap))
+                    return ZXC_ERROR_DST_TOO_SMALL;  // LCOV_EXCL_LINE
+                ZXC_MEMCPY(op, ctx->work_buf, (size_t)res);
+            }
+        }
+        if (UNLIKELY(res < 0)) return res;
+
+        if (checksum_enabled && file_has_checksums) {
+            const uint32_t block_hash = zxc_le32(ip + ZXC_BLOCK_HEADER_SIZE + bh.comp_size);
+            global_hash = zxc_hash_combine_rotate(global_hash, block_hash);
+        }
+
+        ip += ZXC_BLOCK_HEADER_SIZE + bh.comp_size +
+              (file_has_checksums ? ZXC_BLOCK_CHECKSUM_SIZE : 0);
+        op += res;
+    }
+
+    return (int64_t)(op - op_start);
+}
+
+/* ========================================================================= */
+/*  Block-Level API (no file framing)                                        */
+/* ========================================================================= */
+
+/**
+ * @brief Compresses a single block (no file framing), reusing @p cctx.
+ *
+ * Public API; full contract in @c zxc_buffer.h. Produces one format-conformant
+ * block with no header / EOF / footer, so @p src_size must not exceed
+ * @c ZXC_BLOCK_SIZE_MAX (use the frame or streaming APIs for larger inputs).
+ * With a dictionary in @p opts, [dict | block] is assembled in the cctx-owned
+ * bounce buffer before encoding. Inner buffers are re-initialised only when the
+ * effective block size changes.
+ *
+ * @param[in,out] cctx          Reusable compression context.
+ * @param[in]     src           Source block bytes.
+ * @param[in]     src_size      Source length (0 < @p src_size <= @c ZXC_BLOCK_SIZE_MAX).
+ * @param[out]    dst           Destination buffer for the block payload.
+ * @param[in]     dst_capacity  Capacity of @p dst in bytes.
+ * @param[in]     opts          Per-call options (level, dict, ...), or NULL.
+ * @return Block payload size in bytes on success, or a negative @ref zxc_error_t.
+ */
+int64_t zxc_compress_block(zxc_cctx* cctx, const void* RESTRICT src, const size_t src_size,
+                           void* RESTRICT dst, const size_t dst_capacity,
+                           const zxc_compress_opts_t* opts) {
+    if (UNLIKELY(!cctx || !src || !dst || src_size == 0 || dst_capacity == 0))
+        return ZXC_ERROR_NULL_INPUT;
+
+    /* Block API processes a single format-conformant block: src_size must not
+     * exceed ZXC_BLOCK_SIZE_MAX. Callers with larger inputs should use the
+     * frame or streaming APIs which chunk transparently. */
+    if (UNLIKELY(src_size > ZXC_BLOCK_SIZE_MAX)) return ZXC_ERROR_BAD_BLOCK_SIZE;
+
+    const int checksum_enabled = opts ? opts->checksum_enabled : cctx->stored_checksum;
+    const int level = (opts && opts->level > 0) ? opts->level : cctx->stored_level;
+    /* For block API, block_size == src_size (the caller compresses one block at a time). */
+    const size_t block_size =
+        (opts && opts->block_size > 0) ? opts->block_size : cctx->stored_block_size;
+    const size_t min_bs = zxc_block_size_ceil(src_size);
+
+    /* Always ensure internal buffers can hold src_size.
+     * When a dictionary is active, offset_bits must accommodate dict + block. */
+    const uint8_t* b_dict = opts ? (const uint8_t*)opts->dict : NULL;
+    const size_t b_dict_size = (opts && opts->dict) ? opts->dict_size : 0;
+    const size_t base_block_size = (block_size > min_bs) ? block_size : min_bs;
+    const size_t effective_block_size =
+        b_dict_size > 0 ? zxc_block_size_ceil(b_dict_size + base_block_size) : base_block_size;
+
+    cctx->stored_level = level;
+    cctx->stored_block_size = effective_block_size;
+    cctx->stored_checksum = checksum_enabled;
+
+    /* Re-init only when block_size changed. */
+    if (UNLIKELY(!cctx->initialized || cctx->last_block_size != effective_block_size)) {
+        if (cctx->initialized) {
+            // LCOV_EXCL_START
+            zxc_cctx_free(&cctx->inner);
+            cctx->initialized = 0;
+            // LCOV_EXCL_STOP
+        }
+        // LCOV_EXCL_START
+        if (UNLIKELY(zxc_cctx_init(&cctx->inner, effective_block_size, 1, level, checksum_enabled,
+                                   b_dict_size) != ZXC_OK))
+            return ZXC_ERROR_MEMORY;
+        // LCOV_EXCL_STOP
+        cctx->last_block_size = effective_block_size;
+        cctx->initialized = 1;
+    } else {
+        cctx->inner.compression_level = level;
+        cctx->inner.checksum_enabled = checksum_enabled;
+    }
+
+    cctx->inner.dict_size = b_dict_size;
+
+    int res;
+    if (b_dict && b_dict_size > 0) {
+        /* [dict | block] assembled in the cctx-owned dict_buffer */
+        uint8_t* const combined = cctx->inner.dict_buffer;
+        ZXC_MEMCPY(combined, b_dict, b_dict_size);
+        ZXC_MEMCPY(combined + b_dict_size, src, src_size);
+        res = zxc_compress_chunk_wrapper(&cctx->inner, combined, b_dict_size + src_size,
+                                         (uint8_t*)dst, dst_capacity);
+    } else {
+        res = zxc_compress_chunk_wrapper(&cctx->inner, (const uint8_t*)src, src_size, (uint8_t*)dst,
+                                         dst_capacity);
+    }
+    if (UNLIKELY(res < 0)) return res;
+    return (int64_t)res;
+}
+
+/**
+ * @brief Decompresses a single block (no file framing), reusing @p dctx.
+ *
+ * Public API; full contract in @c zxc_buffer.h. Decodes one format-conformant
+ * block; the decoded payload cannot exceed @c ZXC_BLOCK_SIZE_MAX, so
+ * @p dst_capacity is bounded by @c ZXC_BLOCK_SIZE_MAX + @c ZXC_DECOMPRESS_TAIL_PAD.
+ * With a dictionary in @p opts the decode runs through the [dict | decode]
+ * bounce buffer; otherwise it goes straight into @p dst when the tail padding
+ * fits, or via @c work_buf when it doesn't.
+ *
+ * @param[in,out] dctx          Reusable decompression context.
+ * @param[in]     src           Compressed block bytes.
+ * @param[in]     src_size      Source length (>= @c ZXC_BLOCK_HEADER_SIZE).
+ * @param[out]    dst           Destination for the decoded payload.
+ * @param[in]     dst_capacity  Capacity of @p dst in bytes.
+ * @param[in]     opts          Per-call options (dict, checksum), or NULL.
+ * @return Decoded payload size in bytes on success, or a negative @ref zxc_error_t.
+ */
+int64_t zxc_decompress_block(zxc_dctx* dctx, const void* RESTRICT src, const size_t src_size,
+                             void* RESTRICT dst, const size_t dst_capacity,
+                             const zxc_decompress_opts_t* opts) {
+    if (UNLIKELY(!dctx || !src || !dst || src_size < ZXC_BLOCK_HEADER_SIZE || dst_capacity == 0))
+        return ZXC_ERROR_NULL_INPUT;
+
+    /* Block API decompresses a single format-conformant block. Decoded payload
+     * cannot exceed ZXC_BLOCK_SIZE_MAX; dst_capacity is bounded accordingly to
+     * include the tail-pad needed for safe wild copies. Callers expecting
+     * larger outputs should use the frame or streaming APIs. */
+    if (UNLIKELY(dst_capacity > ZXC_BLOCK_SIZE_MAX + ZXC_DECOMPRESS_TAIL_PAD))
+        return ZXC_ERROR_BAD_BLOCK_SIZE;
+
+    const int checksum_enabled = opts ? opts->checksum_enabled : 0;
+
+    const uint8_t* dict = opts ? (const uint8_t*)opts->dict : NULL;
+    const size_t dict_size = (opts && opts->dict) ? opts->dict_size : 0;
+
+    /* Derive the block_size from dst_capacity (callers know the original size) */
+    const size_t block_size = zxc_block_size_ceil(dst_capacity);
+    if (UNLIKELY(!dctx->initialized || dctx->last_block_size != block_size ||
+                 dctx->last_dict_size != dict_size)) {
+        if (dctx->initialized) {
+            zxc_cctx_free(&dctx->inner);
+            dctx->initialized = 0;
+        }
+        // LCOV_EXCL_START
+        if (UNLIKELY(zxc_cctx_init(&dctx->inner, block_size, 0, 0, checksum_enabled, dict_size) !=
+                     ZXC_OK))
+            return ZXC_ERROR_MEMORY;
+        // LCOV_EXCL_STOP
+        dctx->last_block_size = block_size;
+        dctx->last_dict_size = dict_size;
+        dctx->initialized = 1;
+    } else {
+        dctx->inner.checksum_enabled = checksum_enabled;
+    }
+
+    zxc_cctx_t* const ctx = &dctx->inner;
+    ctx->dict_size = dict_size;
+
+    /* work_buf was pre-sized to block_size + ZXC_DECOMPRESS_TAIL_PAD inside
+     * the matching zxc_cctx_init call above. */
+    const size_t work_sz = block_size + ZXC_DECOMPRESS_TAIL_PAD;
+
+    int res;
+    if (dict && dict_size > 0) {
+        /* [dict | decode] assembled in the cctx-owned dict_buffer */
+        uint8_t* const dec_buf = ctx->dict_buffer;
+        ZXC_MEMCPY(dec_buf, dict, dict_size);
+        res = zxc_decompress_chunk_wrapper(ctx, (const uint8_t*)src, src_size, dec_buf + dict_size,
+                                           work_sz);
+        if (LIKELY(res > 0)) {
+            if (UNLIKELY((size_t)res > dst_capacity)) return ZXC_ERROR_DST_TOO_SMALL;
+            ZXC_MEMCPY(dst, dec_buf + dict_size, (size_t)res);
+        }
+    } else if (LIKELY(dst_capacity >= work_sz)) {
+        res = zxc_decompress_chunk_wrapper(ctx, (const uint8_t*)src, src_size, (uint8_t*)dst,
+                                           dst_capacity);
+    } else {
+        /* Bounce through work_buf when output can't absorb wild copies. */
+        res = zxc_decompress_chunk_wrapper(ctx, (const uint8_t*)src, src_size, ctx->work_buf,
+                                           ctx->work_buf_cap);
+        if (LIKELY(res > 0)) {
+            if (UNLIKELY((size_t)res > dst_capacity)) return ZXC_ERROR_DST_TOO_SMALL;
+            ZXC_MEMCPY(dst, ctx->work_buf, (size_t)res);
+        }
+    }
+    if (UNLIKELY(res < 0)) return res;
+    return (int64_t)res;
+}
+
+/**
+ * @brief Safe-variant block decompressor: accepts dst_capacity == uncompressed_size.
+ *
+ * Dict inputs and RAW blocks route to @ref zxc_decompress_block; plain GLO/GHI
+ * use the strict safe decoder (no bounce buffer, no +ZXC_DECOMPRESS_TAIL_PAD).
+ *
+ * Public API; full contract in @c zxc_buffer.h.
+ *
+ * @param[in,out] dctx          Reusable decompression context.
+ * @param[in]     src           Compressed block bytes.
+ * @param[in]     src_size      Source length (>= @c ZXC_BLOCK_HEADER_SIZE).
+ * @param[out]    dst           Destination for the decoded payload.
+ * @param[in]     dst_capacity  Exact uncompressed size (<= @c ZXC_BLOCK_SIZE_MAX).
+ * @param[in]     opts          Per-call options (dict, checksum), or NULL.
+ * @return Decoded payload size in bytes on success, or a negative @ref zxc_error_t.
+ */
+int64_t zxc_decompress_block_safe(zxc_dctx* dctx, const void* RESTRICT src, const size_t src_size,
+                                  void* RESTRICT dst, const size_t dst_capacity,
+                                  const zxc_decompress_opts_t* opts) {
+    if (UNLIKELY(!dctx || !src || !dst || src_size < ZXC_BLOCK_HEADER_SIZE || dst_capacity == 0))
+        return ZXC_ERROR_NULL_INPUT;
+
+    /* Strict-tail variant: dst_capacity matches the exact uncompressed size */
+    if (UNLIKELY(dst_capacity > ZXC_BLOCK_SIZE_MAX)) return ZXC_ERROR_BAD_BLOCK_SIZE;
+
+    /* A dict needs the [dict|payload] bounce; route to the bounce-capable path. */
+    if (opts && opts->dict && opts->dict_size > 0) {
+        return zxc_decompress_block(dctx, src, src_size, dst, dst_capacity, opts);
+    }
+
+    const uint8_t type = ((const uint8_t*)src)[0];
+    /* RAW never wild-writes past dst_capacity: route to the existing fast API. */
+    if (type == ZXC_BLOCK_RAW) {
+        return zxc_decompress_block(dctx, src, src_size, dst, dst_capacity, opts);
+    }
+
+    /* GLO/GHI: use the strict-tail decoder (no bounce buffer required). */
+    const int checksum_enabled = opts ? opts->checksum_enabled : 0;
+    const size_t block_size = zxc_block_size_ceil(dst_capacity);
+    if (UNLIKELY(!dctx->initialized || dctx->last_block_size != block_size ||
+                 dctx->last_dict_size != 0)) {
+        if (dctx->initialized) {
+            zxc_cctx_free(&dctx->inner);
+            dctx->initialized = 0;
+        }
+        // LCOV_EXCL_START
+        if (UNLIKELY(zxc_cctx_init(&dctx->inner, block_size, 0, 0, checksum_enabled, 0) != ZXC_OK))
+            return ZXC_ERROR_MEMORY;
+        // LCOV_EXCL_STOP
+        dctx->last_block_size = block_size;
+        dctx->last_dict_size = 0;
+        dctx->initialized = 1;
+    } else {
+        dctx->inner.checksum_enabled = checksum_enabled;
+    }
+    dctx->inner.dict_size = 0;
+
+    const int res = zxc_decompress_chunk_wrapper_safe_public(&dctx->inner, (const uint8_t*)src,
+                                                             src_size, (uint8_t*)dst, dst_capacity);
+    if (UNLIKELY(res < 0)) return res;
+    return (int64_t)res;
+}
+
+/*
+ * ============================================================================
+ * STATIC CONTEXT API (caller-allocated workspace)
+ * ============================================================================
+ * Places the public handle struct at the start of the workspace, then carves
+ * the persistent buffer (via zxc_cctx_init_in_workspace) in the remaining
+ * cache-line-aligned tail.  The caller owns the whole workspace; free
+ * functions become no-ops via the owns_workspace flag.
+ */
+
+/* Size occupied by the opaque handle at the start of the workspace, rounded
+ * up to a cache-line boundary so the persistent buffer (which expects 64 B
+ * alignment for the hot zones) starts aligned. */
+#define ZXC_STATIC_CCTX_HDR_SIZE ZXC_ALIGN_CL(sizeof(struct zxc_cctx_s))
+#define ZXC_STATIC_DCTX_HDR_SIZE ZXC_ALIGN_CL(sizeof(struct zxc_dctx_s))
+
+/**
+ * @brief Workspace size needed for a static compression context.
+ *
+ * Public API; see @c zxc_buffer.h. Sum of the cache-line-aligned handle header
+ * and the persistent buffer that @ref zxc_init_static_cctx carves for the given
+ * @p block_size / @p level. Performs no allocation.
+ *
+ * @param[in] block_size  Block size the context will be pinned to.
+ * @param[in] level       Compression level.
+ * @return Required workspace size in bytes, or 0 if the parameters are invalid.
+ */
+size_t zxc_static_cctx_workspace_size(const size_t block_size, const int level) {
+    if (UNLIKELY(!zxc_validate_block_size(block_size))) return 0;
+    if (UNLIKELY(level < ZXC_LEVEL_FASTEST || level > ZXC_LEVEL_DENSITY)) return 0;
+    const size_t inner_sz = zxc_cctx_compute_workspace_size(block_size, 1, level, 0);
+    if (UNLIKELY(inner_sz == 0)) return 0;
+    return ZXC_STATIC_CCTX_HDR_SIZE + inner_sz;
+}
+
+/**
+ * @brief Initialises a compression context inside a caller-supplied workspace.
+ *
+ * Public API; full contract in @c zxc_buffer.h. Places the opaque handle at the
+ * start of @p workspace and carves the persistent buffer in the aligned tail -
+ * no heap allocation. The block size is pinned for the context's lifetime, and
+ * @ref zxc_free_cctx becomes a no-op (the caller owns @p workspace).
+ *
+ * @param[in] workspace       Caller buffer (>= @ref zxc_static_cctx_workspace_size).
+ * @param[in] workspace_size  Capacity of @p workspace in bytes.
+ * @param[in] opts            Compression options (non-NULL: level, block_size,
+ *                            checksum).
+ * @return A ready context owned by @p workspace, or NULL on invalid input or an
+ *         undersized workspace.
+ */
+zxc_cctx* zxc_init_static_cctx(void* RESTRICT workspace, const size_t workspace_size,
+                               const zxc_compress_opts_t* RESTRICT opts) {
+    if (UNLIKELY(!workspace || !opts)) return NULL;
+
+    const int level = (opts->level > 0) ? opts->level : ZXC_LEVEL_DEFAULT;
+    const size_t block_size = (opts->block_size > 0) ? opts->block_size : ZXC_BLOCK_SIZE_DEFAULT;
+    const int checksum_enabled = opts->checksum_enabled;
+
+    if (UNLIKELY(!zxc_validate_block_size(block_size))) return NULL;
+    if (UNLIKELY(level < ZXC_LEVEL_FASTEST || level > ZXC_LEVEL_DENSITY)) return NULL;
+
+    const size_t inner_sz = zxc_cctx_compute_workspace_size(block_size, 1, level, 0);
+    if (UNLIKELY(inner_sz == 0)) return NULL;
+    if (UNLIKELY(workspace_size < ZXC_STATIC_CCTX_HDR_SIZE + inner_sz)) return NULL;
+
+    zxc_cctx* const cctx = (zxc_cctx*)workspace;
+    ZXC_MEMSET(cctx, 0, sizeof(*cctx));
+
+    uint8_t* const inner_ws = (uint8_t*)workspace + ZXC_STATIC_CCTX_HDR_SIZE;
+    if (UNLIKELY(zxc_cctx_init_in_workspace(&cctx->inner, inner_ws, inner_sz, block_size, 1, level,
+                                            checksum_enabled, 0) != ZXC_OK))
+        return NULL;
+
+    cctx->owns_workspace = 1;
+    cctx->initialized = 1;
+    cctx->last_block_size = block_size;
+    cctx->stored_level = level;
+    cctx->stored_block_size = block_size;
+    cctx->stored_checksum = checksum_enabled;
+    return cctx;
+}
+
+/**
+ * @brief Workspace size needed for a static decompression context.
+ *
+ * Public API; see @c zxc_buffer.h. Sum of the cache-line-aligned handle header
+ * and the persistent buffer that @ref zxc_init_static_dctx carves for the given
+ * @p block_size. Performs no allocation.
+ *
+ * @param[in] block_size  Block size the context will be pinned to.
+ * @return Required workspace size in bytes, or 0 if @p block_size is invalid.
+ */
+size_t zxc_static_dctx_workspace_size(const size_t block_size) {
+    if (UNLIKELY(!zxc_validate_block_size(block_size))) return 0;
+    const size_t inner_sz = zxc_cctx_compute_workspace_size(block_size, 0, 0, 0);
+    if (UNLIKELY(inner_sz == 0)) return 0;
+    return ZXC_STATIC_DCTX_HDR_SIZE + inner_sz;
+}
+
+/**
+ * @brief Initialises a decompression context inside a caller-supplied workspace.
+ *
+ * Public API; full contract in @c zxc_buffer.h. Places the opaque handle at the
+ * start of @p workspace and carves the persistent buffer in the aligned tail -
+ * no heap allocation. The block size is pinned, so decoded archives must match
+ * it; @ref zxc_free_dctx becomes a no-op (the caller owns @p workspace).
+ *
+ * @param[in] workspace       Caller buffer (>= @ref zxc_static_dctx_workspace_size).
+ * @param[in] workspace_size  Capacity of @p workspace in bytes.
+ * @param[in] block_size      Block size to pin the context to.
+ * @return A ready context owned by @p workspace, or NULL on invalid input or an
+ *         undersized workspace.
+ */
+zxc_dctx* zxc_init_static_dctx(void* RESTRICT workspace, const size_t workspace_size,
+                               const size_t block_size) {
+    if (UNLIKELY(!workspace)) return NULL;
+    if (UNLIKELY(!zxc_validate_block_size(block_size))) return NULL;
+
+    const size_t inner_sz = zxc_cctx_compute_workspace_size(block_size, 0, 0, 0);
+    if (UNLIKELY(inner_sz == 0)) return NULL;
+    if (UNLIKELY(workspace_size < ZXC_STATIC_DCTX_HDR_SIZE + inner_sz)) return NULL;
+
+    zxc_dctx* const dctx = (zxc_dctx*)workspace;
+    ZXC_MEMSET(dctx, 0, sizeof(*dctx));
+
+    uint8_t* const inner_ws = (uint8_t*)workspace + ZXC_STATIC_DCTX_HDR_SIZE;
+    /* mode == 0 init: checksum_enabled is updated per-call from the file
+     * header flags, so it does not need to be locked at workspace init. */
+    if (UNLIKELY(zxc_cctx_init_in_workspace(&dctx->inner, inner_ws, inner_sz, block_size, 0, 0, 0,
+                                            0) != ZXC_OK))
+        return NULL;
+
+    dctx->owns_workspace = 1;
+    dctx->initialized = 1;
+    dctx->last_block_size = block_size;
+    return dctx;
+}
diff --git a/thirdparty/zxc/src/lib/zxc_driver.c b/thirdparty/zxc/src/lib/zxc_driver.c
new file mode 100644
index 000000000000..307c338807a9
--- /dev/null
+++ b/thirdparty/zxc/src/lib/zxc_driver.c
@@ -0,0 +1,1257 @@
+/*
+ * ZXC - High-performance lossless compression
+ *
+ * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file zxc_driver.c
+ * @brief Userspace @c FILE*-flavored driver: multi-threaded streaming and
+ *        the seekable @c FILE* open helper.
+ *
+ * Two distinct subsystems live in this translation unit because they share
+ * the same userspace-only host requirements (@c <stdio.h>, threading, and
+ * platform file-descriptor extraction): keeping them together means a
+ * single TU to exclude when building for kernel / freestanding targets.
+ *
+ *   1. Streaming engine: a ring-buffer producer / worker / consumer
+ *      pipeline that parallelises block processing over @c FILE* streams.
+ *      Public API: @ref zxc_stream_compress, @ref zxc_stream_decompress,
+ *      @ref zxc_stream_get_decompressed_size.
+ *
+ *   2. Seekable @c FILE* wrapper: builds a @ref zxc_reader_t whose
+ *      @c read_at uses @c pread / @c ReadFile on the file descriptor
+ *      extracted from a @c FILE*, then delegates to
+ *      @ref zxc_seekable_open_reader.  Public API:
+ *      @ref zxc_seekable_open_file.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "../../include/zxc_buffer.h"
+#include "../../include/zxc_dict.h"
+#include "../../include/zxc_error.h"
+#include "../../include/zxc_seekable.h"
+#include "../../include/zxc_stream.h"
+#include "zxc_internal.h"
+
+/*
+ * ============================================================================
+ * WINDOWS THREADING EMULATION
+ * ============================================================================
+ * Maps POSIX pthread calls to Windows Native API (CriticalSection,
+ * ConditionVariable, Threads). Allows the same threading logic to compile on
+ * Linux/macOS and Windows.
+ */
+#if defined(_WIN32)
+#include <io.h> /* _get_osfhandle, _fileno (used by zxc_seekable_open_file) */
+#include <malloc.h>
+#include <process.h>
+#include <sys/types.h>
+#include <windows.h>
+
+// Map POSIX file positioning functions to Windows equivalents
+#define fseeko _fseeki64
+#define ftello _ftelli64
+
+/**
+ * @brief Returns the logical-processor count (backs the @c sysconf shim below).
+ * @return Number of processors reported by @c GetSystemInfo.
+ */
+static int zxc_get_num_procs(void) {
+    SYSTEM_INFO sysinfo;
+    GetSystemInfo(&sysinfo);
+    return sysinfo.dwNumberOfProcessors;
+}
+
+typedef CRITICAL_SECTION pthread_mutex_t;
+typedef CONDITION_VARIABLE pthread_cond_t;
+typedef HANDLE pthread_t;
+
+#define pthread_mutex_init(m, a) InitializeCriticalSection(m)
+#define pthread_mutex_destroy(m) DeleteCriticalSection(m)
+#define pthread_mutex_lock(m) EnterCriticalSection(m)
+#define pthread_mutex_unlock(m) LeaveCriticalSection(m)
+
+#define pthread_cond_init(c, a) InitializeConditionVariable(c)
+#define pthread_cond_destroy(c) (void)(0)
+#define pthread_cond_wait(c, m) SleepConditionVariableCS(c, m, INFINITE)
+#define pthread_cond_signal(c) WakeConditionVariable(c)
+#define pthread_cond_broadcast(c) WakeAllConditionVariable(c)
+
+/**
+ * @brief Trampoline payload bridging the POSIX @c void*(*)(void*) worker
+ *        signature to the @c _beginthreadex entry point.
+ *
+ * Heap-allocated by the @c pthread_create shim and freed by
+ * @ref zxc_win_thread_entry once the captured worker has started.
+ */
+typedef struct {
+    void* (*func)(void*); /* worker to invoke */
+    void* arg;            /* argument forwarded to @c func */
+} zxc_win_thread_arg_t;
+
+/**
+ * @brief @c _beginthreadex entry point: unpacks the trampoline payload, frees
+ *        it, then runs the captured POSIX-style worker.
+ *
+ * @param[in] p  Heap @ref zxc_win_thread_arg_t handed over by the creator;
+ *               ownership transfers to this function.
+ * @return Always 0 (the worker's @c void* result is discarded, as on POSIX).
+ */
+static unsigned __stdcall zxc_win_thread_entry(void* p) {
+    zxc_win_thread_arg_t* a = (zxc_win_thread_arg_t*)p;
+    void* (*f)(void*) = a->func;
+    void* arg = a->arg;
+    ZXC_FREE(a);
+    f(arg);
+    return 0;
+}
+
+/**
+ * @brief @c pthread_create shim: spawns @p start_routine(@p arg) via
+ *        @c _beginthreadex, matching the POSIX prototype.
+ *
+ * @param[out] thread        Receives the thread handle on success.
+ * @param[in]  attr          Unused (POSIX attribute object); ignored.
+ * @param[in]  start_routine Worker to run on the new thread.
+ * @param[in]  arg           Opaque argument forwarded to @p start_routine.
+ * @return 0 on success, @ref ZXC_ERROR_MEMORY on allocation or spawn failure.
+ */
+static int pthread_create(pthread_t* thread, const void* attr, void* (*start_routine)(void*),
+                          void* arg) {
+    (void)attr;
+    zxc_win_thread_arg_t* wrapper = ZXC_MALLOC(sizeof(zxc_win_thread_arg_t));
+    if (UNLIKELY(!wrapper)) return ZXC_ERROR_MEMORY;
+    wrapper->func = start_routine;
+    wrapper->arg = arg;
+    uintptr_t handle = _beginthreadex(NULL, 0, zxc_win_thread_entry, wrapper, 0, NULL);
+    if (UNLIKELY(handle == 0)) {
+        ZXC_FREE(wrapper);
+        return ZXC_ERROR_MEMORY;
+    }
+    *thread = (HANDLE)handle;
+    return 0;
+}
+
+/**
+ * @brief @c pthread_join shim: blocks until @p thread finishes, then closes its
+ *        handle.
+ *
+ * @param[in] thread  Handle from a successful @c pthread_create.
+ * @param[in] retval  Unused (POSIX exit-value out-param); ignored.
+ * @return Always 0.
+ */
+static int pthread_join(pthread_t thread, void** retval) {
+    (void)retval;
+    WaitForSingleObject(thread, INFINITE);
+    CloseHandle(thread);
+    return 0;
+}
+
+#define sysconf(x) zxc_get_num_procs()
+#define _SC_NPROCESSORS_ONLN 0
+
+#else
+#include <pthread.h>
+#include <unistd.h>
+#endif
+
+/*
+ * ============================================================================
+ * STREAMING ENGINE (Producer / Worker / Consumer)
+ * ============================================================================
+ * Implements a Ring Buffer architecture to parallelize block processing.
+ */
+
+/**
+ * @enum job_status_t
+ * @brief Represents the lifecycle states of a processing job within the ring
+ * buffer.
+ *
+ * @var JOB_STATUS_FREE
+ *      The job slot is empty and available to be filled with new data by the
+ * writer.
+ * @var JOB_STATUS_FILLED
+ *      The job slot has been populated with input data and is ready for
+ * processing by a worker.
+ * @var JOB_STATUS_PROCESSED
+ *      The worker has finished processing the data; the result is ready to be
+ * consumed/written out.
+ */
+typedef enum { JOB_STATUS_FREE, JOB_STATUS_FILLED, JOB_STATUS_PROCESSED } job_status_t;
+
+/**
+ * @struct zxc_stream_job_t
+ * @brief Represents a single unit of work (a chunk of data) to be processed.
+ *
+ * This structure holds the input and output buffers for a specific chunk of
+ * data, along with its processing status. It is padded to align with cache
+ * lines to prevent false sharing in a multi-threaded environment.
+ *
+ * @var zxc_stream_job_t::in_buf
+ *      Pointer to the buffer containing raw input data.
+ * @var zxc_stream_job_t::in_cap
+ *      The total allocated capacity of the input buffer.
+ * @var zxc_stream_job_t::in_sz
+ *      The actual size of the valid data currently in the input buffer.
+ * @var zxc_stream_job_t::out_buf
+ *      Pointer to the buffer where processed (compressed/decompressed) data is
+ * stored.
+ * @var zxc_stream_job_t::out_cap
+ *      The total allocated capacity of the output buffer.
+ * @var zxc_stream_job_t::result_sz
+ *      The actual size of the valid data produced in the output buffer.
+ * @var zxc_stream_job_t::job_id
+ *      A unique identifier for the job, often used for ordering or debugging.
+ * @var zxc_stream_job_t::status
+ *      The current state of this job (Free, Filled, or Processed).
+ * @var zxc_stream_job_t::pad
+ *      Padding bytes to ensure the structure size aligns with the cache line
+ * size (@c ZXC_CACHE_LINE_SIZE), minimizing cache contention between threads
+ * accessing adjacent jobs.
+ */
+typedef struct {
+    uint8_t* in_buf;
+    size_t in_cap;
+    size_t in_sz;
+    uint8_t* out_buf;
+    size_t out_cap;
+    size_t result_sz;
+    int job_id;
+    ZXC_ATOMIC job_status_t status;  // Atomic for lock-free status updates
+    char pad[ZXC_CACHE_LINE_SIZE];   // Prevent False Sharing
+} zxc_stream_job_t;
+
+/**
+ * @typedef zxc_chunk_processor_t
+ * @brief Function pointer type for processing a chunk of data.
+ *
+ * This type defines the signature for internal functions responsible for
+ * processing (compressing or transforming) a specific chunk of input data.
+ *
+ * @param ctx     Pointer to the compression context containing state and
+ * configuration.
+ * @param in      Pointer to the input data buffer.
+ * @param in_sz   Size of the input data in bytes.
+ * @param out     Pointer to the output buffer where processed data will be
+ * written.
+ * @param out_cap Capacity of the output buffer in bytes.
+ *
+ * @return The number of bytes written to the output buffer on success, or a
+ * negative error code on failure.
+ */
+typedef int (*zxc_chunk_processor_t)(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT in,
+                                     const size_t in_sz, uint8_t* RESTRICT out,
+                                     const size_t out_cap);
+
+/**
+ * @struct zxc_stream_ctx_t
+ * @brief The main context structure managing the streaming
+ * compression/decompression state.
+ *
+ * This structure orchestrates the producer-consumer workflow. It manages the
+ * ring buffer of jobs, the worker queue, synchronization primitives (mutexes
+ * and condition variables), and configuration settings for the compression
+ * algorithm.
+ *
+ * @var zxc_stream_ctx_t::jobs
+ *      Array of job structures acting as the ring buffer.
+ * @var zxc_stream_ctx_t::ring_size
+ *      The total number of slots in the jobs array.
+ * @var zxc_stream_ctx_t::worker_queue
+ *      A circular queue containing indices of jobs ready to be picked up by
+ * worker threads.
+ * @var zxc_stream_ctx_t::wq_head
+ *      Index of the head of the worker queue (where workers take jobs).
+ * @var zxc_stream_ctx_t::wq_tail
+ *      Index of the tail of the worker queue (where the writer adds jobs).
+ * @var zxc_stream_ctx_t::wq_count
+ *      Current number of items in the worker queue.
+ * @var zxc_stream_ctx_t::lock
+ *      Mutex used to protect access to shared resources (queue indices, status
+ * changes).
+ * @var zxc_stream_ctx_t::cond_reader
+ *      Condition variable to signal the output thread (reader) that processed
+ * data is available.
+ * @var zxc_stream_ctx_t::cond_worker
+ *      Condition variable to signal worker threads that new work is available.
+ * @var zxc_stream_ctx_t::cond_writer
+ *      Condition variable to signal the input thread (writer) that job slots
+ * are free.
+ * @var zxc_stream_ctx_t::shutdown_workers
+ *      Flag indicating that worker threads should terminate.
+ * @var zxc_stream_ctx_t::compression_mode
+ *      Indicates the operation mode (e.g., compression or decompression).
+ * @var zxc_stream_ctx_t::io_error
+ *      Atomic flag to signal if an I/O error occurred during processing.
+ * @var zxc_stream_ctx_t::processor
+ *      Function pointer or object responsible for the actual chunk processing
+ * logic.
+ * @var zxc_stream_ctx_t::write_idx
+ *      The index of the next job slot to be written to by the main thread.
+ * @var zxc_stream_ctx_t::compression_level
+ *      The configured level of compression (trading off speed vs. ratio).
+ * @var zxc_stream_ctx_t::chunk_size
+ *      The size of each data chunk to be processed.
+ * @var zxc_stream_ctx_t::checksum_enabled
+ *      Flag indicating whether checksum verification/generation is active.
+ * @var zxc_stream_ctx_t::file_has_checksum
+ *     Flag indicating whether the input file includes checksums.
+ * @var zxc_stream_ctx_t::progress_cb
+ *     Optional callback function for reporting progress during processing.
+ * @var zxc_stream_ctx_t::progress_user_data
+ *    User data pointer to be passed to the progress callback function.
+ * @var zxc_stream_ctx_t::total_input_bytes
+ *     Total size of the input data in bytes, used for progress tracking.
+ * @var zxc_stream_ctx_t::dict
+ *     Pointer to the optional dictionary buffer used to prime
+ *     compression/decompression, NULL when no dictionary is in use.
+ * @var zxc_stream_ctx_t::dict_size
+ *     Size of the dictionary in bytes, 0 when no dictionary is in use.
+ * @var zxc_stream_ctx_t::dict_huf
+ *     Shared dictionary literal Huffman table (128-byte packed code-lengths
+ *     header), NULL when absent.
+ */
+typedef struct {
+    zxc_stream_job_t* jobs;
+    size_t ring_size;
+    int* worker_queue;
+    int wq_head;
+    int wq_tail;
+    int wq_count;
+    pthread_mutex_t lock;
+    pthread_cond_t cond_reader;
+    pthread_cond_t cond_worker;
+    pthread_cond_t cond_writer;
+    int shutdown_workers;
+    int compression_mode;
+    ZXC_ATOMIC int io_error;
+    zxc_chunk_processor_t processor;
+    int write_idx;
+    int compression_level;
+    size_t chunk_size;
+    int checksum_enabled;
+    int file_has_checksum;
+    zxc_progress_callback_t progress_cb;
+    void* progress_user_data;
+    uint64_t total_input_bytes;
+    const uint8_t* dict;
+    size_t dict_size;
+    const uint8_t* dict_huf; /**< Shared dictionary literal table (128-byte packed
+                                  code-lengths header), NULL when absent. */
+} zxc_stream_ctx_t;
+
+/**
+ * @struct writer_args_t
+ * @brief Structure containing arguments for the writer callback function.
+ *
+ * This structure is used to pass necessary context and state information
+ * to the function responsible for writing compressed or decompressed data
+ * to a file stream.
+ *
+ * @var writer_args_t::ctx
+ * Pointer to the ZXC stream context, holding the state of the
+ * compression/decompression stream.
+ *
+ * @var writer_args_t::f
+ * Pointer to the output file stream where data will be written.
+ *
+ * @var writer_args_t::total_bytes
+ * Accumulator for the total number of bytes written to the file so far.
+ *
+ * @var writer_args_t::global_hash
+ * The global hash accumulated during processing.
+ *
+ * @var writer_args_t::bytes_processed
+ * The number of bytes processed so far, used for progress reporting.
+ *
+ * @var writer_args_t::seek_comp
+ * Array of compressed block sizes for seek table construction.
+ *
+ * @var writer_args_t::seek_count
+ * Number of entries in the seek table.
+ *
+ * @var writer_args_t::seek_cap
+ * Capacity of the seek table array.
+ */
+typedef struct {
+    zxc_stream_ctx_t* ctx;
+    FILE* f;
+    int64_t total_bytes;
+    uint32_t global_hash;
+    uint64_t bytes_processed;  // For progress callback
+    uint32_t* seek_comp;
+    uint32_t seek_count;
+    uint32_t seek_cap;
+} writer_args_t;
+
+/**
+ * @brief Worker thread function for parallel stream processing.
+ *
+ * This function serves as the entry point for worker threads in the ZXC
+ * streaming compression/decompression context. It continuously retrieves jobs
+ * from a shared work queue, processes them using a thread-local compression
+ * context (`zxc_cctx_t`), and signals the writer thread upon completion.
+ *
+ * **Worker Lifecycle & Synchronization:**
+ * 1. **Initialization:** Allocates a thread-local `zxc_cctx_t` to avoid lock
+ * contention during compression/decompression.
+ * 2. **Wait Loop:** Uses `pthread_cond_wait` on `cond_worker` to sleep until a
+ * job is available in the `worker_queue`.
+ * 3. **Job Retrieval:** Dequeues a job ID from the ring buffer. The
+ * `worker_queue` acts as a load balancer.
+ * 4. **Processing:** Calls `ctx->processor` (the compression/decompression
+ * function) on the job's data. This is the CPU-intensive part and runs in
+ * parallel.
+ * 5. **Completion:** Updates `job->status` to `JOB_STATUS_PROCESSED`.
+ * 6. **Signaling:** If the processed job is the *next* one expected by the
+ * writer
+ *    (`jid == ctx->write_idx`), it signals `cond_writer`. This optimization
+ * prevents unnecessary wake-ups of the writer thread for out-of-order
+ * completions.
+ *
+ * @param[in] arg A pointer to the shared stream context (`zxc_stream_ctx_t`).
+ * @return Always returns NULL.
+ */
+static void* zxc_stream_worker(void* arg) {
+    zxc_stream_ctx_t* const ctx = (zxc_stream_ctx_t*)arg;
+    zxc_cctx_t cctx;
+
+    const int unified_chk = (ctx->compression_mode == 1)
+                                ? ctx->checksum_enabled
+                                : (ctx->file_has_checksum && ctx->checksum_enabled);
+
+    const size_t eff_chunk = (ctx->dict_size > 0 && ctx->compression_mode == 1)
+                                 ? zxc_block_size_ceil(ctx->dict_size + ctx->chunk_size)
+                                 : ctx->chunk_size;
+    if (UNLIKELY(zxc_cctx_init(&cctx, eff_chunk, ctx->compression_mode, ctx->compression_level,
+                               unified_chk, ctx->dict_size) != ZXC_OK ||
+                 zxc_cctx_attach_dict_huf(&cctx, ctx->dict_huf) != ZXC_OK)) {
+        // LCOV_EXCL_START
+        zxc_cctx_free(&cctx);
+        pthread_mutex_lock(&ctx->lock);
+        ctx->io_error = 1;
+        pthread_cond_broadcast(&ctx->cond_writer);
+        pthread_cond_broadcast(&ctx->cond_reader);
+        pthread_mutex_unlock(&ctx->lock);
+        return NULL;
+        // LCOV_EXCL_STOP
+    }
+
+    cctx.compression_level = ctx->compression_level;
+
+    /* Per-worker dict buffer for assembling [dict | block_data] */
+    const size_t dsz = ctx->dict_size;
+    uint8_t* const dict_work = cctx.dict_buffer;
+    if (dict_work) ZXC_MEMCPY(dict_work, ctx->dict, dsz);
+
+    while (1) {
+        zxc_stream_job_t* job = NULL;
+        pthread_mutex_lock(&ctx->lock);
+        while (ctx->wq_count == 0 && !ctx->shutdown_workers) {
+            pthread_cond_wait(&ctx->cond_worker, &ctx->lock);
+        }
+        if (ctx->shutdown_workers && ctx->wq_count == 0) {
+            pthread_mutex_unlock(&ctx->lock);
+            break;
+        }
+        const int jid = ctx->worker_queue[ctx->wq_tail];
+        ctx->wq_tail = (ctx->wq_tail + 1) % ctx->ring_size;
+        ctx->wq_count--;
+        job = &ctx->jobs[jid];
+        pthread_mutex_unlock(&ctx->lock);
+
+        int res;
+        if (dict_work && ctx->compression_mode == 1) {
+            ZXC_MEMCPY(dict_work + dsz, job->in_buf, job->in_sz);
+            res = ctx->processor(&cctx, dict_work, dsz + job->in_sz, job->out_buf, job->out_cap);
+        } else if (dict_work && ctx->compression_mode == 0) {
+            res = ctx->processor(&cctx, job->in_buf, job->in_sz, dict_work + dsz,
+                                 ctx->chunk_size + ZXC_DECOMPRESS_TAIL_PAD);
+            if (LIKELY(res > 0)) ZXC_MEMCPY(job->out_buf, dict_work + dsz, (size_t)res);
+        } else {
+            res = ctx->processor(&cctx, job->in_buf, job->in_sz, job->out_buf, job->out_cap);
+        }
+
+        pthread_mutex_lock(&ctx->lock);
+        job->result_sz = UNLIKELY(res < 0) ? 0 : (size_t)res;
+        job->status = JOB_STATUS_PROCESSED;
+        if (UNLIKELY(res < 0)) {
+            ctx->io_error = 1;
+            pthread_cond_broadcast(&ctx->cond_writer);
+            pthread_cond_broadcast(&ctx->cond_reader);
+        } else if (jid == ctx->write_idx) {
+            pthread_cond_signal(&ctx->cond_writer);
+        }
+        pthread_mutex_unlock(&ctx->lock);
+    }
+    zxc_cctx_free(&cctx);
+    return NULL;
+}
+
+/**
+ * @brief Asynchronous writer thread function.
+ *
+ * This function runs as a separate thread responsible for writing processed
+ * data chunks to the output file. It operates on a ring buffer of jobs shared
+ * with the reader and worker threads.
+ *
+ * **Ordering Enforcement:**
+ * The writer MUST write blocks in the exact order they were read. Even if
+ * worker threads finish jobs out of order (e.g., job 2 finishes before job 1),
+ * the writer waits for `ctx->write_idx` (job 1) to be `JOB_STATUS_PROCESSED`.
+ *
+ * **Workflow:**
+ * 1. **Wait:** Sleeps on `cond_writer` until the job at `ctx->write_idx` is
+ * ready.
+ * 2. **Write:** Writes the `out_buf` to the file.
+ * 3. **Release:** Sets the job status to `JOB_STATUS_FREE` and signals
+ * `cond_reader`, allowing the main thread to reuse this slot for new input.
+ * 4. **Advance:** Increments `ctx->write_idx` to wait for the next sequential
+ * block.
+ *
+ * @param[in] arg Pointer to a `writer_args_t` structure containing the stream
+ * context, the output file handle, and a counter for total bytes written.
+ * @return Always returns NULL.
+ */
+static void* zxc_async_writer(void* arg) {
+    writer_args_t* const args = (writer_args_t*)arg;
+    zxc_stream_ctx_t* const ctx = args->ctx;
+    while (1) {
+        zxc_stream_job_t* const job = &ctx->jobs[ctx->write_idx];
+        pthread_mutex_lock(&ctx->lock);
+        while (job->status != JOB_STATUS_PROCESSED && !ctx->io_error)
+            pthread_cond_wait(&ctx->cond_writer, &ctx->lock);
+
+        const size_t result_sz = job->result_sz;
+        const size_t in_sz = job->in_sz;
+        pthread_mutex_unlock(&ctx->lock);
+
+        if (result_sz == (size_t)-1) break;
+
+        if (args->f && result_sz > 0) {
+            if (fwrite(job->out_buf, 1, result_sz, args->f) != result_sz) {
+                pthread_mutex_lock(&ctx->lock);
+                ctx->io_error = 1;
+                pthread_cond_signal(&ctx->cond_reader);
+                pthread_mutex_unlock(&ctx->lock);
+            } else if (ctx->checksum_enabled && ctx->compression_mode == 1) {
+                // Update Global Hash (Rotation + XOR)
+                if (LIKELY(result_sz >= ZXC_GLOBAL_CHECKSUM_SIZE)) {
+                    uint32_t block_hash =
+                        zxc_le32(job->out_buf + result_sz - ZXC_GLOBAL_CHECKSUM_SIZE);
+                    args->global_hash = zxc_hash_combine_rotate(args->global_hash, block_hash);
+                }
+            }
+        }
+        if (UNLIKELY(ctx->io_error)) {
+            pthread_mutex_lock(&ctx->lock);
+            job->status = JOB_STATUS_FREE;
+            pthread_cond_signal(&ctx->cond_reader);
+            pthread_mutex_unlock(&ctx->lock);
+            break;
+        }
+        args->total_bytes += (int64_t)result_sz;
+
+        /* Seekable: record compressed block size */
+        if (args->seek_comp && ctx->compression_mode == 1) {
+            if (UNLIKELY(args->seek_count >= args->seek_cap)) {
+                args->seek_cap = args->seek_cap * 2;
+                uint32_t* nc =
+                    (uint32_t*)ZXC_REALLOC(args->seek_comp, args->seek_cap * sizeof(uint32_t));
+                // LCOV_EXCL_START
+                if (UNLIKELY(!nc)) {
+                    pthread_mutex_lock(&ctx->lock);
+                    ctx->io_error = 1;
+                    job->status = JOB_STATUS_FREE;
+                    pthread_cond_signal(&ctx->cond_reader);
+                    pthread_mutex_unlock(&ctx->lock);
+                    break;
+                }
+                // LCOV_EXCL_STOP
+                args->seek_comp = nc;
+            }
+            args->seek_comp[args->seek_count++] = (uint32_t)result_sz;
+        }
+
+        // Update progress callback
+        if (ctx->progress_cb) {
+            // LCOV_EXCL_START
+            args->bytes_processed += ctx->compression_mode == 1 ? in_sz : result_sz;
+            ctx->progress_cb(args->bytes_processed, ctx->total_input_bytes,
+                             ctx->progress_user_data);
+            // LCOV_EXCL_STOP
+        }
+
+        pthread_mutex_lock(&ctx->lock);
+        job->status = JOB_STATUS_FREE;
+        ctx->write_idx = (ctx->write_idx + 1) % ctx->ring_size;
+        pthread_cond_signal(&ctx->cond_reader);
+        pthread_mutex_unlock(&ctx->lock);
+    }
+    return NULL;
+}
+
+/**
+ * @brief Orchestrates the multithreaded streaming compression or decompression
+ * engine.
+ *
+ * This function initializes the stream context, allocates the necessary ring
+ * buffer memory for jobs and I/O buffers, and spawns the worker threads and the
+ * asynchronous writer thread. It acts as the main "producer" (reader) loop.
+ *
+ * **Architecture: Producer-Consumer with Ring Buffer**
+ * - **Ring Buffer:** A fixed-size array of `zxc_stream_job_t` structures.
+ * - **Producer (Main Thread):** Reads chunks from `f_in` and fills "Free" slots
+ *   in the ring buffer. It blocks if no slots are free (backpressure).
+ * - **Workers:** Pick up "Filled" jobs from a queue, process them, and mark
+ * them as "Processed".
+ * - **Consumer (Writer Thread):** Waits for the *next sequential* job to be
+ *   "Processed", writes it to `f_out`, and marks the slot as "Free".
+ *
+ * **Double-Buffering & Zero-Copy:**
+ * We allocate `alloc_in` and `alloc_out` buffers for each job. The reader reads
+ * directly into `in_buf`, and the writer writes directly from `out_buf`,
+ * minimizing memory copies.
+ *
+ * @param[in]  f_in             Input file stream (source).
+ * @param[out] f_out            Output file stream (destination).
+ * @param[in]  n_threads        Worker thread count; 0 or less auto-detects the
+ *                              number of online processors.
+ * @param[in]  mode             1 for compression, 0 for decompression.
+ * @param[in]  level            Compression level (compression mode only).
+ * @param[in]  block_size       Block size in bytes (compression mode).
+ * @param[in]  checksum_enabled Non-zero to generate / verify checksums.
+ * @param[in]  seekable         Non-zero to emit a seek table (compression mode).
+ * @param[in]  func             Chunk processor (compression or decompression).
+ * @param[in]  progress_cb      Optional progress callback, or NULL.
+ * @param[in]  user_data        Opaque pointer passed to @p progress_cb.
+ * @param[in]  dict             Optional dictionary content, or NULL.
+ * @param[in]  dict_size        Dictionary length in bytes (0 if none).
+ * @param[in]  dict_huf         Optional shared literal Huffman table, or NULL.
+ * @return Total bytes written to the output on success, or a negative
+ *         @ref zxc_error_t code.
+ */
+static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_threads, const int mode,
+                                     const int level, const size_t block_size,
+                                     const int checksum_enabled, const int seekable,
+                                     zxc_chunk_processor_t func,
+                                     zxc_progress_callback_t progress_cb, void* user_data,
+                                     const uint8_t* dict, const size_t dict_size,
+                                     const uint8_t* dict_huf) {
+    zxc_stream_ctx_t ctx;
+    ZXC_MEMSET(&ctx, 0, sizeof(ctx));
+
+    size_t runtime_chunk_sz = (block_size > 0) ? block_size : ZXC_BLOCK_SIZE_DEFAULT;
+    int file_has_chk = 0;
+
+    // Try to get input file size for progress tracking (compression mode only)
+    // For decompression, the CLI precomputes the size and passes it via user_data
+    uint64_t total_file_size = 0;
+    if (mode == 1 && progress_cb) {
+        // LCOV_EXCL_START
+        const long long saved_pos = ftello(f_in);
+        if (saved_pos >= 0 && fseeko(f_in, 0, SEEK_END) == 0) {
+            const long long size = ftello(f_in);
+            if (size > 0) total_file_size = (uint64_t)size;
+            fseeko(f_in, saved_pos, SEEK_SET);
+        }
+        // LCOV_EXCL_STOP
+    }
+
+    if (mode == 0) {
+        // Decompression Mode: Read and validate file header
+        uint8_t h[ZXC_FILE_HEADER_SIZE];
+        uint32_t header_dict_id = 0;
+        if (UNLIKELY(fread(h, 1, ZXC_FILE_HEADER_SIZE, f_in) != ZXC_FILE_HEADER_SIZE ||
+                     zxc_read_file_header(h, ZXC_FILE_HEADER_SIZE, &runtime_chunk_sz, &file_has_chk,
+                                          &header_dict_id) != ZXC_OK))
+            return ZXC_ERROR_BAD_HEADER;
+
+        if (header_dict_id != 0) {
+            if (UNLIKELY(!dict || dict_size == 0)) return ZXC_ERROR_DICT_REQUIRED;
+            if (UNLIKELY(zxc_dict_id(dict, dict_size, dict_huf) != header_dict_id))
+                return ZXC_ERROR_DICT_MISMATCH;
+        }
+    }
+
+    int num_threads = (n_threads > 0) ? n_threads : (int)sysconf(_SC_NPROCESSORS_ONLN);
+    if (num_threads > ZXC_MAX_THREADS) num_threads = ZXC_MAX_THREADS;
+    // Reserve 1 thread for Writer/Reader overhead if possible
+    const int num_workers = (num_threads > 1) ? num_threads - 1 : 1;
+
+    ctx.compression_mode = mode;
+    ctx.processor = func;
+    ctx.io_error = 0;
+    ctx.compression_level = level;
+    ctx.ring_size = (size_t)num_workers * 4U;
+    ctx.chunk_size = runtime_chunk_sz;
+    ctx.checksum_enabled = checksum_enabled;
+    ctx.file_has_checksum = mode == 1 ? checksum_enabled : file_has_chk;
+    ctx.progress_cb = progress_cb;
+    ctx.progress_user_data = user_data;
+    ctx.total_input_bytes = total_file_size;
+    ctx.dict = dict;
+    ctx.dict_size = dict_size;
+    ctx.dict_huf = dict_huf;
+
+    uint32_t d_global_hash = 0;
+
+    const uint64_t max_out = zxc_compress_bound(runtime_chunk_sz);
+    const size_t raw_alloc_in = (size_t)((mode ? runtime_chunk_sz : max_out) + ZXC_PAD_SIZE);
+    const size_t alloc_in = (raw_alloc_in + ZXC_ALIGNMENT_MASK) & ~ZXC_ALIGNMENT_MASK;
+
+    const size_t raw_alloc_out =
+        (size_t)((mode ? max_out : runtime_chunk_sz + ZXC_DECOMPRESS_TAIL_PAD) + ZXC_PAD_SIZE);
+    const size_t alloc_out = (raw_alloc_out + ZXC_ALIGNMENT_MASK) & ~ZXC_ALIGNMENT_MASK;
+
+    const size_t per_job_sz = sizeof(zxc_stream_job_t) + sizeof(int) + alloc_in + alloc_out;
+    const size_t alloc_size = ctx.ring_size * per_job_sz;
+    uint8_t* const mem_block = ZXC_ALIGNED_MALLOC(alloc_size, ZXC_CACHE_LINE_SIZE);
+    if (UNLIKELY(!mem_block || per_job_sz > SIZE_MAX / ctx.ring_size)) {
+        // LCOV_EXCL_START
+        ZXC_ALIGNED_FREE(mem_block);
+        return ZXC_ERROR_MEMORY;
+        // LCOV_EXCL_STOP
+    }
+
+    uint8_t* ptr = mem_block;
+    ctx.jobs = (zxc_stream_job_t*)ptr;
+    ptr += ctx.ring_size * sizeof(zxc_stream_job_t);
+    ctx.worker_queue = (int*)ptr;
+    ptr += ctx.ring_size * sizeof(int);
+    uint8_t* buf_in = ptr;
+    ptr += ctx.ring_size * alloc_in;
+    uint8_t* buf_out = ptr;
+
+    ZXC_MEMSET(mem_block, 0, alloc_size);
+
+    for (size_t i = 0; i < ctx.ring_size; i++) {
+        ctx.jobs[i].job_id = (int)i;
+        ctx.jobs[i].status = JOB_STATUS_FREE;
+        ctx.jobs[i].in_buf = buf_in + (i * alloc_in);
+        ctx.jobs[i].in_cap = alloc_in - ZXC_PAD_SIZE;
+        ctx.jobs[i].in_sz = 0;
+        ctx.jobs[i].out_buf = buf_out + (i * alloc_out);
+        ctx.jobs[i].out_cap = alloc_out - ZXC_PAD_SIZE;
+        ctx.jobs[i].result_sz = 0;
+    }
+
+    pthread_mutex_init(&ctx.lock, NULL);
+    pthread_cond_init(&ctx.cond_reader, NULL);
+    pthread_cond_init(&ctx.cond_worker, NULL);
+    pthread_cond_init(&ctx.cond_writer, NULL);
+
+    pthread_t* const workers = ZXC_MALLOC((size_t)num_workers * sizeof(pthread_t));
+    if (UNLIKELY(!workers)) {
+        // LCOV_EXCL_START
+        ZXC_ALIGNED_FREE(mem_block);
+        return ZXC_ERROR_MEMORY;
+        // LCOV_EXCL_STOP
+    }
+    int started_workers = 0;
+    for (int i = 0; i < num_workers; i++) {
+        if (UNLIKELY(pthread_create(&workers[i], NULL, zxc_stream_worker, &ctx) != 0)) break;
+        started_workers++;
+    }
+    if (UNLIKELY(started_workers == 0)) {
+        // LCOV_EXCL_START
+        pthread_cond_destroy(&ctx.cond_writer);
+        pthread_cond_destroy(&ctx.cond_worker);
+        pthread_cond_destroy(&ctx.cond_reader);
+        pthread_mutex_destroy(&ctx.lock);
+        ZXC_FREE(workers);
+        ZXC_ALIGNED_FREE(mem_block);
+        return ZXC_ERROR_MEMORY;
+        // LCOV_EXCL_STOP
+    }
+
+    writer_args_t w_args = {&ctx, f_out, 0, 0, 0, NULL, 0, 0};
+
+    /* Seekable: allocate initial block-size tracking array */
+    if (mode == 1 && seekable) {
+        w_args.seek_cap = 64;
+        w_args.seek_comp = (uint32_t*)ZXC_MALLOC(w_args.seek_cap * sizeof(uint32_t));
+        // LCOV_EXCL_START
+        if (UNLIKELY(!w_args.seek_comp)) {
+            pthread_mutex_lock(&ctx.lock);
+            ctx.shutdown_workers = 1;
+            pthread_cond_broadcast(&ctx.cond_worker);
+            pthread_mutex_unlock(&ctx.lock);
+            for (int i = 0; i < started_workers; i++) pthread_join(workers[i], NULL);
+            pthread_cond_destroy(&ctx.cond_writer);
+            pthread_cond_destroy(&ctx.cond_worker);
+            pthread_cond_destroy(&ctx.cond_reader);
+            pthread_mutex_destroy(&ctx.lock);
+            ZXC_FREE(workers);
+            ZXC_ALIGNED_FREE(mem_block);
+            return ZXC_ERROR_MEMORY;
+        }
+        // LCOV_EXCL_STOP
+    }
+
+    if (mode == 1 && f_out) {
+        uint8_t h[ZXC_FILE_HEADER_SIZE];
+        zxc_write_file_header(h, ZXC_FILE_HEADER_SIZE, runtime_chunk_sz, checksum_enabled,
+                              (dict && dict_size) ? zxc_dict_id(dict, dict_size, dict_huf) : 0);
+        if (UNLIKELY(fwrite(h, 1, ZXC_FILE_HEADER_SIZE, f_out) != ZXC_FILE_HEADER_SIZE))
+            ctx.io_error = 1;
+
+        w_args.total_bytes = ZXC_FILE_HEADER_SIZE;
+    }
+    pthread_t writer_th;
+    if (UNLIKELY(pthread_create(&writer_th, NULL, zxc_async_writer, &w_args) != 0)) {
+        // LCOV_EXCL_START
+        pthread_mutex_lock(&ctx.lock);
+        ctx.shutdown_workers = 1;
+        pthread_cond_broadcast(&ctx.cond_worker);
+        pthread_mutex_unlock(&ctx.lock);
+        for (int i = 0; i < started_workers; i++) pthread_join(workers[i], NULL);
+        pthread_cond_destroy(&ctx.cond_writer);
+        pthread_cond_destroy(&ctx.cond_worker);
+        pthread_cond_destroy(&ctx.cond_reader);
+        pthread_mutex_destroy(&ctx.lock);
+        ZXC_FREE(workers);
+        ZXC_ALIGNED_FREE(mem_block);
+        return ZXC_ERROR_MEMORY;
+        // LCOV_EXCL_STOP
+    }
+
+    int read_idx = 0;
+    int read_eof = 0;
+    uint64_t total_src_bytes = 0;
+
+    // Reader Loop: Reads from file, prepares jobs, pushes to worker queue.
+    while (!read_eof && !ctx.io_error) {
+        zxc_stream_job_t* const job = &ctx.jobs[read_idx];
+        pthread_mutex_lock(&ctx.lock);
+        while (job->status != JOB_STATUS_FREE && !ctx.io_error)
+            pthread_cond_wait(&ctx.cond_reader, &ctx.lock);
+        pthread_mutex_unlock(&ctx.lock);
+
+        if (UNLIKELY(ctx.io_error)) break;
+
+        size_t read_sz = 0;
+        if (mode == 1) {
+            read_sz = fread(job->in_buf, 1, runtime_chunk_sz, f_in);
+            total_src_bytes += read_sz;
+            if (UNLIKELY(read_sz == 0)) read_eof = 1;
+        } else {
+            uint8_t bh_buf[ZXC_BLOCK_HEADER_SIZE];
+            size_t h_read = fread(bh_buf, 1, ZXC_BLOCK_HEADER_SIZE, f_in);
+            if (UNLIKELY(h_read < ZXC_BLOCK_HEADER_SIZE)) {
+                read_eof = 1;
+            } else {
+                zxc_block_header_t bh;
+                if (UNLIKELY(zxc_read_block_header(bh_buf, ZXC_BLOCK_HEADER_SIZE, &bh) != ZXC_OK)) {
+                    read_eof = 1;
+                    goto _job_prepared;
+                }
+
+                if (bh.block_type == ZXC_BLOCK_EOF) {
+                    if (UNLIKELY(bh.comp_size != 0)) {
+                        ctx.io_error = 1;
+                        goto _job_prepared;
+                    }
+                    read_eof = 1;
+                    read_sz = 0;
+                    goto _job_prepared;
+                }
+
+                const int has_crc = ctx.file_has_checksum;
+                const size_t checksum_sz = (has_crc ? ZXC_BLOCK_CHECKSUM_SIZE : 0);
+                const size_t body_total = bh.comp_size + checksum_sz;
+                const size_t total_len = ZXC_BLOCK_HEADER_SIZE + body_total;
+
+                if (UNLIKELY(total_len > job->in_cap)) {
+                    ctx.io_error = 1;
+                    break;
+                }
+
+                ZXC_MEMCPY(job->in_buf, bh_buf, ZXC_BLOCK_HEADER_SIZE);
+
+                // Single fread for body + checksum (reduces syscalls)
+                const size_t body_read =
+                    fread(job->in_buf + ZXC_BLOCK_HEADER_SIZE, 1, body_total, f_in);
+
+                if (UNLIKELY(body_read != body_total)) {
+                    ctx.io_error = 1;
+                    break;
+                } else if (has_crc) {
+                    // Update Global Hash for Decompression
+                    const uint32_t b_crc =
+                        zxc_le32(job->in_buf + ZXC_BLOCK_HEADER_SIZE + bh.comp_size);
+                    d_global_hash = zxc_hash_combine_rotate(d_global_hash, b_crc);
+                }
+                read_sz = ZXC_BLOCK_HEADER_SIZE + body_read;
+            }
+        }
+    _job_prepared:
+        if (UNLIKELY(read_eof && read_sz == 0)) break;
+
+        job->in_sz = read_sz;
+        pthread_mutex_lock(&ctx.lock);
+        job->status = JOB_STATUS_FILLED;
+        ctx.worker_queue[ctx.wq_head] = read_idx;
+        ctx.wq_head = (ctx.wq_head + 1) % ctx.ring_size;
+        ctx.wq_count++;
+        read_idx = (read_idx + 1) % ctx.ring_size;
+        pthread_cond_signal(&ctx.cond_worker);
+        pthread_mutex_unlock(&ctx.lock);
+
+        if (UNLIKELY(read_sz < runtime_chunk_sz && mode == 1)) read_eof = 1;
+    }
+
+    zxc_stream_job_t* const end_job = &ctx.jobs[read_idx];
+    pthread_mutex_lock(&ctx.lock);
+    while (end_job->status != JOB_STATUS_FREE && !ctx.io_error)
+        pthread_cond_wait(&ctx.cond_reader, &ctx.lock);
+    end_job->result_sz = (size_t)-1;
+    end_job->status = JOB_STATUS_PROCESSED;
+    pthread_cond_broadcast(&ctx.cond_writer);
+    pthread_mutex_unlock(&ctx.lock);
+
+    pthread_join(writer_th, NULL);
+    pthread_mutex_lock(&ctx.lock);
+    ctx.shutdown_workers = 1;
+    pthread_cond_broadcast(&ctx.cond_worker);
+    pthread_mutex_unlock(&ctx.lock);
+    for (int i = 0; i < started_workers; i++) pthread_join(workers[i], NULL);
+
+    pthread_cond_destroy(&ctx.cond_writer);
+    pthread_cond_destroy(&ctx.cond_worker);
+    pthread_cond_destroy(&ctx.cond_reader);
+    pthread_mutex_destroy(&ctx.lock);
+
+    // Write EOF Block + optional Seek Table + Footer if compression and no error
+    if (mode == 1 && !ctx.io_error && w_args.total_bytes >= 0) {
+        /* EOF block */
+        uint8_t eof_buf[ZXC_BLOCK_HEADER_SIZE];
+        const zxc_block_header_t eof_bh = {
+            .block_type = ZXC_BLOCK_EOF, .block_flags = 0, .reserved = 0, .comp_size = 0};
+        zxc_write_block_header(eof_buf, ZXC_BLOCK_HEADER_SIZE, &eof_bh);
+        if (UNLIKELY(f_out &&
+                     fwrite(eof_buf, 1, ZXC_BLOCK_HEADER_SIZE, f_out) != ZXC_BLOCK_HEADER_SIZE))
+            ctx.io_error = 1;
+        else
+            w_args.total_bytes += ZXC_BLOCK_HEADER_SIZE;
+
+        /* Seekable: write SEK block between EOF and footer */
+        if (!ctx.io_error && w_args.seek_comp && w_args.seek_count > 0) {
+            const size_t st_size = zxc_seek_table_size(w_args.seek_count);
+            uint8_t* const st_buf = (uint8_t*)ZXC_MALLOC(st_size);
+            if (st_buf) {
+                const int64_t st_val =
+                    zxc_write_seek_table(st_buf, st_size, w_args.seek_comp, w_args.seek_count);
+                if (st_val > 0 && f_out &&
+                    fwrite(st_buf, 1, (size_t)st_val, f_out) == (size_t)st_val)
+                    w_args.total_bytes += st_val;
+                ZXC_FREE(st_buf);
+            }
+        }
+
+        /* Footer */
+        uint8_t footer_buf[ZXC_FILE_FOOTER_SIZE];
+        zxc_write_file_footer(footer_buf, ZXC_FILE_FOOTER_SIZE, total_src_bytes, w_args.global_hash,
+                              checksum_enabled);
+        if (UNLIKELY(f_out &&
+                     fwrite(footer_buf, 1, ZXC_FILE_FOOTER_SIZE, f_out) != ZXC_FILE_FOOTER_SIZE))
+            ctx.io_error = 1;
+        else
+            w_args.total_bytes += ZXC_FILE_FOOTER_SIZE;
+    } else if (mode == 0 && !ctx.io_error) {
+        /*
+         * After the EOF block, the stream may contain:
+         *   (a) [FOOTER 12B]                  - no seekable table
+         *   (b) [SEK header 8B] [payload] [FOOTER 12B] - seekable archive
+         */
+        uint8_t peek_buf[ZXC_BLOCK_HEADER_SIZE];
+        uint8_t footer[ZXC_FILE_FOOTER_SIZE];
+
+        if (UNLIKELY(fread(peek_buf, 1, ZXC_BLOCK_HEADER_SIZE, f_in) != ZXC_BLOCK_HEADER_SIZE)) {
+            ctx.io_error = 1;
+        } else {
+            zxc_block_header_t peek_bh;
+            const int is_sek =
+                (zxc_read_block_header(peek_buf, ZXC_BLOCK_HEADER_SIZE, &peek_bh) == ZXC_OK &&
+                 peek_bh.block_type == ZXC_BLOCK_SEK);
+
+            if (is_sek) {
+                /* Drain the SEK payload (read + discard) */
+                size_t remaining = (size_t)peek_bh.comp_size;
+                uint8_t discard[512];
+                while (remaining > 0 && !ctx.io_error) {
+                    const size_t chunk = remaining < sizeof(discard) ? remaining : sizeof(discard);
+                    if (UNLIKELY(fread(discard, 1, chunk, f_in) != chunk)) ctx.io_error = 1;
+                    remaining -= chunk;
+                }
+                /* Read full 12-byte footer */
+                if (!ctx.io_error &&
+                    UNLIKELY(fread(footer, 1, ZXC_FILE_FOOTER_SIZE, f_in) != ZXC_FILE_FOOTER_SIZE))
+                    ctx.io_error = 1;
+            } else {
+                /* peek_buf contains the first 8 bytes of the 12-byte footer.
+                 * Read the remaining 4 bytes and assemble. */
+                ZXC_MEMCPY(footer, peek_buf, ZXC_BLOCK_HEADER_SIZE);
+                const size_t tail = ZXC_FILE_FOOTER_SIZE - ZXC_BLOCK_HEADER_SIZE; /* 4 */
+                if (UNLIKELY(fread(footer + ZXC_BLOCK_HEADER_SIZE, 1, tail, f_in) != tail))
+                    ctx.io_error = 1;
+            }
+        }
+
+        /* Verify Footer Content: Source Size and Global Checksum */
+        if (!ctx.io_error) {
+            int valid = (zxc_le64(footer) == (uint64_t)w_args.total_bytes);
+            if (valid && checksum_enabled && ctx.file_has_checksum)
+                valid = (zxc_le32(footer + sizeof(uint64_t)) == d_global_hash);
+            if (UNLIKELY(!valid)) ctx.io_error = 1;
+        }
+    }
+
+    ZXC_FREE(w_args.seek_comp);
+    ZXC_FREE(workers);
+    ZXC_ALIGNED_FREE(mem_block);
+
+    if (UNLIKELY(ctx.io_error)) return ZXC_ERROR_IO;
+
+    return w_args.total_bytes;
+}
+
+/**
+ * @brief Compresses a @c FILE* stream to another @c FILE* stream.
+ *
+ * Public API; full contract in @c zxc_stream.h. Resolves the options (threads,
+ * level, block size, checksums, seekable, dictionary) with their defaults, then
+ * drives @ref zxc_stream_engine_run in compression mode with the
+ * compress chunk processor.
+ *
+ * @param[in]  f_in   Input stream (must be non-NULL).
+ * @param[out] f_out  Output stream (NULL performs a dry run / size estimate).
+ * @param[in]  opts   Compression options, or NULL for all defaults.
+ * @return Total bytes written on success, or a negative @ref zxc_error_t.
+ */
+int64_t zxc_stream_compress(FILE* f_in, FILE* f_out, const zxc_compress_opts_t* opts) {
+    if (UNLIKELY(!f_in)) return ZXC_ERROR_NULL_INPUT;
+
+    const int n_threads = opts ? opts->n_threads : 0;
+    const int checksum_enabled = opts ? opts->checksum_enabled : 0;
+    const int seekable = opts ? opts->seekable : 0;
+    const int level = (opts && opts->level > 0) ? opts->level : ZXC_LEVEL_DEFAULT;
+    const size_t block_size =
+        (opts && opts->block_size > 0) ? opts->block_size : ZXC_BLOCK_SIZE_DEFAULT;
+    const uint8_t* dict = opts ? (const uint8_t*)opts->dict : NULL;
+    const size_t dict_size = (opts && opts->dict) ? opts->dict_size : 0;
+    zxc_progress_callback_t cb = opts ? opts->progress_cb : NULL;
+    void* ud = opts ? opts->user_data : NULL;
+
+    if (UNLIKELY(!zxc_validate_block_size(block_size))) return ZXC_ERROR_BAD_BLOCK_SIZE;
+    if (UNLIKELY(dict_size > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE;
+
+    const uint8_t* dict_huf = (opts && opts->dict) ? (const uint8_t*)opts->dict_huf : NULL;
+    return zxc_stream_engine_run(f_in, f_out, n_threads, 1, level, block_size, checksum_enabled,
+                                 seekable, zxc_compress_chunk_wrapper, cb, ud, dict, dict_size,
+                                 dict_huf);
+}
+
+/**
+ * @brief Decompresses a @c FILE* stream to another @c FILE* stream.
+ *
+ * Public API; full contract in @c zxc_stream.h. Resolves the options (threads,
+ * checksums, dictionary), then drives @ref zxc_stream_engine_run in
+ * decompression mode with the decompress chunk processor. The block size and
+ * level are recovered from the archive header, not from @p opts.
+ *
+ * @param[in]  f_in   Input (compressed) stream (must be non-NULL).
+ * @param[out] f_out  Output (decompressed) stream.
+ * @param[in]  opts   Decompression options, or NULL for all defaults.
+ * @return Total bytes written on success, or a negative @ref zxc_error_t.
+ */
+int64_t zxc_stream_decompress(FILE* f_in, FILE* f_out, const zxc_decompress_opts_t* opts) {
+    if (UNLIKELY(!f_in)) return ZXC_ERROR_NULL_INPUT;
+
+    const int n_threads = opts ? opts->n_threads : 0;
+    const int checksum_enabled = opts ? opts->checksum_enabled : 0;
+    const uint8_t* dict = opts ? (const uint8_t*)opts->dict : NULL;
+    const size_t dict_size = (opts && opts->dict) ? opts->dict_size : 0;
+    zxc_progress_callback_t cb = opts ? opts->progress_cb : NULL;
+    void* ud = opts ? opts->user_data : NULL;
+
+    const uint8_t* dict_huf = (opts && opts->dict) ? (const uint8_t*)opts->dict_huf : NULL;
+    return zxc_stream_engine_run(f_in, f_out, n_threads, 0, 0, 0, checksum_enabled, 0,
+                                 (zxc_chunk_processor_t)zxc_decompress_chunk_wrapper, cb, ud, dict,
+                                 dict_size, dict_huf);
+}
+
+/**
+ * @brief Reads the total decompressed size from an archive's footer.
+ *
+ * Public API; see @c zxc_stream.h. Validates the file magic, reads the 64-bit
+ * decompressed-size field from the footer, and restores the caller's original
+ * stream position before returning. Does not decompress any data.
+ *
+ * @param[in] f_in  Compressed stream (must be non-NULL and seekable).
+ * @return Decompressed size in bytes, or a negative @ref zxc_error_t
+ *         (@ref ZXC_ERROR_BAD_MAGIC, @ref ZXC_ERROR_SRC_TOO_SMALL,
+ *         @ref ZXC_ERROR_IO).
+ */
+int64_t zxc_stream_get_decompressed_size(FILE* f_in) {
+    if (UNLIKELY(!f_in)) return ZXC_ERROR_NULL_INPUT;
+
+    const long long saved_pos = ftello(f_in);
+    if (UNLIKELY(saved_pos < 0)) return ZXC_ERROR_IO;
+
+    // Get file size
+    if (fseeko(f_in, 0, SEEK_END) != 0) return ZXC_ERROR_IO;
+    const long long file_size = ftello(f_in);
+    if (UNLIKELY(file_size < (long long)(ZXC_FILE_HEADER_SIZE + ZXC_FILE_FOOTER_SIZE))) {
+        fseeko(f_in, saved_pos, SEEK_SET);
+        return ZXC_ERROR_SRC_TOO_SMALL;
+    }
+
+    uint8_t header[ZXC_FILE_HEADER_SIZE];
+    if (UNLIKELY(fseeko(f_in, 0, SEEK_SET) != 0 ||
+                 fread(header, 1, ZXC_FILE_HEADER_SIZE, f_in) != ZXC_FILE_HEADER_SIZE)) {
+        fseeko(f_in, saved_pos, SEEK_SET);
+        return ZXC_ERROR_IO;
+    }
+
+    if (UNLIKELY(zxc_le32(header) != ZXC_MAGIC_WORD)) {
+        fseeko(f_in, saved_pos, SEEK_SET);
+        return ZXC_ERROR_BAD_MAGIC;
+    }
+
+    uint8_t footer[ZXC_FILE_FOOTER_SIZE];
+    if (UNLIKELY(fseeko(f_in, file_size - ZXC_FILE_FOOTER_SIZE, SEEK_SET) != 0 ||
+                 fread(footer, 1, ZXC_FILE_FOOTER_SIZE, f_in) != ZXC_FILE_FOOTER_SIZE)) {
+        fseeko(f_in, saved_pos, SEEK_SET);
+        return ZXC_ERROR_IO;
+    }
+
+    fseeko(f_in, saved_pos, SEEK_SET);
+
+    return (int64_t)zxc_le64(footer);
+}
+
+/*
+ * ============================================================================
+ * SEEKABLE FILE* WRAPPER
+ * ============================================================================
+ * Adapts a FILE* into a thread-safe zxc_reader_t (pread on POSIX, ReadFile +
+ * OVERLAPPED on Windows) and delegates to zxc_seekable_open_reader.  Keeping
+ * this entry point alongside the stream driver, rather than in the kernel-
+ * safe zxc_seekable.c, means zxc_seekable.c stays freestanding.
+ */
+
+#if defined(_WIN32)
+/** @brief Reader context for the Win32 @c FILE* adapter (OS file handle + size). */
+typedef struct {
+    HANDLE handle; /* OS handle from _get_osfhandle(_fileno(f)) */
+    uint64_t size; /* total file size in bytes */
+} zxc_stdio_ctx_t;
+
+/**
+ * @brief Thread-safe positioned read backing the seekable @c FILE* reader.
+ *
+ * Win32 implementation: a positioned @c ReadFile via @c OVERLAPPED, so
+ * concurrent worker threads never race on a shared file cursor.
+ *
+ * @param[in]  vctx    @ref zxc_stdio_ctx_t carrying the file handle.
+ * @param[out] dst     Destination buffer (at least @p len bytes).
+ * @param[in]  len     Number of bytes to read.
+ * @param[in]  offset  Absolute byte offset to read from.
+ * @return @p len on a full read, otherwise @ref ZXC_ERROR_IO.
+ */
+// LCOV_EXCL_START - Windows I/O path, not reachable on POSIX CI
+static int64_t zxc_stdio_read_at(void* vctx, void* dst, size_t len, uint64_t offset) {
+    zxc_stdio_ctx_t* const ctx = (zxc_stdio_ctx_t*)vctx;
+    OVERLAPPED ov;
+    ZXC_MEMSET(&ov, 0, sizeof(ov));
+    ov.Offset = (DWORD)(offset & 0xFFFFFFFFu);
+    ov.OffsetHigh = (DWORD)(offset >> 32);
+    DWORD bytes_read = 0;
+    if (!ReadFile(ctx->handle, dst, (DWORD)len, &bytes_read, &ov)) return ZXC_ERROR_IO;
+    return (bytes_read == (DWORD)len) ? (int64_t)len : ZXC_ERROR_IO;
+}
+// LCOV_EXCL_STOP
+
+#else  /* POSIX */
+/** @brief Reader context for the POSIX @c FILE* adapter (file descriptor + size). */
+typedef struct {
+    int fd;        /* descriptor from fileno(f) */
+    uint64_t size; /* total file size in bytes */
+} zxc_stdio_ctx_t;
+
+/**
+ * @brief Thread-safe positioned read backing the seekable @c FILE* reader.
+ *
+ * POSIX implementation: a single @c pread, which carries its own offset and so
+ * is safe to call concurrently from multiple worker threads on one descriptor.
+ *
+ * @param[in]  vctx    @ref zxc_stdio_ctx_t carrying the file descriptor.
+ * @param[out] dst     Destination buffer (at least @p len bytes).
+ * @param[in]  len     Number of bytes to read.
+ * @param[in]  offset  Absolute byte offset to read from.
+ * @return @p len on a full read, otherwise @ref ZXC_ERROR_IO.
+ */
+static int64_t zxc_stdio_read_at(void* vctx, void* dst, size_t len, uint64_t offset) {
+    zxc_stdio_ctx_t* const ctx = (zxc_stdio_ctx_t*)vctx;
+    const ssize_t r = pread(ctx->fd, dst, len, (off_t)offset);
+    return (r == (ssize_t)len) ? (int64_t)len : ZXC_ERROR_IO;
+}
+#endif /* _WIN32 */
+
+/**
+ * @brief Opens a seekable archive backed by an open @c FILE*.
+ *
+ * Public API; full contract in @c zxc_stream.h. Snapshots and restores the
+ * file position, measures the file, wraps it in a thread-safe positioned
+ * reader (@c pread on POSIX, @c ReadFile + @c OVERLAPPED on Windows), and
+ * delegates to @ref zxc_seekable_open_reader. The reader context is heap-owned
+ * and handed to the returned handle via @ref zxc_seekable_attach_owned_ctx, so
+ * @ref zxc_seekable_free releases it.
+ *
+ * @param[in] f  Open, seekable file handle.
+ * @return A handle to release with @ref zxc_seekable_free, or NULL on bad input,
+ *         an I/O error, or a missing / malformed seek table.
+ */
+zxc_seekable* zxc_seekable_open_file(FILE* f) {
+    if (UNLIKELY(!f)) return NULL;
+
+    /* Snapshot the caller's file position so we can restore it. */
+    const long long saved_pos = ftello(f);
+    if (UNLIKELY(saved_pos < 0)) return NULL;  // LCOV_EXCL_LINE
+
+    // LCOV_EXCL_START - ftello/fseeko failure paths not reachable in CI
+    if (UNLIKELY(fseeko(f, 0, SEEK_END) != 0)) return NULL;
+    const long long file_size = ftello(f);
+    (void)fseeko(f, saved_pos, SEEK_SET);
+    if (UNLIKELY(file_size <= 0)) return NULL;
+    // LCOV_EXCL_STOP
+
+    zxc_stdio_ctx_t* const ctx = (zxc_stdio_ctx_t*)ZXC_MALLOC(sizeof(*ctx));
+    if (UNLIKELY(!ctx)) return NULL;  // LCOV_EXCL_LINE
+
+#if defined(_WIN32)
+    ctx->handle = (HANDLE)(intptr_t)_get_osfhandle(_fileno(f));  // LCOV_EXCL_LINE
+#else
+    ctx->fd = fileno(f);
+#endif
+    ctx->size = (uint64_t)file_size;
+
+    const zxc_reader_t reader = {
+        .read_at = zxc_stdio_read_at, .ctx = ctx, .size = (uint64_t)file_size};
+
+    zxc_seekable* const s = zxc_seekable_open_reader(&reader);
+    if (UNLIKELY(!s)) {
+        ZXC_FREE(ctx);
+        return NULL;
+    }
+
+    /* Hand the ctx lifetime over to the seekable handle. */
+    zxc_seekable_attach_owned_ctx(s, ctx);
+    return s;
+}
diff --git a/thirdparty/zxc/src/lib/zxc_huffman.c b/thirdparty/zxc/src/lib/zxc_huffman.c
new file mode 100644
index 000000000000..4a263ffcd5bd
--- /dev/null
+++ b/thirdparty/zxc/src/lib/zxc_huffman.c
@@ -0,0 +1,1008 @@
+/*
+ * ZXC - High-performance lossless compression
+ *
+ * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ */
+
+/**
+ * @file zxc_huffman.c
+ * @brief Canonical, length-limited (ZXC_HUF_MAX_CODE_LEN) Huffman codec for the GLO literal
+ *
+ * Canonical, length-limited (ZXC_HUF_MAX_CODE_LEN) Huffman codec for the GLO literal
+ * stream at compression level >= 6. Codes are emitted LSB-first; the
+ * decoder uses a 2048-entry multi-symbol lookup table (11-bit lookup,
+ * 1 or 2 symbols per lookup depending on the cumulative code length)
+ * and a 4-way interleaved hot loop. Public declarations live in
+ * zxc_internal.h; the rest is private to this translation unit.
+ */
+
+/*
+ * Function Multi-Versioning Support
+ * If ZXC_FUNCTION_SUFFIX is defined (e.g. _avx2, _neon), rename the public
+ * entry points so each variant TU produces its own copy under a unique symbol
+ * (e.g. zxc_huf_decode_section_avx2). The runtime dispatcher in
+ * zxc_compress.c / zxc_decompress.c routes to the matching variant.
+ *
+ * The defines sit before zxc_internal.h so the header's prototypes are
+ * rewritten with the same suffix as the definitions below.
+ */
+#ifdef ZXC_FUNCTION_SUFFIX
+#define ZXC_CAT_IMPL(x, y) x##y
+#define ZXC_CAT(x, y) ZXC_CAT_IMPL(x, y)
+#define zxc_huf_build_code_lengths ZXC_CAT(zxc_huf_build_code_lengths, ZXC_FUNCTION_SUFFIX)
+#define zxc_huf_encode_section ZXC_CAT(zxc_huf_encode_section, ZXC_FUNCTION_SUFFIX)
+#define zxc_huf_decode_section ZXC_CAT(zxc_huf_decode_section, ZXC_FUNCTION_SUFFIX)
+#define zxc_huf_encode_section_dict ZXC_CAT(zxc_huf_encode_section_dict, ZXC_FUNCTION_SUFFIX)
+#define zxc_huf_decode_section_dict ZXC_CAT(zxc_huf_decode_section_dict, ZXC_FUNCTION_SUFFIX)
+#define zxc_huf_build_dec_table ZXC_CAT(zxc_huf_build_dec_table, ZXC_FUNCTION_SUFFIX)
+#define zxc_huf_pack_lengths ZXC_CAT(zxc_huf_pack_lengths, ZXC_FUNCTION_SUFFIX)
+#define zxc_huf_unpack_lengths ZXC_CAT(zxc_huf_unpack_lengths, ZXC_FUNCTION_SUFFIX)
+#endif
+
+#include "../../include/zxc_error.h"
+#include "zxc_internal.h"
+
+/* The decoder lookup table entry type (zxc_huf_dec_entry_t) lives in
+ * zxc_internal.h so the compression context can carry a prebuilt table for
+ * the shared dictionary literal table. Bit layout recap:
+ * sym1(0..7) | sym2(8..15) | len1(16..19) | len_total(20..23) | n_extra(24). */
+#define ZXC_HUF_ENTRY(sym1, sym2, len1, len_total, n_extra)                  \
+    ((uint32_t)(sym1) | ((uint32_t)(sym2) << 8) | ((uint32_t)(len1) << 16) | \
+     ((uint32_t)(len_total) << 20) | ((uint32_t)(n_extra) << 24))
+
+/* ===========================================================================
+ * Length-limited Huffman: boundary package-merge
+ * ===========================================================================
+ *
+ * Builds optimal length-limited Huffman code lengths (max length
+ * ZXC_HUF_MAX_CODE_LEN) on 256-symbol alphabets. Package-merge is run for
+ * ZXC_HUF_MAX_CODE_LEN levels; each level holds up to 2N items (leaves +
+ * paired packages). Selection of the cheapest 2N - 2 items at level
+ * ZXC_HUF_MAX_CODE_LEN gives the appearance count of each leaf, which is
+ * its code length.
+ */
+
+typedef zxc_huf_pm_item_t pm_item_t;
+
+typedef struct {
+    uint32_t w;
+    int16_t sym;
+} pm_leaf_t;
+
+typedef zxc_huf_pm_frame_t frame_t;
+
+/**
+ * @brief Sort `pm_leaf_t` array by ascending weight, ties broken by ascending symbol.
+ *
+ * Bucket sort on `floor(log2(weight))` (32 buckets), with insertion sort
+ * inside each bucket. Replaces a libc `qsort` call: the comparator's
+ * indirect call dominated, and frequency distributions cluster naturally
+ * across ~10-14 magnitude buckets, so intra-bucket lists stay short and
+ * insertion sort is branch-friendly. Deterministic tie-break on `sym` is
+ * applied inside the insertion sort.
+ *
+ * Precondition: all weights are > 0 (zero-frequency symbols are filtered
+ * by the caller before this runs).
+ *
+ * @param[in,out] leaves  Leaf array, sorted in place (ascending weight, then
+ *                        ascending @c sym on ties).
+ * @param[in]     n       Number of leaves; @c n < 2 is effectively a no-op.
+ */
+static void pm_leaves_sort(pm_leaf_t* RESTRICT leaves, const int n) {
+    /* One bucket per possible value of floor(log2(weight)) for a 32-bit
+     * weight, i.e. 32 buckets. */
+    enum { NUM_BUCKETS = 32 };
+    int count[NUM_BUCKETS];
+    int offset[NUM_BUCKETS + 1]; /* +1 sentinel = n, avoids end-of-bucket branch. */
+    uint8_t bucket_of[ZXC_HUF_NUM_SYMBOLS];
+    pm_leaf_t tmp[ZXC_HUF_NUM_SYMBOLS];
+
+    ZXC_MEMSET(count, 0, sizeof(count));
+    for (int i = 0; i < n; i++) {
+        const unsigned b = zxc_log2_u32(leaves[i].w);
+        bucket_of[i] = (uint8_t)b;
+        count[b]++;
+    }
+
+    int acc = 0;
+    for (int b = 0; b < NUM_BUCKETS; b++) {
+        offset[b] = acc;
+        acc += count[b];
+    }
+    offset[NUM_BUCKETS] = n;
+
+    int pos[NUM_BUCKETS];
+    ZXC_MEMCPY(pos, offset, sizeof(pos));
+    for (int i = 0; i < n; i++) {
+        tmp[pos[bucket_of[i]]++] = leaves[i];
+    }
+
+    for (int b = 0; b < NUM_BUCKETS; b++) {
+        if (count[b] < 2) continue;
+        const int s = offset[b];
+        const int e = offset[b + 1];
+        for (int i = s + 1; i < e; i++) {
+            const pm_leaf_t key = tmp[i];
+            int j = i - 1;
+            while (j >= s && (tmp[j].w > key.w || (tmp[j].w == key.w && tmp[j].sym > key.sym))) {
+                tmp[j + 1] = tmp[j];
+                j--;
+            }
+            tmp[j + 1] = key;
+        }
+    }
+
+    ZXC_MEMCPY(leaves, tmp, (size_t)n * sizeof(pm_leaf_t));
+}
+
+/**
+ * @brief Build length-limited canonical Huffman code lengths.
+ *
+ * Runs the boundary package-merge algorithm capped at `ZXC_HUF_MAX_CODE_LEN`.
+ * Symbols with `freq[i] == 0` get `code_len[i] == 0`; every other symbol
+ * receives a length in `[1, ZXC_HUF_MAX_CODE_LEN]`. The single-present-symbol
+ * case is handled as a degenerate code of length 1.
+ *
+ * @param[in]  freq     Frequency table indexed by symbol (0..255).
+ * @param[out] code_len Output code-length array, written in full.
+ * @param[in]  scratch  Optional scratch of `ZXC_HUF_BUILD_SCRATCH_SIZE` bytes
+ *                      (carved into items / counts / stack regions). If
+ *                      `NULL`, the function allocates its own working memory
+ *                      for the duration of the call.
+ * @return `ZXC_OK` on success, `ZXC_ERROR_MEMORY` or `ZXC_ERROR_CORRUPT_DATA`
+ *         on failure.
+ */
+int zxc_huf_build_code_lengths(const uint32_t* RESTRICT freq, uint8_t* RESTRICT code_len,
+                               void* RESTRICT scratch) {
+    ZXC_MEMSET(code_len, 0, ZXC_HUF_NUM_SYMBOLS);
+
+    pm_leaf_t leaves[ZXC_HUF_NUM_SYMBOLS];
+    int n = 0;
+    for (int i = 0; i < ZXC_HUF_NUM_SYMBOLS; i++) {
+        if (freq[i] > 0) {
+            leaves[n].w = freq[i];
+            leaves[n].sym = (int16_t)i;
+            n++;
+        }
+    }
+    if (UNLIKELY(n == 0)) return ZXC_ERROR_CORRUPT_DATA;
+    if (n == 1) {
+        code_len[leaves[0].sym] = 1;
+        return ZXC_OK;
+    }
+
+    pm_leaves_sort(leaves, n);
+
+    /* n <= 256 <= 2^ZXC_HUF_MAX_CODE_LEN, so length-limit is always feasible. */
+    const int max_per_level = 2 * n;
+
+    /* Working buffers: either carve from caller-provided scratch (sized for
+     * the worst-case alphabet) or fall back to per-call malloc/free. */
+    pm_item_t* items;
+    int* counts;
+    frame_t* stack;
+    pm_item_t* owned_items = NULL;
+    int* owned_counts = NULL;
+    frame_t* owned_stack = NULL;
+    if (scratch) {
+        uint8_t* p = (uint8_t*)scratch;
+        items = (pm_item_t*)p;
+        p += (size_t)ZXC_HUF_MAX_CODE_LEN * (size_t)ZXC_HUF_PM_LEVEL_BOUND * sizeof(pm_item_t);
+        p = (uint8_t*)(((uintptr_t)p + 7u) & ~(uintptr_t)7u);
+        counts = (int*)p;
+        ZXC_MEMSET(counts, 0, (size_t)ZXC_HUF_MAX_CODE_LEN * sizeof(int));
+        p += (size_t)ZXC_HUF_MAX_CODE_LEN * sizeof(int);
+        p = (uint8_t*)(((uintptr_t)p + 7u) & ~(uintptr_t)7u);
+        stack = (frame_t*)p;
+    } else {
+        owned_items = (pm_item_t*)ZXC_MALLOC((size_t)ZXC_HUF_MAX_CODE_LEN * (size_t)max_per_level *
+                                             sizeof(pm_item_t));
+        owned_counts = (int*)ZXC_CALLOC((size_t)ZXC_HUF_MAX_CODE_LEN, sizeof(int));
+        owned_stack = (frame_t*)ZXC_MALLOC((size_t)ZXC_HUF_MAX_CODE_LEN * (size_t)max_per_level *
+                                           sizeof(frame_t));
+        if (UNLIKELY(!owned_items || !owned_counts || !owned_stack)) {
+            ZXC_FREE(owned_items);
+            ZXC_FREE(owned_counts);
+            ZXC_FREE(owned_stack);
+            return ZXC_ERROR_MEMORY;
+        }
+        items = owned_items;
+        counts = owned_counts;
+        stack = owned_stack;
+    }
+#define ITEM(k, i) items[(size_t)(k) * (size_t)max_per_level + (size_t)(i)]
+
+    /* Level 0 (logical level 1): the leaves themselves, already sorted. */
+    for (int i = 0; i < n; i++) {
+        ITEM(0, i).weight = leaves[i].w;
+        ITEM(0, i).left = -1;
+        ITEM(0, i).right = -1;
+        ITEM(0, i).sym = leaves[i].sym;
+    }
+    counts[0] = n;
+
+    /* Levels 1..ZXC_HUF_MAX_CODE_LEN-1: merge sorted leaves with sorted packages from the previous
+     * level. */
+    for (int k = 1; k < ZXC_HUF_MAX_CODE_LEN; k++) {
+        const int prev = counts[k - 1];
+        const int packs = prev / 2;
+        int li = 0;
+        int pi = 0;
+        int n_lvl = 0;
+        while (li < n || pi < packs) {
+            const uint32_t wl = (li < n) ? leaves[li].w : UINT32_MAX;
+            const uint32_t wp =
+                (pi < packs)
+                    ? (uint32_t)(ITEM(k - 1, 2 * pi).weight + ITEM(k - 1, 2 * pi + 1).weight)
+                    : UINT32_MAX;
+            if (wl <= wp && li < n) {
+                ITEM(k, n_lvl).weight = wl;
+                ITEM(k, n_lvl).left = -1;
+                ITEM(k, n_lvl).right = -1;
+                ITEM(k, n_lvl).sym = leaves[li].sym;
+                li++;
+            } else {
+                ITEM(k, n_lvl).weight = wp;
+                ITEM(k, n_lvl).left = (int16_t)(2 * pi);
+                ITEM(k, n_lvl).right = (int16_t)(2 * pi + 1);
+                ITEM(k, n_lvl).sym = -1;
+                pi++;
+            }
+            n_lvl++;
+        }
+        counts[k] = n_lvl;
+    }
+
+    /* Step 3: take first 2n-2 items at level ZXC_HUF_MAX_CODE_LEN-1; trace back, counting leaf
+     * appearances. */
+    int n_take = 2 * n - 2;
+    if (n_take > counts[ZXC_HUF_MAX_CODE_LEN - 1]) n_take = counts[ZXC_HUF_MAX_CODE_LEN - 1];
+
+    /* Worst case stack depth: (ZXC_HUF_MAX_CODE_LEN * n_take) frames; bounded by
+     * ZXC_HUF_MAX_CODE_LEN * 2n. `stack` was set up earlier from scratch (or
+     * the local malloc fallback). */
+    int sp = 0;
+    for (int i = 0; i < n_take; i++) {
+        stack[sp].lvl = (int8_t)(ZXC_HUF_MAX_CODE_LEN - 1);
+        stack[sp].idx = (int16_t)i;
+        sp++;
+    }
+    while (sp > 0) {
+        frame_t f = stack[--sp];
+        const pm_item_t* it = &ITEM(f.lvl, f.idx);
+        if (it->sym >= 0) {
+            code_len[it->sym]++;
+        } else {
+            stack[sp].lvl = (int8_t)(f.lvl - 1);
+            stack[sp].idx = it->left;
+            sp++;
+            stack[sp].lvl = (int8_t)(f.lvl - 1);
+            stack[sp].idx = it->right;
+            sp++;
+        }
+    }
+
+    if (owned_items) {
+        ZXC_FREE(owned_items);
+        ZXC_FREE(owned_counts);
+        ZXC_FREE(owned_stack);
+    }
+#undef ITEM
+    return ZXC_OK;
+}
+
+/* ===========================================================================
+ * Canonical code construction (LSB-first by bit-reversing canonical MSB codes)
+ * =========================================================================*/
+
+/**
+ * @brief Reverse the low @p n bits of @p v.
+ *
+ * Used to convert MSB-first canonical Huffman codes (the natural form
+ * produced by the canonical-code construction) into LSB-first codes that
+ * can be packed into the bit writer with a single shift-or.
+ *
+ * @param[in] v Value whose low @p n bits will be reversed.
+ * @param[in] n Number of significant bits in @p v (1..32).
+ * @return The bit-reversed value, with bits above position @p n set to 0.
+ */
+static uint32_t reverse_bits(uint32_t v, const int n) {
+    uint32_t r = 0;
+    for (int i = 0; i < n; i++) {
+        r = (r << 1) | (v & 1u);
+        v >>= 1;
+    }
+    return r;
+}
+
+/**
+ * @brief Build the canonical LSB-first Huffman codes for a length table.
+ *
+ * Generates MSB-first canonical codes following RFC 1951 3.2.2, then
+ * bit-reverses each so the encoder can emit them with a plain
+ * `accum |= code << bits` step. Absent symbols (length 0) receive code 0.
+ *
+ * @param[in]  code_len Per-symbol code lengths.
+ * @param[out] codes    Per-symbol LSB-first canonical codes.
+ */
+static void build_canonical_codes(const uint8_t* RESTRICT code_len, uint32_t* RESTRICT codes) {
+    uint32_t bl_count[ZXC_HUF_MAX_CODE_LEN + 1] = {0};
+    for (int i = 0; i < ZXC_HUF_NUM_SYMBOLS; i++) {
+        bl_count[code_len[i]]++;
+    }
+    bl_count[0] = 0;
+
+    uint32_t next_code[ZXC_HUF_MAX_CODE_LEN + 2] = {0};
+    uint32_t code = 0;
+    for (int k = 1; k <= ZXC_HUF_MAX_CODE_LEN + 1; k++) {
+        code = (code + bl_count[k - 1]) << 1;
+        next_code[k] = code;
+    }
+
+    for (int i = 0; i < ZXC_HUF_NUM_SYMBOLS; i++) {
+        const int l = code_len[i];
+        if (l == 0) {
+            codes[i] = 0;
+        } else {
+            const uint32_t msb_code = next_code[l]++;
+            codes[i] = reverse_bits(msb_code, l);
+        }
+    }
+}
+
+/* ===========================================================================
+ * 128-byte length header: 256 x 4-bit lengths, low nibble first.
+ * =========================================================================*/
+
+/**
+ * @brief Pack 256 4-bit code lengths into the 128-byte section header.
+ *
+ * The packing is little-endian within each byte: low nibble holds
+ * `code_len[2*i]`, high nibble holds `code_len[2*i + 1]`. The function
+ * silently truncates any length > 15; callers must enforce the cap of
+ * `ZXC_HUF_MAX_CODE_LEN` (<= 15) before calling.
+ *
+ * @param[in]  code_len Per-symbol code lengths (length `ZXC_HUF_NUM_SYMBOLS`).
+ * @param[out] out      Output header buffer of `ZXC_HUF_TABLE_SIZE` bytes.
+ */
+static void pack_lengths_header(const uint8_t* RESTRICT code_len, uint8_t* RESTRICT out) {
+    for (int i = 0; i < ZXC_HUF_NUM_SYMBOLS; i += 2) {
+        const uint8_t lo = code_len[i] & 0x0F;
+        const uint8_t hi = code_len[i + 1] & 0x0F;
+        out[i >> 1] = (uint8_t)(lo | (hi << 4));
+    }
+}
+
+/**
+ * @brief Decode the 128-byte length header back into 256 code lengths.
+ *
+ * Inverts ::pack_lengths_header and validates the two structural invariants:
+ * no length exceeds `ZXC_HUF_MAX_CODE_LEN`, and at least one symbol is
+ * present.
+ *
+ * @param[in]  in       Input header buffer of `ZXC_HUF_TABLE_SIZE` bytes.
+ * @param[out] code_len Output code-length array of length `ZXC_HUF_NUM_SYMBOLS`.
+ * @return `ZXC_OK` on success, `ZXC_ERROR_CORRUPT_DATA` if a length is too
+ *         large or the table is empty.
+ */
+static int unpack_lengths_header(const uint8_t* RESTRICT in, uint8_t* RESTRICT code_len) {
+    int max_len = 0;
+    int n_present = 0;
+    for (int i = 0; i < ZXC_HUF_NUM_SYMBOLS; i += 2) {
+        const uint8_t b = in[i >> 1];
+        const uint8_t lo = b & 0x0F;
+        const uint8_t hi = (uint8_t)(b >> 4);
+        code_len[i] = lo;
+        code_len[i + 1] = hi;
+        if (lo > max_len) max_len = lo;
+        if (hi > max_len) max_len = hi;
+        if (lo) n_present++;
+        if (hi) n_present++;
+    }
+    if (UNLIKELY(max_len > ZXC_HUF_MAX_CODE_LEN)) return ZXC_ERROR_CORRUPT_DATA;
+    if (UNLIKELY(n_present == 0)) return ZXC_ERROR_CORRUPT_DATA;
+    return ZXC_OK;
+}
+
+/* ===========================================================================
+ * Bit writer (LSB-first)
+ * =========================================================================*/
+
+typedef struct {
+    uint8_t* ptr;
+    uint8_t* end;
+    uint64_t accum;
+    int bits;
+    int err;
+} bit_writer_t;
+
+/**
+ * @brief Initialise an LSB-first bit writer over a caller-owned buffer.
+ *
+ * @param[out] bw  Writer to initialise.
+ * @param[out] dst Output buffer (writer takes no ownership).
+ * @param[in]  cap Capacity of @p dst in bytes.
+ */
+static ZXC_ALWAYS_INLINE void bw_init(bit_writer_t* RESTRICT bw, uint8_t* RESTRICT dst,
+                                      const size_t cap) {
+    bw->ptr = dst;
+    bw->end = dst + cap;
+    bw->accum = 0;
+    bw->bits = 0;
+    bw->err = 0;
+}
+
+/**
+ * @brief Append the low @p len bits of @p code to the writer's bitstream.
+ *
+ * Bits are consumed from the LSB end. When the internal accumulator has
+ * accumulated 8 or more bits, full bytes are flushed to the output buffer.
+ * If the buffer is exhausted mid-flush the writer's `err` flag is set;
+ * subsequent ::bw_finish reports `ZXC_ERROR_DST_TOO_SMALL`.
+ *
+ * @param[in,out] bw   Writer state.
+ * @param[in]     code Code bits to emit (the low @p len bits matter).
+ * @param[in]     len  Number of bits to emit (1..ZXC_HUF_MAX_CODE_LEN).
+ */
+static ZXC_ALWAYS_INLINE void bw_put(bit_writer_t* RESTRICT bw, const uint32_t code,
+                                     const int len) {
+    bw->accum |= ((uint64_t)code) << bw->bits;
+    bw->bits += len;
+    if (LIKELY((size_t)(bw->end - bw->ptr) >= sizeof(uint64_t))) {
+        zxc_store_le64(bw->ptr, bw->accum);
+    } else {
+        if (UNLIKELY(bw->ptr >= bw->end)) {
+            bw->err = 1;
+            bw->bits = 0;
+            return;
+        }
+        *bw->ptr = (uint8_t)bw->accum;
+    }
+    const int n = bw->bits >> 3; /* 0 or 1 full byte to flush */
+    bw->ptr += n;
+    bw->accum >>= n << 3;
+    bw->bits &= 7;
+}
+
+/**
+ * @brief Flush any partial trailing byte and finalise the bit writer.
+ *
+ * Writes the (zero-padded) trailing byte if the accumulator holds any bits.
+ *
+ * @param[in,out] bw Writer state.
+ * @return `ZXC_OK` on success, `ZXC_ERROR_DST_TOO_SMALL` if the buffer was
+ *         exhausted at any point.
+ */
+static ZXC_ALWAYS_INLINE int bw_finish(bit_writer_t* RESTRICT bw) {
+    if (bw->bits > 0) {
+        if (UNLIKELY(bw->ptr >= bw->end)) return ZXC_ERROR_DST_TOO_SMALL;
+        *bw->ptr++ = (uint8_t)bw->accum;
+        bw->accum = 0;
+        bw->bits = 0;
+    }
+    return UNLIKELY(bw->err) ? ZXC_ERROR_DST_TOO_SMALL : ZXC_OK;
+}
+
+/* ===========================================================================
+ * Encoder
+ * =========================================================================*/
+
+/**
+ * @brief Shared encoder body: 6-byte sub-stream sizes header + 4 interleaved
+ *        sub-streams, written at @p dst. The 128-byte lengths header, when
+ *        wanted, is the caller's business (see the two public wrappers).
+ *
+ * @param[in]  literals    Source literal bytes (must not alias @p dst).
+ * @param[in]  n_literals  Number of source bytes (must be > 0).
+ * @param[in]  code_len    Per-symbol code lengths for the canonical codes.
+ * @param[out] dst         Destination for the sizes header + sub-streams.
+ * @param[in]  dst_cap     Capacity of @p dst in bytes.
+ * @return Bytes written (>= ZXC_HUF_STREAM_SIZES_HEADER_SIZE) on success,
+ *         negative `zxc_error_t` code on failure.
+ */
+static int zxc_huf_encode_streams(const uint8_t* RESTRICT literals, const size_t n_literals,
+                                  const uint8_t* RESTRICT code_len, uint8_t* RESTRICT dst,
+                                  const size_t dst_cap) {
+    if (UNLIKELY(n_literals == 0)) return ZXC_ERROR_CORRUPT_DATA;
+    if (UNLIKELY(dst_cap < (size_t)ZXC_HUF_STREAM_SIZES_HEADER_SIZE))
+        return ZXC_ERROR_DST_TOO_SMALL;
+
+    /* 1. Build canonical codes (LSB-first via bit-reversal). */
+    uint32_t codes[ZXC_HUF_NUM_SYMBOLS];
+    build_canonical_codes(code_len, codes);
+
+    /* 2. Reserve 6 bytes for sub-stream sizes; encode 4 sub-streams after them. */
+    uint8_t* const sizes_hdr = dst;
+    uint8_t* const stream_base = sizes_hdr + ZXC_HUF_STREAM_SIZES_HEADER_SIZE;
+    const uint8_t* const stream_end = dst + dst_cap;
+
+    const size_t Q = (n_literals + ZXC_HUF_NUM_STREAMS - 1) / ZXC_HUF_NUM_STREAMS;
+
+    bit_writer_t bw;
+    uint8_t* p = stream_base;
+    size_t s_sizes[ZXC_HUF_NUM_STREAMS];
+
+    for (int s = 0; s < ZXC_HUF_NUM_STREAMS; s++) {
+        const size_t start = (size_t)s * Q;
+        size_t stop = start + Q;
+        if (stop > n_literals) stop = n_literals;
+
+        const uint8_t* const stream_start = p;
+        bw_init(&bw, p, (size_t)(stream_end - p));
+        for (size_t i = start; i < stop; i++) {
+            const uint8_t sym = literals[i];
+            const int len = code_len[sym];
+            if (UNLIKELY(len == 0)) return ZXC_ERROR_CORRUPT_DATA; /* symbol absent from table */
+            bw_put(&bw, codes[sym], len);
+        }
+        const int rc = bw_finish(&bw);
+        if (UNLIKELY(rc != ZXC_OK)) return rc;
+        s_sizes[s] = (size_t)(bw.ptr - stream_start);
+        p = bw.ptr;
+    }
+
+    /* 3. Persist the 3 explicit sub-stream sizes (s4 is implied). */
+    for (int s = 0; s < ZXC_HUF_NUM_STREAMS - 1; s++) {
+        if (UNLIKELY(s_sizes[s] > 0xFFFFu)) return ZXC_ERROR_DST_TOO_SMALL;
+        zxc_store_le16(sizes_hdr + 2 * s, (uint16_t)s_sizes[s]);
+    }
+
+    return (int)(p - dst);
+}
+
+/**
+ * @brief Encode the literal stream into a full Huffman section payload.
+ *
+ * Packs the 128-byte lengths header, then delegates to
+ * @ref zxc_huf_encode_streams for the 6-byte sub-stream sizes and the 4
+ * interleaved LSB-first bit-streams.
+ *
+ * @param[in]  literals    Source literal bytes (must not alias @p dst).
+ * @param[in]  n_literals  Number of source bytes (must be > 0).
+ * @param[in]  code_len    Per-symbol code lengths (see @ref zxc_huf_build_code_lengths).
+ * @param[out] dst         Destination buffer for the section payload.
+ * @param[in]  dst_cap     Capacity of @p dst in bytes.
+ * @return Total bytes written on success, negative `zxc_error_t` on failure.
+ */
+int zxc_huf_encode_section(const uint8_t* RESTRICT literals, const size_t n_literals,
+                           const uint8_t* RESTRICT code_len, uint8_t* RESTRICT dst,
+                           const size_t dst_cap) {
+    if (UNLIKELY(n_literals == 0)) return ZXC_ERROR_CORRUPT_DATA;
+    if (UNLIKELY(dst_cap < ZXC_HUF_HEADER_SIZE)) return ZXC_ERROR_DST_TOO_SMALL;
+
+    /* Pack the 128-byte length header, then the streams after it. */
+    pack_lengths_header(code_len, dst);
+    const int rc = zxc_huf_encode_streams(literals, n_literals, code_len, dst + ZXC_HUF_TABLE_SIZE,
+                                          dst_cap - ZXC_HUF_TABLE_SIZE);
+    return (rc < 0) ? rc : rc + ZXC_HUF_TABLE_SIZE;
+}
+
+/**
+ * @brief Encode a literal section using supplied code lengths, WITHOUT the
+ *        128-byte lengths header (shared dictionary table).
+ *
+ * Emits only the 6-byte sub-stream sizes header + 4 sub-streams (a thin pass
+ * through @ref zxc_huf_encode_streams); the lengths live in the dictionary.
+ *
+ * @param[in]  literals    Source literal bytes (must not alias @p dst).
+ * @param[in]  n_literals  Number of source bytes (must be > 0).
+ * @param[in]  code_len    Per-symbol code lengths from the shared dict table.
+ * @param[out] dst         Destination buffer for the section payload.
+ * @param[in]  dst_cap     Capacity of @p dst in bytes.
+ * @return Bytes written on success, negative `zxc_error_t` on failure
+ *         (incl. `ZXC_ERROR_CORRUPT_DATA` if a literal has no code).
+ */
+int zxc_huf_encode_section_dict(const uint8_t* RESTRICT literals, const size_t n_literals,
+                                const uint8_t* RESTRICT code_len, uint8_t* RESTRICT dst,
+                                const size_t dst_cap) {
+    return zxc_huf_encode_streams(literals, n_literals, code_len, dst, dst_cap);
+}
+
+/* ===========================================================================
+ * Decoder table builder + 4-way interleaved decoder
+ * =========================================================================*/
+
+/**
+ * @brief Build the 2048-entry multi-symbol decoder lookup table.
+ *
+ * Strategy: build a temporary 256-entry single-symbol (8-bit) table, then
+ * use it to populate the 2048-entry (11-bit) multi-symbol table. For each
+ * 11-bit prefix p:
+ *   1. (sym1, len1) = ss[p & 0xFF]   -- always valid, 1 <= len1 <= 8.
+ *   2. rem = 11 - len1 E [3, 10] bits remain after consuming the first code.
+ *   3. (sym2_cand, len2_cand) = ss[(p >> len1) & 0xFF]. If len2_cand <= rem,
+ *      both codes fit in 11 bits -> encode 2-symbol entry. Otherwise the
+ *      second code's bit window extends past the lookup width -> keep only
+ *      the first symbol and let the next iteration handle the rest.
+ *
+ * Validates Kraft equality (or the single-present-symbol degenerate case).
+ *
+ * @param[in]  code_len Per-symbol code lengths from the section header.
+ * @param[out] table    Destination 2048-entry lookup table (caller-aligned).
+ * @return `ZXC_OK` on success, `ZXC_ERROR_CORRUPT_DATA` on validation failure.
+ */
+static int build_decode_table(const uint8_t* RESTRICT code_len,
+                              zxc_huf_dec_entry_t* RESTRICT table) {
+    uint32_t bl_count[ZXC_HUF_MAX_CODE_LEN + 1] = {0};
+    int n_present = 0;
+    for (int i = 0; i < ZXC_HUF_NUM_SYMBOLS; i++) {
+        const uint8_t l = code_len[i];
+        if (UNLIKELY(l > ZXC_HUF_MAX_CODE_LEN)) return ZXC_ERROR_CORRUPT_DATA;
+        bl_count[l]++;
+        if (l) n_present++;
+    }
+    if (UNLIKELY(n_present == 0)) return ZXC_ERROR_CORRUPT_DATA;
+    bl_count[0] = 0;
+
+    /* Validate Kraft equality on the ZXC_HUF_MAX_CODE_LEN axis. */
+    {
+        uint64_t kraft = 0;
+        for (int k = 1; k <= ZXC_HUF_MAX_CODE_LEN; k++) {
+            kraft += (uint64_t)bl_count[k] << (ZXC_HUF_MAX_CODE_LEN - k);
+        }
+        /* Degenerate: single symbol with length 1 (Kraft sum =
+         * 2^(ZXC_HUF_MAX_CODE_LEN-1)). Otherwise: full Kraft equality
+         * on the ZXC_HUF_MAX_CODE_LEN axis. */
+        const int kraft_ok = (n_present == 1) ? (bl_count[1] == 1)
+                                              : (kraft == ((uint64_t)1 << ZXC_HUF_MAX_CODE_LEN));
+        if (UNLIKELY(!kraft_ok)) return ZXC_ERROR_CORRUPT_DATA;
+    }
+
+    uint32_t next_code[ZXC_HUF_MAX_CODE_LEN + 2] = {0};
+    {
+        uint32_t code = 0;
+        for (int k = 1; k <= ZXC_HUF_MAX_CODE_LEN + 1; k++) {
+            code = (code + bl_count[k - 1]) << 1;
+            next_code[k] = code;
+        }
+    }
+
+    /* Single-symbol intermediate (ZXC_HUF_MAX_CODE_LEN-bit lookup). Layout:
+     * low byte = sym, high byte = len. Filled by replicating each canonical
+     * code across all ZXC_HUF_MAX_CODE_LEN-bit windows that share its low
+     * `len` bits. */
+#define ZXC_HUF_SS_SIZE (1u << ZXC_HUF_MAX_CODE_LEN)
+#define ZXC_HUF_SS_MASK ((uint32_t)(ZXC_HUF_SS_SIZE - 1))
+    uint16_t ss[ZXC_HUF_SS_SIZE] = {0};
+
+    for (int sym = 0; sym < ZXC_HUF_NUM_SYMBOLS; sym++) {
+        const int l = code_len[sym];
+        if (l == 0) continue;
+        const uint32_t msb_code = next_code[l]++;
+        const uint32_t lsb_code = reverse_bits(msb_code, l);
+        const uint16_t entry = (uint16_t)((unsigned)l << 8 | (unsigned)sym);
+        const uint32_t step = (uint32_t)1 << l;
+        for (uint32_t fill = lsb_code; fill < ZXC_HUF_SS_SIZE; fill += step) {
+            ss[fill] = entry;
+        }
+    }
+
+    /* Single-symbol degenerate (Kraft sum = 2^(ZXC_HUF_MAX_CODE_LEN-1)): replicate the one
+     * valid entry across every slot. */
+    if (UNLIKELY(n_present == 1)) {
+        uint16_t valid = 0;
+        for (uint32_t i = 0; i < ZXC_HUF_SS_SIZE; i++) {
+            if (ss[i] != 0) {
+                valid = ss[i];
+                break;
+            }
+        }
+        for (uint32_t i = 0; i < ZXC_HUF_SS_SIZE; i++) {
+            if (ss[i] == 0) ss[i] = valid;
+        }
+    }
+
+    /* Build the multi-symbol table. */
+    for (uint32_t p = 0; p < ZXC_HUF_DEC_TABLE_SIZE; p++) {
+        const uint16_t e1 = ss[p & ZXC_HUF_SS_MASK];
+        const uint8_t sym1 = (uint8_t)e1;
+        const int len1 = e1 >> 8;
+        const int rem = ZXC_HUF_LOOKUP_BITS - len1;
+
+        uint8_t sym2 = 0;
+        int len_total = len1;
+        int n_extra = 0;
+
+        const uint16_t e2 = ss[(p >> len1) & ZXC_HUF_SS_MASK];
+        const int len2 = e2 >> 8;
+        if (len2 <= rem) {
+            sym2 = (uint8_t)e2;
+            len_total = len1 + len2;
+            n_extra = 1;
+        }
+
+        table[p].entry = ZXC_HUF_ENTRY(sym1, sym2, len1, len_total, n_extra);
+    }
+#undef ZXC_HUF_SS_SIZE
+#undef ZXC_HUF_SS_MASK
+
+    return ZXC_OK;
+}
+
+/**
+ * @brief Shared decoder body: parses the 6-byte sub-stream sizes header at
+ *        @p payload and runs the 4-way interleaved decode with @p table.
+ *        The 128-byte lengths header, when present, has already been consumed
+ *        by the caller (see the two public wrappers).
+ *
+ * @param[in]  payload       Sizes header followed by the 4 sub-streams.
+ * @param[in]  payload_size  Size of @p payload in bytes.
+ * @param[out] dst           Destination for the decoded literals.
+ * @param[in]  n_literals    Number of literals to decode (must be > 0).
+ * @param[in]  table         Multi-symbol decode table built for this section.
+ * @return @c ZXC_OK on success, @c ZXC_ERROR_CORRUPT_DATA on a malformed stream.
+ */
+static int zxc_huf_decode_streams(const uint8_t* RESTRICT payload, const size_t payload_size,
+                                  uint8_t* RESTRICT dst, const size_t n_literals,
+                                  const zxc_huf_dec_entry_t* RESTRICT table) {
+    if (UNLIKELY(payload_size < (size_t)ZXC_HUF_STREAM_SIZES_HEADER_SIZE || n_literals == 0))
+        return ZXC_ERROR_CORRUPT_DATA;
+
+    /* 1. Parse sub-stream sizes. */
+    const uint8_t* const sizes_hdr = payload;
+    const uint16_t s1 = zxc_le16(sizes_hdr + 0);
+    const uint16_t s2 = zxc_le16(sizes_hdr + 2);
+    const uint16_t s3 = zxc_le16(sizes_hdr + 4);
+
+    const size_t streams_total = payload_size - ZXC_HUF_STREAM_SIZES_HEADER_SIZE;
+    const size_t s123 = (size_t)s1 + (size_t)s2 + (size_t)s3;
+    if (UNLIKELY(s123 > streams_total)) return ZXC_ERROR_CORRUPT_DATA;
+    const size_t s4 = streams_total - s123;
+
+    const uint8_t* const stream_base = payload + ZXC_HUF_STREAM_SIZES_HEADER_SIZE;
+    const size_t off[ZXC_HUF_NUM_STREAMS] = {0, s1, (size_t)s1 + s2, s123};
+    const size_t sz[ZXC_HUF_NUM_STREAMS] = {s1, s2, s3, s4};
+
+    /* 4. Initialise 4 bit readers. */
+    zxc_bit_reader_t br[ZXC_HUF_NUM_STREAMS];
+    for (int s = 0; s < ZXC_HUF_NUM_STREAMS; s++) {
+        zxc_br_init(&br[s], stream_base + off[s], sz[s]);
+    }
+
+    /* 5. 4-way interleaved multi-symbol decode. Each sub-stream owns a
+     * contiguous slice of dst: stream s covers literal indices
+     * [s*Q, min((s+1)*Q, N)). With Q = ceil(N/4) the first 3 streams have
+     * exactly Q symbols and stream 3 has `N - 3Q` symbols. */
+    const size_t Q = (n_literals + ZXC_HUF_NUM_STREAMS - 1) / ZXC_HUF_NUM_STREAMS;
+    size_t s_count[ZXC_HUF_NUM_STREAMS];
+    uint8_t* s_dst[ZXC_HUF_NUM_STREAMS];
+    for (int s = 0; s < ZXC_HUF_NUM_STREAMS; s++) {
+        size_t start = (size_t)s * Q;
+        size_t stop = start + Q;
+        if (start > n_literals) start = n_literals;
+        if (stop > n_literals) stop = n_literals;
+        s_count[s] = stop - start;
+        s_dst[s] = dst + start;
+    }
+
+    /* Batched multi-symbol decode. Each ZXC_HUF_BATCH iterations consume
+     * <= ZXC_HUF_BATCH_BITS bits per stream, fitting under the 57-bit cap
+     * an 8-byte refill can guarantee.
+     *
+     * Each iter speculatively writes 2 bytes per stream and advances by 1
+     * or 2. If only 1 symbol was decoded, the spec byte is overwritten by
+     * the next iter, except at end-of-stream where it would corrupt the
+     * adjacent stream. The batched loop therefore requires
+     * ZXC_HUF_SAFE_MARGIN bytes of headroom per stream. */
+
+    uint8_t* d0 = s_dst[0];
+    uint8_t* d1 = s_dst[1];
+    uint8_t* d2 = s_dst[2];
+    uint8_t* d3 = s_dst[3];
+
+    const uint8_t* const dend0 = s_dst[0] + s_count[0];
+    const uint8_t* const dend1 = s_dst[1] + s_count[1];
+    const uint8_t* const dend2 = s_dst[2] + s_count[2];
+    const uint8_t* const dend3 = s_dst[3] + s_count[3];
+
+    /* Hoist all four bit-reader hot fields into locals. They live in
+     * registers for the full duration of the batched loop. */
+    uint64_t a0 = br[0].accum;
+    uint64_t a1 = br[1].accum;
+    uint64_t a2 = br[2].accum;
+    uint64_t a3 = br[3].accum;
+    int bb0 = br[0].bits;
+    int bb1 = br[1].bits;
+    int bb2 = br[2].bits;
+    int bb3 = br[3].bits;
+    const uint8_t* p0 = br[0].ptr;
+    const uint8_t* p1 = br[1].ptr;
+    const uint8_t* p2 = br[2].ptr;
+    const uint8_t* p3 = br[3].ptr;
+    const uint8_t* const e0 = br[0].end;
+    const uint8_t* const e1 = br[1].end;
+    const uint8_t* const e2 = br[2].end;
+    const uint8_t* const e3 = br[3].end;
+
+    /* Refill the bit accumulator with up to (ZXC_HUF_ACCUM_BITS - nbits) more
+     * bits read from src. Fast path reads 8 bytes at once (LE u64 load); slow
+     * path reads byte-by-byte while at least one byte of free room remains. */
+#define REFILL(accum, nbits, src, src_end)                                                   \
+    do {                                                                                     \
+        if (LIKELY((nbits) < ZXC_HUF_BATCH_BITS && (src) + sizeof(uint64_t) <= (src_end))) { \
+            (accum) |= zxc_le64(src) << (nbits);                                             \
+            const int _n = (ZXC_HUF_ACCUM_BITS - (nbits)) / CHAR_BIT;                        \
+            (src) += _n;                                                                     \
+            (nbits) += _n * CHAR_BIT;                                                        \
+        } else {                                                                             \
+            while ((nbits) <= ZXC_HUF_ACCUM_BITS - CHAR_BIT && (src) < (src_end)) {          \
+                (accum) |= ((uint64_t)*(src)++) << (nbits);                                  \
+                (nbits) += CHAR_BIT;                                                         \
+            }                                                                                \
+        }                                                                                    \
+    } while (0)
+
+    /* Decode one 11-bit window per stream. Always writes 2 bytes per stream
+     * (sym1 + spec sym2); advances d_s by 1 + n_extra; advances accum by
+     * len_total. Per-stream length accumulators sl0..sl3 collect consumed
+     * bits across the batch and are folded into bb_s once at end of batch. */
+#define DECODE_ONE()                                             \
+    do {                                                         \
+        const uint32_t _e0 = table[a0 & ZXC_HUF_TBL_MASK].entry; \
+        const uint32_t _e1 = table[a1 & ZXC_HUF_TBL_MASK].entry; \
+        const uint32_t _e2 = table[a2 & ZXC_HUF_TBL_MASK].entry; \
+        const uint32_t _e3 = table[a3 & ZXC_HUF_TBL_MASK].entry; \
+        zxc_store_le16(d0, (uint16_t)_e0);                       \
+        zxc_store_le16(d1, (uint16_t)_e1);                       \
+        zxc_store_le16(d2, (uint16_t)_e2);                       \
+        zxc_store_le16(d3, (uint16_t)_e3);                       \
+        const int _t0 = (int)((_e0 >> 20) & 0xF);                \
+        const int _t1 = (int)((_e1 >> 20) & 0xF);                \
+        const int _t2 = (int)((_e2 >> 20) & 0xF);                \
+        const int _t3 = (int)((_e3 >> 20) & 0xF);                \
+        d0 += 1 + (int)((_e0 >> 24) & 1);                        \
+        d1 += 1 + (int)((_e1 >> 24) & 1);                        \
+        d2 += 1 + (int)((_e2 >> 24) & 1);                        \
+        d3 += 1 + (int)((_e3 >> 24) & 1);                        \
+        a0 >>= _t0;                                              \
+        a1 >>= _t1;                                              \
+        a2 >>= _t2;                                              \
+        a3 >>= _t3;                                              \
+        sl0 += _t0;                                              \
+        sl1 += _t1;                                              \
+        sl2 += _t2;                                              \
+        sl3 += _t3;                                              \
+    } while (0)
+
+    while ((size_t)(dend0 - d0) >= ZXC_HUF_SAFE_MARGIN &&
+           (size_t)(dend1 - d1) >= ZXC_HUF_SAFE_MARGIN &&
+           (size_t)(dend2 - d2) >= ZXC_HUF_SAFE_MARGIN &&
+           (size_t)(dend3 - d3) >= ZXC_HUF_SAFE_MARGIN) {
+        REFILL(a0, bb0, p0, e0);
+        REFILL(a1, bb1, p1, e1);
+        REFILL(a2, bb2, p2, e2);
+        REFILL(a3, bb3, p3, e3);
+
+        int sl0 = 0;
+        int sl1 = 0;
+        int sl2 = 0;
+        int sl3 = 0;
+        DECODE_ONE();
+        DECODE_ONE();
+        DECODE_ONE();
+        DECODE_ONE();
+        DECODE_ONE();
+        bb0 -= sl0;
+        bb1 -= sl1;
+        bb2 -= sl2;
+        bb3 -= sl3;
+    }
+
+    /* Per-stream scalar tail (<= ZXC_HUF_SAFE_MARGIN - 1 = 9 symbols per
+     * stream). Single-symbol decode using the same 2048-entry table,
+     * we read sym1 + len1 only and advance by 1 byte, no spec write. */
+#define TAIL_ONE(accum, nbits, src, src_end, dst)                    \
+    do {                                                             \
+        REFILL(accum, nbits, src, src_end);                          \
+        const uint32_t _e = table[(accum) & ZXC_HUF_TBL_MASK].entry; \
+        *(dst)++ = (uint8_t)_e;                                      \
+        const int _l1 = (int)((_e >> 16) & 0xF);                     \
+        (accum) >>= _l1;                                             \
+        (nbits) -= _l1;                                              \
+    } while (0)
+
+    while (d0 < dend0) TAIL_ONE(a0, bb0, p0, e0, d0);
+    while (d1 < dend1) TAIL_ONE(a1, bb1, p1, e1, d1);
+    while (d2 < dend2) TAIL_ONE(a2, bb2, p2, e2, d2);
+    while (d3 < dend3) TAIL_ONE(a3, bb3, p3, e3, d3);
+
+#undef TAIL_ONE
+#undef DECODE_ONE
+#undef REFILL
+    return ZXC_OK;
+}
+
+/**
+ * @brief Decode a full Huffman literal section payload.
+ *
+ * Unpacks the 128-byte lengths header, builds the multi-symbol decode table,
+ * then runs the 4-way interleaved decode, writing exactly @p n_literals bytes.
+ *
+ * @param[in]  payload       Section payload (lengths header + sizes + 4 sub-streams).
+ * @param[in]  payload_size  Total payload length in bytes.
+ * @param[out] dst           Destination buffer (must not alias @p payload).
+ * @param[in]  n_literals    Expected number of decoded bytes.
+ * @return `ZXC_OK` on success, negative `zxc_error_t` on failure.
+ */
+int zxc_huf_decode_section(const uint8_t* RESTRICT payload, const size_t payload_size,
+                           uint8_t* RESTRICT dst, const size_t n_literals) {
+    if (UNLIKELY(payload_size < ZXC_HUF_HEADER_SIZE || n_literals == 0))
+        return ZXC_ERROR_CORRUPT_DATA;
+
+    /* 1. Parse length header. */
+    uint8_t code_len[ZXC_HUF_NUM_SYMBOLS];
+    {
+        const int rc = unpack_lengths_header(payload, code_len);
+        if (UNLIKELY(rc != ZXC_OK)) return rc;
+    }
+
+    /* 2. Build the 2048-entry multi-symbol decode table. Cache-line
+     * aligned: the LUT spans 128 lines (8 KB / 64 B) and is hammered every
+     * symbol, landing it on a 64-byte boundary avoids any cross-line
+     * load split on the per-iteration entry fetch. */
+    ZXC_ALIGN(ZXC_CACHE_LINE_SIZE) zxc_huf_dec_entry_t table[ZXC_HUF_DEC_TABLE_SIZE];
+    {
+        const int rc = build_decode_table(code_len, table);
+        if (UNLIKELY(rc != ZXC_OK)) return rc;
+    }
+
+    /* 3. Decode the 4 interleaved sub-streams. */
+    return zxc_huf_decode_streams(payload + ZXC_HUF_TABLE_SIZE, payload_size - ZXC_HUF_TABLE_SIZE,
+                                  dst, n_literals, table);
+}
+
+/**
+ * @brief Decode a literal section that carries no lengths header, using a
+ *        prebuilt decode table (shared dictionary table).
+ *
+ * @param[in]  payload       Section payload (6-byte sizes header + 4 sub-streams).
+ * @param[in]  payload_size  Total payload length in bytes.
+ * @param[out] dst           Destination buffer (must not alias @p payload).
+ * @param[in]  n_literals    Expected number of decoded bytes.
+ * @param[in]  table         Prebuilt @ref ZXC_HUF_DEC_TABLE_SIZE-entry decode table.
+ * @return `ZXC_OK` on success, `ZXC_ERROR_CORRUPT_DATA` if @p table is NULL or
+ *         the stream is malformed.
+ */
+int zxc_huf_decode_section_dict(const uint8_t* RESTRICT payload, const size_t payload_size,
+                                uint8_t* RESTRICT dst, const size_t n_literals,
+                                const zxc_huf_dec_entry_t* RESTRICT table) {
+    if (UNLIKELY(table == NULL)) return ZXC_ERROR_CORRUPT_DATA;
+    return zxc_huf_decode_streams(payload, payload_size, dst, n_literals, table);
+}
+
+/**
+ * @brief Build the @ref ZXC_HUF_DEC_TABLE_SIZE-entry decode table from
+ *        per-symbol code lengths. Validates Kraft equality.
+ *
+ * Public wrapper around @ref build_decode_table.
+ *
+ * @param[in]  code_len  Per-symbol code lengths.
+ * @param[out] table     Destination decode table (caller-aligned).
+ * @return `ZXC_OK` on success, `ZXC_ERROR_CORRUPT_DATA` on invalid lengths.
+ */
+int zxc_huf_build_dec_table(const uint8_t* RESTRICT code_len, zxc_huf_dec_entry_t* RESTRICT table) {
+    return build_decode_table(code_len, table);
+}
+
+/**
+ * @brief Pack per-symbol code lengths into the 128-byte (4-bit nibble) header.
+ *
+ * @param[in]  code_len  Per-symbol code lengths (one byte each).
+ * @param[out] out       Destination 128-byte packed header.
+ */
+void zxc_huf_pack_lengths(const uint8_t* RESTRICT code_len, uint8_t* RESTRICT out) {
+    pack_lengths_header(code_len, out);
+}
+
+/**
+ * @brief Unpack and structurally validate a 128-byte packed lengths header.
+ *
+ * @param[in]  in        128-byte packed lengths header.
+ * @param[out] code_len  Destination per-symbol code lengths.
+ * @return `ZXC_OK` on success, `ZXC_ERROR_CORRUPT_DATA` on invalid lengths.
+ */
+int zxc_huf_unpack_lengths(const uint8_t* RESTRICT in, uint8_t* RESTRICT code_len) {
+    return unpack_lengths_header(in, code_len);
+}
diff --git a/thirdparty/zxc/src/lib/zxc_internal.h b/thirdparty/zxc/src/lib/zxc_internal.h
new file mode 100644
index 000000000000..455b492c72c3
--- /dev/null
+++ b/thirdparty/zxc/src/lib/zxc_internal.h
@@ -0,0 +1,1846 @@
+/*
+ * ZXC - High-performance lossless compression
+ *
+ * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file zxc_internal.h
+ * @brief Internal definitions, constants, SIMD helpers, and utility functions.
+ *
+ * This header is **not** part of the public API.  It is shared across the
+ * library's translation units and contains:
+ * - Platform detection and SIMD intrinsic includes.
+ * - Compiler-abstraction macros (LIKELY, PREFETCH, MEMCPY, ALIGN, ...).
+ * - Endianness detection and byte-swap helpers.
+ * - File-format constants (magic word, header sizes, block sizes, ...).
+ * - Inline helpers for hashing, endian-safe loads/stores, bit manipulation,
+ *   aligned allocation, and bitstream reading.
+ * - Internal function prototypes for chunk-level compression/decompression.
+ *
+ * @warning Do not include this header from user code; use the public headers
+ *          zxc_buffer.h or zxc_stream.h instead.
+ */
+
+#ifndef ZXC_INTERNAL_H
+#define ZXC_INTERNAL_H
+
+#include "zxc_deps.h" /* libc deps: <limits.h>, <stdint.h>, <stdlib.h>, <string.h>,
+                        and the ZXC_MALLOC / ZXC_ALIGNED_MALLOC macros.
+                        Vendor this file to retarget non-libc environments. */
+
+#include "../../include/zxc_buffer.h"
+#include "../../include/zxc_constants.h"
+#include "../../include/zxc_error.h"
+#include "../../include/zxc_seekable.h"
+#include "rapidhash.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @defgroup internal Internal Helpers
+ * @brief Platform abstractions, constants, and utility functions (private).
+ * @{
+ */
+
+/**
+ * @name Atomic Qualifier
+ * @brief Provides a portable atomic / volatile qualifier.
+ *
+ * If C11 atomics are available, @c ZXC_ATOMIC expands to @c _Atomic;
+ * otherwise it falls back to @c volatile.
+ * @{
+ */
+#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && \
+    !defined(__STDC_NO_ATOMICS__)
+#include <stdatomic.h>
+#define ZXC_ATOMIC _Atomic
+#define ZXC_USE_C11_ATOMICS 1
+#else
+#define ZXC_ATOMIC volatile
+#define ZXC_USE_C11_ATOMICS 0
+#endif
+/** @} */ /* end of Atomic Qualifier */
+
+/**
+ * @name SIMD Intrinsics & Compiler Macros
+ * @brief Auto-detected SIMD feature macros for x86 (SSE/AVX) and ARM (NEON).
+ *
+ * Depending on the target architecture and compiler flags the following macros
+ * may be defined:
+ * - @c ZXC_USE_AVX512 - AVX-512F + AVX-512BW available.
+ * - @c ZXC_USE_AVX2   - AVX2 available.
+ * - @c ZXC_USE_SSE2   - SSE2 (x86-64 baseline) available.
+ * - @c ZXC_USE_NEON64 - AArch64 NEON available.
+ * - @c ZXC_USE_NEON32 - ARMv7 NEON available.
+ *
+ * Note: @c -mavx2 / @c -mavx512f imply @c __SSE2__, so @c ZXC_USE_SSE2 is
+ * also defined in the AVX variants. The hand-written SIMD code paths therefore
+ * order their preprocessor branches AVX512 -> AVX2 -> SSE2 so the widest
+ * available path wins; the SSE2 branch is the active one only in the dedicated
+ * @c _sse2 variant (no AVX2/AVX512 flags). SSE2 is the x86-64 baseline, so this
+ * tier covers every 64-bit x86 CPU (and i686 with @c -msse2). The handful of
+ * operations that would otherwise require SSE4.1 (@c _mm_max_epu32,
+ * @c _mm_blendv_epi8, @c _mm_packus_epi32) or SSSE3 (@c _mm_shuffle_epi8) are
+ * emulated with pure SSE2 instruction sequences or fall back to scalar code.
+ *
+ * Define @c ZXC_DISABLE_SIMD to gate all hand-written SIMD paths (intrinsics,
+ * inline assembly).  Compiler auto-vectorisation is unaffected.
+ * @{
+ */
+#ifndef ZXC_DISABLE_SIMD
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
+#include <immintrin.h>
+#include <nmmintrin.h>
+#if defined(__AVX512F__) && defined(__AVX512BW__)
+#ifndef ZXC_USE_AVX512
+#define ZXC_USE_AVX512
+#endif
+#endif
+#if defined(__AVX2__)
+#ifndef ZXC_USE_AVX2
+#define ZXC_USE_AVX2
+#endif
+#endif
+#if defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#ifndef ZXC_USE_SSE2
+#define ZXC_USE_SSE2
+#endif
+#endif
+#elif (defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM64) || \
+       defined(ZXC_USE_NEON32) || defined(ZXC_USE_NEON64))
+#if !defined(_MSC_VER)
+#include <arm_acle.h>
+#endif
+#include <arm_neon.h>
+#if defined(__aarch64__) || defined(_M_ARM64)
+#ifndef ZXC_USE_NEON64
+#define ZXC_USE_NEON64
+#endif
+#else
+#ifndef ZXC_USE_NEON32
+#define ZXC_USE_NEON32
+#endif
+#endif
+#endif
+#endif    /* ZXC_DISABLE_SIMD */
+/** @} */ /* end of SIMD Intrinsics */
+
+/**
+ * @name Compiler Abstractions
+ * @brief Portable wrappers for branch hints, prefetch, memory ops, alignment,
+ *        and forced inlining.
+ * @{
+ */
+
+#if defined(__GNUC__) || defined(__clang__)
+/** @def LIKELY
+ * @brief Branch prediction hint: expression is likely true.
+ * @param x Expression to evaluate.
+ */
+#define LIKELY(x) (__builtin_expect(!!(x), 1))
+
+/** @def UNLIKELY
+ * @brief Branch prediction hint: expression is unlikely to be true.
+ * @param x Expression to evaluate.
+ */
+#define UNLIKELY(x) (__builtin_expect(!!(x), 0))
+
+/** @def RESTRICT
+ * @brief Pointer aliasing hint (maps to __restrict__).
+ */
+#define RESTRICT __restrict__
+
+/** @def ZXC_PREFETCH_READ
+ * @brief Prefetch data for reading.
+ * @param ptr Pointer to data to prefetch.
+ */
+#define ZXC_PREFETCH_READ(ptr) __builtin_prefetch((const void*)(ptr), 0, 3)
+
+/** @def ZXC_PREFETCH_WRITE
+ * @brief Prefetch data for writing.
+ * @param ptr Pointer to data to prefetch.
+ */
+#define ZXC_PREFETCH_WRITE(ptr) __builtin_prefetch((const void*)(ptr), 1, 3)
+
+/** @def ZXC_MEMCPY
+ * @brief Optimized memory copy using compiler built-in.
+ */
+#define ZXC_MEMCPY(dst, src, n) __builtin_memcpy(dst, src, n)
+
+/** @def ZXC_MEMSET
+ * @brief Optimized memory set using compiler built-in.
+ */
+#define ZXC_MEMSET(dst, val, n) __builtin_memset(dst, val, n)
+
+/** @def ZXC_ALIGN
+ * @brief Specifies memory alignment for a variable or structure.
+ * @param x Alignment boundary in bytes (must be a power of 2).
+ */
+#define ZXC_ALIGN(x) __attribute__((aligned(x)))
+
+/** @def ZXC_ALWAYS_INLINE
+ * @brief Forces a function to be inlined at all optimization levels.
+ */
+#define ZXC_ALWAYS_INLINE inline __attribute__((always_inline))
+
+/** @def ZXC_NOINLINE
+ * @brief Prevents a function from being inlined into its callers.
+ */
+#define ZXC_NOINLINE __attribute__((noinline))
+
+#elif defined(_MSC_VER)
+#include <intrin.h>
+#if defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64)
+#include <xmmintrin.h>
+#define ZXC_PREFETCH_READ(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#define ZXC_PREFETCH_WRITE(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#else
+#define ZXC_PREFETCH_READ(ptr) __prefetch((const void*)(ptr))
+#define ZXC_PREFETCH_WRITE(ptr) __prefetch((const void*)(ptr))
+#endif
+#define LIKELY(x) (x)
+#define UNLIKELY(x) (x)
+#define RESTRICT __restrict
+#pragma intrinsic(memcpy, memset)
+#define ZXC_MEMCPY(dst, src, n) memcpy(dst, src, n)
+#define ZXC_MEMSET(dst, val, n) memset(dst, val, n)
+
+/** @def ZXC_ALIGN
+ * @brief Specifies memory alignment for a variable or structure (MSVC).
+ * @param x Alignment boundary in bytes (must be a power of 2).
+ */
+#define ZXC_ALIGN(x) __declspec(align(x))
+
+/** @def ZXC_ALWAYS_INLINE
+ * @brief Forces a function to be inlined at all optimization levels (MSVC).
+ */
+#define ZXC_ALWAYS_INLINE __forceinline
+
+/** @def ZXC_NOINLINE
+ * @brief Prevents a function from being inlined into its callers (MSVC).
+ */
+#define ZXC_NOINLINE __declspec(noinline)
+#pragma intrinsic(_BitScanReverse)
+#else
+#define LIKELY(x) (x)
+#define UNLIKELY(x) (x)
+#define RESTRICT
+#define ZXC_PREFETCH_READ(ptr)
+#define ZXC_PREFETCH_WRITE(ptr)
+#define ZXC_MEMCPY(dst, src, n) memcpy(dst, src, n)
+#define ZXC_MEMSET(dst, val, n) memset(dst, val, n)
+
+/** @def ZXC_ALWAYS_INLINE
+ * @brief Forces a function to be inlined (fallback for non-GCC/Clang/MSVC compilers).
+ */
+#define ZXC_ALWAYS_INLINE inline
+
+/** @def ZXC_NOINLINE
+ * @brief Prevents inlining (best-effort no-op fallback for unknown compilers).
+ */
+#define ZXC_NOINLINE
+
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+#include <stdalign.h>
+/** @def ZXC_ALIGN
+ * @brief Specifies memory alignment using C11 _Alignas.
+ * @param x Alignment boundary in bytes (must be a power of 2).
+ */
+#define ZXC_ALIGN(x) _Alignas(x)
+#else
+/** @def ZXC_ALIGN
+ * @brief No-op alignment macro for compilers without alignment support.
+ * @param x Ignored (alignment not supported).
+ */
+#define ZXC_ALIGN(x)
+#endif
+#endif
+/** @} */ /* end of Compiler Abstractions */
+
+/* Heap allocator and cache-line-aligned allocator macros are now defined
+ * in @c zxc_deps.h (included at the top of this header), so non-libc
+ * targets can override them by vendoring that single file. */
+
+/**
+ * @name Endianness Detection
+ * @brief Compile-time detection of host byte order.
+ *
+ * Defines exactly one of @c ZXC_LITTLE_ENDIAN or @c ZXC_BIG_ENDIAN.
+ * @{
+ */
+#ifndef ZXC_LITTLE_ENDIAN
+#if defined(_WIN32) || defined(__LITTLE_ENDIAN__) || \
+    (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define ZXC_LITTLE_ENDIAN
+#elif defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define ZXC_BIG_ENDIAN
+#else
+#warning "Endianness not detected, defaulting to little-endian"
+#define ZXC_LITTLE_ENDIAN
+#endif
+#endif
+/** @} */ /* end of Endianness Detection */
+
+/**
+ * @name Byte-Swap Helpers
+ * @brief 16/32/64-bit byte-swap macros (only defined under @c ZXC_BIG_ENDIAN).
+ * @{
+ */
+#ifdef ZXC_BIG_ENDIAN
+#if defined(__GNUC__) || defined(__clang__)
+#define ZXC_BSWAP16(x) __builtin_bswap16(x)
+#define ZXC_BSWAP32(x) __builtin_bswap32(x)
+#define ZXC_BSWAP64(x) __builtin_bswap64(x)
+#elif defined(_MSC_VER)
+#define ZXC_BSWAP16(x) _byteswap_ushort(x)
+#define ZXC_BSWAP32(x) _byteswap_ulong(x)
+#define ZXC_BSWAP64(x) _byteswap_uint64(x)
+#else
+#define ZXC_BSWAP16(x) ((uint16_t)(((x) >> 8) | ((x) << 8)))
+#define ZXC_BSWAP32(x) \
+    ((uint32_t)(((x) >> 24) | (((x) >> 8) & 0xFF00) | (((x) << 8) & 0xFF0000) | ((x) << 24)))
+#define ZXC_BSWAP64(x) \
+    ((uint64_t)(((uint64_t)ZXC_BSWAP32((uint32_t)(x)) << 32) | ZXC_BSWAP32((uint32_t)((x) >> 32))))
+#endif
+#endif
+/** @} */ /* end of Byte-Swap Helpers */
+
+/**
+ * @name File Format Constants
+ * @brief Magic words, header sizes, block sizes, and related constants.
+ * @{
+ */
+
+/** @brief Magic word identifying ZXC files (little-endian 0x9CB02EF5). */
+#define ZXC_MAGIC_WORD 0x9CB02EF5U
+/** @brief Current on-disk file format version. The decoder accepts only this
+ *  version; Older versions are rejected with ZXC_ERROR_BAD_VERSION. */
+#define ZXC_FILE_FORMAT_VERSION 6
+
+/** @brief Safety padding appended to buffers to tolerate overruns. */
+#define ZXC_PAD_SIZE 32
+/**
+ * @brief Tail padding required on the decompression destination buffer.
+ *
+ * The decoder's fast path uses speculative wild-copy writes and gates
+ * fast-loop entry on @c d_end - ZXC_DECOMPRESS_TAIL_PAD. Sizing
+ * @c dst_capacity to @c uncompressed_size + ZXC_DECOMPRESS_TAIL_PAD
+ * guarantees the fast path is reachable and that tail bounds checks
+ * never spuriously reject the last literals of a valid block.
+ *
+ * @see zxc_decompress_block_bound()
+ */
+#define ZXC_DECOMPRESS_TAIL_PAD (ZXC_PAD_SIZE * 66)
+/** @brief Assumed CPU cache line size for alignment. */
+#define ZXC_CACHE_LINE_SIZE 64
+/** @brief Bitmask for cache-line alignment checks. */
+#define ZXC_ALIGNMENT_MASK (ZXC_CACHE_LINE_SIZE - 1)
+/** @brief Round @p x up to the next cache-line boundary. */
+#define ZXC_ALIGN_CL(x) (((x) + ZXC_ALIGNMENT_MASK) & ~(size_t)ZXC_ALIGNMENT_MASK)
+
+/**
+ * @brief Number of @c uint64_t words needed to hold a bitmap of @p n_bits.
+ *
+ * Equivalent to @c ceil(n_bits / 64).
+ */
+#define ZXC_BITMAP_WORDS(n_bits) (((n_bits) + 63) / 64)
+
+/** @brief Bit flag in the Flags byte indicating checksum presence (bit 7). */
+#define ZXC_FILE_FLAG_HAS_CHECKSUM 0x80U
+/** @brief Bit flag in the Flags byte indicating a dictionary is required (bit 6). */
+#define ZXC_FILE_FLAG_HAS_DICTIONARY 0x40U
+/** @brief Mask for the checksum algorithm id (bits 0-3). */
+#define ZXC_FILE_CHECKSUM_ALGO_MASK 0x0FU
+
+/** @brief Magic word identifying ZXC dictionary files (.zxd). */
+#define ZXC_DICT_MAGIC 0x9CB0D1C7U
+/** @brief Current dictionary file format version. A 128-byte packed Huffman
+ *         code-lengths table (shared literal table) always follows the
+ *         dictionary content. */
+#define ZXC_DICT_VERSION 1
+/** @brief K-gram length scanned by the dictionary trainer. Aligned on the LZ
+ *         minimum match length so trained patterns are matchable at encode time. */
+#define ZXC_DICT_KGRAM_LEN ZXC_LZ_MIN_MATCH_LEN
+/** @brief Address bits for the dictionary trainer's k-gram frequency table. */
+#define ZXC_DICT_HASH_BITS 16
+/** @brief Maximum number of candidate segments the dictionary trainer keeps. */
+#define ZXC_DICT_MAX_SEGMENTS (1U << 16)
+/** @brief Target number of sampled k-gram positions for the trainer's frequency
+ *  estimate. Bounds the count so 16-bit counters stay unsaturated on large
+ *  corpora; the trainer strides the corpus to hit roughly this many positions. */
+#define ZXC_DICT_SAMPLE_TARGET (1U << 19)
+/** @brief Number of buckets in the dictionary trainer's frequency table. */
+#define ZXC_DICT_HASH_SIZE (1U << ZXC_DICT_HASH_BITS)
+/** @brief Training block size for the shared-table literal statistics. */
+#define ZXC_DICT_HUF_TRAIN_BLOCK 4096U
+/** @brief Cap on the corpus bytes compressed by the literal-table trainer: the
+ *         histogram converges early, so past it slices are strided evenly instead. */
+#define ZXC_DICT_HUF_SAMPLE_BUDGET (8U << 20)
+
+/** @brief Block header size: Type(1)+Flags(1)+Reserved(1)+CRC(1)+CompSize(4). */
+#define ZXC_BLOCK_HEADER_SIZE 8
+/** @brief Size of the per-block checksum field in bytes. */
+#define ZXC_BLOCK_CHECKSUM_SIZE 4
+/** @brief Binary size of a GLO block sub-header. */
+#define ZXC_GLO_HEADER_BINARY_SIZE 16
+/** @brief Binary size of a GHI block sub-header. */
+#define ZXC_GHI_HEADER_BINARY_SIZE 16
+
+/** @brief Worst-case format overhead inside a single block beyond the outer
+ *  8-byte block header and the optional 4-byte checksum.
+ *
+ *  Covers the inner GLO/GHI sub-header (16 B) plus four section descriptors
+ *  (4 x 8 = 32 B) = 48 B, with a 16 B safety margin for future format
+ *  evolution. Used by zxc_compress_block_bound() and zxc_compress_bound()
+ *  to size the destination buffer in the worst (incompressible) case. */
+#define ZXC_BLOCK_FORMAT_OVERHEAD 64
+
+/** @brief Binary size of a section descriptor (comp_size + raw_size). */
+#define ZXC_SECTION_DESC_BINARY_SIZE 8
+/** @brief 32-bit mask for extracting sizes from a section descriptor. */
+#define ZXC_SECTION_SIZE_MASK 0xFFFFFFFFU
+/** @brief Number of sections in a GLO block. */
+#define ZXC_GLO_SECTIONS 4
+/** @brief Number of sections in a GHI block. */
+#define ZXC_GHI_SECTIONS 3
+
+/** @brief Checksum algorithm id for RapidHash (default, sole implementation). */
+#define ZXC_CHECKSUM_RAPIDHASH 0
+
+/** @brief Size of the global checksum appended after EOF block (4 bytes). */
+#define ZXC_GLOBAL_CHECKSUM_SIZE 4
+
+/** @name Seekable Format Constants
+ *  @brief Seek table block appended between EOF block and footer.
+ *
+ *  The seek table is optional (opt-in at compression time) and allows
+ *  random-access decompression by recording per-block compressed and
+ *  decompressed sizes.  It uses a standard ZXC block header with
+ *  @c block_type = @c ZXC_BLOCK_SEK.
+ *
+ *  Detection from the end of the file: the reader derives @c num_blocks
+ *  from the file footer (total decompressed size) and file header (block size).
+ *  It then seeks backward to validate the SEK block header.
+ *  @{ */
+/** @brief Per-block entry size: comp_size(4) only.  decomp_size is derived
+ *  from the file header's block_size (all blocks except the last are full). */
+#define ZXC_SEEK_ENTRY_SIZE 4
+/** @} */ /* end of Seekable Format Constants */
+
+/** @name GLO Token Constants
+ *  @brief 4-bit literal length / 4-bit match length / 16-bit offset.
+ *  @{ */
+/** @brief Bits for Literal Length in a GLO token. */
+#define ZXC_TOKEN_LIT_BITS 4
+/** @brief Bits for Match Length in a GLO token. */
+#define ZXC_TOKEN_ML_BITS 4
+/** @brief Mask to extract Literal Length from a GLO token. */
+#define ZXC_TOKEN_LL_MASK ((1U << ZXC_TOKEN_LIT_BITS) - 1)
+/** @brief Mask to extract Match Length from a GLO token. */
+#define ZXC_TOKEN_ML_MASK ((1U << ZXC_TOKEN_ML_BITS) - 1)
+/** @} */
+
+/** @name GHI Sequence Constants
+ *  @brief 8-bit literal length / 8-bit match length / 16-bit offset.
+ *  @{ */
+/** @brief Bits for Literal Length in a GHI sequence. */
+#define ZXC_SEQ_LL_BITS 8
+/** @brief Bits for Match Length in a GHI sequence. */
+#define ZXC_SEQ_ML_BITS 8
+/** @brief Bits for Offset in a GHI sequence. */
+#define ZXC_SEQ_OFF_BITS 16
+/** @brief Mask to extract Literal Length from a GHI sequence. */
+#define ZXC_SEQ_LL_MASK ((1U << ZXC_SEQ_LL_BITS) - 1)
+/** @brief Mask to extract Match Length from a GHI sequence. */
+#define ZXC_SEQ_ML_MASK ((1U << ZXC_SEQ_ML_BITS) - 1)
+/** @brief Mask to extract Offset from a GHI sequence. */
+#define ZXC_SEQ_OFF_MASK ((1U << ZXC_SEQ_OFF_BITS) - 1)
+/** @} */
+
+/** @name Literal Stream Encoding
+ *  @{ */
+/** @brief Flag bit indicating an RLE run in the literal stream (0x80). */
+#define ZXC_LIT_RLE_FLAG 0x80U
+/** @brief Mask to extract the run/literal length (lower 7 bits). */
+#define ZXC_LIT_LEN_MASK (ZXC_LIT_RLE_FLAG - 1)
+/** @} */
+
+/** @name LZ77 Constants
+ *  @brief Hash table geometry, sliding window, and match parameters.
+ *
+ *  The hash table uses a split layout with 15-bit addressing (32 768 buckets):
+ *  - `hash_table[]`: uint32_t, stores `(epoch << offset_bits) | position` (128 KB).
+ *  - `hash_tags[]`:      uint8_t, stores an 8-bit tag for fast rejection (32 KB).
+ *  Total: 160 KB.  The tag table fits in L1 cache, enabling a
+ *  "filter-first" access pattern that avoids cold loads into hash_table
+ *  on the ~60-75% of lookups where the tag mismatches.
+ *  The 64 KB sliding window allows `chain_table` to use `uint16_t`.
+ *  @{ */
+/** @brief Address bits for the LZ77 hash table (2^15 = 32 768 buckets). */
+#define ZXC_LZ_HASH_BITS 15
+/** @brief Marsaglia multiplicative hash constant for 4-byte hashing. */
+#define ZXC_LZ_HASH_PRIME1 0x2D35182DU
+/** @brief Marsaglia/Vigna xorshift* multiplier for 5-byte hashing. */
+#define ZXC_LZ_HASH_PRIME2 0x2545F4914F6CDD1DULL
+/** @brief Maximum number of entries in the hash table. */
+#define ZXC_LZ_HASH_SIZE (1U << ZXC_LZ_HASH_BITS)
+/** @brief Sliding window size (64 KB). */
+#define ZXC_LZ_WINDOW_SIZE (1U << 16)
+/** @brief Mask for ring-buffer indexing into chain_table (power-of-two window). */
+#define ZXC_LZ_WINDOW_MASK (ZXC_LZ_WINDOW_SIZE - 1U)
+/** @brief Minimum match length for an LZ77 match. */
+#define ZXC_LZ_MIN_MATCH_LEN 5
+/** @brief Maximum legitimate value a varint can decode to.
+ *
+ * A varint value represents (ll - MASK) or (ml - MASK) and is therefore always
+ * strictly less than ZXC_BLOCK_SIZE_MAX (enforced by the Block API entry
+ * points). The cap is set to (ZXC_BLOCK_SIZE_MAX - 1), which fits cleanly in a
+ * 3-byte varint (21 bits): the decoder rejects any 4- or 5-byte encoding, and
+ * the encoder refuses to emit values above this bound. Together they bound the
+ * varint surface to exactly the format-defined block size limit. */
+#define ZXC_MAX_VARINT_VALUE ((uint32_t)(ZXC_BLOCK_SIZE_MAX - 1U))
+/** @brief Maximum decoded output of a single sequence with INLINE ll/ml
+ *         (non-varint). Used by 4x decoder bounds checks to reserve space for
+ *         subsequent inline sequences in the same batch when the current
+ *         sequence has a varint-extended ml. */
+#define ZXC_GLO_MAX_INLINE_OUT_PER_SEQ \
+    ((ZXC_TOKEN_LL_MASK - 1U) + (ZXC_TOKEN_ML_MASK - 1U) + ZXC_LZ_MIN_MATCH_LEN) /* 33 */
+#define ZXC_GHI_MAX_INLINE_OUT_PER_SEQ \
+    ((ZXC_SEQ_LL_MASK - 1U) + (ZXC_SEQ_ML_MASK - 1U) + ZXC_LZ_MIN_MATCH_LEN) /* 513 */
+/** @brief Base bias added to encoded offsets (stored = actual - bias). */
+#define ZXC_LZ_OFFSET_BIAS 1
+/** @brief Maximum allowed offset distance. */
+#define ZXC_LZ_MAX_DIST (ZXC_LZ_WINDOW_SIZE - 1)
+/** @brief Bytes at the block end where match search stops (left as literals).
+ *  Equals the 8-byte word the finder reads at each probe, so @c ip+8<=iend. */
+#define ZXC_LZ_SEARCH_MARGIN (sizeof(uint64_t))
+/** @} */
+
+/** @name Optimal Parser Tuning (level >= 6)
+ *  @brief Static prices and complexity guards used by the level-6 optimal
+ *         LZ77 parser DP.
+ *  @{ */
+/** @brief Static price (bits) of a match token before varint extras: 1 byte
+ *         token + 2 byte offset. */
+#define ZXC_OPT_MATCH_COST_BASE ((uint32_t)(3U * CHAR_BIT))
+/** @brief Threshold above which `find_best_match` is skipped at intra-match
+ *         positions, keeping the parser O(N) on highly repetitive data. */
+#define ZXC_OPT_LONG_MATCH_SKIP ((size_t)256)
+/** @brief Minimum literal count for the sample-based Huffman cost estimator
+ *         used by the optimal parser. Below this, the strided sample is too
+ *         small for the resulting code-lengths to be statistically reliable,
+ *         so the estimator falls back to RAW cost (8 bits/byte). */
+#define ZXC_OPT_LIT_SAMPLE_MIN 1024
+
+/** @} */
+
+/** @name Hash Prime Constants
+ *  @brief Mixing primes used by internal hash functions.
+ *  @{ */
+/** @brief Hash prime 1. */
+#define ZXC_HASH_PRIME1 0x9E3779B97F4A7C15ULL
+/** @brief Hash prime 2. */
+#define ZXC_HASH_PRIME2 0xD2D84A61D2D84A61ULL
+/** @} */
+
+/** @name Huffman Codec Constants
+ *  @brief Length-limited canonical Huffman codec for the GLO literal stream
+ *         (active at compression level >= 6).
+ *
+ *  On-disk section payload layout:
+ *  - @c ZXC_HUF_TABLE_SIZE bytes: @c ZXC_HUF_NUM_SYMBOLS code lengths
+ *    packed two per byte (4 bits each). The same packed table is used as the
+ *    per-block lengths header (enc_lit=2) and as the shared table carried by
+ *    a .zxd dictionary (enc_lit=3) -- hence the public constant.
+ *  - @c ZXC_HUF_STREAM_SIZES_HEADER_SIZE bytes: the first
+ *    `ZXC_HUF_NUM_STREAMS - 1` sub-stream sizes as little-endian @c uint16_t;
+ *    the last sub-stream size is derived from the enclosing section length.
+ *  - Payload: @c ZXC_HUF_NUM_STREAMS concatenated LSB-first bit-streams,
+ *    each covering an equal share of the literal indices (the last absorbs
+ *    the remainder).
+ *
+ *  The decoder uses a single lookup table of @c ZXC_HUF_DEC_TABLE_SIZE entries
+ *  (width @c ZXC_HUF_LOOKUP_BITS) that yields 1 or 2 symbols per lookup,
+ *  feeding a `ZXC_HUF_NUM_STREAMS`-way interleaved hot loop.
+ *  @{ */
+/** @brief Maximum code length, in bits. Capped well below the package-merge
+ *         algorithmic ceiling (14) to keep the decoder LUT small. */
+#define ZXC_HUF_MAX_CODE_LEN 8
+/** @brief Decoder LUT width: each lookup consumes this many bits and yields
+ *         1 or 2 symbols. */
+#define ZXC_HUF_LOOKUP_BITS 11
+/** @brief Number of entries in the multi-symbol decoder lookup table. */
+#define ZXC_HUF_DEC_TABLE_SIZE (1U << ZXC_HUF_LOOKUP_BITS)
+/** @brief Alphabet size: one entry per possible byte value. */
+#define ZXC_HUF_NUM_SYMBOLS 256
+/** @brief Interleaved bit-stream count for parallel decoding. */
+#define ZXC_HUF_NUM_STREAMS 4
+/** @brief Sub-stream sizes header: `(ZXC_HUF_NUM_STREAMS - 1)` little-endian
+ *         @c uint16_t values; the last sub-stream size is derived from the
+ *         enclosing section length. */
+#define ZXC_HUF_STREAM_SIZES_HEADER_SIZE ((int)((ZXC_HUF_NUM_STREAMS - 1) * sizeof(uint16_t)))
+/** @brief Total Huffman header size: packed code lengths + sub-stream sizes. */
+#define ZXC_HUF_HEADER_SIZE (ZXC_HUF_TABLE_SIZE + ZXC_HUF_STREAM_SIZES_HEADER_SIZE)
+/** @brief Absolute floor below which Huffman cannot beat RAW even with
+ *         zero-entropy literals after the 3 % savings margin. Above this
+ *         floor, the precise size accounting at the call site decides per
+ *         block, so the threshold is corpus-agnostic.
+ *
+ *         Derivation: the call site requires `huf_total < baseline * 31/32`
+ *         (3 % margin = `baseline >> 5`). At zero-entropy literals the
+ *         payload vanishes and `huf_total = HEADER`, giving
+ *         `N > HEADER x 32/31`. The `+30` is the standard ceiling-division
+ *         offset (`b - 1` with `b = 31`). Constants:
+ *           - 32 = inverse of the 3 % margin (`1/32`)
+ *           - 31 = `32 - 1`, the fraction kept after the margin
+ *           - 30 = `31 - 1`, ceiling-division rounding offset */
+#define ZXC_HUF_MIN_LITERALS ((ZXC_HUF_HEADER_SIZE * 32 + 30) / 31)
+/** @brief Width of the decoder bit accumulator, in bits
+ *         (`sizeof(uint64_t) * CHAR_BIT`). */
+#define ZXC_HUF_ACCUM_BITS 64
+/** @brief Decoder batch size: lookups per stream between two refills. */
+#define ZXC_HUF_BATCH 5
+/** @brief Worst-case bits consumed per stream per batch. Must stay <= 57 so
+ *         that an 8-byte refill always brings the bit accumulator back to
+ *         >= 56 bits before the next batch. */
+#define ZXC_HUF_BATCH_BITS (ZXC_HUF_BATCH * ZXC_HUF_LOOKUP_BITS)
+/** @brief Mask for indexing into the multi-symbol decoder lookup table. */
+#define ZXC_HUF_TBL_MASK ((uint64_t)(ZXC_HUF_DEC_TABLE_SIZE - 1))
+/** @brief Per-stream output headroom required to enter the batched fast loop:
+ *         each iteration speculatively writes 2 bytes per stream and runs
+ *         @c ZXC_HUF_BATCH iterations before re-checking the bound. */
+#define ZXC_HUF_SAFE_MARGIN ((size_t)(2 * ZXC_HUF_BATCH))
+
+/**
+ * @brief Multi-symbol decoder lookup table entry. Bit layout:
+ *   bits  0..7   sym1       - first decoded symbol
+ *   bits  8..15  sym2       - second decoded symbol (junk if n_extra == 0)
+ *   bits 16..19  len1       - bit length of sym1's code (1..8)
+ *   bits 20..23  len_total  - total bits consumed (1..11)
+ *   bit  24      n_extra    - 0 if 1 symbol, 1 if 2 symbols decoded
+ *
+ * Lives here (not in zxc_huffman.c) so a prebuilt table can be carried by the
+ * compression context for the shared dictionary literal table.
+ */
+typedef struct {
+    uint32_t entry;
+} zxc_huf_dec_entry_t;
+
+/**
+ * @brief Boundary package-merge work item.
+ *
+ * Each level holds at most `2 * ZXC_HUF_NUM_SYMBOLS` of these; exposed so
+ * callers can size pre-allocated scratch via ::ZXC_HUF_BUILD_SCRATCH_SIZE.
+ */
+typedef struct {
+    uint32_t weight; /**< Accumulated weight (summed frequency) of the package. */
+    int16_t left;    /**< Left child index, or -1 for a leaf. */
+    int16_t right;   /**< Right child index, or -1 for a leaf. */
+    int16_t sym;     /**< Symbol index for a leaf, or -1 for an internal node. */
+} zxc_huf_pm_item_t;
+
+/** @brief Trace-back stack frame for the package-merge code-length recovery. */
+typedef struct {
+    int8_t lvl;  /**< Package-merge level being traced back. */
+    int16_t idx; /**< Item index within that level. */
+} zxc_huf_pm_frame_t;
+
+/** @brief Per-level item bound: at most leaves + paired packages from the
+ *         previous level. */
+#define ZXC_HUF_PM_LEVEL_BOUND (2 * ZXC_HUF_NUM_SYMBOLS)
+
+/** @brief Worst-case scratch size (bytes) for ::zxc_huf_build_code_lengths.
+ *         Carved by the function into items / counts / stack regions; sized
+ *         for the worst-case alphabet (n = `ZXC_HUF_NUM_SYMBOLS`). Includes
+ *         a small alignment slack between regions. */
+#define ZXC_HUF_BUILD_SCRATCH_SIZE                                                               \
+    ((size_t)ZXC_HUF_MAX_CODE_LEN * (size_t)ZXC_HUF_PM_LEVEL_BOUND * sizeof(zxc_huf_pm_item_t) + \
+     8U + (size_t)ZXC_HUF_MAX_CODE_LEN * sizeof(int) + 8U +                                      \
+     (size_t)ZXC_HUF_MAX_CODE_LEN * (size_t)ZXC_HUF_PM_LEVEL_BOUND * sizeof(zxc_huf_pm_frame_t))
+/** @} */
+
+/** @name Block Size Helpers
+ *  @brief Runtime helpers for variable block sizes.
+ *  @{ */
+
+/**
+ * @brief Integer log-base-2 for a 32-bit value.
+ * @param v Must be a power of two (returns 0 for zero).
+ * @return Floor of log2(v).
+ */
+static ZXC_ALWAYS_INLINE uint32_t zxc_log2_u32(const uint32_t v) {
+#ifdef _MSC_VER
+    unsigned long index;
+    return (v == 0) ? 0 : (_BitScanReverse(&index, v) ? index : 0);
+#else
+    return (v == 0) ? 0 : (uint32_t)(31 - __builtin_clz(v));
+#endif
+}
+
+/**
+ * @brief Branchless bit_ceil: smallest power of two >= v, clamped to ZXC_BLOCK_SIZE_MIN.
+ * @param[in] v Input size (must be > 0).
+ * @return Smallest power of two >= @p v, clamped up to @ref ZXC_BLOCK_SIZE_MIN.
+ */
+static ZXC_ALWAYS_INLINE size_t zxc_block_size_ceil(const size_t v) {
+    uint64_t x = (uint64_t)v - 1;
+    x |= x >> 1;
+    x |= x >> 2;
+    x |= x >> 4;
+    x |= x >> 8;
+    x |= x >> 16;
+    x |= x >> 32;
+    x++;
+    const size_t bs = (size_t)x;
+    return (bs < ZXC_BLOCK_SIZE_MIN) ? ZXC_BLOCK_SIZE_MIN : bs;
+}
+
+/**
+ * @brief Validates a block size.
+ * Must be a power of two in [ZXC_BLOCK_SIZE_MIN, ZXC_BLOCK_SIZE_MAX].
+ * @param[in] bs Block size to validate.
+ * @return 1 if valid, 0 otherwise.
+ */
+static ZXC_ALWAYS_INLINE int zxc_validate_block_size(const size_t bs) {
+    return bs >= ZXC_BLOCK_SIZE_MIN && bs <= ZXC_BLOCK_SIZE_MAX && (bs & (bs - 1)) == 0;
+}
+/** @} */
+
+/** @} */ /* end of File Format Constants */
+
+/**
+ * @struct zxc_lz77_params_t
+ * @brief Search parameters for LZ77 compression levels.
+ *
+ * Each compression level maps to a specific set of parameters that control the
+ * trade-off between compression speed and ratio.  Higher search depths and lazy
+ * matching improve ratio at the expense of throughput; larger step values
+ * accelerate literal scanning but may miss short matches.
+ */
+typedef struct {
+    /** Maximum number of candidates explored in the hash chain per position.
+     *  Higher values find better matches but increase CPU cost linearly. */
+    int search_depth;
+
+    /** "Good enough" match length: once a match reaches this threshold the
+     *  chain walk stops immediately, avoiding wasted effort on an already
+     *  excellent match. */
+    int sufficient_len;
+
+    /** Enable lazy matching.  When set, after finding a match at position
+     *  @c ip the compressor probes @c ip+1 (and @c ip+2 for level >= 4) to
+     *  see if a longer match exists.  If so, a literal is emitted and the
+     *  better match is taken instead.  Improves ratio but costs extra work. */
+    int use_lazy;
+
+    /** Maximum number of candidates explored during lazy evaluation (same
+     *  semantics as @ref search_depth but applied to the ip+1 / ip+2 probes).
+     *  Only meaningful when @ref use_lazy is non-zero. */
+    int lazy_attempts;
+
+    /** Skip lazy evaluation when the current match length already reaches
+     *  this threshold: a match this long is unlikely to be beaten at the
+     *  next byte.  Set to 0 when @ref use_lazy is disabled. */
+    int lazy_len_threshold;
+
+    /** Base step size when advancing through unmatched literals.
+     *  1 = test every byte (best ratio), 4 = skip aggressively (fastest). */
+    uint32_t step_base;
+
+    /** Acceleration factor for step size: @c step = step_base + (distance >> step_shift).
+     *  A larger value keeps the step conservative (grows slowly with distance);
+     *  a smaller value ramps up quickly, skipping more in long literal runs. */
+    uint32_t step_shift;
+} zxc_lz77_params_t;
+
+/**
+ * @brief Retrieves LZ77 compression parameters based on the specified compression level.
+ *
+ * This inline function returns the appropriate LZ77 parameters configuration
+ * for the given compression level.
+ *
+ * @param[in] level The compression level to use for determining LZ77 parameters.
+ * @return zxc_lz77_params_t The LZ77 parameters structure corresponding to the specified level.
+ */
+static ZXC_ALWAYS_INLINE zxc_lz77_params_t zxc_get_lz77_params(const int level) {
+    if (level >= ZXC_LEVEL_DENSITY) return (zxc_lz77_params_t){64, 256, 0, 0, 0, 1, 8};
+    // search_depth, sufficient_len, use_lazy, lazy_attempts, lazy_len_threshold, step_base,
+    // step_shift
+    static const zxc_lz77_params_t table[6] = {
+        {3, 16, 0, 0, 0, 4, 4},      // fallback
+        {3, 16, 0, 0, 0, 4, 4},      // level 1
+        {3, 18, 0, 0, 0, 3, 6},      // level 2
+        {3, 16, 1, 4, 128, 1, 4},    // level 3
+        {3, 18, 1, 4, 128, 1, 5},    // level 4
+        {64, 256, 1, 16, 128, 1, 8}  // level 5
+    };
+    return table[level < ZXC_LEVEL_FASTEST ? ZXC_LEVEL_FASTEST : level];
+}
+
+/**
+ * @enum zxc_block_type_t
+ * @brief Defines the different types of data blocks supported by the ZXC
+ * format.
+ *
+ * This enumeration categorizes blocks based on the compression strategy
+ * applied:
+ * - `ZXC_BLOCK_RAW` (0): No compression. Used when data is incompressible (high
+ * entropy) or when compression would expand the data size.
+ * - `ZXC_BLOCK_GLO` (1): General-purpose compression (LZ77 + Bitpacking). This
+ * is the default for most data (text, binaries, JSON, etc.). Includes 4 sections descriptors.
+ * - `ZXC_BLOCK_GHI` (2): General-purpose high-velocity mode using LZ77 with advanced
+ * techniques (lazy matching, step skipping) for maximum ratio. Includes 3 sections descriptors.
+ * - `ZXC_BLOCK_SEK` (254): Seek table block. Contains per-block compressed/decompressed sizes
+ *   for random-access decompression. Placed between EOF block and file footer.
+ * - `ZXC_BLOCK_EOF` (255): End of file marker.
+ */
+typedef enum {
+    ZXC_BLOCK_RAW = 0,
+    ZXC_BLOCK_GLO = 1,
+    ZXC_BLOCK_GHI = 2,
+    ZXC_BLOCK_SEK = 254,
+    ZXC_BLOCK_EOF = 255
+} zxc_block_type_t;
+
+/**
+ * @enum zxc_section_encoding_t
+ * @brief Specifies the encoding methods used for internal data sections.
+ *
+ * These modes determine how specific components (like literals, match lengths,
+ * or offsets) are stored within a block.
+ * - `ZXC_SECTION_ENCODING_RAW`: Data is stored uncompressed.
+ * - `ZXC_SECTION_ENCODING_RLE`: Run-Length Encoding.
+ * - `ZXC_SECTION_ENCODING_HUFFMAN`: Canonical Huffman, 4-way interleaved
+ *   sub-streams, max 8-bit codes, LSB-first. Only valid for the literal
+ *   stream (`enc_lit`) of GLO blocks. Produced exclusively at level >= 6.
+ * - `ZXC_SECTION_ENCODING_HUFFMAN_DICT`: same bitstream layout as HUFFMAN but
+ *   the 128-byte code-lengths header is omitted: codes come from the shared
+ *   table carried by the dictionary (.zxd). Only valid for `enc_lit` of GLO
+ *   blocks in dictionary-compressed archives; requires the same dictionary
+ *   (content + table, bound by dict_id) at decode time.
+ */
+typedef enum {
+    ZXC_SECTION_ENCODING_RAW = 0,
+    ZXC_SECTION_ENCODING_RLE = 1,
+    ZXC_SECTION_ENCODING_HUFFMAN = 2,
+    ZXC_SECTION_ENCODING_HUFFMAN_DICT = 3
+} zxc_section_encoding_t;
+
+/**
+ * @struct zxc_gnr_header_t
+ * @brief Header specific to General (LZ-based) compression blocks.
+ *
+ * This header follows the main block header when the block type is GLO/GHI. It
+ * describes the layout of sequences and literals.
+ *
+ * @var zxc_gnr_header_t::n_sequences
+ * The total count of LZ sequences in the block.
+ * @var zxc_gnr_header_t::n_literals
+ * The total count of literal bytes.
+ * @var zxc_gnr_header_t::enc_lit
+ * Encoding method used for the literal stream.
+ * @var zxc_gnr_header_t::enc_litlen
+ * Encoding method used for the literal lengths stream.
+ * @var zxc_gnr_header_t::enc_mlen
+ * Encoding method used for the match lengths stream.
+ * @var zxc_gnr_header_t::enc_off
+ * Encoding method used for the offset stream.
+ */
+typedef struct {
+    uint32_t n_sequences;  // Number of sequences
+    uint32_t n_literals;   // Number of literals
+    uint8_t enc_lit;       // Literal encoding
+    uint8_t enc_litlen;    // Literal lengths encoding
+    uint8_t enc_mlen;      // Match lengths encoding
+    uint8_t enc_off;       // Offset encoding (Unused in Token format, kept for alignment)
+} zxc_gnr_header_t;
+
+/**
+ * @struct zxc_section_desc_t
+ * @brief Describes the size attributes of a specific data section.
+ *
+ * Used to track the compressed and uncompressed sizes of sub-components
+ * (e.g., a literal stream or offset stream) within a block.
+ */
+typedef struct {
+    uint64_t sizes; /**< Packed sizes: compressed size (low 32 bits) | raw size (high 32 bits). */
+} zxc_section_desc_t;
+
+/**
+ * @struct zxc_bit_reader_t
+ * @brief Internal bit reader structure for ZXC compression/decompression.
+ *
+ * This structure maintains the state of the bit stream reading operation.
+ * It buffers bits from the input byte stream into an accumulator to allow
+ * reading variable-length bit sequences.
+ */
+typedef struct {
+    const uint8_t* ptr; /**< Pointer to the current position in the input byte stream. */
+    const uint8_t* end; /**< Pointer to the end of the input byte stream. */
+    uint64_t accum;     /**< Bit accumulator holding buffered bits (64-bit buffer). */
+    int bits;           /**< Number of valid bits currently in the accumulator. */
+} zxc_bit_reader_t;
+
+/**
+ * ============================================================================
+ * MEMORY & ENDIANNESS HELPERS
+ * ============================================================================
+ * Functions to handle unaligned memory access and Little Endian conversion.
+ */
+
+/**
+ * @brief Reads a 16-bit unsigned integer from memory in little-endian format.
+ *
+ * This function interprets the bytes at the given memory address as a
+ * little-endian 16-bit integer, regardless of the host system's endianness.
+ * It is marked as always inline for performance critical paths.
+ *
+ * @param[in] p Pointer to the memory location to read from.
+ * @return The 16-bit unsigned integer value read from memory.
+ */
+static ZXC_ALWAYS_INLINE uint16_t zxc_le16(const void* p) {
+    uint16_t v;
+    ZXC_MEMCPY(&v, p, sizeof(v));
+#ifdef ZXC_BIG_ENDIAN
+    return ZXC_BSWAP16(v);
+#else
+    return v;
+#endif
+}
+
+/**
+ * @brief Reads a 32-bit unsigned integer from memory in little-endian format.
+ *
+ * This function interprets the bytes at the given pointer address as a
+ * little-endian 32-bit integer, regardless of the host system's endianness.
+ * It is marked as always inline for performance critical paths.
+ *
+ * @param[in] p Pointer to the memory location to read from.
+ * @return The 32-bit unsigned integer value read from memory.
+ */
+static ZXC_ALWAYS_INLINE uint32_t zxc_le32(const void* p) {
+    uint32_t v;
+    ZXC_MEMCPY(&v, p, sizeof(v));
+#ifdef ZXC_BIG_ENDIAN
+    return ZXC_BSWAP32(v);
+#else
+    return v;
+#endif
+}
+
+/**
+ * @brief Reads a 64-bit unsigned integer from memory in little-endian format.
+ *
+ * This function interprets the bytes at the given memory address as a
+ * little-endian 64-bit integer, regardless of the host system's endianness.
+ * It is marked as always inline for performance critical paths.
+ *
+ * @param[in] p Pointer to the memory location to read from.
+ * @return The 64-bit unsigned integer value read from memory.
+ */
+static ZXC_ALWAYS_INLINE uint64_t zxc_le64(const void* p) {
+    uint64_t v;
+    ZXC_MEMCPY(&v, p, sizeof(v));
+#ifdef ZXC_BIG_ENDIAN
+    return ZXC_BSWAP64(v);
+#else
+    return v;
+#endif
+}
+
+/**
+ * @brief Stores a 16-bit integer in memory using little-endian byte order.
+ *
+ * This function copies the value of a 16-bit unsigned integer to the specified
+ * memory location. It uses memcpy to avoid strict aliasing violations and
+ * potential unaligned access issues.
+ *
+ * @note This function assumes the system is little-endian or that the compiler
+ * optimizes the memcpy to a store instruction that handles endianness if necessary
+ * (though the implementation shown is a direct copy).
+ *
+ * @param[out] p Pointer to the destination memory where the value will be stored.
+ *          Must point to a valid memory region of at least 2 bytes.
+ * @param[in] v The 16-bit unsigned integer value to store.
+ */
+static ZXC_ALWAYS_INLINE void zxc_store_le16(void* p, const uint16_t v) {
+#ifdef ZXC_BIG_ENDIAN
+    const uint16_t s = ZXC_BSWAP16(v);
+    ZXC_MEMCPY(p, &s, sizeof(s));
+#else
+    ZXC_MEMCPY(p, &v, sizeof(v));
+#endif
+}
+
+/**
+ * @brief Stores a 32-bit unsigned integer in little-endian format at the specified memory location.
+ *
+ * This function writes the 32-bit value `v` to the memory pointed to by `p`.
+ * It uses `ZXC_MEMCPY` to ensure safe memory access, avoiding potential alignment issues
+ * that could occur with direct pointer casting on some architectures.
+ *
+ * @note This function is marked as `ZXC_ALWAYS_INLINE` to minimize function call overhead.
+ *
+ * @param[out] p Pointer to the destination memory where the value will be stored.
+ * @param[in] v The 32-bit unsigned integer value to store.
+ */
+static ZXC_ALWAYS_INLINE void zxc_store_le32(void* p, const uint32_t v) {
+#ifdef ZXC_BIG_ENDIAN
+    const uint32_t s = ZXC_BSWAP32(v);
+    ZXC_MEMCPY(p, &s, sizeof(s));
+#else
+    ZXC_MEMCPY(p, &v, sizeof(v));
+#endif
+}
+
+/**
+ * @brief Stores a 64-bit unsigned integer in little-endian format at the specified memory location.
+ *
+ * This function copies the 64-bit value `v` to the memory pointed to by `p`.
+ * It uses `ZXC_MEMCPY` to ensure safe memory access, avoiding potential alignment issues
+ * that might occur with direct pointer dereferencing on some architectures.
+ *
+ * @note This function assumes the system is little-endian or that the compiler optimizes
+ * the memcpy to a store instruction that handles endianness correctly if `ZXC_MEMCPY`
+ * is defined appropriately.
+ *
+ * @param[out] p Pointer to the destination memory where the value will be stored.
+ * @param[in] v The 64-bit unsigned integer value to store.
+ */
+static ZXC_ALWAYS_INLINE void zxc_store_le64(void* p, const uint64_t v) {
+#ifdef ZXC_BIG_ENDIAN
+    const uint64_t s = ZXC_BSWAP64(v);
+    ZXC_MEMCPY(p, &s, sizeof(s));
+#else
+    ZXC_MEMCPY(p, &v, sizeof(v));
+#endif
+}
+
+/**
+ * @brief Computes the 1-byte checksum for block headers.
+ *
+ * Implementation based on Marsaglia's Xorshift (PRNG) principles.
+ *
+ * @param[in] p Pointer to the input data to be hashed (8 bytes)
+ * @return uint8_t The computed hash value.
+ */
+static ZXC_ALWAYS_INLINE uint8_t zxc_hash8(const uint8_t* p) {
+    const uint64_t v = zxc_le64(p);
+    uint64_t h = v ^ ZXC_HASH_PRIME1;
+    h ^= h << 13;
+    h ^= h >> 7;
+    h ^= h << 17;
+    return (uint8_t)((h >> 32) ^ h);
+}
+
+/**
+ * @brief Computes the 2-byte checksum for file headers.
+ *
+ * This function generates a hash value by reading data from the given pointer.
+ * The result is a 16-bit hash.
+ * Implementation based on Marsaglia's Xorshift (PRNG) principles.
+ *
+ * @param[in] p Pointer to the input data to be hashed (16 bytes)
+ * @return uint16_t The computed hash value.
+ */
+static ZXC_ALWAYS_INLINE uint16_t zxc_hash16(const uint8_t* p) {
+    const uint64_t v1 = zxc_le64(p);
+    const uint64_t v2 = zxc_le64(p + 8);
+    uint64_t h = v1 ^ v2 ^ ZXC_HASH_PRIME2;
+    h ^= h << 13;
+    h ^= h >> 7;
+    h ^= h << 17;
+    const uint32_t res = (uint32_t)((h >> 32) ^ h);
+    return (uint16_t)((res >> 16) ^ res);
+}
+
+/**
+ * @brief Copies 16 bytes from the source memory location to the destination memory location.
+ *
+ * This function is forced to be inlined and uses SIMD intrinsics when available.
+ * SSE2 on x86/x64, NEON on ARM, or memcpy as fallback.
+ *
+ * @param[out] dst Pointer to the destination memory block.
+ * @param[in] src Pointer to the source memory block.
+ */
+static ZXC_ALWAYS_INLINE void zxc_copy16(void* dst, const void* src) {
+#if defined(ZXC_USE_AVX2) || defined(ZXC_USE_AVX512) || defined(ZXC_USE_SSE2)
+    // x86 SSE2/AVX2/AVX512: Single 128-bit unaligned load/store
+    _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((const __m128i*)src));
+#elif defined(ZXC_USE_NEON64) || defined(ZXC_USE_NEON32)
+    vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src));
+#else
+    ZXC_MEMCPY(dst, src, 16);
+#endif
+}
+
+/**
+ * @brief Copies 32 bytes from source to destination using SIMD when available.
+ *
+ * Uses AVX2 on x86, NEON on ARM64/ARM32, or two 16-byte copies as fallback.
+ *
+ * @param[out] dst Pointer to the destination memory block.
+ * @param[in] src Pointer to the source memory block.
+ */
+static ZXC_ALWAYS_INLINE void zxc_copy32(void* dst, const void* src) {
+#if defined(ZXC_USE_AVX2) || defined(ZXC_USE_AVX512)
+    // AVX2/AVX512: Single 256-bit (32 byte) unaligned load/store
+    _mm256_storeu_si256((__m256i*)dst, _mm256_loadu_si256((const __m256i*)src));
+#elif defined(ZXC_USE_SSE2)
+    // SSE2: Two 128-bit (16 byte) unaligned load/stores (no 256-bit regs)
+    _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((const __m128i*)src));
+    _mm_storeu_si128((__m128i*)((uint8_t*)dst + 16),
+                     _mm_loadu_si128((const __m128i*)((const uint8_t*)src + 16)));
+#elif defined(ZXC_USE_NEON64) || defined(ZXC_USE_NEON32)
+    // NEON: Two 128-bit (16 byte) unaligned load/stores
+    vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src));
+    vst1q_u8((uint8_t*)dst + 16, vld1q_u8((const uint8_t*)src + 16));
+#else
+    ZXC_MEMCPY(dst, src, 32);
+#endif
+}
+
+/**
+ * @brief Counts trailing zeros in a 32-bit unsigned integer.
+ *
+ * This function returns the number of contiguous zero bits starting from the
+ * least significant bit (LSB). If the input is 0, it returns 32.
+ *
+ * It utilizes compiler-specific built-ins for GCC/Clang (`__builtin_ctz`) and
+ * MSVC (`_BitScanForward`) for optimal performance. If no supported compiler
+ * is detected, it falls back to a portable De Bruijn sequence implementation.
+ *
+ * @param[in] x The 32-bit unsigned integer to scan.
+ * @return The number of trailing zeros (0-32).
+ */
+static ZXC_ALWAYS_INLINE int zxc_ctz32(const uint32_t x) {
+    if (x == 0) return 32;
+#if defined(__GNUC__) || defined(__clang__)
+    return __builtin_ctz(x);
+#elif defined(_MSC_VER)
+    unsigned long r;
+    _BitScanForward(&r, x);
+    return (int)r;
+#else
+    // Fallback De Bruijn (32 bits)
+    static const int DeBruijn32[32] = {0,  1,  28, 2,  29, 14, 24, 3,  30, 22, 20,
+                                       15, 25, 17, 4,  8,  31, 27, 13, 23, 21, 19,
+                                       16, 7,  26, 12, 18, 6,  11, 5,  10, 9};
+    return DeBruijn32[((uint32_t)((x & (0U - x)) * 0x077CB531U)) >> 27];
+#endif
+}
+
+/**
+ * @brief Counts the number of trailing zeros in a 64-bit unsigned integer.
+ *
+ * This function determines the number of zero bits following the least significant
+ * one bit in the binary representation of `x`.
+ *
+ * @param[in] x The 64-bit unsigned integer to scan.
+ * @return The number of trailing zeros. Returns 64 if `x` is 0.
+ *
+ * @note This implementation uses compiler built-ins for GCC/Clang (`__builtin_ctzll`)
+ *       and MSVC (`_BitScanForward64`) when available for optimal performance.
+ *       It falls back to a De Bruijn sequence multiplication method for other compilers.
+ */
+static ZXC_ALWAYS_INLINE int zxc_ctz64(const uint64_t x) {
+    if (x == 0) return 64;
+#if defined(__GNUC__) || defined(__clang__)
+    return __builtin_ctzll(x);
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
+    unsigned long r;
+    _BitScanForward64(&r, x);
+    return (int)r;
+#elif defined(_MSC_VER)
+    // Use two 32-bit scans to avoid fragile 64-bit De Bruijn multiplication.
+    unsigned long r;
+    const uint32_t lo = (uint32_t)x;
+    if (_BitScanForward(&r, lo)) return (int)r;
+    _BitScanForward(&r, (uint32_t)(x >> 32));
+    return 32 + (int)r;
+#else
+    // Fallback De Bruijn for non-GCC/non-MSVC compilers
+    static const int Debruijn64[64] = {
+        0,  1,  48, 2,  57, 49, 28, 3,  61, 58, 50, 42, 38, 29, 17, 4,  62, 55, 59, 36, 53, 51,
+        43, 22, 45, 39, 33, 30, 24, 18, 12, 5,  63, 47, 56, 27, 60, 41, 37, 16, 54, 35, 52, 21,
+        44, 32, 23, 11, 46, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19, 9,  13, 8,  7,  6};
+    return Debruijn64[((x & (0ULL - x)) * 0x03F79D71B4CA8B09ULL) >> 58];
+#endif
+}
+
+/**
+ * @brief Allocates aligned memory in a cross-platform manner.
+ *
+ * This function provides a unified interface for allocating memory with a specific
+ * alignment requirement. It wraps `_aligned_malloc` for Windows
+ * environments and `posix_memalign` for POSIX-compliant systems.
+ *
+ * @param[in] size The size of the memory block to allocate, in bytes.
+ * @param[in] alignment The alignment value, which must be a power of two and a multiple
+ *                  of `sizeof(void *)`.
+ * @return A pointer to the allocated memory block, or NULL if the allocation fails.
+ *         The returned pointer must be freed using the corresponding aligned free function.
+ */
+void* zxc_aligned_malloc(const size_t size, const size_t alignment);
+
+/**
+ * @brief Frees memory previously allocated with an aligned allocation function.
+ *
+ * This function provides a cross-platform wrapper for freeing aligned memory.
+ * On Windows, it calls `_aligned_free`.
+ * On other platforms, it falls back to the standard `free` function.
+ *
+ * @param[in] ptr A pointer to the memory block to be freed. If ptr is NULL, no operation is
+ * performed.
+ */
+void zxc_aligned_free(void* ptr);
+
+/*
+ * ============================================================================
+ * COMPRESSION CONTEXT & STRUCTS
+ * ============================================================================
+ */
+
+/*
+ * INTERNAL API
+ * ------------
+ */
+
+/**
+ * @brief Calculates a 32-bit hash for a given input buffer.
+ * @param[in] input Pointer to the data buffer.
+ * @param[in] len Length of the data in bytes.
+ * @param[in] hash_method Checksum algorithm identifier (e.g., ZXC_CHECKSUM_RAPIDHASH).
+ * @return The calculated 32-bit hash value.
+ */
+static ZXC_ALWAYS_INLINE uint32_t zxc_checksum(const void* RESTRICT input, const size_t len,
+                                               const uint8_t hash_method) {
+    (void)hash_method; /* single algorithm for now; extend when adding more */
+    const uint64_t hash = rapidhash(input, len);
+
+    return (uint32_t)(hash ^ (hash >> (sizeof(uint32_t) * CHAR_BIT)));
+}
+
+/**
+ * @brief Seeded variant of @ref zxc_checksum, for chaining a hash over
+ *        non-contiguous buffers: `zxc_checksum_seed(b, bn, zxc_checksum(a, an, m), m)`
+ *        hashes each byte once without a concat copy.
+ * @param[in] input Pointer to the data buffer.
+ * @param[in] len Length of the data in bytes.
+ * @param[in] seed Previous 32-bit checksum to chain from.
+ * @param[in] hash_method Checksum algorithm identifier (e.g., ZXC_CHECKSUM_RAPIDHASH).
+ * @return The calculated 32-bit hash value.
+ */
+static ZXC_ALWAYS_INLINE uint32_t zxc_checksum_seed(const void* RESTRICT input, const size_t len,
+                                                    const uint32_t seed,
+                                                    const uint8_t hash_method) {
+    (void)hash_method; /* single algorithm for now; extend when adding more */
+    const uint64_t hash = rapidhash_withSeed(input, len, seed);
+
+    return (uint32_t)(hash ^ (hash >> (sizeof(uint32_t) * CHAR_BIT)));
+}
+
+/**
+ * @brief Combines a running hash with a new block hash using rotate-left and XOR.
+ *
+ * This function updates a global checksum by rotating the current hash left by 1 bit
+ * (with wraparound) and XORing with the new block hash. This provides a simple but
+ * effective rolling hash that depends on the order of blocks.
+ *
+ * Formula: result = ((hash << 1) | (hash >> 31)) ^ block_hash
+ *
+ * @param[in] hash The current running hash value.
+ * @param[in] block_hash The hash of the new block to combine.
+ * @return The updated combined hash value.
+ */
+static ZXC_ALWAYS_INLINE uint32_t zxc_hash_combine_rotate(const uint32_t hash,
+                                                          const uint32_t block_hash) {
+    return ((hash << 1) | (hash >> 31)) ^ block_hash;
+}
+
+/**
+ * @brief Loads up to 7 bytes from memory in little-endian order into a uint64_t.
+ *
+ * This is used for partial reads at stream boundaries where fewer than 8 bytes
+ * remain. Unlike ZXC_MEMCPY into a uint64_t (which is endian-dependent), this
+ * function always produces a value with byte 0 in the least-significant bits.
+ *
+ * @param[in] p Pointer to the source bytes.
+ * @param[in] n Number of bytes to read (must be < 8).
+ * @return The loaded value in native host order, with bytes arranged as if
+ *         read from a little-endian stream.
+ */
+static ZXC_ALWAYS_INLINE uint64_t zxc_le_partial(const uint8_t* p, size_t n) {
+#ifdef ZXC_BIG_ENDIAN
+    uint64_t v = 0;
+    for (size_t i = 0; i < n; i++) v |= (uint64_t)p[i] << (i * CHAR_BIT);
+    return v;
+#else
+    uint64_t v = 0;
+    n = n > sizeof(v) ? sizeof(v) : n;
+    ZXC_MEMCPY(&v, p, n);
+    return v;
+#endif
+}
+
+/**
+ * @brief Initializes a bit reader structure.
+ *
+ * Sets up the internal state of the bit reader to read from the specified
+ * source buffer.
+ *
+ * @param[out] br Pointer to the bit reader structure to initialize.
+ * @param[in] src Pointer to the source buffer containing the data to read.
+ * @param[in] size The size of the source buffer in bytes.
+ */
+static ZXC_ALWAYS_INLINE void zxc_br_init(zxc_bit_reader_t* RESTRICT br,
+                                          const uint8_t* RESTRICT src, const size_t size) {
+    br->ptr = src;
+    br->end = src + size;
+    // Safety check: ensure we have at least 8 bytes to fill the accumulator
+    if (UNLIKELY(size < sizeof(uint64_t))) {
+        br->accum = zxc_le_partial(src, size);
+        br->ptr += size;
+        br->bits = (int)(size * CHAR_BIT);
+    } else {
+        br->accum = zxc_le64(br->ptr);
+        br->ptr += sizeof(uint64_t);
+        br->bits = sizeof(uint64_t) * CHAR_BIT;
+    }
+}
+
+/**
+ * @brief Writes a generic header and section descriptors to a destination
+ * buffer.
+ *
+ * Serializes the `zxc_gnr_header_t` and an array of 4 section descriptors.
+ *
+ * @param[out] dst Pointer to the destination buffer.
+ * @param[in] rem The remaining space in the destination buffer.
+ * @param[in] gh Pointer to the generic header structure to write.
+ * @param[in] desc Array of 4 section descriptors to write.
+ * @return int The number of bytes written, or a negative error code if the buffer
+ * is too small.
+ */
+int zxc_write_glo_header_and_desc(uint8_t* RESTRICT dst, const size_t rem,
+                                  const zxc_gnr_header_t* RESTRICT gh,
+                                  const zxc_section_desc_t desc[ZXC_GLO_SECTIONS]);
+
+/**
+ * @brief Reads a generic header and section descriptors from a source buffer.
+ *
+ * Deserializes data into a `zxc_gnr_header_t` and an array of 4 section
+ * descriptors.
+ *
+ * @param[in] src Pointer to the source buffer.
+ * @param[in] len The length of the source buffer available for reading.
+ * @param[out] gh Pointer to the generic header structure to populate.
+ * @param[out] desc Array of 4 section descriptors to populate.
+ *
+ * @return int Returns ZXC_OK on success, or a negative zxc_error_t code on failure.
+ */
+int zxc_read_glo_header_and_desc(const uint8_t* RESTRICT src, const size_t len,
+                                 zxc_gnr_header_t* RESTRICT gh,
+                                 zxc_section_desc_t desc[ZXC_GLO_SECTIONS]);
+
+/**
+ * @brief Writes a record header and description to the destination buffer.
+ *
+ * @param dst Pointer to the destination buffer where the header and description will be written.
+ * @param rem Remaining size available in the destination buffer.
+ * @param gh Pointer to the GNR header structure containing header information.
+ * @param desc Array of 3 section descriptors to be written along with the header.
+ *
+ * @return int Returns the number of bytes written on success, or a negative error code on failure.
+ */
+int zxc_write_ghi_header_and_desc(uint8_t* RESTRICT dst, const size_t rem,
+                                  const zxc_gnr_header_t* RESTRICT gh,
+                                  const zxc_section_desc_t desc[ZXC_GHI_SECTIONS]);
+
+/**
+ * @brief Reads a record header and section descriptors from a buffer.
+ *
+ * This function parses the source buffer to extract a general header and
+ * up to three section descriptors from a ZXC record.
+ *
+ * @param[in] src Pointer to the source buffer containing the record data.
+ * @param[in] len Length of the source buffer in bytes.
+ * @param[out] gh Pointer to a zxc_gnr_header_t structure to store the parsed header.
+ * @param[out] desc Array of 3 zxc_section_desc_t structures to store the parsed section
+ * descriptors.
+ *
+ * @return int Returns ZXC_OK on success, or a negative zxc_error_t code on failure.
+ */
+int zxc_read_ghi_header_and_desc(const uint8_t* RESTRICT src, const size_t len,
+                                 zxc_gnr_header_t* RESTRICT gh,
+                                 zxc_section_desc_t desc[ZXC_GHI_SECTIONS]);
+
+/* ============================================================================
+ * Huffman codec for the GLO literal stream (level >= 6).
+ *
+ * On-disk layout, decoder geometry and tunables: see
+ * @ref ZXC_HUF_MAX_CODE_LEN and the surrounding "Huffman Codec Constants"
+ * group above.
+ * ============================================================================
+ */
+
+/**
+ * @brief Build length-limited canonical Huffman code lengths from a frequency table.
+ *
+ * Uses the boundary package-merge algorithm capped at `ZXC_HUF_MAX_CODE_LEN`.
+ * Symbols with `freq[i] == 0` get `code_len[i] == 0`; others receive a value
+ * in `[1, ZXC_HUF_MAX_CODE_LEN]`.
+ *
+ * @param[in]  freq     Frequency table of length `ZXC_HUF_NUM_SYMBOLS`.
+ * @param[out] code_len Output code-length array of length `ZXC_HUF_NUM_SYMBOLS`.
+ * @param[in]  scratch  Optional caller-owned scratch buffer of at least
+ *                      ::ZXC_HUF_BUILD_SCRATCH_SIZE bytes. If `NULL`, the
+ *                      function allocates its own working memory and frees
+ *                      it before returning.
+ * @return `ZXC_OK` on success, negative `zxc_error_t` code on failure.
+ */
+int zxc_huf_build_code_lengths(const uint32_t* RESTRICT freq, uint8_t* RESTRICT code_len,
+                               void* RESTRICT scratch);
+
+/**
+ * @brief Encode the literal stream into a Huffman section payload.
+ *
+ * Writes the 128-byte length header, the 6-byte sub-stream size table and
+ * the 4 concatenated LSB-first bit-streams.
+ *
+ * @param[in]  literals   Source literal bytes (must not alias `dst`).
+ * @param[in]  n_literals Number of source bytes.
+ * @param[in]  code_len   Per-symbol code lengths produced by
+ *                        ::zxc_huf_build_code_lengths.
+ * @param[out] dst        Destination buffer for the section payload.
+ * @param[in]  dst_cap    Capacity of @p dst in bytes.
+ * @return Total bytes written on success, negative `zxc_error_t` code on failure.
+ */
+int zxc_huf_encode_section(const uint8_t* RESTRICT literals, const size_t n_literals,
+                           const uint8_t* RESTRICT code_len, uint8_t* RESTRICT dst,
+                           const size_t dst_cap);
+
+/**
+ * @brief Decode a Huffman literal section payload of `payload_size` bytes.
+ *
+ * Writes exactly `n_literals` decoded bytes into @p dst.
+ *
+ * @param[in]  payload      Section payload (header + 4 sub-streams).
+ * @param[in]  payload_size Total payload length in bytes.
+ * @param[out] dst          Destination buffer (must not alias @p payload).
+ * @param[in]  n_literals   Expected number of decoded bytes.
+ * @return `ZXC_OK` on success, negative `zxc_error_t` code on failure.
+ */
+int zxc_huf_decode_section(const uint8_t* RESTRICT payload, const size_t payload_size,
+                           uint8_t* RESTRICT dst, const size_t n_literals);
+
+/**
+ * @brief Encode a Huffman literal section using externally supplied code
+ *        lengths, WITHOUT the 128-byte lengths header (shared dictionary
+ *        table). Output: 6-byte sub-stream sizes header + 4 sub-streams.
+ *
+ * @return Bytes written on success, negative `zxc_error_t` code on failure
+ *         (including `ZXC_ERROR_CORRUPT_DATA` if a literal has no code).
+ */
+int zxc_huf_encode_section_dict(const uint8_t* RESTRICT literals, const size_t n_literals,
+                                const uint8_t* RESTRICT code_len, uint8_t* RESTRICT dst,
+                                const size_t dst_cap);
+
+/**
+ * @brief Decode a Huffman literal section that carries no lengths header,
+ *        using a prebuilt decode table (shared dictionary table).
+ *
+ * @param[in]  payload      Section payload (6-byte sizes header + 4 sub-streams).
+ * @param[in]  payload_size Total payload length in bytes.
+ * @param[out] dst          Destination buffer (must not alias @p payload).
+ * @param[in]  n_literals   Expected number of decoded bytes.
+ * @param[in]  table        Prebuilt @ref ZXC_HUF_DEC_TABLE_SIZE-entry decode table.
+ * @return `ZXC_OK` on success, negative `zxc_error_t` code on failure.
+ */
+int zxc_huf_decode_section_dict(const uint8_t* RESTRICT payload, const size_t payload_size,
+                                uint8_t* RESTRICT dst, const size_t n_literals,
+                                const zxc_huf_dec_entry_t* RESTRICT table);
+
+/**
+ * @brief Build the @ref ZXC_HUF_DEC_TABLE_SIZE-entry decode table from per-symbol
+ *        code lengths. Validates Kraft equality.
+ * @return `ZXC_OK` on success, `ZXC_ERROR_CORRUPT_DATA` on invalid lengths.
+ */
+int zxc_huf_build_dec_table(const uint8_t* RESTRICT code_len, zxc_huf_dec_entry_t* RESTRICT table);
+
+/**
+ * @brief Pack per-symbol code lengths into the 128-byte (4-bit nibble) header.
+ */
+void zxc_huf_pack_lengths(const uint8_t* RESTRICT code_len, uint8_t* RESTRICT out);
+
+/**
+ * @brief Unpack and structurally validate a 128-byte packed lengths header.
+ * @return `ZXC_OK` on success, `ZXC_ERROR_CORRUPT_DATA` on invalid lengths.
+ */
+int zxc_huf_unpack_lengths(const uint8_t* RESTRICT in, uint8_t* RESTRICT code_len);
+
+/* ---------------------------------------------------------------------------
+ * Compression / decompression context.
+ *
+ * The context owns the working buffers (hash table, sequence buffers, scratch
+ * memory) that the encoder and decoder reuse across blocks. It used to be
+ * exposed via zxc_sans_io.h, but no consumer outside of the library itself
+ * needs to drive it directly - the public buffer / streaming / seekable APIs
+ * already provide opaque wrappers (`zxc_create_cctx` / `zxc_create_dctx`).
+ * Keeping the layout private lets us evolve the buffer layout (cache-line
+ * placement, additional scratch arenas) without breaking the ABI.
+ * --------------------------------------------------------------------------- */
+
+/**
+ * @struct zxc_cctx_t
+ * @brief Compression / decompression context.
+ *
+ * Holds the buffers reused across blocks to avoid repeated allocations.
+ *
+ * **Key fields:**
+ * - @c hash_table: epoch-tagged positions (`ZXC_LZ_HASH_SIZE` * 4 bytes).
+ * - @c hash_tags:  8-bit tags for fast match rejection
+ *   (`ZXC_LZ_HASH_SIZE` * 1 byte).
+ * - @c chain_table: collision chain storing the *previous* occurrence of a
+ *   hash, forming a linked list per bucket and enabling history traversal.
+ * - @c epoch: drives "lazy hash table invalidation". Instead of memset-ing
+ *   the hash table for every block, we store `(epoch << offset_bits) | offset`; an
+ *   entry whose stored epoch differs from `ctx->epoch` is treated as empty.
+ */
+typedef struct {
+    /* Hot zone: random access / high frequency.
+     * Kept at the start to ensure they reside in the first cache line (64 bytes). */
+    uint32_t* hash_table;  /**< Hash table for LZ77 match positions (epoch|pos). */
+    uint8_t* hash_tags;    /**< Split tag table for fast match rejection (8-bit tags). */
+    uint16_t* chain_table; /**< Chain table for collision resolution. */
+    void* memory_block;    /**< Single allocation block owner. */
+    uint32_t epoch;        /**< Current epoch for lazy hash table invalidation. */
+
+    /* Warm zone: sequential access per sequence. */
+    uint32_t* buf_sequences; /**< Buffer for sequence records (packed: LL(8)|ML(8)|Offset(16)). */
+    uint8_t* buf_tokens;     /**< Buffer for token sequences. */
+    uint16_t* buf_offsets;   /**< Buffer for offsets. */
+    uint8_t* buf_extras;     /**< Buffer for extra lengths (vbytes for LL/ML). */
+    uint8_t* literals;       /**< Buffer for literal bytes. */
+
+    /* Cold zone: configuration / scratch / resizeable. */
+    uint8_t* lit_buffer;                 /**< Scratch buffer for literals (RLE / Huffman). */
+    size_t lit_buffer_cap;               /**< Current capacity of the scratch buffer. */
+    uint8_t* work_buf;                   /**< Padded scratch buffer for buffer-API decompression. */
+    size_t work_buf_cap;                 /**< Capacity of the work buffer. */
+    uint8_t* opt_scratch;                /**< Optimal-parser DP scratch (level >= 6 only,
+                                              lazy-allocated, packs dp/parent_len/parent_off/actions).
+                                              Also reused as transient scratch for the
+                                              length-limited Huffman code-length builder. */
+    size_t opt_scratch_cap;              /**< Current capacity of opt_scratch in bytes. */
+    int checksum_enabled;                /**< 1 if checksum calculation/verification is enabled. */
+    int compression_level;               /**< Compression level. */
+    size_t dict_size;                    /**< Dictionary prefill size (0 = no dictionary). */
+    uint8_t* dict_buffer;                /**< [dict | data] concat scratch carved from memory_block
+                                              when dict_size > 0 (NULL otherwise). */
+    size_t dict_buffer_cap;              /**< Capacity of dict_buffer in bytes (0 = none). */
+    const uint8_t* dict_huf_lengths;     /**< Shared dictionary literal table: 128-byte
+                                     packed code-lengths header (NULL = none). Set via
+                                     zxc_cctx_attach_dict_huf; caller-owned memory. */
+    zxc_huf_dec_entry_t* dict_huf_table; /**< Decode table built once from
+                                 dict_huf_lengths; carved from memory_block when
+                                 mode == 0 and dict_size > 0 (NULL otherwise). */
+    uint32_t* lit_freq_acc;              /**< Trainer hook: when non-NULL, the GLO encoder
+                                              accumulates post-LZ literal byte frequencies here
+                                              (256 entries). NULL outside dictionary training. */
+
+    /* Block-size derived parameters (computed once at init). */
+    size_t chunk_size;    /**< Effective block size in bytes. */
+    uint32_t offset_bits; /**< log2(chunk_size) - governs epoch_mark shift. */
+    uint32_t offset_mask; /**< (1U << offset_bits) - 1 */
+    uint32_t max_epoch;   /**< 1U << (32 - offset_bits) */
+} zxc_cctx_t;
+
+/**
+ * @brief Initialises a ZXC compression / decompression context in place.
+ *
+ * Allocates the internal buffers (hash table, sequence buffers, scratch) sized
+ * for @p chunk_size and the requested @p mode.
+ *
+ * @param[out] ctx               Context to initialise.
+ * @param[in]  chunk_size        Block size driving buffer sizing.
+ * @param[in]  mode              1 for compression, 0 for decompression.
+ * @param[in]  level             Compression level (ignored when @p mode == 0).
+ * @param[in]  checksum_enabled  Non-zero to enable checksum computation.
+ * @param[in]  dict_size         Dictionary prefill size; when > 0 an extra
+ *                               [dict | data] concat buffer is carved into the
+ *                               workspace and @c ctx->dict_buffer is set.
+ *
+ * @return @c ZXC_OK on success, or a negative @ref zxc_error_t code (notably
+ *         @c ZXC_ERROR_MEMORY on allocation failure).
+ */
+int zxc_cctx_init(zxc_cctx_t* ctx, const size_t chunk_size, const int mode, const int level,
+                  const int checksum_enabled, const size_t dict_size);
+
+/**
+ * @brief Attach the shared dictionary literal table to an initialised context.
+ *
+ * Stores @p lengths (128-byte packed code-lengths header, caller-owned, must
+ * outlive the context's use) and, on decompression contexts created with
+ * @c dict_size > 0, builds the decode table once into the workspace-carved
+ * @c dict_huf_table. A NULL @p lengths is a no-op.
+ *
+ * @return @ref ZXC_OK on success, @ref ZXC_ERROR_CORRUPT_DATA if the lengths
+ *         header is structurally invalid (bad nibble, Kraft inequality).
+ */
+int zxc_cctx_attach_dict_huf(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT lengths);
+
+/**
+ * @brief Returns the byte count that @ref zxc_cctx_init would allocate for
+ *        the given parameters.
+ *
+ * Used by the static-cctx public API to size a caller-supplied workspace
+ * before calling @ref zxc_cctx_init_in_workspace.
+ *
+ * @param[in] chunk_size  Block size in bytes (must satisfy
+ *                        @ref zxc_validate_block_size).
+ * @param[in] mode        1 = compression, 0 = decompression.
+ * @param[in] level       Compression level (only consulted when @p mode == 1).
+ * @param[in] dict_size   Dictionary prefill size; when > 0 the figure includes
+ *                        the [dict | data] concat buffer.
+ * @return Size in bytes, or 0 if the parameters are invalid.
+ */
+size_t zxc_cctx_compute_workspace_size(const size_t chunk_size, const int mode, const int level,
+                                       const size_t dict_size);
+
+/**
+ * @brief Initialises a compression / decompression context inside a
+ *        caller-supplied workspace.
+ *
+ * Identical to @ref zxc_cctx_init except that the persistent buffer is
+ * carved out of @p workspace instead of being @c ZXC_ALIGNED_MALLOC'd
+ * internally.  @p workspace must be cache-line aligned and at least as
+ * large as @ref zxc_cctx_compute_workspace_size for the same parameters.
+ *
+ * The caller owns @p workspace and must keep it alive for the lifetime of
+ * @p ctx.  @ref zxc_cctx_free becomes a no-op for contexts initialised
+ * this way (the workspace is not freed by the library).
+ *
+ * @param[out] ctx               Context to initialise (zeroed on entry).
+ * @param[in]  workspace         Caller-allocated, cache-line-aligned buffer.
+ * @param[in]  workspace_size    Capacity of @p workspace in bytes.
+ * @param[in]  chunk_size        Block size in bytes.
+ * @param[in]  mode              1 = compression, 0 = decompression.
+ * @param[in]  level             Compression level (ignored when @p mode == 0).
+ * @param[in]  checksum_enabled  Non-zero to enable checksum computation.
+ * @param[in]  dict_size         Dictionary prefill size; when > 0 the workspace
+ *                               must include the [dict | data] concat buffer and
+ *                               @c ctx->dict_buffer is set into it.
+ * @return @c ZXC_OK on success, @c ZXC_ERROR_DST_TOO_SMALL if the workspace
+ *         is too small, or another negative @ref zxc_error_t.
+ */
+int zxc_cctx_init_in_workspace(zxc_cctx_t* RESTRICT ctx, void* RESTRICT workspace,
+                               const size_t workspace_size, const size_t chunk_size, const int mode,
+                               const int level, const int checksum_enabled, const size_t dict_size);
+
+/**
+ * @brief Releases the internal buffers owned by a context.
+ *
+ * Does NOT free @p ctx itself - the caller owns the struct storage. The
+ * context may safely be re-initialised with zxc_cctx_init() afterwards.
+ *
+ * @param[in,out] ctx Context whose buffers should be released.
+ */
+void zxc_cctx_free(zxc_cctx_t* ctx);
+
+/**
+ * @brief Internal wrapper function to decompress a single chunk of data.
+ *
+ * This function handles the decompression of a specific chunk from the source
+ * buffer into the destination buffer using the provided compression context. It
+ * serves as an abstraction layer over the core decompression logic.
+ *
+ * @param[in,out] ctx     Pointer to the ZXC compression context structure containing
+ *                internal state and configuration.
+ * @param[in] src     Pointer to the source buffer containing compressed data.
+ * @param[in] src_sz  Size of the compressed data in the source buffer (in bytes).
+ * @param[out] dst     Pointer to the destination buffer where decompressed data will
+ * be written.
+ * @param[in] dst_cap Capacity of the destination buffer (maximum bytes that can be
+ * written).
+ *
+ * @return int    Returns ZXC_OK on success, or a negative zxc_error_t code on failure.
+ *                Specific error codes depend on the underlying ZXC
+ * implementation.
+ */
+int zxc_decompress_chunk_wrapper(const zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                 const size_t src_sz, uint8_t* RESTRICT dst, const size_t dst_cap);
+int zxc_decompress_chunk_wrapper_dict(const zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src,
+                                      const size_t src_sz, uint8_t* RESTRICT dst,
+                                      const size_t dst_cap);
+
+/**
+ * @brief Wraps the internal chunk compression logic.
+ *
+ * This function acts as a wrapper to compress a single chunk of data using the
+ * provided compression context. It handles the interaction with the underlying
+ * compression algorithm for a specific block of memory.
+ *
+ * @param[in,out] ctx   Pointer to the ZXC compression context containing configuration
+ *              and state.
+ * @param[in] chunk Pointer to the source buffer containing the raw data to
+ * compress.
+ * @param[in] src_sz    The size of the source chunk in bytes.
+ * @param[out] dst   Pointer to the destination buffer where compressed data will be
+ * written.
+ * @param[in] dst_cap   The capacity of the destination buffer (maximum bytes to write).
+ *
+ * @return int      The number of bytes written to the destination buffer on success,
+ *                  or a negative error code on failure.
+ */
+int zxc_compress_chunk_wrapper(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT chunk,
+                               const size_t src_sz, uint8_t* RESTRICT dst, const size_t dst_cap);
+
+/* ---------------------------------------------------------------------------
+ * Internal frame primitives.
+ *
+ * Read/write the ZXC file header, block header, and file footer. These were
+ * previously exposed via zxc_sans_io.h but no in-tree consumer outside of the
+ * library implementation needs them, and exposing them freezes on-disk layout
+ * details (block_flags layout, footer composition) that we want to keep free
+ * to evolve until the format is declared stable.
+ * --------------------------------------------------------------------------- */
+
+/**
+ * @brief On-disk header structure for a ZXC block (8 bytes, little-endian).
+ *
+ * @c raw_size is not stored in the header; decoders derive it from Section
+ * Descriptors within the compressed payload.
+ */
+typedef struct {
+    uint8_t block_type;  /**< Block type (see @ref zxc_block_type_t). */
+    uint8_t block_flags; /**< Flags (e.g., checksum presence). */
+    uint8_t reserved;    /**< Reserved for future protocol extensions. */
+    uint8_t header_crc;  /**< Header integrity checksum (1 byte). */
+    uint32_t comp_size;  /**< Compressed size excluding this header. */
+} zxc_block_header_t;
+
+/**
+ * @brief Writes the standard ZXC file header into @p dst.
+ *
+ * Stores the magic word (little-endian) and the version number into the
+ * provided buffer, after checking that it has sufficient capacity.
+ *
+ * @param[out] dst           Destination buffer.
+ * @param[in]  dst_capacity  Total capacity of @p dst in bytes.
+ * @param[in]  chunk_size    Block size to encode in the header.
+ * @param[in]  has_checksum  Non-zero if the checksum bit must be set.
+ * @param[in]  dict_id       Dictionary ID (0 = no dictionary).
+ *
+ * @return Number of bytes written (@c ZXC_FILE_HEADER_SIZE) on success,
+ *         or @c ZXC_ERROR_DST_TOO_SMALL if @p dst_capacity is insufficient.
+ */
+int zxc_write_file_header(uint8_t* RESTRICT dst, const size_t dst_capacity, const size_t chunk_size,
+                          const int has_checksum, const uint32_t dict_id);
+
+/**
+ * @brief Validates and reads the ZXC file header from @p src.
+ *
+ * Checks that the source buffer is large enough to contain a ZXC file header
+ * and that the magic word and version number match the expected format.
+ *
+ * @param[in]  src               Pointer to the source buffer.
+ * @param[in]  src_size          Size of the source buffer in bytes.
+ * @param[out] out_block_size    Optional pointer that receives the recommended
+ *                               block size. May be @c NULL.
+ * @param[out] out_has_checksum  Optional pointer that receives the checksum
+ *                               flag. May be @c NULL.
+ * @param[out] out_dict_id       Optional pointer that receives the dictionary
+ *                               ID (0 if none). May be @c NULL.
+ *
+ * @return @c ZXC_OK on success, or a negative error code (e.g.
+ *         @c ZXC_ERROR_SRC_TOO_SMALL, @c ZXC_ERROR_BAD_MAGIC,
+ *         @c ZXC_ERROR_BAD_VERSION).
+ */
+int zxc_read_file_header(const uint8_t* RESTRICT src, const size_t src_size, size_t* out_block_size,
+                         int* out_has_checksum, uint32_t* out_dict_id);
+
+/**
+ * @brief Encodes a block header into @p dst.
+ *
+ * Serialises the contents of a @ref zxc_block_header_t structure into a byte
+ * array in little-endian format, after checking that @p dst has sufficient
+ * capacity.
+ *
+ * @param[out] dst           Destination buffer.
+ * @param[in]  dst_capacity  Total capacity of @p dst in bytes.
+ * @param[in]  bh            Source block header structure to serialise.
+ *
+ * @return Number of bytes written (@c ZXC_BLOCK_HEADER_SIZE) on success,
+ *         or @c ZXC_ERROR_DST_TOO_SMALL if @p dst_capacity is insufficient.
+ */
+int zxc_write_block_header(uint8_t* RESTRICT dst, const size_t dst_capacity,
+                           const zxc_block_header_t* bh);
+
+/**
+ * @brief Reads and parses a ZXC block header from @p src.
+ *
+ * Extracts the block type, flags, reserved fields, and compressed size from
+ * the first @c ZXC_BLOCK_HEADER_SIZE bytes of @p src. Multi-byte fields are
+ * decoded as little-endian.
+ *
+ * @param[in]  src       Source buffer holding the encoded block header.
+ * @param[in]  src_size  Size of @p src in bytes.
+ * @param[out] bh        Block header structure populated with the parsed data.
+ *
+ * @return @c ZXC_OK on success, or @c ZXC_ERROR_SRC_TOO_SMALL if @p src is
+ *         smaller than @c ZXC_BLOCK_HEADER_SIZE.
+ */
+int zxc_read_block_header(const uint8_t* RESTRICT src, const size_t src_size,
+                          zxc_block_header_t* bh);
+
+/**
+ * @brief Writes the ZXC file footer into @p dst.
+ *
+ * The footer stores the original uncompressed size and an optional global
+ * checksum. It is always @c ZXC_FILE_FOOTER_SIZE (12) bytes long.
+ *
+ * @param[out] dst               Destination buffer.
+ * @param[in]  dst_capacity      Total capacity of @p dst in bytes.
+ * @param[in]  src_size          Original uncompressed size of the data.
+ * @param[in]  global_hash       Global checksum hash (used only when
+ *                               @p checksum_enabled is non-zero).
+ * @param[in]  checksum_enabled  Non-zero if the checksum should be emitted.
+ *
+ * @return Number of bytes written (@c ZXC_FILE_FOOTER_SIZE) on success,
+ *         or @c ZXC_ERROR_DST_TOO_SMALL on failure.
+ */
+int zxc_write_file_footer(uint8_t* RESTRICT dst, const size_t dst_capacity, const uint64_t src_size,
+                          const uint32_t global_hash, const int checksum_enabled);
+
+/* ---------------------------------------------------------------------------
+ * Seekable cross-TU hooks (defined in zxc_seekable.c, consumed by the
+ * FILE*-flavored open helper in zxc_driver.c).
+ * ------------------------------------------------------------------------- */
+
+/**
+ * @brief Hands ownership of a heap-allocated reader context to a seekable
+ *        handle.  The context will be released via @c ZXC_FREE when
+ *        @ref zxc_seekable_free is called on @p s.
+ *
+ * Safe to call exactly once per handle.  Intended for thin wrappers that
+ * build a @ref zxc_reader_t over their own allocated state
+ * (@ref zxc_seekable_open_file) and need that state to outlive the call
+ * site.
+ *
+ * @param[in,out] s    Seekable handle returned by @ref zxc_seekable_open_reader.
+ * @param[in]     ctx  Pointer previously returned by @c ZXC_MALLOC / @c ZXC_CALLOC.
+ */
+void zxc_seekable_attach_owned_ctx(zxc_seekable* s, void* ctx);
+
+/** @} */ /* end of internal */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // ZXC_INTERNAL_H
\ No newline at end of file
diff --git a/thirdparty/zxc/src/lib/zxc_pstream.c b/thirdparty/zxc/src/lib/zxc_pstream.c
new file mode 100644
index 000000000000..eb97bdb00f37
--- /dev/null
+++ b/thirdparty/zxc/src/lib/zxc_pstream.c
@@ -0,0 +1,1126 @@
+/*
+ * ZXC - High-performance lossless compression
+ *
+ * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file zxc_pstream.c
+ * @brief Push-based, single-threaded streaming driver implementation.
+ *
+ * See zxc_pstream.h for the public contract.  The implementation composes
+ * the public block API (@ref zxc_compress_block / @ref zxc_decompress_block)
+ * with the public sans-IO header helpers (@ref zxc_write_file_header /
+ * footer, @c zxc_read_*); the only internal dependency is on shared
+ * constants and the global-hash combine inline, pulled from zxc_internal.h.
+ *
+ * Both compression and decompression are structured as resumable state
+ * machines driven by caller-provided input/output buffers
+ * (@ref zxc_inbuf_t / @ref zxc_outbuf_t).  Each call advances as much as
+ * possible without blocking and returns a status indicating whether the
+ * caller should drain @p out, supply more @p in, or finalise the stream.
+ */
+
+#include "../../include/zxc_pstream.h"
+
+#include "../../include/zxc_buffer.h"
+#include "../../include/zxc_constants.h"
+#include "../../include/zxc_error.h"
+#include "zxc_internal.h"
+
+/* ===================================================================== */
+/*  Compression                                                          */
+/* ===================================================================== */
+
+/**
+ * @enum cstream_state_t
+ * @brief Lifecycle states of the push compression stream.
+ *
+ * The compression state machine alternates between *staging* a chunk of
+ * output bytes into the @c pending buffer and *draining* those bytes into
+ * the caller's @ref zxc_outbuf_t.  Forward progress is therefore always
+ * either consuming from @c in or producing into @c out.
+ *
+ * @var cstream_state_t::CS_INIT
+ *      Initial state; nothing has been emitted yet.
+ * @var cstream_state_t::CS_DRAIN_HEADER
+ *      File header staged in @c pending; draining to @p out, then transitions
+ *      to @c CS_ACCUMULATE.
+ * @var cstream_state_t::CS_ACCUMULATE
+ *      Copying input bytes into the internal block accumulator until it is
+ *      full (or the stream is finalised).
+ * @var cstream_state_t::CS_DRAIN_BLOCK
+ *      A full data block was just compressed; draining it to @p out, then
+ *      back to @c CS_ACCUMULATE.
+ * @var cstream_state_t::CS_DRAIN_LAST
+ *      Draining the final partial block produced inside @ref zxc_cstream_end;
+ *      transitions to @c CS_DRAIN_EOF.
+ * @var cstream_state_t::CS_DRAIN_EOF
+ *      Draining the EOF block; transitions to @c CS_DRAIN_FOOTER.
+ * @var cstream_state_t::CS_DRAIN_FOOTER
+ *      Draining the file footer; transitions to @c CS_DONE.
+ * @var cstream_state_t::CS_DONE
+ *      Finalisation complete; further @c _compress / @c _end calls are
+ *      rejected.
+ * @var cstream_state_t::CS_ERRORED
+ *      Sticky error state; subsequent calls return the latched error code.
+ */
+typedef enum {
+    CS_INIT = 0,
+    CS_DRAIN_HEADER,
+    CS_ACCUMULATE,
+    CS_DRAIN_BLOCK,
+    CS_DRAIN_LAST,
+    CS_DRAIN_EOF,
+    CS_DRAIN_FOOTER,
+    CS_DONE,
+    CS_ERRORED
+} cstream_state_t;
+
+/**
+ * @struct zxc_cstream_s
+ * @brief Internal state of a push compression stream.
+ *
+ * Owns three buffers: a fixed-size input accumulator (@c in_block, sized
+ * to one block), a variable-size output staging area (@c pending, holding
+ * the file header, one compressed block, the EOF block, or the file
+ * footer), and the underlying compression context (@c cctx).
+ *
+ * @var zxc_cstream_s::opts
+ *      Compression options (copied from the caller at creation time).
+ * @var zxc_cstream_s::cctx
+ *      Underlying single-block compression context.
+ * @var zxc_cstream_s::block_size
+ *      Target block size in bytes; cached from @c opts.block_size.
+ * @var zxc_cstream_s::in_block
+ *      Heap buffer of capacity @c block_size used to accumulate one full
+ *      uncompressed block before invoking the block compressor.
+ * @var zxc_cstream_s::in_used
+ *      Number of valid bytes currently held in @c in_block (in
+ *      [0, @c block_size]).
+ * @var zxc_cstream_s::pending
+ *      Heap buffer holding the next chunk of output bytes to emit
+ *      (header, compressed block, EOF marker or footer).
+ * @var zxc_cstream_s::pending_cap
+ *      Allocated capacity of @c pending.
+ * @var zxc_cstream_s::pending_len
+ *      Total valid bytes currently staged in @c pending.
+ * @var zxc_cstream_s::pending_pos
+ *      Bytes already copied from @c pending to the caller's output buffer.
+ *      Drain is complete when @c pending_pos == @c pending_len.
+ * @var zxc_cstream_s::total_in
+ *      Running count of uncompressed bytes consumed; written into the file
+ *      footer.
+ * @var zxc_cstream_s::global_hash
+ *      Rolling per-block-trailer hash; written into the file footer when
+ *      checksums are enabled.
+ * @var zxc_cstream_s::state
+ *      Current state machine position (see @ref cstream_state_t).
+ * @var zxc_cstream_s::error_code
+ *      Sticky error code; valid only when @c state is @c CS_ERRORED.
+ */
+struct zxc_cstream_s {
+    zxc_compress_opts_t opts;
+    zxc_cctx* cctx;
+    size_t block_size;
+
+    uint8_t* in_block;
+    size_t in_used;
+
+    uint8_t* pending;
+    size_t pending_cap;
+    size_t pending_len;
+    size_t pending_pos;
+
+    uint64_t total_in;
+    uint32_t global_hash;
+
+    cstream_state_t state;
+    int error_code;
+};
+
+/**
+ * @brief Latches a sticky error on the compression stream.
+ *
+ * Stores @p code in @c cs->error_code, transitions @c cs->state to
+ * @c CS_ERRORED, and returns @p code.  Once errored, subsequent
+ * @ref zxc_cstream_compress / @ref zxc_cstream_end calls return the same
+ * code without performing further work.
+ *
+ * @param[in,out] cs   Compression stream.
+ * @param[in]     code Negative @ref zxc_error_t value to latch.
+ * @return @p code (always negative).
+ */
+static int cs_set_error(zxc_cstream* cs, const int code) {
+    cs->error_code = code;
+    cs->state = CS_ERRORED;
+    return code;
+}
+
+/**
+ * @brief Compresses one full or partial accumulated block.
+ *
+ * Compresses the contents of @c cs->in_block into @c cs->pending, growing
+ * the latter to @ref zxc_compress_block_bound if needed, and updates
+ * bookkeeping (@c total_in, @c global_hash, @c in_used reset to 0).  When
+ * file-level checksums are enabled, folds the block trailer into the
+ * rolling @c global_hash.
+ *
+ * @pre @c cs->in_used > 0.
+ *
+ * @param[in,out] cs Compression stream.
+ * @return @ref ZXC_OK on success, negative @ref zxc_error_t on failure.
+ */
+static int cs_compress_one_block(zxc_cstream* cs) {
+    const uint64_t bound = zxc_compress_block_bound(cs->in_used);
+    // LCOV_EXCL_START
+    if (UNLIKELY(bound == 0 || bound > SIZE_MAX)) return ZXC_ERROR_OVERFLOW;
+    if (UNLIKELY(bound > cs->pending_cap)) {
+        uint8_t* nb = (uint8_t*)ZXC_REALLOC(cs->pending, (size_t)bound);
+        if (UNLIKELY(!nb)) return ZXC_ERROR_MEMORY;
+        cs->pending = nb;
+        cs->pending_cap = (size_t)bound;
+    }
+    // LCOV_EXCL_STOP
+    const int64_t csize = zxc_compress_block(cs->cctx, cs->in_block, cs->in_used, cs->pending,
+                                             cs->pending_cap, &cs->opts);
+    if (UNLIKELY(csize < 0)) return (int)csize;  // LCOV_EXCL_LINE
+
+    cs->pending_len = (size_t)csize;
+    cs->pending_pos = 0;
+    cs->total_in += cs->in_used;
+    cs->in_used = 0;
+
+    /* If checksums are on, the block trailer is the last ZXC_BLOCK_CHECKSUM_SIZE
+     * bytes of pending; fold it into the rolling global hash. */
+    if (cs->opts.checksum_enabled && cs->pending_len >= ZXC_BLOCK_CHECKSUM_SIZE) {
+        const uint32_t bh = zxc_le32(cs->pending + cs->pending_len - ZXC_BLOCK_CHECKSUM_SIZE);
+        cs->global_hash = zxc_hash_combine_rotate(cs->global_hash, bh);
+    }
+    return ZXC_OK;
+}
+
+/**
+ * @brief Drains staged output bytes into the caller's output buffer.
+ *
+ * Copies as many bytes as possible from
+ * @c cs->pending[pending_pos..pending_len) into
+ * @c out->dst[pos..size), advancing both cursors.
+ *
+ * @param[in,out] cs  Compression stream.
+ * @param[in,out] out Caller output buffer.
+ * @return Non-zero once the @c pending buffer has been fully drained
+ *         (@c pending_pos == @c pending_len), zero otherwise.
+ */
+static int cs_drain_pending(zxc_cstream* cs, zxc_outbuf_t* out) {
+    const size_t avail_out = out->size - out->pos;
+    const size_t avail_pen = cs->pending_len - cs->pending_pos;
+    const size_t n = avail_out < avail_pen ? avail_out : avail_pen;
+    if (n) {
+        ZXC_MEMCPY((uint8_t*)out->dst + out->pos, cs->pending + cs->pending_pos, n);
+        out->pos += n;
+        cs->pending_pos += n;
+    }
+    return cs->pending_pos == cs->pending_len;
+}
+
+/**
+ * @brief Allocates and initialises a push compression stream.
+ *
+ * Copies @p opts into the new context, applying defaults for any zero-valued
+ * field (@c level -> @ref ZXC_LEVEL_DEFAULT, @c block_size ->
+ * @ref ZXC_BLOCK_SIZE_DEFAULT).  Forces single-threaded operation
+ * (@c n_threads = 0), disables progress callbacks and seekable framing
+ * (those modes belong to the @c FILE*-based pipeline).  Pre-sizes the
+ * @c pending buffer so that the file header / footer paths never need a
+ * realloc.
+ *
+ * @param[in] opts Compression options, or @c NULL for full defaults.
+ * @return New stream owned by the caller, or @c NULL on allocation
+ *         failure / invalid option values.
+ */
+zxc_cstream* zxc_cstream_create(const zxc_compress_opts_t* opts) {
+    zxc_cstream* cs = (zxc_cstream*)ZXC_CALLOC(1, sizeof(*cs));
+    if (UNLIKELY(!cs)) return NULL;  // LCOV_EXCL_LINE
+
+    if (opts) cs->opts = *opts;
+    if (cs->opts.level == 0) cs->opts.level = ZXC_LEVEL_DEFAULT;
+    if (cs->opts.block_size == 0) cs->opts.block_size = ZXC_BLOCK_SIZE_DEFAULT;
+    /* n_threads is ignored on this single-threaded path. */
+    cs->opts.n_threads = 0;
+    cs->opts.progress_cb = NULL;
+    cs->opts.user_data = NULL;
+    cs->opts.seekable = 0;
+    cs->block_size = cs->opts.block_size;
+
+    cs->cctx = zxc_create_cctx(&cs->opts);
+    // LCOV_EXCL_START
+    if (UNLIKELY(!cs->cctx)) {
+        ZXC_FREE(cs);
+        return NULL;
+    }
+    cs->in_block = (uint8_t*)ZXC_MALLOC(cs->block_size);
+    if (UNLIKELY(!cs->in_block)) {
+        zxc_free_cctx(cs->cctx);
+        ZXC_FREE(cs);
+        return NULL;
+    }
+    // LCOV_EXCL_STOP
+    /* Pre-size pending so the file header path never needs realloc. */
+    cs->pending_cap =
+        ZXC_FILE_HEADER_SIZE > ZXC_FILE_FOOTER_SIZE ? ZXC_FILE_HEADER_SIZE : ZXC_FILE_FOOTER_SIZE;
+    cs->pending = (uint8_t*)ZXC_MALLOC(cs->pending_cap);
+    // LCOV_EXCL_START
+    if (UNLIKELY(!cs->pending)) {
+        ZXC_FREE(cs->in_block);
+        zxc_free_cctx(cs->cctx);
+        ZXC_FREE(cs);
+        return NULL;
+    }
+    // LCOV_EXCL_STOP
+    cs->state = CS_INIT;
+    return cs;
+}
+
+/**
+ * @brief Stages the 16-byte file header into the @c pending buffer.
+ *
+ * @param[in,out] cs Compression stream.
+ * @return @ref ZXC_OK on success, negative @ref zxc_error_t on failure.
+ */
+static int cs_stage_file_header(zxc_cstream* cs) {
+    const int w = zxc_write_file_header(cs->pending, cs->pending_cap, cs->block_size,
+                                        cs->opts.checksum_enabled, 0);
+    if (UNLIKELY(w < 0)) return w;  // LCOV_EXCL_LINE
+    cs->pending_len = (size_t)w;
+    cs->pending_pos = 0;
+    return ZXC_OK;
+}
+
+/**
+ * @brief Stages the 8-byte EOF block into the @c pending buffer.
+ *
+ * The EOF block is a regular block header with @c block_type set to
+ * @ref ZXC_BLOCK_EOF and @c comp_size = 0; it carries no payload.
+ *
+ * @param[in,out] cs Compression stream.
+ * @return @ref ZXC_OK on success, negative @ref zxc_error_t on failure.
+ */
+static int cs_stage_eof(zxc_cstream* cs) {
+    // LCOV_EXCL_START
+    if (UNLIKELY(ZXC_BLOCK_HEADER_SIZE > cs->pending_cap)) {
+        uint8_t* nb = (uint8_t*)ZXC_REALLOC(cs->pending, ZXC_BLOCK_HEADER_SIZE);
+        if (UNLIKELY(!nb)) return ZXC_ERROR_MEMORY;
+        cs->pending = nb;
+        cs->pending_cap = ZXC_BLOCK_HEADER_SIZE;
+    }
+    // LCOV_EXCL_STOP
+    const zxc_block_header_t eof = {
+        .block_type = (uint8_t)ZXC_BLOCK_EOF,
+        .block_flags = 0,
+        .reserved = 0,
+        .header_crc = 0,
+        .comp_size = 0,
+    };
+    const int w = zxc_write_block_header(cs->pending, cs->pending_cap, &eof);
+    if (UNLIKELY(w < 0)) return w;  // LCOV_EXCL_LINE
+    cs->pending_len = (size_t)w;
+    cs->pending_pos = 0;
+    return ZXC_OK;
+}
+
+/**
+ * @brief Stages the 12-byte file footer into the @c pending buffer.
+ *
+ * The footer carries the total uncompressed input size and (when checksums
+ * are enabled) the global rolling hash accumulated across all data blocks.
+ *
+ * @param[in,out] cs Compression stream.
+ * @return @ref ZXC_OK on success, negative @ref zxc_error_t on failure.
+ */
+static int cs_stage_footer(zxc_cstream* cs) {
+    // LCOV_EXCL_START
+    if (UNLIKELY(ZXC_FILE_FOOTER_SIZE > cs->pending_cap)) {
+        uint8_t* nb = (uint8_t*)ZXC_REALLOC(cs->pending, ZXC_FILE_FOOTER_SIZE);
+        if (UNLIKELY(!nb)) return ZXC_ERROR_MEMORY;
+        cs->pending = nb;
+        cs->pending_cap = ZXC_FILE_FOOTER_SIZE;
+    }
+    // LCOV_EXCL_STOP
+    const int w = zxc_write_file_footer(cs->pending, cs->pending_cap, cs->total_in, cs->global_hash,
+                                        cs->opts.checksum_enabled);
+    if (UNLIKELY(w < 0)) return w;  // LCOV_EXCL_LINE
+    cs->pending_len = (size_t)w;
+    cs->pending_pos = 0;
+    return ZXC_OK;
+}
+
+/**
+ * @brief Releases a compression stream and all internal buffers.
+ *
+ * Safe to call with @c NULL.
+ *
+ * @param[in,out] cs Stream returned by @ref zxc_cstream_create.
+ */
+void zxc_cstream_free(zxc_cstream* cs) {
+    if (!cs) return;
+    ZXC_FREE(cs->pending);
+    ZXC_FREE(cs->in_block);
+    zxc_free_cctx(cs->cctx);
+    ZXC_FREE(cs);
+}
+
+/**
+ * @brief Returns the suggested input chunk size (configured block size).
+ *
+ * @param[in] cs Compression stream.
+ * @return Block size in bytes, or 0 if @p cs is @c NULL.
+ */
+size_t zxc_cstream_in_size(const zxc_cstream* cs) { return cs ? cs->block_size : 0; }
+
+/**
+ * @brief Returns the suggested output chunk size.
+ *
+ * Sized to hold one full compressed block plus framing overhead, i.e.
+ * @ref zxc_compress_block_bound applied to the configured block size.
+ * Falls back to @c block_size when the bound overflows @c size_t.
+ *
+ * @param[in] cs Compression stream.
+ * @return Suggested output buffer capacity in bytes, or 0 if @p cs is @c NULL.
+ */
+size_t zxc_cstream_out_size(const zxc_cstream* cs) {
+    if (!cs) return 0;
+    const uint64_t b = zxc_compress_block_bound(cs->block_size);
+    return (b == 0 || b > SIZE_MAX) ? cs->block_size : (size_t)b;
+}
+
+/**
+ * @brief Push-side entry point: feeds input and drains compressed output.
+ *
+ * Drives the @ref cstream_state_t machine: emits the file header on the
+ * first call, then accumulates input until a block is full, compresses it,
+ * and drains the result into @p out.  Each call makes as much progress as
+ * either buffer allows; the function is fully reentrant.  See the public
+ * contract in @ref zxc_cstream_compress for full semantics.
+ *
+ * The terminal states (@c CS_DRAIN_LAST, @c CS_DRAIN_EOF, @c CS_DRAIN_FOOTER,
+ * @c CS_DONE, @c CS_ERRORED) are owned by @ref zxc_cstream_end; reaching
+ * them here yields @ref ZXC_ERROR_NULL_INPUT.
+ *
+ * @param[in,out] cs  Compression stream.
+ * @param[in,out] out Caller output buffer.
+ * @param[in,out] in  Caller input buffer.
+ * @return @c 0 if @p in fully consumed and nothing pending,
+ *         @c >0 number of bytes still pending (drain @p out then call again),
+ *         @c <0 a @ref zxc_error_t code.
+ */
+int64_t zxc_cstream_compress(zxc_cstream* cs, zxc_outbuf_t* out, zxc_inbuf_t* in) {
+    if (UNLIKELY(!cs || !out || !in || in->pos > in->size || out->pos > out->size ||
+                 (in->size > in->pos && !in->src) || (out->size > out->pos && !out->dst) ||
+                 cs->state == CS_DONE)) {
+        return ZXC_ERROR_NULL_INPUT;
+    }
+    if (UNLIKELY(cs->state == CS_ERRORED)) return cs->error_code;
+
+    for (;;) {
+        switch (cs->state) {
+            case CS_INIT: {
+                const int rc = cs_stage_file_header(cs);
+                if (UNLIKELY(rc < 0)) return cs_set_error(cs, rc);  // LCOV_EXCL_LINE
+                cs->state = CS_DRAIN_HEADER;
+                break;
+            }
+
+            case CS_DRAIN_HEADER:
+            case CS_DRAIN_BLOCK: {
+                if (!cs_drain_pending(cs, out)) return (int64_t)(cs->pending_len - cs->pending_pos);
+                cs->state = CS_ACCUMULATE;
+                break;
+            }
+
+            case CS_ACCUMULATE: {
+                const size_t avail_in = in->size - in->pos;
+                const size_t room = cs->block_size - cs->in_used;
+                const size_t n = avail_in < room ? avail_in : room;
+                if (n) {
+                    ZXC_MEMCPY(cs->in_block + cs->in_used, (const uint8_t*)in->src + in->pos, n);
+                    in->pos += n;
+                    cs->in_used += n;
+                }
+
+                if (cs->in_used == cs->block_size) {
+                    const int rc = cs_compress_one_block(cs);
+                    if (UNLIKELY(rc < 0)) return cs_set_error(cs, rc);  // LCOV_EXCL_LINE
+                    cs->state = CS_DRAIN_BLOCK;
+                    break;
+                }
+                /* Block not yet full either in is empty or we made no progress. */
+                return 0;
+            }
+
+            case CS_DRAIN_LAST:
+            case CS_DRAIN_EOF:
+            case CS_DRAIN_FOOTER:
+            case CS_DONE:
+            case CS_ERRORED:
+                /* These states are owned by _end(). */
+                return ZXC_ERROR_NULL_INPUT;
+        }
+    }
+}
+
+/**
+ * @brief Finalises the stream: residual block (if any), EOF, and footer.
+ *
+ * Continues the same state machine as @ref zxc_cstream_compress through the
+ * terminal states (@c CS_DRAIN_LAST -> @c CS_DRAIN_EOF -> @c CS_DRAIN_FOOTER
+ * -> @c CS_DONE).  Reentrant: when @p out fills mid-drain, returns the
+ * number of bytes still pending and resumes from where it left off on the
+ * next call.  See the public contract in @ref zxc_cstream_end.
+ *
+ * @param[in,out] cs  Compression stream.
+ * @param[in,out] out Caller output buffer.
+ * @return @c 0 once finalisation is complete (stream is now in DONE state),
+ *         @c >0 number of bytes still pending (drain and call again),
+ *         @c <0 a @ref zxc_error_t code.
+ */
+int64_t zxc_cstream_end(zxc_cstream* cs, zxc_outbuf_t* out) {
+    if (UNLIKELY(!cs || !out || cs->state == CS_DONE)) return ZXC_ERROR_NULL_INPUT;
+    if (UNLIKELY(cs->state == CS_ERRORED)) return cs->error_code;
+
+    for (;;) {
+        switch (cs->state) {
+            case CS_INIT: {
+                /* _end before any input, still need to emit file header. */
+                const int rc = cs_stage_file_header(cs);
+                if (UNLIKELY(rc < 0)) return cs_set_error(cs, rc);  // LCOV_EXCL_LINE
+                cs->state = CS_DRAIN_HEADER;
+                break;
+            }
+
+            case CS_DRAIN_HEADER: {
+                if (!cs_drain_pending(cs, out)) return (int64_t)(cs->pending_len - cs->pending_pos);
+                cs->state = CS_ACCUMULATE;
+                break;
+            }
+
+            case CS_DRAIN_BLOCK: {
+                /* This drain came from a full block compressed during _compress. */
+                if (!cs_drain_pending(cs, out)) return (int64_t)(cs->pending_len - cs->pending_pos);
+                cs->state = CS_ACCUMULATE;
+                break;
+            }
+
+            case CS_ACCUMULATE: {
+                /* Compress the residual partial block (if any), then EOF + footer. */
+                if (cs->in_used > 0) {
+                    const int rc = cs_compress_one_block(cs);
+                    if (UNLIKELY(rc < 0)) return cs_set_error(cs, rc);  // LCOV_EXCL_LINE
+                    cs->state = CS_DRAIN_LAST;
+                    break;
+                }
+                /* No residual data: go straight to EOF. */
+                {
+                    const int rc = cs_stage_eof(cs);
+                    if (UNLIKELY(rc < 0)) return cs_set_error(cs, rc);  // LCOV_EXCL_LINE
+                    cs->state = CS_DRAIN_EOF;
+                    break;
+                }
+            }
+
+            case CS_DRAIN_LAST: {
+                if (!cs_drain_pending(cs, out)) return (int64_t)(cs->pending_len - cs->pending_pos);
+                /* After last data block -> EOF. */
+                const int rc = cs_stage_eof(cs);
+                if (UNLIKELY(rc < 0)) return cs_set_error(cs, rc);  // LCOV_EXCL_LINE
+                cs->state = CS_DRAIN_EOF;
+                break;
+            }
+
+            case CS_DRAIN_EOF: {
+                if (!cs_drain_pending(cs, out)) return (int64_t)(cs->pending_len - cs->pending_pos);
+                const int rc = cs_stage_footer(cs);
+                if (UNLIKELY(rc < 0)) return cs_set_error(cs, rc);  // LCOV_EXCL_LINE
+                cs->state = CS_DRAIN_FOOTER;
+                break;
+            }
+
+            case CS_DRAIN_FOOTER: {
+                if (!cs_drain_pending(cs, out)) return (int64_t)(cs->pending_len - cs->pending_pos);
+                cs->state = CS_DONE;
+                return 0;
+            }
+
+            case CS_DONE:
+            case CS_ERRORED:
+                return cs->state == CS_ERRORED ? cs->error_code : 0;
+        }
+    }
+}
+
+/* ===================================================================== */
+/*  Decompression                                                        */
+/* ===================================================================== */
+
+/**
+ * @enum dstream_state_t
+ * @brief Lifecycle states of the push decompression stream.
+ *
+ * The decompressor implements a frame-aware parser: file header -> N
+ * (data block header + payload [+ optional checksum]) -> EOF block ->
+ * optional SEK index block -> file footer.  The states alternate between
+ * *pulling* fixed- or variable-sized chunks from the caller's input, and
+ * *emitting* the corresponding decoded output.
+ *
+ * @var dstream_state_t::DS_NEED_FILE_HEADER
+ *      Pulling the 16-byte file header into @c scratch.
+ * @var dstream_state_t::DS_NEED_BLOCK_HEADER
+ *      Pulling an 8-byte block header into @c scratch.
+ * @var dstream_state_t::DS_NEED_BLOCK_PAYLOAD
+ *      Pulling a data block payload (and optional checksum) into @c payload.
+ * @var dstream_state_t::DS_DECODE_BLOCK
+ *      Calling the underlying block decoder on the accumulated payload.
+ * @var dstream_state_t::DS_EMIT_DECODED
+ *      Draining decoded bytes from @c decoded into @p out.
+ * @var dstream_state_t::DS_PEEK_TAIL
+ *      Just past the EOF block: read 8 bytes and peek to disambiguate
+ *      between an optional SEK index block and the file footer.
+ * @var dstream_state_t::DS_DRAIN_SEK_PAYLOAD
+ *      The peeked 8 bytes were a SEK header; skipping its payload bytes.
+ * @var dstream_state_t::DS_NEED_FOOTER_FULL
+ *      Pulling the full 12-byte file footer (used after a SEK block).
+ * @var dstream_state_t::DS_NEED_FOOTER_REST
+ *      The 8 peeked bytes were the head of the footer; pulling the
+ *      remaining 4 bytes.
+ * @var dstream_state_t::DS_VALIDATE_FOOTER
+ *      Validating @c total_out and the optional global hash.
+ * @var dstream_state_t::DS_DONE
+ *      Stream fully consumed and validated.
+ * @var dstream_state_t::DS_ERRORED
+ *      Sticky error state; subsequent calls return the latched code.
+ */
+typedef enum {
+    DS_NEED_FILE_HEADER = 0,
+    DS_NEED_BLOCK_HEADER,
+    DS_NEED_BLOCK_PAYLOAD,
+    DS_DECODE_BLOCK,
+    DS_EMIT_DECODED,
+    DS_PEEK_TAIL,
+    DS_DRAIN_SEK_PAYLOAD,
+    DS_NEED_FOOTER_FULL,
+    DS_NEED_FOOTER_REST,
+    DS_VALIDATE_FOOTER,
+    DS_DONE,
+    DS_ERRORED
+} dstream_state_t;
+
+/**
+ * @struct zxc_dstream_s
+ * @brief Internal state of a push decompression stream.
+ *
+ * Owns three accumulator regions: a fixed-size on-stack-style @c scratch
+ * buffer for headers/footers/peeks, a heap @c payload buffer for variable
+ * block payloads, and a heap @c decoded buffer that holds one decompressed
+ * block before it is emitted to the caller.
+ *
+ * @var zxc_dstream_s::opts
+ *      Decompression options (copied at creation time).
+ * @var zxc_dstream_s::inner
+ *      Underlying single-block decompression context.
+ * @var zxc_dstream_s::inner_initialized
+ *      Non-zero once @c inner has been initialised; gates the matching
+ *      @ref zxc_cctx_free call at teardown.
+ * @var zxc_dstream_s::block_size
+ *      Block size declared by the file header; 0 until the header is parsed.
+ * @var zxc_dstream_s::file_has_checksum
+ *      File-level checksum flag declared by the file header.
+ * @var zxc_dstream_s::scratch
+ *      Generic 32-byte accumulator for fixed-size frames (file header, block
+ *      header, footer); comfortably holds the largest (16-byte file header).
+ * @var zxc_dstream_s::scratch_used
+ *      Number of bytes currently held in @c scratch.
+ * @var zxc_dstream_s::scratch_need
+ *      Target number of bytes for the current accumulation phase.
+ * @var zxc_dstream_s::payload
+ *      Heap buffer holding one block: header + compressed payload + optional
+ *      checksum.
+ * @var zxc_dstream_s::payload_cap
+ *      Allocated capacity of @c payload.
+ * @var zxc_dstream_s::payload_used
+ *      Number of valid bytes currently in @c payload.
+ * @var zxc_dstream_s::payload_need
+ *      Target byte count for the current payload phase
+ *      (= header size + comp_size + checksum size).
+ * @var zxc_dstream_s::decoded
+ *      Heap buffer holding the decoded output of one block (sized for the
+ *      wild-copy fast path: @c block_size + @ref ZXC_DECOMPRESS_TAIL_PAD).
+ * @var zxc_dstream_s::decoded_cap
+ *      Allocated capacity of @c decoded.
+ * @var zxc_dstream_s::decoded_size
+ *      Real number of decoded bytes in @c decoded after the last decode.
+ * @var zxc_dstream_s::decoded_pos
+ *      Bytes already emitted from @c decoded; drain complete when
+ *      @c decoded_pos == @c decoded_size.
+ * @var zxc_dstream_s::cur_bh
+ *      Parsed block header for the block currently being processed.
+ * @var zxc_dstream_s::sek_remaining
+ *      Bytes left to skip from a SEK block payload (only used in
+ *      @c DS_DRAIN_SEK_PAYLOAD).
+ * @var zxc_dstream_s::total_out
+ *      Cumulative decompressed output size; cross-checked against the
+ *      file footer.
+ * @var zxc_dstream_s::global_hash
+ *      Rolling per-block-trailer hash; cross-checked against the file
+ *      footer when checksums are enabled.
+ * @var zxc_dstream_s::state
+ *      Current state machine position (see @ref dstream_state_t).
+ * @var zxc_dstream_s::error_code
+ *      Sticky error code; valid only when @c state is @c DS_ERRORED.
+ */
+struct zxc_dstream_s {
+    zxc_decompress_opts_t opts;
+    zxc_cctx_t inner;
+    int inner_initialized;
+    size_t block_size;
+    int file_has_checksum;
+
+    uint8_t scratch[32];
+    size_t scratch_used;
+    size_t scratch_need;
+
+    uint8_t* payload;
+    size_t payload_cap;
+    size_t payload_used;
+    size_t payload_need;
+
+    uint8_t* decoded;
+    size_t decoded_cap;
+    size_t decoded_size;
+    size_t decoded_pos;
+
+    zxc_block_header_t cur_bh;
+    size_t sek_remaining;
+
+    uint64_t total_out;
+    uint32_t global_hash;
+
+    dstream_state_t state;
+    int error_code;
+};
+
+/**
+ * @brief Latches a sticky error on the decompression stream.
+ *
+ * Stores @p code in @c ds->error_code, transitions @c ds->state to
+ * @c DS_ERRORED, and returns @p code.  Once errored, subsequent
+ * @ref zxc_dstream_decompress calls return the same code without
+ * performing further work.
+ *
+ * @param[in,out] ds   Decompression stream.
+ * @param[in]     code Negative @ref zxc_error_t value to latch.
+ * @return @p code (always negative).
+ */
+static int ds_set_error(zxc_dstream* ds, const int code) {
+    ds->error_code = code;
+    ds->state = DS_ERRORED;
+    return code;
+}
+
+/**
+ * @brief Pulls up to @c (scratch_need - scratch_used) bytes from @p in.
+ *
+ * Used to accumulate fixed-size frames (file header, block header, footer,
+ * EOF tail peek) into the inline @c scratch buffer.
+ *
+ * @param[in,out] ds Decompression stream.
+ * @param[in,out] in Caller input buffer; @c pos is advanced.
+ * @return @c 1 once @c scratch holds exactly @c scratch_need bytes,
+ *         @c 0 otherwise (need more input).
+ */
+static int ds_pull_scratch(zxc_dstream* ds, zxc_inbuf_t* in) {
+    const size_t want = ds->scratch_need - ds->scratch_used;
+    const size_t avail = in->size - in->pos;
+    const size_t n = want < avail ? want : avail;
+    if (n) {
+        ZXC_MEMCPY(ds->scratch + ds->scratch_used, (const uint8_t*)in->src + in->pos, n);
+        in->pos += n;
+        ds->scratch_used += n;
+    }
+    return ds->scratch_used == ds->scratch_need;
+}
+
+/**
+ * @brief Same as @ref ds_pull_scratch but pulls into the heap @c payload buffer.
+ *
+ * Used to accumulate the variable-size compressed block payload
+ * (header + comp_size [+ checksum]).
+ *
+ * @param[in,out] ds Decompression stream.
+ * @param[in,out] in Caller input buffer; @c pos is advanced.
+ * @return @c 1 once @c payload holds exactly @c payload_need bytes,
+ *         @c 0 otherwise (need more input).
+ */
+static int ds_pull_payload(zxc_dstream* ds, zxc_inbuf_t* in) {
+    const size_t want = ds->payload_need - ds->payload_used;
+    const size_t avail = in->size - in->pos;
+    const size_t n = want < avail ? want : avail;
+    if (n) {
+        ZXC_MEMCPY(ds->payload + ds->payload_used, (const uint8_t*)in->src + in->pos, n);
+        in->pos += n;
+        ds->payload_used += n;
+    }
+    return ds->payload_used == ds->payload_need;
+}
+
+/**
+ * @brief Allocates and initialises a push decompression stream.
+ *
+ * Copies @p opts into the new context.  The internal multi-threading,
+ * progress-callback, and seekable knobs are forced off (those modes belong
+ * to the @c FILE*-based pipeline).  @c block_size, @c file_has_checksum,
+ * and the @c payload / @c decoded buffers are filled in lazily once the
+ * file header is parsed.
+ *
+ * @param[in] opts Decompression options, or @c NULL for full defaults.
+ * @return New stream owned by the caller, or @c NULL on allocation failure.
+ */
+zxc_dstream* zxc_dstream_create(const zxc_decompress_opts_t* opts) {
+    zxc_dstream* ds = (zxc_dstream*)ZXC_CALLOC(1, sizeof(*ds));
+    if (UNLIKELY(!ds)) return NULL;  // LCOV_EXCL_LINE
+    if (opts) ds->opts = *opts;
+    ds->opts.n_threads = 0;
+    ds->opts.progress_cb = NULL;
+    ds->opts.user_data = NULL;
+    ds->state = DS_NEED_FILE_HEADER;
+    ds->scratch_need = ZXC_FILE_HEADER_SIZE;
+    return ds;
+}
+
+/**
+ * @brief Releases a decompression stream and all internal buffers.
+ *
+ * Safe to call with @c NULL.
+ *
+ * @param[in,out] ds Stream returned by @ref zxc_dstream_create.
+ */
+void zxc_dstream_free(zxc_dstream* ds) {
+    if (!ds) return;
+    ZXC_FREE(ds->payload);
+    ZXC_FREE(ds->decoded);
+    if (ds->inner_initialized) zxc_cctx_free(&ds->inner);
+    ZXC_FREE(ds);
+}
+
+/**
+ * @brief Returns 1 iff the stream has reached @c DS_DONE.
+ *
+ * @param[in] ds Decompression stream.
+ * @return @c 1 if DONE, @c 0 otherwise (including errored).
+ */
+int zxc_dstream_finished(const zxc_dstream* ds) { return (ds && ds->state == DS_DONE) ? 1 : 0; }
+
+/**
+ * @brief Returns the suggested input chunk size for the decompressor.
+ *
+ * Before the file header is parsed the call returns
+ * @ref ZXC_BLOCK_SIZE_DEFAULT; afterwards it returns the maximal compressed
+ * block size derived from the negotiated @c block_size.
+ *
+ * @param[in] ds Decompression stream.
+ * @return Suggested input buffer capacity in bytes, or 0 if @p ds is @c NULL.
+ */
+size_t zxc_dstream_in_size(const zxc_dstream* ds) {
+    if (!ds) return 0;
+    if (ds->block_size == 0) return ZXC_BLOCK_SIZE_DEFAULT;
+    const uint64_t b = zxc_compress_block_bound(ds->block_size);
+    return (b == 0 || b > SIZE_MAX) ? ds->block_size : (size_t)b;
+}
+
+/**
+ * @brief Returns the suggested output chunk size for the decompressor.
+ *
+ * Equals the negotiated @c block_size; before the file header is parsed,
+ * returns @ref ZXC_BLOCK_SIZE_DEFAULT.
+ *
+ * @param[in] ds Decompression stream.
+ * @return Suggested output buffer capacity in bytes, or 0 if @p ds is @c NULL.
+ */
+size_t zxc_dstream_out_size(const zxc_dstream* ds) {
+    if (!ds) return 0;
+    return ds->block_size == 0 ? ZXC_BLOCK_SIZE_DEFAULT : ds->block_size;
+}
+
+/**
+ * @brief Drains @c ds->decoded[decoded_pos..decoded_size) into @p out.
+ *
+ * Updates @c ds->total_out by the number of bytes copied and, when
+ * @p produced is non-NULL, accumulates the same count into @c *produced
+ * (used by the outer state machine to compute the per-call return value).
+ *
+ * @param[in,out] ds       Decompression stream.
+ * @param[in,out] out      Caller output buffer.
+ * @param[in,out] produced Optional running count of bytes produced this call.
+ * @return @c 1 once @c decoded is fully drained, @c 0 otherwise.
+ */
+static int ds_drain_decoded(zxc_dstream* ds, zxc_outbuf_t* out, size_t* produced) {
+    const size_t avail_out = out->size - out->pos;
+    const size_t avail_dec = ds->decoded_size - ds->decoded_pos;
+    const size_t n = avail_out < avail_dec ? avail_out : avail_dec;
+    if (n) {
+        ZXC_MEMCPY((uint8_t*)out->dst + out->pos, ds->decoded + ds->decoded_pos, n);
+        out->pos += n;
+        ds->decoded_pos += n;
+        ds->total_out += n;
+        if (produced) *produced += n;
+    }
+    return ds->decoded_pos == ds->decoded_size;
+}
+
+/**
+ * @brief Handles the @c DS_NEED_FILE_HEADER state.
+ *
+ * Pulls the 16-byte file header into @c scratch, parses it via
+ * @ref zxc_read_file_header, and lazily allocates the @c payload and
+ * @c decoded buffers (sized from the negotiated @c block_size).  The
+ * @c decoded buffer is over-allocated by @ref ZXC_DECOMPRESS_TAIL_PAD bytes to
+ * absorb wild-copy overflow and give the decoder's 4x ML bounds checks their
+ * required tail headroom.  Initialises the underlying
+ * decompression context and transitions to @c DS_NEED_BLOCK_HEADER.
+ *
+ * @param[in,out] ds Decompression stream.
+ * @param[in,out] in Caller input buffer.
+ * @return @c 1 if more input is needed, @c 0 to continue the outer loop,
+ *         negative @ref zxc_error_t on validation/allocation failure.
+ */
+static int ds_handle_need_file_header(zxc_dstream* ds, zxc_inbuf_t* in) {
+    if (!ds_pull_scratch(ds, in)) return 1;
+
+    size_t bs = 0;
+    int has_csum = 0;
+    const int rc = zxc_read_file_header(ds->scratch, ds->scratch_used, &bs, &has_csum, NULL);
+    if (UNLIKELY(rc != ZXC_OK)) return ds_set_error(ds, rc);  // LCOV_EXCL_LINE
+    ds->block_size = bs;
+    ds->file_has_checksum = has_csum;
+
+    /* Allocate payload + decoded buffers now that block_size is known. */
+    const uint64_t pb = zxc_compress_block_bound(ds->block_size);
+    // LCOV_EXCL_START
+    if (UNLIKELY(pb == 0 || pb > SIZE_MAX)) return ds_set_error(ds, ZXC_ERROR_OVERFLOW);
+    // LCOV_EXCL_STOP
+    ds->payload_cap = (size_t)pb;
+    ds->payload = (uint8_t*)ZXC_MALLOC(ds->payload_cap);
+
+    ds->decoded_cap = ds->block_size + ZXC_DECOMPRESS_TAIL_PAD;
+    ds->decoded = (uint8_t*)ZXC_MALLOC(ds->decoded_cap);
+    // LCOV_EXCL_START
+    if (UNLIKELY(!ds->payload || !ds->decoded)) return ds_set_error(ds, ZXC_ERROR_MEMORY);
+
+    if (UNLIKELY(zxc_cctx_init(&ds->inner, ds->block_size, 0, 0,
+                               ds->file_has_checksum && ds->opts.checksum_enabled, 0) != ZXC_OK)) {
+        return ds_set_error(ds, ZXC_ERROR_MEMORY);
+    }
+    // LCOV_EXCL_STOP
+    ds->inner_initialized = 1;
+
+    ds->state = DS_NEED_BLOCK_HEADER;
+    ds->scratch_used = 0;
+    ds->scratch_need = ZXC_BLOCK_HEADER_SIZE;
+    return 0;
+}
+
+/**
+ * @brief Handles the @c DS_NEED_BLOCK_HEADER state.
+ *
+ * Pulls 8 bytes into @c scratch and parses them as a block header.  If the
+ * block is an EOF block, transitions to @c DS_PEEK_TAIL to disambiguate
+ * between an optional SEK index and the file footer.  Otherwise, validates
+ * the announced @c comp_size against the @c payload buffer capacity, copies
+ * the parsed header into @c payload (the underlying decoder expects header
+ * + body + optional checksum as a single contiguous frame), and transitions
+ * to @c DS_NEED_BLOCK_PAYLOAD.
+ *
+ * @param[in,out] ds Decompression stream.
+ * @param[in,out] in Caller input buffer.
+ * @return @c 1 if more input is needed, @c 0 to continue the outer loop,
+ *         negative @ref zxc_error_t on validation/allocation failure.
+ */
+static int ds_handle_need_block_header(zxc_dstream* ds, zxc_inbuf_t* in) {
+    if (!ds_pull_scratch(ds, in)) return 1;
+
+    const int rc = zxc_read_block_header(ds->scratch, ds->scratch_used, &ds->cur_bh);
+    if (UNLIKELY(rc != ZXC_OK)) return ds_set_error(ds, rc);  // LCOV_EXCL_LINE
+
+    if (ds->cur_bh.block_type == (uint8_t)ZXC_BLOCK_EOF) {
+        /* EOF block: comp_size must be 0; no payload, no checksum. */
+        if (UNLIKELY(ds->cur_bh.comp_size != 0)) return ds_set_error(ds, ZXC_ERROR_BAD_BLOCK_SIZE);
+        ds->state = DS_PEEK_TAIL;
+        ds->scratch_used = 0;
+        ds->scratch_need = ZXC_BLOCK_HEADER_SIZE; /* sniff */
+        return 0;
+    }
+
+    /* Normal data block: read comp_size [+ ZXC_BLOCK_CHECKSUM_SIZE if file-level checksums]. */
+    const uint64_t need = (uint64_t)ds->cur_bh.comp_size +
+                          (ds->file_has_checksum ? (uint64_t)ZXC_BLOCK_CHECKSUM_SIZE : 0u);
+    if (UNLIKELY(need > ds->payload_cap)) return ds_set_error(ds, ZXC_ERROR_BAD_BLOCK_SIZE);
+
+    /* Feed the full block (header + payload + opt csum) to zxc_decompress_block,
+     * so prefix with the 8-byte header we just parsed. */
+    ZXC_MEMCPY(ds->payload, ds->scratch, ZXC_BLOCK_HEADER_SIZE);
+    ds->payload_used = ZXC_BLOCK_HEADER_SIZE;
+    ds->payload_need = (size_t)need + ZXC_BLOCK_HEADER_SIZE;
+    // LCOV_EXCL_START
+    if (UNLIKELY(ds->payload_need > ds->payload_cap)) {
+        /* grow */
+        uint8_t* nb = (uint8_t*)ZXC_REALLOC(ds->payload, ds->payload_need);
+        if (UNLIKELY(!nb)) return ds_set_error(ds, ZXC_ERROR_MEMORY);
+        ds->payload = nb;
+        ds->payload_cap = ds->payload_need;
+    }
+    // LCOV_EXCL_STOP
+    ds->state = DS_NEED_BLOCK_PAYLOAD;
+    return 0;
+}
+
+/**
+ * @brief Push-side entry point: feeds compressed input and drains decoded output.
+ *
+ * Drives the @ref dstream_state_t machine: file header, then per-block
+ * (header + payload + optional checksum) -> decode -> emit, repeated until
+ * the EOF block, optional SEK index block, and file footer have been parsed
+ * and validated.  Each call makes as much progress as @p in and @p out
+ * allow; the function is fully reentrant.  See the public contract in
+ * @ref zxc_dstream_decompress for full semantics.
+ *
+ * @par End of stream
+ * Once @c DS_VALIDATE_FOOTER succeeds, the stream is in @c DS_DONE; further
+ * calls return @c 0 without consuming input.
+ *
+ * @par Errors
+ * On any negative return the stream becomes errored (sticky); subsequent
+ * calls keep returning the same code until @ref zxc_dstream_free.
+ *
+ * @param[in,out] ds  Decompression stream.
+ * @param[in,out] out Caller output buffer.
+ * @param[in,out] in  Caller input buffer.
+ * @return @c >0 number of decompressed bytes written into @p out this call,
+ *         @c 0 stream complete (DONE) or no progress possible,
+ *         @c <0 a @ref zxc_error_t code.
+ */
+int64_t zxc_dstream_decompress(zxc_dstream* ds, zxc_outbuf_t* out, zxc_inbuf_t* in) {
+    if (UNLIKELY(!ds || !out || !in || in->pos > in->size || out->pos > out->size ||
+                 (in->size > in->pos && !in->src) || (out->size > out->pos && !out->dst))) {
+        return ZXC_ERROR_NULL_INPUT;
+    }
+    if (UNLIKELY(ds->state == DS_ERRORED)) return ds->error_code;
+    if (UNLIKELY(ds->state == DS_DONE)) return 0;
+
+    size_t produced = 0;
+
+    for (;;) {
+        switch (ds->state) {
+            case DS_NEED_FILE_HEADER: {
+                const int rc = ds_handle_need_file_header(ds, in);
+                if (rc == 1) return (int64_t)produced;
+                if (rc < 0) return rc;
+                break;
+            }
+
+            case DS_NEED_BLOCK_HEADER: {
+                const int rc = ds_handle_need_block_header(ds, in);
+                if (rc == 1) return (int64_t)produced;
+                if (rc < 0) return rc;
+                break;
+            }
+
+            case DS_NEED_BLOCK_PAYLOAD: {
+                if (!ds_pull_payload(ds, in)) return (int64_t)produced;
+                ds->state = DS_DECODE_BLOCK;
+                break;
+            }
+
+            case DS_DECODE_BLOCK: {
+                const int dsz = zxc_decompress_chunk_wrapper(
+                    &ds->inner, ds->payload, ds->payload_used, ds->decoded, ds->decoded_cap);
+                if (UNLIKELY(dsz < 0)) return ds_set_error(ds, dsz);
+                ds->decoded_size = (size_t)dsz;
+                ds->decoded_pos = 0;
+
+                /* If file-level checksum verification is enabled, fold this
+                 * block's trailer into the rolling global hash (last
+                 * ZXC_BLOCK_CHECKSUM_SIZE bytes of the *raw* block). */
+                if (ds->opts.checksum_enabled && ds->file_has_checksum &&
+                    ds->payload_used >= ZXC_BLOCK_CHECKSUM_SIZE) {
+                    const uint32_t bh =
+                        zxc_le32(ds->payload + ds->payload_used - ZXC_BLOCK_CHECKSUM_SIZE);
+                    ds->global_hash = zxc_hash_combine_rotate(ds->global_hash, bh);
+                }
+                ds->state = DS_EMIT_DECODED;
+                break;
+            }
+
+            case DS_EMIT_DECODED: {
+                const int done = ds_drain_decoded(ds, out, &produced);
+                if (!done) return (int64_t)produced;
+                ds->state = DS_NEED_BLOCK_HEADER;
+                ds->scratch_used = 0;
+                ds->scratch_need = ZXC_BLOCK_HEADER_SIZE;
+                break;
+            }
+
+            case DS_PEEK_TAIL: {
+                if (!ds_pull_scratch(ds, in)) return (int64_t)produced;
+                /* Try to interpret as a block header (SEK). */
+                zxc_block_header_t peek;
+                const int sek_rc = zxc_read_block_header(ds->scratch, ds->scratch_used, &peek);
+                if (sek_rc == ZXC_OK && peek.block_type == (uint8_t)ZXC_BLOCK_SEK) {
+                    /* SEK block: skip its payload (peek.comp_size bytes). */
+                    ds->sek_remaining = (size_t)peek.comp_size;
+                    ds->state = DS_DRAIN_SEK_PAYLOAD;
+                    break;
+                }
+                /* Not SEK -> these 8 bytes are the first 8 of the 12-byte footer. */
+                ds->state = DS_NEED_FOOTER_REST;
+                ds->scratch_need = ZXC_FILE_FOOTER_SIZE; /* keep first 8, want 4 more */
+                break;
+            }
+
+            case DS_DRAIN_SEK_PAYLOAD: {
+                const size_t avail = in->size - in->pos;
+                const size_t n = avail < ds->sek_remaining ? avail : ds->sek_remaining;
+                in->pos += n;
+                ds->sek_remaining -= n;
+                if (ds->sek_remaining > 0) return (int64_t)produced;
+                ds->state = DS_NEED_FOOTER_FULL;
+                ds->scratch_used = 0;
+                ds->scratch_need = ZXC_FILE_FOOTER_SIZE;
+                break;
+            }
+
+            case DS_NEED_FOOTER_REST:
+            case DS_NEED_FOOTER_FULL: {
+                if (!ds_pull_scratch(ds, in)) return (int64_t)produced;
+                ds->state = DS_VALIDATE_FOOTER;
+                break;
+            }
+
+            case DS_VALIDATE_FOOTER: {
+                const uint64_t declared = zxc_le64(ds->scratch);
+                if (UNLIKELY(declared != ds->total_out))
+                    return ds_set_error(ds, ZXC_ERROR_CORRUPT_DATA);
+                if (ds->opts.checksum_enabled && ds->file_has_checksum) {
+                    const uint32_t fh = zxc_le32(ds->scratch + sizeof(uint64_t));
+                    if (UNLIKELY(fh != ds->global_hash))
+                        return ds_set_error(ds, ZXC_ERROR_BAD_CHECKSUM);
+                }
+                ds->state = DS_DONE;
+                return (int64_t)produced;
+            }
+
+            case DS_DONE:
+            case DS_ERRORED:
+                return ds->state == DS_ERRORED ? ds->error_code : (int64_t)produced;
+        }
+    }
+}
diff --git a/thirdparty/zxc/src/lib/zxc_seekable.c b/thirdparty/zxc/src/lib/zxc_seekable.c
new file mode 100644
index 000000000000..5c7e7b0c8bd4
--- /dev/null
+++ b/thirdparty/zxc/src/lib/zxc_seekable.c
@@ -0,0 +1,1143 @@
+/*
+ * ZXC - High-performance lossless compression
+ *
+ * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file zxc_seekable.c
+ * @brief Seekable archive reader (random-access decompression) and seek table writer.
+ *
+ * The seek table is a standard ZXC block (type = ZXC_BLOCK_SEK) appended
+ * between the EOF block and the file footer.  It records the compressed size
+ * of every block (decompressed sizes are derived from the header's block_size),
+ * enabling O(1) lookup + O(block_size) decompression for any byte range.
+ *
+ * On-disk layout of a SEK block:
+ *
+ *   [Block Header (8B)]   block_type=SEK, block_flags=0, comp_size=N*4
+ *   [N x Entry (4B)]      comp_size(u32 LE) per block
+ *
+ * Detection from end of file:
+ *   1. Read file header (first 16 bytes) => block_size
+ *   2. Read file footer (last 12 bytes) => total_decompressed_size
+ *   3. Derive num_blocks = ceil(total_decomp / block_size)
+ *   4. Compute seek block size, read backward to the block header
+ *   5. Validate block_type == ZXC_BLOCK_SEK
+ */
+
+#include "../../include/zxc_seekable.h"
+
+#include "../../include/zxc_dict.h"
+#include "../../include/zxc_error.h"
+#include "zxc_internal.h"
+
+/* ========================================================================= */
+/*  Platform Threading Layer                                                 */
+/* ========================================================================= */
+
+// LCOV_EXCL_START - Windows platform layer, not reachable on POSIX CI
+#if defined(_WIN32)
+#include <process.h> /* _beginthreadex */
+#include <windows.h>
+
+/* Map POSIX threading primitives to Windows equivalents */
+typedef HANDLE zxc_thread_t;
+
+/**
+ * @brief Trampoline payload bridging the POSIX-style @c void*(*)(void*) worker
+ *        signature to the Win32 @c _beginthreadex entry point.
+ *
+ * Heap-allocated by @ref zxc_seek_thread_create and freed by
+ * @ref zxc_seek_thread_entry once the captured callback has started.
+ */
+typedef struct {
+    void* (*func)(void*); /* worker to invoke */
+    void* arg;            /* argument forwarded to @c func */
+} zxc_seek_thread_arg_t;
+
+/**
+ * @brief @c _beginthreadex entry point: unpacks the trampoline payload, frees
+ *        it, then runs the captured POSIX-style worker.
+ *
+ * @param[in] p  Heap @ref zxc_seek_thread_arg_t handed over by the creator;
+ *               ownership transfers to this function.
+ * @return Always 0 (the worker's @c void* result is discarded, matching the
+ *         POSIX path which also ignores it).
+ */
+static unsigned __stdcall zxc_seek_thread_entry(void* p) {
+    zxc_seek_thread_arg_t* a = (zxc_seek_thread_arg_t*)p;
+    void* (*f)(void*) = a->func;
+    void* arg = a->arg;
+    ZXC_FREE(a);
+    f(arg);
+    return 0;
+}
+
+/**
+ * @brief Spawns a thread running @p fn(@p arg), abstracting @c _beginthreadex.
+ *
+ * Allocates a @ref zxc_seek_thread_arg_t trampoline so the Win32 entry-point
+ * signature can carry a POSIX-style worker; the trampoline is freed by the
+ * thread itself (or here, on a launch failure).
+ *
+ * @param[out] t    Receives the created thread handle on success.
+ * @param[in]  fn   Worker to run on the new thread.
+ * @param[in]  arg  Opaque argument forwarded to @p fn.
+ * @return 0 on success, @ref ZXC_ERROR_MEMORY on allocation or spawn failure.
+ */
+static int zxc_seek_thread_create(zxc_thread_t* t, void* (*fn)(void*), void* arg) {
+    zxc_seek_thread_arg_t* wrapper = ZXC_MALLOC(sizeof(zxc_seek_thread_arg_t));
+    if (UNLIKELY(!wrapper)) return ZXC_ERROR_MEMORY;
+    wrapper->func = fn;
+    wrapper->arg = arg;
+    uintptr_t handle = _beginthreadex(NULL, 0, zxc_seek_thread_entry, wrapper, 0, NULL);
+    if (UNLIKELY(handle == 0)) {
+        ZXC_FREE(wrapper);
+        return ZXC_ERROR_MEMORY;
+    }
+    *t = (HANDLE)handle;
+    return 0;
+}
+
+/**
+ * @brief Blocks until thread @p t finishes, then releases its handle.
+ * @param[in] t  Handle from a successful @ref zxc_seek_thread_create.
+ */
+static void zxc_seek_thread_join(zxc_thread_t t) {
+    WaitForSingleObject(t, INFINITE);
+    CloseHandle(t);
+}
+
+/**
+ * @brief Returns the number of logical processors reported by the OS.
+ * @return Online processor count (always >= 1 in practice).
+ */
+static int zxc_seek_get_num_procs(void) {
+    SYSTEM_INFO si;
+    GetSystemInfo(&si);
+    return (int)si.dwNumberOfProcessors;
+}
+// LCOV_EXCL_STOP
+
+#else /* POSIX */
+#include <pthread.h>
+#include <unistd.h>
+
+typedef pthread_t zxc_thread_t;
+
+/**
+ * @brief Spawns a thread running @p fn(@p arg) via @c pthread_create.
+ * @param[out] t    Receives the created thread handle on success.
+ * @param[in]  fn   Worker to run on the new thread.
+ * @param[in]  arg  Opaque argument forwarded to @p fn.
+ * @return 0 on success, @ref ZXC_ERROR_MEMORY if the thread cannot be created.
+ */
+static int zxc_seek_thread_create(zxc_thread_t* t, void* (*fn)(void*), void* arg) {
+    return pthread_create(t, NULL, fn, arg) == 0 ? 0 : ZXC_ERROR_MEMORY;
+}
+
+/**
+ * @brief Blocks until thread @p t finishes (its @c void* result is discarded).
+ * @param[in] t  Handle from a successful @ref zxc_seek_thread_create.
+ */
+static void zxc_seek_thread_join(zxc_thread_t t) { pthread_join(t, NULL); }
+
+/**
+ * @brief Returns the number of online logical processors.
+ * @return @c _SC_NPROCESSORS_ONLN, clamped to a minimum of 1 if the query fails.
+ */
+static int zxc_seek_get_num_procs(void) {
+    const long n = sysconf(_SC_NPROCESSORS_ONLN);
+    return (n > 0) ? (int)n : 1;
+}
+
+#endif /* _WIN32 */
+
+/* ========================================================================= */
+/*  Seek Table Writer                                                        */
+/* ========================================================================= */
+
+/**
+ * @brief Byte size of a seek table holding @p num_blocks entries.
+ *
+ * Public API (declared in @c zxc_seekable.h): one block header plus
+ * @p num_blocks fixed-size entries. Use it to size the destination buffer
+ * before @ref zxc_write_seek_table.
+ *
+ * @param[in] num_blocks  Number of block entries the table will hold.
+ * @return Total table size in bytes (block header + entries).
+ */
+size_t zxc_seek_table_size(const uint32_t num_blocks) {
+    return ZXC_BLOCK_HEADER_SIZE + (size_t)num_blocks * ZXC_SEEK_ENTRY_SIZE;
+}
+
+/**
+ * @brief Serialises a seek table (a @c ZXC_BLOCK_SEK block) into @p dst.
+ *
+ * Public API; full contract in @c zxc_seekable.h. Emits the standard ZXC block
+ * header followed by one little-endian @c u32 compressed-size entry per block.
+ *
+ * @param[out] dst           Destination buffer.
+ * @param[in]  dst_capacity  Capacity of @p dst in bytes.
+ * @param[in]  comp_sizes    Array of @p num_blocks compressed block sizes.
+ * @param[in]  num_blocks    Number of blocks (and entries) to write.
+ * @return Bytes written on success, or a negative @ref zxc_error_t
+ *         (@ref ZXC_ERROR_OVERFLOW, @ref ZXC_ERROR_DST_TOO_SMALL,
+ *         @ref ZXC_ERROR_NULL_INPUT).
+ */
+int64_t zxc_write_seek_table(uint8_t* dst, const size_t dst_capacity, const uint32_t* comp_sizes,
+                             const uint32_t num_blocks) {
+    if (UNLIKELY(num_blocks > UINT32_MAX / ZXC_SEEK_ENTRY_SIZE)) return ZXC_ERROR_OVERFLOW;
+
+    const size_t total = zxc_seek_table_size(num_blocks);
+    if (UNLIKELY(dst_capacity < total)) return ZXC_ERROR_DST_TOO_SMALL;
+    if (UNLIKELY(!dst || !comp_sizes)) return ZXC_ERROR_NULL_INPUT;
+
+    const uint32_t payload_size = num_blocks * ZXC_SEEK_ENTRY_SIZE;
+
+    /* Write standard ZXC block header */
+    const zxc_block_header_t bh = {
+        .block_type = ZXC_BLOCK_SEK, .block_flags = 0, .reserved = 0, .comp_size = payload_size};
+    const int hdr_res = zxc_write_block_header(dst, dst_capacity, &bh);
+    if (UNLIKELY(hdr_res < 0)) return hdr_res;
+    uint8_t* p = dst + hdr_res;
+
+    /* Write entries: comp_size(4) only */
+    for (uint32_t i = 0; i < num_blocks; i++) {
+        zxc_store_le32(p, comp_sizes[i]);
+        p += sizeof(uint32_t);
+    }
+
+    return (int64_t)(p - dst);
+}
+
+/* ========================================================================= */
+/*  Seekable Reader (Opaque Handle)                                          */
+/* ========================================================================= */
+
+struct zxc_seekable_s {
+    /* Source - exactly one of {src, reader.read_at} is set.  The FILE*
+     * variant (see zxc_seekable_file.c) routes through the reader callback
+     * by wrapping pread() in its own ctx; from this struct's perspective it
+     * is indistinguishable from any other caller-supplied reader. */
+    const uint8_t* src;
+    uint64_t src_size;
+    zxc_reader_t reader; /* user-supplied callback reader; read_at == NULL when unused */
+
+    /* Heap-allocated reader context owned by the seekable handle, freed in
+     * zxc_seekable_free.  Set by thin wrappers (e.g. zxc_seekable_open_file)
+     * via zxc_seekable_attach_owned_ctx.  NULL when the caller manages
+     * reader.ctx lifetime themselves. */
+    void* owned_reader_ctx;
+
+    /* Parsed seek table */
+    uint32_t num_blocks;
+    uint32_t* comp_sizes;   /* array[num_blocks] */
+    uint64_t* comp_offsets; /* prefix-sum: byte offset in compressed file per block */
+    uint64_t total_decomp;  /* total decompressed size (from footer) */
+
+    /* File header info - block_size is always a power of 2 in [4KB, 2MB],
+     * fits in 21 bits. */
+    uint32_t block_size;
+    int file_has_checksums;
+    uint32_t expected_dict_id; /* dict_id from the file header; 0 = no dictionary */
+
+    /* Reusable decompression context (single-threaded path only) */
+    zxc_cctx_t dctx;
+    int dctx_initialized;
+
+    /* Dictionary (owned copy, freed in zxc_seekable_free). */
+    uint8_t* dict;
+    size_t dict_size;
+    /* Shared literal Huffman table (owned copy; meaningful when has_dict_huf). */
+    uint8_t dict_huf[ZXC_HUF_TABLE_SIZE];
+    int has_dict_huf;
+};
+
+/**
+ * @brief Parses the seek table from raw bytes at the end of the archive.
+ *
+ * Detection (backward from end):
+ *   1. Read file header => block_size
+ *   2. Read file footer => total_decomp_size
+ *   3. Derive num_blocks = ceil(total_decomp_size / block_size)
+ *   4. Compute expected seek block position, validate block_type == SEK
+ *   5. Read comp_sizes; derive decomp_sizes from block_size
+ *
+ * @param[in] data       Pointer to the whole in-memory archive.
+ * @param[in] data_size  Archive size in bytes.
+ * @return A newly allocated handle (free via @ref zxc_seekable_free), or NULL
+ *         if the buffer is too small or the seek table is missing / malformed.
+ */
+static zxc_seekable* zxc_seekable_parse(const uint8_t* data, const size_t data_size) {
+    /* Minimum: file_header(16) + eof_block(8) + seek_block_header(8)
+     *          + file_footer(12) = 44 */
+    const size_t MIN_SEEKABLE_SIZE =
+        ZXC_FILE_HEADER_SIZE + ZXC_BLOCK_HEADER_SIZE + ZXC_BLOCK_HEADER_SIZE + ZXC_FILE_FOOTER_SIZE;
+    if (UNLIKELY(data_size < MIN_SEEKABLE_SIZE)) return NULL;
+
+    /* Step 1: validate file header => block_size */
+    size_t block_size_sz = 0;
+    int file_has_chk = 0;
+    uint32_t header_dict_id = 0;
+    if (UNLIKELY(zxc_read_file_header(data, data_size, &block_size_sz, &file_has_chk,
+                                      &header_dict_id) != ZXC_OK))
+        return NULL;  // LCOV_EXCL_LINE
+    const uint32_t block_size = (uint32_t)block_size_sz;
+    if (UNLIKELY(block_size == 0)) return NULL;  // LCOV_EXCL_LINE
+
+    /* Step 2: read total decompressed size from the file footer */
+    const uint8_t* const footer_ptr = data + data_size - ZXC_FILE_FOOTER_SIZE;
+    const uint64_t total_decomp = zxc_le64(footer_ptr);
+
+    /* A value of 0 means empty file - no seek table */
+    if (UNLIKELY(total_decomp == 0)) return NULL;
+
+    /* Step 3: derive num_blocks = ceil(total_decomp / block_size) */
+    const uint64_t num_blocks_64 = (total_decomp + block_size - 1) / block_size;
+    if (UNLIKELY(num_blocks_64 > UINT32_MAX)) return NULL;
+    const uint32_t num_blocks = (uint32_t)num_blocks_64;
+
+    /* Step 4: compute seek block position and validate. */
+    const uint64_t entries_total_64 = num_blocks_64 * ZXC_SEEK_ENTRY_SIZE;
+    if (UNLIKELY(entries_total_64 > SIZE_MAX - ZXC_BLOCK_HEADER_SIZE)) return NULL;
+    const size_t entries_total = (size_t)entries_total_64;
+    const size_t seek_block_total = ZXC_BLOCK_HEADER_SIZE + entries_total;
+    if (UNLIKELY(seek_block_total + ZXC_FILE_FOOTER_SIZE > data_size)) return NULL;
+    const uint8_t* const seek_block_start =
+        data + data_size - ZXC_FILE_FOOTER_SIZE - seek_block_total;
+    if (UNLIKELY(seek_block_start < data)) return NULL;
+
+    /* Read and validate SEK block header */
+    zxc_block_header_t bh;
+    if (UNLIKELY(zxc_read_block_header(seek_block_start, seek_block_total, &bh) != ZXC_OK))
+        return NULL;
+    if (UNLIKELY(bh.block_type != ZXC_BLOCK_SEK)) return NULL;
+    if (UNLIKELY(bh.comp_size != (uint32_t)entries_total)) return NULL;
+
+    /* Step 5: allocate handle and parse entries */
+    zxc_seekable* const s = (zxc_seekable*)ZXC_CALLOC(1, sizeof(zxc_seekable));
+    // LCOV_EXCL_START
+    if (UNLIKELY(!s)) return NULL;
+    // LCOV_EXCL_STOP
+
+    s->num_blocks = num_blocks;
+    s->block_size = block_size;
+    s->file_has_checksums = file_has_chk;
+    s->expected_dict_id = header_dict_id;
+    s->src = data;
+    s->src_size = (uint64_t)data_size;
+
+    /* Allocate arrays */
+    s->comp_sizes = (uint32_t*)ZXC_CALLOC(num_blocks, sizeof(uint32_t));
+    s->comp_offsets = (uint64_t*)ZXC_CALLOC((size_t)num_blocks + 1, sizeof(uint64_t));
+    // LCOV_EXCL_START
+    if (UNLIKELY(!s->comp_sizes || !s->comp_offsets)) {
+        zxc_seekable_free(s);
+        return NULL;
+    }
+    // LCOV_EXCL_STOP
+    s->total_decomp = total_decomp;
+
+    /* Parse comp_sizes and build compressed prefix sums.
+     * Validate each comp_size against data_size to prevent prefix-sum overflow
+     * and out-of-bounds reads during decompression. */
+    const uint8_t* ep = seek_block_start + ZXC_BLOCK_HEADER_SIZE;
+    uint64_t comp_acc = ZXC_FILE_HEADER_SIZE; /* blocks start after file header */
+    for (uint32_t i = 0; i < num_blocks; i++) {
+        s->comp_sizes[i] = zxc_le32(ep);
+        ep += sizeof(uint32_t);
+
+        /* Reject entries below minimum (block header) or larger than the file */
+        if (UNLIKELY(s->comp_sizes[i] < ZXC_BLOCK_HEADER_SIZE ||
+                     s->comp_sizes[i] > (uint64_t)data_size)) {
+            zxc_seekable_free(s);
+            return NULL;
+        }
+        s->comp_offsets[i] = comp_acc;
+        comp_acc += s->comp_sizes[i];
+        /* Reject if cumulative offset exceeds file size (inconsistent table) */
+        if (UNLIKELY(comp_acc > (uint64_t)data_size)) {
+            zxc_seekable_free(s);
+            return NULL;
+        }
+    }
+    s->comp_offsets[num_blocks] = comp_acc;
+
+    /* Verify prefix-sum lands exactly at the EOF block position.
+     * Expected layout: [header 16][data blocks][EOF 8][SEK block][footer 12]
+     * So comp_acc (end of data blocks) + EOF(8) == seek_block_start. */
+    const uint64_t expected_eof_offset =
+        (uint64_t)(seek_block_start - data) - ZXC_BLOCK_HEADER_SIZE;
+    if (UNLIKELY(comp_acc != expected_eof_offset)) {
+        zxc_seekable_free(s);
+        return NULL;
+    }
+
+    /* Validate that an actual EOF block header exists at the computed offset */
+    if (UNLIKELY(comp_acc + ZXC_BLOCK_HEADER_SIZE > data_size)) {
+        zxc_seekable_free(s);
+        return NULL;
+    }
+    zxc_block_header_t eof_bh;
+    if (UNLIKELY(zxc_read_block_header(data + comp_acc, ZXC_BLOCK_HEADER_SIZE, &eof_bh) != ZXC_OK ||
+                 eof_bh.block_type != ZXC_BLOCK_EOF)) {
+        zxc_seekable_free(s);
+        return NULL;
+    }
+
+    return s;
+}
+
+/**
+ * @brief Opens a seekable archive held entirely in a memory buffer.
+ *
+ * Public API; see @c zxc_seekable.h. Thin guard around
+ * @ref zxc_seekable_parse, which detects and validates the trailing seek table.
+ *
+ * @param[in] src       Pointer to the whole compressed archive.
+ * @param[in] src_size  Archive size in bytes.
+ * @return A handle to release with @ref zxc_seekable_free, or NULL on bad input
+ *         or a missing / malformed seek table.
+ */
+zxc_seekable* zxc_seekable_open(const void* src, const size_t src_size) {
+    if (UNLIKELY(!src || src_size == 0)) return NULL;
+    return zxc_seekable_parse((const uint8_t*)src, src_size);
+}
+
+/* zxc_seekable_open_file (FILE* variant) lives in zxc_seekable_file.c.  It
+ * builds a zxc_reader_t over pread() and delegates to
+ * zxc_seekable_open_reader below, keeping this translation unit free of any
+ * <stdio.h> dependency. */
+
+/**
+ * @brief Opens a seekable archive over a caller-supplied random-access reader.
+ *
+ * Public API; see @c zxc_seekable.h. Reads the file header, footer and seek
+ * block through @p r->read_at (the FILE* variant wraps @c pread this way),
+ * validates the SEK block, and builds the per-block compressed-offset prefix
+ * sums. Unlike @ref zxc_seekable_open the archive is never mapped whole; only
+ * the metadata is read up front.
+ *
+ * @param[in] r  Reader descriptor (@c read_at and @c size must be set).
+ * @return A handle to release with @ref zxc_seekable_free, or NULL on bad input,
+ *         a short read, or a malformed seek table.
+ */
+zxc_seekable* zxc_seekable_open_reader(const zxc_reader_t* r) {
+    if (UNLIKELY(!r || !r->read_at || r->size == 0)) return NULL;
+
+    /* Minimum: file_header(16) + eof_block(8) + seek_block_header(8)
+     *          + file_footer(12) = 44 */
+    const uint64_t MIN_SEEKABLE_SIZE =
+        ZXC_FILE_HEADER_SIZE + ZXC_BLOCK_HEADER_SIZE + ZXC_BLOCK_HEADER_SIZE + ZXC_FILE_FOOTER_SIZE;
+    if (UNLIKELY(r->size < MIN_SEEKABLE_SIZE)) return NULL;
+
+    /* Read file header => block_size */
+    uint8_t header[ZXC_FILE_HEADER_SIZE];
+    if (UNLIKELY(r->read_at(r->ctx, header, ZXC_FILE_HEADER_SIZE, 0) !=
+                 (int64_t)ZXC_FILE_HEADER_SIZE))
+        return NULL;
+
+    size_t bs_sz = 0;
+    int fhc = 0;
+    uint32_t header_dict_id = 0;
+    if (UNLIKELY(zxc_read_file_header(header, ZXC_FILE_HEADER_SIZE, &bs_sz, &fhc,
+                                      &header_dict_id) != ZXC_OK))
+        return NULL;  // LCOV_EXCL_LINE
+    const uint32_t bs = (uint32_t)bs_sz;
+    if (UNLIKELY(bs == 0)) return NULL;
+
+    /* Read footer => total_decomp_size */
+    uint8_t footer_buf[ZXC_FILE_FOOTER_SIZE];
+    if (UNLIKELY(r->read_at(r->ctx, footer_buf, ZXC_FILE_FOOTER_SIZE,
+                            r->size - ZXC_FILE_FOOTER_SIZE) != (int64_t)ZXC_FILE_FOOTER_SIZE))
+        return NULL;
+
+    const uint64_t total_decomp = zxc_le64(footer_buf);
+    if (UNLIKELY(total_decomp == 0)) return NULL;
+
+    /* Derive num_blocks = ceil(total_decomp / block_size) */
+    const uint64_t num_blocks_64 = (total_decomp + bs - 1) / bs;
+    if (UNLIKELY(num_blocks_64 > UINT32_MAX)) return NULL;
+    const uint32_t num_blocks = (uint32_t)num_blocks_64;
+
+    /* Guard against size_t multiplication overflow */
+    const uint64_t entries_total_64 = (uint64_t)num_blocks * ZXC_SEEK_ENTRY_SIZE;
+    if (UNLIKELY(entries_total_64 > SIZE_MAX - ZXC_BLOCK_HEADER_SIZE)) return NULL;
+
+    /* Read the full seek block */
+    const size_t seek_block_total = ZXC_BLOCK_HEADER_SIZE + (size_t)entries_total_64;
+    if (UNLIKELY(seek_block_total + ZXC_FILE_FOOTER_SIZE > r->size)) return NULL;
+
+    uint8_t* const seek_buf = (uint8_t*)ZXC_MALLOC(seek_block_total);
+    if (UNLIKELY(!seek_buf)) return NULL;
+
+    const uint64_t seek_offset = r->size - ZXC_FILE_FOOTER_SIZE - (uint64_t)seek_block_total;
+    if (UNLIKELY(r->read_at(r->ctx, seek_buf, seek_block_total, seek_offset) !=
+                 (int64_t)seek_block_total)) {
+        // LCOV_EXCL_START
+        ZXC_FREE(seek_buf);
+        return NULL;
+        // LCOV_EXCL_STOP
+    }
+
+    /* Validate SEK block header */
+    zxc_block_header_t bh;
+    if (UNLIKELY(zxc_read_block_header(seek_buf, seek_block_total, &bh) != ZXC_OK) ||
+        bh.block_type != ZXC_BLOCK_SEK || bh.comp_size != (uint32_t)entries_total_64) {
+        // LCOV_EXCL_START
+        ZXC_FREE(seek_buf);
+        return NULL;
+        // LCOV_EXCL_STOP
+    }
+
+    /* Build seekable handle */
+    zxc_seekable* const s = (zxc_seekable*)ZXC_CALLOC(1, sizeof(zxc_seekable));
+    if (UNLIKELY(!s)) {
+        ZXC_FREE(seek_buf);
+        return NULL;
+    }
+
+    s->reader = *r;
+    s->src = NULL;
+    s->src_size = r->size;
+    s->num_blocks = num_blocks;
+    s->block_size = bs;
+    s->file_has_checksums = fhc;
+    s->expected_dict_id = header_dict_id;
+
+    s->comp_sizes = (uint32_t*)ZXC_CALLOC(num_blocks, sizeof(uint32_t));
+    s->comp_offsets = (uint64_t*)ZXC_CALLOC((size_t)num_blocks + 1, sizeof(uint64_t));
+    if (UNLIKELY(!s->comp_sizes || !s->comp_offsets)) {
+        // LCOV_EXCL_START
+        ZXC_FREE(seek_buf);
+        zxc_seekable_free(s);
+        return NULL;
+        // LCOV_EXCL_STOP
+    }
+    s->total_decomp = total_decomp;
+
+    /* Parse comp_sizes and build prefix sums; validate against archive size. */
+    const uint8_t* ep = seek_buf + ZXC_BLOCK_HEADER_SIZE;
+    uint64_t comp_acc = ZXC_FILE_HEADER_SIZE;
+    for (uint32_t i = 0; i < num_blocks; i++) {
+        s->comp_sizes[i] = zxc_le32(ep);
+        ep += sizeof(uint32_t);
+
+        if (UNLIKELY(s->comp_sizes[i] < ZXC_BLOCK_HEADER_SIZE || s->comp_sizes[i] > r->size)) {
+            // LCOV_EXCL_START
+            ZXC_FREE(seek_buf);
+            zxc_seekable_free(s);
+            return NULL;
+            // LCOV_EXCL_STOP
+        }
+        s->comp_offsets[i] = comp_acc;
+        comp_acc += s->comp_sizes[i];
+        if (UNLIKELY(comp_acc > r->size)) {
+            // LCOV_EXCL_START
+            ZXC_FREE(seek_buf);
+            zxc_seekable_free(s);
+            return NULL;
+            // LCOV_EXCL_STOP
+        }
+    }
+    s->comp_offsets[num_blocks] = comp_acc;
+
+    ZXC_FREE(seek_buf);
+    return s;
+}
+
+/**
+ * @brief Number of blocks in the archive.
+ * @param[in] s  Seekable handle (may be NULL).
+ * @return Block count, or 0 if @p s is NULL.
+ */
+uint32_t zxc_seekable_get_num_blocks(const zxc_seekable* s) { return s ? s->num_blocks : 0; }
+
+/**
+ * @brief Total decompressed size of the archive.
+ * @param[in] s  Seekable handle (may be NULL).
+ * @return Decompressed size in bytes, or 0 if @p s is NULL.
+ */
+uint64_t zxc_seekable_get_decompressed_size(const zxc_seekable* s) {
+    return s ? s->total_decomp : 0;
+}
+
+/**
+ * @brief Compressed byte size of a given block.
+ * @param[in] s          Seekable handle (may be NULL).
+ * @param[in] block_idx  Zero-based block index.
+ * @return Compressed size in bytes, or 0 if @p s is NULL or @p block_idx is
+ *         out of range.
+ */
+uint32_t zxc_seekable_get_block_comp_size(const zxc_seekable* s, const uint32_t block_idx) {
+    if (UNLIKELY(!s || block_idx >= s->num_blocks)) return 0;
+    return s->comp_sizes[block_idx];
+}
+
+/**
+ * @brief Decompressed byte size of a given block.
+ *
+ * Every block decompresses to @c block_size except the last, which holds the
+ * remainder of @c total_decomp.
+ *
+ * @param[in] s          Seekable handle (may be NULL).
+ * @param[in] block_idx  Zero-based block index.
+ * @return Decompressed size in bytes, or 0 if @p s is NULL or @p block_idx is
+ *         out of range.
+ */
+uint32_t zxc_seekable_get_block_decomp_size(const zxc_seekable* s, const uint32_t block_idx) {
+    if (UNLIKELY(!s || block_idx >= s->num_blocks)) return 0;
+    const uint64_t start = (uint64_t)block_idx * (uint64_t)s->block_size;
+    const uint64_t remaining = s->total_decomp - start;
+    return (remaining >= (uint64_t)s->block_size) ? s->block_size : (uint32_t)remaining;
+}
+
+/* ========================================================================= */
+/*  Random-Access Decompression                                              */
+/* ========================================================================= */
+
+/**
+ * @brief Maps a decompressed @p offset to its containing block index (O(1)).
+ * @param[in] block_size  Fixed decompressed block size (a power of two).
+ * @param[in] offset      Absolute decompressed byte offset.
+ * @return Zero-based index of the block that holds @p offset.
+ */
+static uint32_t zxc_seek_find_block(const uint32_t block_size, const uint64_t offset) {
+    return (uint32_t)(offset / (uint64_t)block_size);
+}
+
+/**
+ * @brief Decompressed start offset of block @p idx (O(1)).
+ * @param[in] block_size  Fixed decompressed block size.
+ * @param[in] idx         Zero-based block index.
+ * @return Absolute decompressed byte offset where block @p idx begins.
+ */
+static uint64_t zxc_seek_decomp_offset(const uint32_t block_size, const uint32_t idx) {
+    return (uint64_t)idx * (uint64_t)block_size;
+}
+
+/**
+ * @brief Decompressed size of block @p idx (O(1)).
+ *
+ * Returns @p block_size for every block except the last, which holds the
+ * remainder of @p total_decomp.
+ *
+ * @param[in] block_size    Fixed decompressed block size.
+ * @param[in] total_decomp  Total decompressed archive size.
+ * @param[in] idx           Zero-based block index.
+ * @return Decompressed byte size of block @p idx.
+ */
+static uint32_t zxc_seek_decomp_size(const uint32_t block_size, const uint64_t total_decomp,
+                                     const uint32_t idx) {
+    const uint64_t start = (uint64_t)idx * (uint64_t)block_size;
+    const uint64_t remaining = total_decomp - start;
+    return (remaining >= (uint64_t)block_size) ? block_size : (uint32_t)remaining;
+}
+
+/**
+ * @brief Reads a compressed block into @p buf from the memory buffer or reader.
+ *
+ * Single-threaded path: copies from @c s->src in buffer mode, otherwise calls
+ * @c s->reader.read_at (which also backs the FILE* variant).
+ *
+ * @param[in]  s          Seekable handle.
+ * @param[in]  block_idx  Zero-based block index to read.
+ * @param[out] buf        Destination buffer.
+ * @param[in]  buf_cap    Capacity of @p buf in bytes.
+ * @return The block's compressed byte count on success, or a negative
+ *         @ref zxc_error_t (@ref ZXC_ERROR_DST_TOO_SMALL,
+ *         @ref ZXC_ERROR_SRC_TOO_SMALL, @ref ZXC_ERROR_IO).
+ */
+static int zxc_seek_read_block(const zxc_seekable* s, const uint32_t block_idx, uint8_t* buf,
+                               const size_t buf_cap) {
+    const uint64_t off = s->comp_offsets[block_idx];
+    const uint32_t csz = s->comp_sizes[block_idx];
+    if (UNLIKELY(csz > buf_cap)) return ZXC_ERROR_DST_TOO_SMALL;
+
+    if (s->src) {
+        /* Buffer mode */
+        if (UNLIKELY(off + csz > s->src_size)) return ZXC_ERROR_SRC_TOO_SMALL;
+        ZXC_MEMCPY(buf, s->src + off, csz);
+    } else if (s->reader.read_at) {
+        /* Caller-supplied reader (also covers the FILE* variant, which
+         * provides a pread-backed callback from zxc_seekable_file.c). */
+        const int64_t r = s->reader.read_at(s->reader.ctx, buf, csz, off);
+        if (UNLIKELY(r != (int64_t)csz)) return (r < 0) ? (int)r : ZXC_ERROR_IO;
+    } else {
+        return ZXC_ERROR_NULL_INPUT;  // LCOV_EXCL_LINE
+    }
+    return (int)csz;
+}
+
+/**
+ * @brief Decompresses the byte range [@p offset, @p offset + @p len) into @p dst.
+ *
+ * Public API; full contract in @c zxc_seekable.h. Maps the range to its block
+ * span via O(1) division, decodes each covered block through a reusable,
+ * lazily-initialised, dictionary-aware context, and copies out only the
+ * requested sub-range. Single-threaded; see @ref zxc_seekable_decompress_range_mt
+ * for the parallel variant.
+ *
+ * @param[in,out] s             Seekable handle (carries the reusable context).
+ * @param[out]    dst           Destination buffer.
+ * @param[in]     dst_capacity  Capacity of @p dst (must be >= @p len).
+ * @param[in]     offset        Absolute decompressed start offset.
+ * @param[in]     len           Number of decompressed bytes to produce.
+ * @return @p len on success, or a negative @ref zxc_error_t.
+ */
+int64_t zxc_seekable_decompress_range(zxc_seekable* s, void* dst, const size_t dst_capacity,
+                                      const uint64_t offset, const size_t len) {
+    if (UNLIKELY(len == 0)) return 0;
+    if (UNLIKELY(!s || !dst)) return ZXC_ERROR_NULL_INPUT;
+    if (UNLIKELY(dst_capacity < len)) return ZXC_ERROR_DST_TOO_SMALL;
+    if (UNLIKELY(offset + len > s->total_decomp)) return ZXC_ERROR_SRC_TOO_SMALL;
+    if (UNLIKELY(s->expected_dict_id != 0 && (!s->dict || s->dict_size == 0)))
+        return ZXC_ERROR_DICT_REQUIRED;
+
+    /* Initialize decompression context on first use */
+    if (!s->dctx_initialized) {
+        // LCOV_EXCL_START
+        if (UNLIKELY(zxc_cctx_init(&s->dctx, (size_t)s->block_size, 0, 0, 0, s->dict_size) !=
+                     ZXC_OK))
+            return ZXC_ERROR_MEMORY;
+        // LCOV_EXCL_STOP
+        if (UNLIKELY(zxc_cctx_attach_dict_huf(&s->dctx, s->has_dict_huf ? s->dict_huf : NULL) !=
+                     ZXC_OK)) {
+            // LCOV_EXCL_START
+            zxc_cctx_free(&s->dctx);
+            return ZXC_ERROR_CORRUPT_DATA;
+            // LCOV_EXCL_STOP
+        }
+        s->dctx_initialized = 1;
+        if (s->dict_size > 0) ZXC_MEMCPY(s->dctx.dict_buffer, s->dict, s->dict_size);
+    }
+    s->dctx.dict_size = s->dict_size;
+
+    /* work_buf is pre-sized to block_size + ZXC_DECOMPRESS_TAIL_PAD by the
+     * matching zxc_cctx_init above. */
+    const size_t work_sz = (size_t)s->block_size + ZXC_DECOMPRESS_TAIL_PAD;
+
+    /* Find block range - O(1) division */
+    const uint32_t blk_start = zxc_seek_find_block(s->block_size, offset);
+    const uint32_t blk_end = zxc_seek_find_block(s->block_size, offset + len - 1);
+
+    uint8_t* out = (uint8_t*)dst;
+    size_t remaining = len;
+
+    /* Allocate read buffer for compressed blocks */
+    size_t max_comp = 0;
+    for (uint32_t bi = blk_start; bi <= blk_end; bi++) {
+        if (s->comp_sizes[bi] > max_comp) max_comp = s->comp_sizes[bi];
+    }
+    uint8_t* const read_buf = (uint8_t*)ZXC_MALLOC(max_comp + ZXC_PAD_SIZE);
+    if (UNLIKELY(!read_buf)) return ZXC_ERROR_MEMORY;  // LCOV_EXCL_LINE
+
+    for (uint32_t bi = blk_start; bi <= blk_end; bi++) {
+        /* Read compressed block data */
+        const int read_res = zxc_seek_read_block(s, bi, read_buf, max_comp + ZXC_PAD_SIZE);
+        if (UNLIKELY(read_res < 0)) {
+            // LCOV_EXCL_START
+            ZXC_FREE(read_buf);
+            return read_res;
+            // LCOV_EXCL_STOP
+        }
+
+        /* Decompress the block: when a dictionary is active, decode into the
+         * cctx-owned dict_buffer (which has dict content prepended) so that
+         * match copies referencing dictionary bytes resolve naturally. */
+        uint8_t* dec_dst =
+            s->dctx.dict_buffer ? s->dctx.dict_buffer + s->dict_size : s->dctx.work_buf;
+        const int dec_res =
+            zxc_decompress_chunk_wrapper(&s->dctx, read_buf, (size_t)read_res, dec_dst, work_sz);
+        if (UNLIKELY(dec_res < 0)) {
+            // LCOV_EXCL_START
+            ZXC_FREE(read_buf);
+            return dec_res;
+            // LCOV_EXCL_STOP
+        }
+
+        /* Calculate which portion of this block's decompressed data we need */
+        const uint64_t blk_decomp_start = zxc_seek_decomp_offset(s->block_size, bi);
+        const size_t skip = (offset > blk_decomp_start) ? (size_t)(offset - blk_decomp_start) : 0;
+        if (UNLIKELY((size_t)dec_res < skip)) {
+            // LCOV_EXCL_START
+            ZXC_FREE(read_buf);
+            return ZXC_ERROR_CORRUPT_DATA;
+            // LCOV_EXCL_STOP
+        }
+        const size_t avail = (size_t)dec_res - skip;
+        const size_t copy = (avail < remaining) ? avail : remaining;
+
+        ZXC_MEMCPY(out, dec_dst + skip, copy);
+        out += copy;
+        remaining -= copy;
+    }
+
+    ZXC_FREE(read_buf);
+    return (int64_t)len;
+}
+
+/* ========================================================================= */
+/*  Multi-Threaded Random-Access Decompression (Fork-Join)                   */
+/* ========================================================================= */
+
+/**
+ * @brief Per-block job descriptor for multi-threaded decompression.
+ *
+ * Each worker thread receives a pointer to one of these, performs the read +
+ * decompress + memcpy sequence, and writes the result code into @c result.
+ * The main thread inspects @c result after join.
+ */
+typedef struct {
+    const zxc_seekable* s; /* shared handle (read-only) */
+    uint32_t block_idx;    /* block to decompress */
+    uint8_t* dst;          /* output pointer within caller's buffer */
+    size_t skip;           /* bytes to skip at start of decompressed block */
+    size_t copy_len;       /* bytes to copy into dst */
+    int result;            /* 0 = OK, < 0 = error */
+} zxc_seek_mt_job_t;
+
+/**
+ * @brief Thread-safe block read backing the multi-threaded path.
+ *
+ * Like @ref zxc_seek_read_block but safe to call concurrently: buffer mode uses
+ * @c memcpy on const data, reader mode relies on a positioned (pread-style)
+ * callback that carries its own offset.
+ *
+ * @param[in]  s          Seekable handle (read-only).
+ * @param[in]  block_idx  Zero-based block index to read.
+ * @param[out] buf        Destination buffer.
+ * @param[in]  buf_cap    Capacity of @p buf in bytes.
+ * @return The block's compressed byte count on success, or a negative
+ *         @ref zxc_error_t.
+ */
+static int zxc_seek_read_block_mt(const zxc_seekable* s, const uint32_t block_idx, uint8_t* buf,
+                                  const size_t buf_cap) {
+    const uint64_t off = s->comp_offsets[block_idx];
+    const uint32_t csz = s->comp_sizes[block_idx];
+    if (UNLIKELY(csz > buf_cap)) return ZXC_ERROR_DST_TOO_SMALL;
+
+    if (s->src) {
+        /* Buffer mode - memcpy is inherently thread-safe on const data */
+        if (UNLIKELY(off + csz > s->src_size)) return ZXC_ERROR_SRC_TOO_SMALL;
+        ZXC_MEMCPY(buf, s->src + off, csz);
+    } else if (s->reader.read_at) {
+        /* Reader callback - caller-supplied read_at must be thread-safe.
+         * The FILE* variant (zxc_seekable_file.c) installs a pread-backed
+         * callback that is naturally thread-safe. */
+        const int64_t r = s->reader.read_at(s->reader.ctx, buf, csz, off);
+        if (UNLIKELY(r != (int64_t)csz)) return (r < 0) ? (int)r : ZXC_ERROR_IO;
+    } else {
+        return ZXC_ERROR_NULL_INPUT;  // LCOV_EXCL_LINE
+    }
+    return (int)csz;
+}
+
+/**
+ * @brief Worker thread entry point for multi-threaded seekable decompression.
+ *
+ * Each worker:
+ *   1. Allocates a thread-local decompression context.
+ *   2. Reads the compressed block via pread (thread-safe).
+ *   3. Decompresses into a local work buffer.
+ *   4. Copies the requested sub-range into the caller's output buffer.
+ *
+ * The outcome is written into @c job->result; the main thread reads it after
+ * join.
+ *
+ * @param[in,out] arg  Pointer to this worker's @ref zxc_seek_mt_job_t.
+ * @return Always NULL (the result code is reported via @c job->result).
+ */
+static void* zxc_seek_mt_worker(void* arg) {
+    zxc_seek_mt_job_t* const job = (zxc_seek_mt_job_t*)arg;
+    const zxc_seekable* const s = job->s;
+    const uint32_t bi = job->block_idx;
+
+    /* Thread-local decompression context (mode=0 for decompress-only) */
+    zxc_cctx_t dctx;
+    // LCOV_EXCL_START
+    if (UNLIKELY(zxc_cctx_init(&dctx, (size_t)s->block_size, 0, 0, 0, s->dict_size) != ZXC_OK)) {
+        job->result = ZXC_ERROR_MEMORY;
+        return NULL;
+    }
+    // LCOV_EXCL_STOP
+    if (UNLIKELY(zxc_cctx_attach_dict_huf(&dctx, s->has_dict_huf ? s->dict_huf : NULL) != ZXC_OK)) {
+        // LCOV_EXCL_START
+        zxc_cctx_free(&dctx);
+        job->result = ZXC_ERROR_CORRUPT_DATA;
+        return NULL;
+        // LCOV_EXCL_STOP
+    }
+    const size_t work_sz = (size_t)s->block_size + ZXC_DECOMPRESS_TAIL_PAD;
+
+    uint8_t* const dict_work = dctx.dict_buffer;
+    if (dict_work) ZXC_MEMCPY(dict_work, s->dict, s->dict_size);
+
+    /* Read compressed block */
+    const uint32_t csz = s->comp_sizes[bi];
+    uint8_t* const read_buf = (uint8_t*)ZXC_MALLOC(csz + ZXC_PAD_SIZE);
+    // LCOV_EXCL_START
+    if (UNLIKELY(!read_buf)) {
+        zxc_cctx_free(&dctx);
+        job->result = ZXC_ERROR_MEMORY;
+        return NULL;
+    }
+    // LCOV_EXCL_STOP
+
+    const int read_res = zxc_seek_read_block_mt(s, bi, read_buf, csz + ZXC_PAD_SIZE);
+    // LCOV_EXCL_START
+    if (UNLIKELY(read_res < 0)) {
+        ZXC_FREE(read_buf);
+        zxc_cctx_free(&dctx);
+        job->result = read_res;
+        return NULL;
+    }
+    // LCOV_EXCL_STOP
+
+    /* Decompress: use dict bounce buffer when dictionary is active */
+    uint8_t* dec_dst = dict_work ? dict_work + s->dict_size : dctx.work_buf;
+    const int dec_res =
+        zxc_decompress_chunk_wrapper(&dctx, read_buf, (size_t)read_res, dec_dst, work_sz);
+    ZXC_FREE(read_buf);
+
+    // LCOV_EXCL_START
+    if (UNLIKELY(dec_res < 0)) {
+        zxc_cctx_free(&dctx);
+        job->result = dec_res;
+        return NULL;
+    }
+    if (UNLIKELY((size_t)dec_res < job->skip + job->copy_len)) {
+        zxc_cctx_free(&dctx);
+        job->result = ZXC_ERROR_CORRUPT_DATA;
+        return NULL;
+    }
+    // LCOV_EXCL_STOP
+
+    /* Copy the requested portion directly into the caller's output buffer */
+    ZXC_MEMCPY(job->dst, dec_dst + job->skip, job->copy_len);
+
+    zxc_cctx_free(&dctx);
+    job->result = 0;
+    return NULL;
+}
+
+/**
+ * @brief Multi-threaded variant of @ref zxc_seekable_decompress_range.
+ *
+ * Public API; full contract in @c zxc_seekable.h. Plans one job per covered
+ * block (each with its own thread-local context and read buffer) and runs them
+ * fork-join in waves of up to @p n_threads. Falls back to the single-threaded
+ * path for trivial spans. @p n_threads == 0 auto-detects the core count.
+ *
+ * @param[in,out] s             Seekable handle (read-only during the parallel phase).
+ * @param[out]    dst           Destination buffer.
+ * @param[in]     dst_capacity  Capacity of @p dst (must be >= @p len).
+ * @param[in]     offset        Absolute decompressed start offset.
+ * @param[in]     len           Number of decompressed bytes to produce.
+ * @param[in]     n_threads     Worker thread count; 0 = auto-detect.
+ * @return @p len on success, or the first negative @ref zxc_error_t observed.
+ */
+int64_t zxc_seekable_decompress_range_mt(zxc_seekable* s, void* dst, const size_t dst_capacity,
+                                         const uint64_t offset, const size_t len, int n_threads) {
+    if (UNLIKELY(len == 0)) return 0;
+    if (UNLIKELY(!s || !dst)) return ZXC_ERROR_NULL_INPUT;
+    if (UNLIKELY(dst_capacity < len)) return ZXC_ERROR_DST_TOO_SMALL;
+    if (UNLIKELY(offset + len > s->total_decomp)) return ZXC_ERROR_SRC_TOO_SMALL;
+    if (UNLIKELY(s->expected_dict_id != 0 && (!s->dict || s->dict_size == 0)))
+        return ZXC_ERROR_DICT_REQUIRED;
+
+    /* Find block range - O(1) division */
+    const uint32_t blk_start = zxc_seek_find_block(s->block_size, offset);
+    const uint32_t blk_end = zxc_seek_find_block(s->block_size, offset + len - 1);
+    const uint32_t num_jobs = blk_end - blk_start + 1;
+
+    /* Auto-detect thread count (0 = use all available cores) */
+    if (n_threads == 0) n_threads = zxc_seek_get_num_procs();
+
+    /* Fallback to single-threaded path for trivial cases */
+    if (n_threads <= 1 || num_jobs <= 1) {
+        return zxc_seekable_decompress_range(s, dst, dst_capacity, offset, len);
+    }
+
+    /* Cap threads to number of blocks and max limit */
+    if ((uint32_t)n_threads > num_jobs) n_threads = (int)num_jobs;
+    if (n_threads > ZXC_MAX_THREADS) n_threads = ZXC_MAX_THREADS;
+
+    /* Allocate job descriptors */
+    zxc_seek_mt_job_t* const jobs =
+        (zxc_seek_mt_job_t*)ZXC_CALLOC(num_jobs, sizeof(zxc_seek_mt_job_t));
+    if (UNLIKELY(!jobs)) return ZXC_ERROR_MEMORY;  // LCOV_EXCL_LINE
+
+    /* Plan jobs: compute skip, copy_len, and dst pointer for each block */
+    uint8_t* out = (uint8_t*)dst;
+    size_t remaining = len;
+    for (uint32_t i = 0; i < num_jobs; i++) {
+        const uint32_t bi = blk_start + i;
+        const uint64_t blk_decomp_start = zxc_seek_decomp_offset(s->block_size, bi);
+        const size_t skip = (offset > blk_decomp_start) ? (size_t)(offset - blk_decomp_start) : 0;
+        const size_t blk_decomp_sz = zxc_seek_decomp_size(s->block_size, s->total_decomp, bi);
+        if (UNLIKELY(blk_decomp_sz < skip)) {
+            // LCOV_EXCL_START
+            ZXC_FREE(jobs);
+            return ZXC_ERROR_CORRUPT_DATA;
+            // LCOV_EXCL_STOP
+        }
+        const size_t avail = blk_decomp_sz - skip;
+        const size_t copy = (avail < remaining) ? avail : remaining;
+
+        jobs[i].s = s;
+        jobs[i].block_idx = bi;
+        jobs[i].dst = out;
+        jobs[i].skip = skip;
+        jobs[i].copy_len = copy;
+        jobs[i].result = 0;
+
+        out += copy;
+        remaining -= copy;
+    }
+
+    /* Launch worker threads (fork phase) */
+    zxc_thread_t* const threads =
+        (zxc_thread_t*)ZXC_MALLOC((size_t)n_threads * sizeof(zxc_thread_t));
+    // LCOV_EXCL_START
+    if (UNLIKELY(!threads)) {
+        ZXC_FREE(jobs);
+        return ZXC_ERROR_MEMORY;
+    }
+    // LCOV_EXCL_STOP
+
+    /*
+     * Distribute jobs across threads round-robin style.
+     * If num_jobs > n_threads, some threads handle multiple blocks sequentially.
+     * We process jobs in waves: spawn n_threads at a time, join, repeat.
+     */
+    int error = 0;
+    uint32_t job_idx = 0;
+
+    while (job_idx < num_jobs && !error) {
+        const int wave_size =
+            ((int)(num_jobs - job_idx) < n_threads) ? (int)(num_jobs - job_idx) : n_threads;
+
+        int launched = 0;
+        for (int t = 0; t < wave_size; t++) {
+            // LCOV_EXCL_START
+            if (zxc_seek_thread_create(&threads[t], zxc_seek_mt_worker, &jobs[job_idx + t]) != 0) {
+                /* Failed to create thread - mark remaining jobs as errors */
+                for (uint32_t j = job_idx + (uint32_t)t; j < num_jobs; j++)
+                    jobs[j].result = ZXC_ERROR_MEMORY;
+                error = 1;
+                break;
+            }
+            // LCOV_EXCL_STOP
+            launched++;
+        }
+
+        /* Join phase */
+        for (int t = 0; t < launched; t++) {
+            zxc_seek_thread_join(threads[t]);
+            if (jobs[job_idx + t].result < 0) error = 1;
+        }
+
+        job_idx += (uint32_t)launched;
+    }
+
+    ZXC_FREE(threads);
+
+    /* Check for errors */
+    int64_t result = (int64_t)len;
+    if (error) {
+        for (uint32_t i = 0; i < num_jobs; i++) {
+            if (jobs[i].result < 0) {
+                result = (int64_t)jobs[i].result;
+                break;
+            }
+        }
+    }
+
+    ZXC_FREE(jobs);
+    return result;
+}
+
+/**
+ * @brief Releases a seekable handle and every resource it owns.
+ *
+ * Public API; see @c zxc_seekable.h. Tears down the reusable context, the seek
+ * arrays (comp sizes / offsets), the owned dictionary copy and any attached
+ * reader context. NULL-safe.
+ *
+ * @param[in] s  Seekable handle to release (may be NULL).
+ */
+void zxc_seekable_free(zxc_seekable* s) {
+    if (!s) return;
+    if (s->dctx_initialized) zxc_cctx_free(&s->dctx);
+    ZXC_FREE(s->dict);
+    ZXC_FREE(s->comp_sizes);
+    ZXC_FREE(s->comp_offsets);
+    ZXC_FREE(s->owned_reader_ctx);
+    ZXC_FREE(s);
+}
+
+/**
+ * @brief Installs the dictionary needed to decode a dict-compressed archive.
+ *
+ * Public API; full contract in @c zxc_seekable.h. Validates the dict_id against
+ * the file header, then takes an owned copy of @p dict (and the optional shared
+ * literal Huffman table @p dict_huf). Drops any context already built so the
+ * [dict | decode] bounce buffer is re-carved on the next decompress.
+ *
+ * @param[in,out] s          Seekable handle.
+ * @param[in]     dict       Dictionary bytes.
+ * @param[in]     dict_size  Dictionary length (<= @c ZXC_DICT_SIZE_MAX).
+ * @param[in]     dict_huf   Optional shared literal Huffman table, or NULL.
+ * @return @ref ZXC_OK, or a negative @ref zxc_error_t
+ *         (@ref ZXC_ERROR_DICT_TOO_LARGE, @ref ZXC_ERROR_DICT_MISMATCH, ...).
+ */
+int zxc_seekable_set_dict(zxc_seekable* s, const void* dict, const size_t dict_size,
+                          const void* dict_huf) {
+    if (UNLIKELY(!s || !dict || dict_size == 0)) return ZXC_ERROR_NULL_INPUT;
+    if (UNLIKELY(dict_size > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE;
+    if (UNLIKELY(s->expected_dict_id != 0 &&
+                 zxc_dict_id(dict, dict_size, (const uint8_t*)dict_huf) != s->expected_dict_id))
+        return ZXC_ERROR_DICT_MISMATCH;
+
+    ZXC_FREE(s->dict);
+    s->dict = NULL;
+    s->dict_size = 0;
+    s->has_dict_huf = 0;
+
+    s->dict = (uint8_t*)ZXC_MALLOC(dict_size);
+    if (UNLIKELY(!s->dict)) return ZXC_ERROR_MEMORY;
+    ZXC_MEMCPY(s->dict, dict, dict_size);
+    s->dict_size = dict_size;
+    if (dict_huf) {
+        ZXC_MEMCPY(s->dict_huf, dict_huf, ZXC_HUF_TABLE_SIZE);
+        s->has_dict_huf = 1;
+    }
+
+    /* The [dict | decode] bounce buffer is carved into the dctx workspace.
+     * Drop any context built without it (or for a different dict size) so it is
+     * re-carved with the new dict on the next decompress. */
+    if (s->dctx_initialized) {
+        zxc_cctx_free(&s->dctx);
+        s->dctx_initialized = 0;
+    }
+    return ZXC_OK;
+}
+
+/**
+ * @brief Transfers ownership of a heap reader context to the handle.
+ *
+ * Cross-TU hook (declared in @c zxc_internal.h): @p ctx is released via
+ * @c ZXC_FREE when @ref zxc_seekable_free runs. Used by
+ * @ref zxc_seekable_open_file so its allocated reader state outlives the open
+ * call. NULL-safe on @p s.
+ *
+ * @param[in,out] s    Seekable handle (may be NULL).
+ * @param[in]     ctx  Heap pointer to hand over; freed by @ref zxc_seekable_free.
+ */
+void zxc_seekable_attach_owned_ctx(zxc_seekable* s, void* ctx) {
+    if (s) s->owned_reader_ctx = ctx;
+}