From 61b410c5bc3ecd1157493e36dcff1ed3e34b540d Mon Sep 17 00:00:00 2001 From: Kevin Hellemun <17928966+OGKevin@users.noreply.github.com> Date: Tue, 16 Jun 2026 06:45:40 +0200 Subject: [PATCH 1/2] feat: add byte tracking This commit introduced a new feature flag that enables the ability to track byte offsets. When the source-positions feature is enabled, the tokenizer will track the number of UTF-8 bytes consumed from the input so far. This is done by giving BufferQueue a `bytes_consumed` field, which is incremented every time a character is consumed. The changes in cargo.toml were needed to make this project load as a git submodule in the Cadmus project. The xhtml-self-closing feature was needed due to EPUBs using XHTML-compatible self-closing on RCDATA/RAWTEXT elements. Change-Id: 566446e2bca101b7fefdca639c1b4d26 Change-Id-Short: uttvvtlxonpy --- Cargo.toml | 8 + html5ever/Cargo.toml | 30 +- html5ever/src/tokenizer/char_ref/mod.rs | 20 +- html5ever/src/tokenizer/interface.rs | 8 + html5ever/src/tokenizer/mod.rs | 361 +++++++++++++++++- html5ever/src/tree_builder/mod.rs | 21 +- markup5ever/Cargo.toml | 23 +- markup5ever/interface/tree_builder.rs | 12 + markup5ever/util/buffer_queue.rs | 178 +++++++++ rcdom/Cargo.toml | 12 + rcdom/tests/source-positions-integration.rs | 214 +++++++++++ rcdom/tests/xhtml-self-closing-integration.rs | 123 ++++++ 12 files changed, 984 insertions(+), 26 deletions(-) create mode 100644 rcdom/tests/source-positions-integration.rs create mode 100644 rcdom/tests/xhtml-self-closing-integration.rs diff --git a/Cargo.toml b/Cargo.toml index 4714cdbc..f2a87026 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,3 +42,11 @@ libtest-mimic = "0.8.1" rand = "0.9" serde_json = "1.0" typed-arena = "2.0.2" + +# Redirect crates.io tendril/web_atoms to the local path crates so that +# markup5ever (which cannot use workspace.dependencies when loaded as a +# Cargo [patch] from outside this workspace) gets the same crate instance +# as rcdom and xml5ever. +[patch.crates-io] +tendril = { path = "tendril" } +web_atoms = { path = "web_atoms" } diff --git a/html5ever/Cargo.toml b/html5ever/Cargo.toml index 3584e456..6db09321 100644 --- a/html5ever/Cargo.toml +++ b/html5ever/Cargo.toml @@ -5,25 +5,33 @@ documentation = "https://docs.rs/html5ever" categories = [ "parser-implementations", "web-programming" ] keywords = ["html", "html5", "parser", "parsing"] readme = "../README.md" -version.workspace = true -license.workspace = true -authors.workspace = true -repository.workspace = true -edition.workspace = true -rust-version.workspace = true +version = "0.39.0" +license = "MIT OR Apache-2.0" +authors = [ "The html5ever Project Developers" ] +repository = "https://github.com/servo/html5ever" +edition = "2021" +rust-version = "1.71.0" [features] trace_tokenizer = [] serde = ["markup5ever/serde"] +# Surfaces byte-accurate source positions; see markup5ever for full description. +source-positions = ["markup5ever/source-positions"] +# Honour the XML/XHTML self-closing syntax (`