From 61b410c5bc3ecd1157493e36dcff1ed3e34b540d Mon Sep 17 00:00:00 2001 From: Kevin Hellemun <17928966+OGKevin@users.noreply.github.com> Date: Tue, 16 Jun 2026 06:45:40 +0200 Subject: [PATCH 1/2] feat: add byte tracking This commit introduced a new feature flag that enables the ability to track byte offsets. When the source-positions feature is enabled, the tokenizer will track the number of UTF-8 bytes consumed from the input so far. This is done by giving BufferQueue a `bytes_consumed` field, which is incremented every time a character is consumed. The changes in cargo.toml were needed to make this project load as a git submodule in the Cadmus project. The xhtml-self-closing feature was needed due to EPUBs using XHTML-compatible self-closing on RCDATA/RAWTEXT elements. Change-Id: 566446e2bca101b7fefdca639c1b4d26 Change-Id-Short: uttvvtlxonpy --- Cargo.toml | 8 + html5ever/Cargo.toml | 30 +- html5ever/src/tokenizer/char_ref/mod.rs | 20 +- html5ever/src/tokenizer/interface.rs | 8 + html5ever/src/tokenizer/mod.rs | 361 +++++++++++++++++- html5ever/src/tree_builder/mod.rs | 21 +- markup5ever/Cargo.toml | 23 +- markup5ever/interface/tree_builder.rs | 12 + markup5ever/util/buffer_queue.rs | 178 +++++++++ rcdom/Cargo.toml | 12 + rcdom/tests/source-positions-integration.rs | 214 +++++++++++ rcdom/tests/xhtml-self-closing-integration.rs | 123 ++++++ 12 files changed, 984 insertions(+), 26 deletions(-) create mode 100644 rcdom/tests/source-positions-integration.rs create mode 100644 rcdom/tests/xhtml-self-closing-integration.rs diff --git a/Cargo.toml b/Cargo.toml index 4714cdbc..f2a87026 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,3 +42,11 @@ libtest-mimic = "0.8.1" rand = "0.9" serde_json = "1.0" typed-arena = "2.0.2" + +# Redirect crates.io tendril/web_atoms to the local path crates so that +# markup5ever (which cannot use workspace.dependencies when loaded as a +# Cargo [patch] from outside this workspace) gets the same crate instance +# as rcdom and xml5ever. +[patch.crates-io] +tendril = { path = "tendril" } +web_atoms = { path = "web_atoms" } diff --git a/html5ever/Cargo.toml b/html5ever/Cargo.toml index 3584e456..6db09321 100644 --- a/html5ever/Cargo.toml +++ b/html5ever/Cargo.toml @@ -5,25 +5,33 @@ documentation = "https://docs.rs/html5ever" categories = [ "parser-implementations", "web-programming" ] keywords = ["html", "html5", "parser", "parsing"] readme = "../README.md" -version.workspace = true -license.workspace = true -authors.workspace = true -repository.workspace = true -edition.workspace = true -rust-version.workspace = true +version = "0.39.0" +license = "MIT OR Apache-2.0" +authors = [ "The html5ever Project Developers" ] +repository = "https://github.com/servo/html5ever" +edition = "2021" +rust-version = "1.71.0" [features] trace_tokenizer = [] serde = ["markup5ever/serde"] +# Surfaces byte-accurate source positions; see markup5ever for full description. +source-positions = ["markup5ever/source-positions"] +# Honour the XML/XHTML self-closing syntax (``, `<style/>`, …) on +# RCDATA and RAWTEXT elements. Without this, html5ever follows the HTML5 +# spec and treats `<title/>` as opening a RCDATA region that swallows the +# rest of the document. EPUB content is XHTML and relies on self-closing +# being honoured. +xhtml-self-closing = [] [dependencies] -markup5ever = { workspace = true } -memchr = { workspace = true } -log = { workspace = true } +markup5ever = { version = "0.39", path = "../markup5ever" } +memchr = "2.8.0" +log = "0.4" [dev-dependencies] -criterion = { workspace = true } -typed-arena = { workspace = true } +criterion = "0.8" +typed-arena = "2.0.2" [[bench]] name = "html5ever" diff --git a/html5ever/src/tokenizer/char_ref/mod.rs b/html5ever/src/tokenizer/char_ref/mod.rs index e119477d..c8da81c8 100644 --- a/html5ever/src/tokenizer/char_ref/mod.rs +++ b/html5ever/src/tokenizer/char_ref/mod.rs @@ -212,7 +212,11 @@ impl CharRefTokenizer { unconsume.push_char(c) } + #[cfg(feature = "source-positions")] + let unconsume_len = unconsume.len() as u64; input.push_front(unconsume); + #[cfg(feature = "source-positions")] + input.retreat_bytes_consumed(unconsume_len); tokenizer.emit_error(Borrowed("Numeric character reference without digits")); Status::Done(CharRef::EMPTY) } @@ -292,7 +296,12 @@ impl CharRefTokenizer { } fn unconsume_name(&mut self, input: &BufferQueue) { - input.push_front(self.name_buf_opt.take().unwrap()); + let name_buf = self.name_buf_opt.take().unwrap(); + #[cfg(feature = "source-positions")] + let name_buf_len = name_buf.len() as u64; + input.push_front(name_buf); + #[cfg(feature = "source-positions")] + input.retreat_bytes_consumed(name_buf_len); } fn finish_named<Sink: TokenSink>( @@ -367,7 +376,12 @@ impl CharRefTokenizer { self.unconsume_name(input); Status::Done(CharRef::EMPTY) } else { - input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..])); + let unconsumed = StrTendril::from_slice(&self.name_buf()[name_len..]); + #[cfg(feature = "source-positions")] + let unconsumed_len = unconsumed.len() as u64; + input.push_front(unconsumed); + #[cfg(feature = "source-positions")] + input.retreat_bytes_consumed(unconsumed_len); tokenizer.ignore_lf.set(false); Status::Done(CharRef { chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()], @@ -419,6 +433,8 @@ impl CharRefTokenizer { }, State::Octothorpe => { input.push_front(StrTendril::from_slice("#")); + #[cfg(feature = "source-positions")] + input.retreat_bytes_consumed(1); tokenizer.emit_error(Borrowed("EOF after '#' in character reference")); Status::Done(CharRef::EMPTY) }, diff --git a/html5ever/src/tokenizer/interface.rs b/html5ever/src/tokenizer/interface.rs index b1436a71..c9ee28c9 100644 --- a/html5ever/src/tokenizer/interface.rs +++ b/html5ever/src/tokenizer/interface.rs @@ -130,6 +130,14 @@ pub trait TokenSink { /// Signal that tokenization reached the end of the document. fn end(&self) {} + /// Called just before each token is dispatched to [`process_token`], + /// with the number of UTF-8 bytes consumed from the input so far. + /// + /// Only called when the `source-positions` feature is enabled. The + /// default implementation is a no-op. + #[cfg(feature = "source-positions")] + fn set_current_byte(&self, _byte_offset: u64) {} + /// Used in the [markup declaration open state]. By default, this always /// returns false and thus all CDATA sections are tokenized as bogus /// comments. diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs index ba9a095c..971ac0ba 100644 --- a/html5ever/src/tokenizer/mod.rs +++ b/html5ever/src/tokenizer/mod.rs @@ -181,6 +181,33 @@ pub struct Tokenizer<Sink> { /// Track current line current_line: Cell<u64>, + + /// Number of UTF-8 bytes consumed from the input so far. + /// + /// Kept in sync with `BufferQueue::bytes_consumed` after every character + /// is consumed. Only present when the `source-positions` feature is + /// enabled. + #[cfg(feature = "source-positions")] + current_byte: Cell<u64>, + + /// Byte offset of the first character of the current token. + /// + /// For tag, comment, and doctype tokens this is the byte of the `<` that + /// opened them — captured whenever `<` is consumed in + /// `get_preprocessed_char`. For character tokens it is the byte right + /// after the end of the previous token, which equals the first byte of + /// the text content — tracked via `last_token_end_byte`. + /// Only present when the `source-positions` feature is enabled. + #[cfg(feature = "source-positions")] + token_start_byte: Cell<u64>, + + /// Byte offset one past the end of the most recently emitted token. + /// + /// Updated at the end of each `process_token` call. Used as the start + /// byte for the next character token. Only present when the + /// `source-positions` feature is enabled. + #[cfg(feature = "source-positions")] + last_token_end_byte: Cell<u64>, } impl<Sink: TokenSink> Tokenizer<Sink> { @@ -216,6 +243,12 @@ impl<Sink: TokenSink> Tokenizer<Sink> { state_profile: RefCell::new(BTreeMap::new()), time_in_sink: Cell::new(0), current_line: Cell::new(1), + #[cfg(feature = "source-positions")] + current_byte: Cell::new(0), + #[cfg(feature = "source-positions")] + token_start_byte: Cell::new(0), + #[cfg(feature = "source-positions")] + last_token_end_byte: Cell::new(0), } } @@ -243,13 +276,27 @@ impl<Sink: TokenSink> Tokenizer<Sink> { } fn process_token(&self, token: Token) -> TokenSinkResult<Sink::Handle> { - if self.opts.profile { + #[cfg(feature = "source-positions")] + { + let byte = match &token { + Token::TagToken(_) | Token::CommentToken(_) | Token::DoctypeToken(_) => { + self.token_start_byte.get() + }, + Token::CharacterTokens(_) => self.last_token_end_byte.get(), + _ => self.current_byte.get(), + }; + self.sink.set_current_byte(byte); + } + let result = if self.opts.profile { let (ret, dt) = time!(self.sink.process_token(token, self.current_line.get())); self.time_in_sink.set(self.time_in_sink.get() + dt); ret } else { self.sink.process_token(token, self.current_line.get()) - } + }; + #[cfg(feature = "source-positions")] + self.last_token_end_byte.set(self.current_byte.get()); + result } fn process_token_and_continue(&self, token: Token) { @@ -292,6 +339,17 @@ impl<Sink: TokenSink> Tokenizer<Sink> { trace!("got character {c}"); self.current_char.set(c); + #[cfg(feature = "source-positions")] + { + let pos = input.bytes_consumed(); + if pos > 0 { + self.current_byte.set(pos); + if c == '<' { + self.token_start_byte + .set(pos.saturating_sub(c.len_utf8() as u64)); + } + } + } Some(c) } @@ -325,7 +383,13 @@ impl<Sink: TokenSink> Tokenizer<Sink> { // NB: We don't set self.current_char for a run of characters not // in the set. It shouldn't matter for the codepaths that use // this. - _ => d, + other => { + #[cfg(feature = "source-positions")] + if other.is_some() { + self.current_byte.set(input.bytes_consumed()); + } + other + }, } } @@ -621,7 +685,20 @@ impl<Sink: TokenSink> Tokenizer<Sink> { if self.reconsume.get() { self.reconsume.set(false); } else { + #[cfg(not(feature = "source-positions"))] input.next(); + #[cfg(feature = "source-positions")] + { + let c = input.next(); + if let Some(c) = c { + let pos = input.bytes_consumed(); + self.current_byte.set(pos); + if c == '<' { + self.token_start_byte + .set(pos.saturating_sub(c.len_utf8() as u64)); + } + } + } } } @@ -757,6 +834,20 @@ impl<Sink: TokenSink> Tokenizer<Sink> { // This CPU is guaranteed to support SIMD due to the is_supported_simd_feature_detected check above let result = unsafe { self.data_state_simd_fast_path(&mut front_buffer) }; + #[cfg(feature = "source-positions")] + if let Some(ref r) = result { + let n = match r { + SetResult::NotFromSet(ref t) => t.len() as u64, + SetResult::FromSet(c) => c.len_utf8() as u64, + }; + input.advance_bytes_consumed(n); + self.current_byte.set(input.bytes_consumed()); + if let SetResult::FromSet('<') = r { + self.token_start_byte + .set(input.bytes_consumed() - '<'.len_utf8() as u64); + } + } + if front_buffer.is_empty() { drop(front_buffer); input.pop_front(); @@ -1752,6 +1843,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> { let mut char_ref_tokenizer = self.char_ref_tokenizer.borrow_mut(); let progress = match char_ref_tokenizer.as_mut().unwrap().step(self, input) { char_ref::Status::Done(char_ref) => { + #[cfg(feature = "source-positions")] + self.current_byte.set(input.bytes_consumed()); self.process_char_ref(char_ref); *char_ref_tokenizer = None; return ProcessResult::Continue; @@ -2379,3 +2472,265 @@ mod test { assert_eq!(results, expected); } } + +#[cfg(all(test, feature = "source-positions"))] +mod test_source_positions { + use crate::tendril::StrTendril; + + use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, TagToken}; + use super::interface::{EndTag, StartTag, Tag, Token}; + use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; + + use crate::LocalName; + use markup5ever::buffer_queue::BufferQueue; + use std::cell::RefCell; + + /// Records (token, byte_offset) pairs via `set_current_byte`. + struct BytesMatch { + /// Byte offset delivered by the most recent `set_current_byte` call. + current_byte: std::cell::Cell<u64>, + /// Byte offset at the start of the current character run. + /// Captured on the first `CharacterTokens` chunk; cleared after flush. + text_start_byte: std::cell::Cell<Option<u64>>, + current_str: RefCell<StrTendril>, + entries: RefCell<Vec<(Token, u64)>>, + } + + impl BytesMatch { + fn new() -> Self { + BytesMatch { + current_byte: std::cell::Cell::new(0), + text_start_byte: std::cell::Cell::new(None), + current_str: RefCell::new(StrTendril::new()), + entries: RefCell::new(vec![]), + } + } + + /// Emit the accumulated character run using the byte of its first chunk. + fn flush_chars(&self) { + let s = self.current_str.take(); + if !s.is_empty() { + let byte = self.text_start_byte.get().unwrap_or(0); + self.text_start_byte.set(None); + self.entries.borrow_mut().push((CharacterTokens(s), byte)); + } + } + } + + /// Records every token without coalescing adjacent character chunks. + struct RawBytesMatch { + current_byte: std::cell::Cell<u64>, + entries: RefCell<Vec<(Token, u64)>>, + } + + impl RawBytesMatch { + fn new() -> Self { + RawBytesMatch { + current_byte: std::cell::Cell::new(0), + entries: RefCell::new(vec![]), + } + } + } + + impl TokenSink for RawBytesMatch { + type Handle = (); + + fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<Self::Handle> { + if !matches!(token, EOFToken) { + self.entries + .borrow_mut() + .push((token, self.current_byte.get())); + } + TokenSinkResult::Continue + } + + fn set_current_byte(&self, byte_offset: u64) { + self.current_byte.set(byte_offset); + } + } + + impl TokenSink for BytesMatch { + type Handle = (); + + fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<Self::Handle> { + let byte = self.current_byte.get(); + match token { + CharacterTokens(b) => { + if self.text_start_byte.get().is_none() { + self.text_start_byte.set(Some(byte)); + } + self.current_str.borrow_mut().push_slice(&b); + }, + NullCharacterToken => { + self.current_str.borrow_mut().push_char('\0'); + }, + EOFToken => { + self.flush_chars(); + }, + TagToken(mut t) => { + self.flush_chars(); + if let EndTag = t.kind { + t.attrs = vec![]; + } else { + t.attrs.sort_by(|a, b| a.name.cmp(&b.name)); + } + self.entries.borrow_mut().push((TagToken(t), byte)); + }, + other => { + self.flush_chars(); + self.entries.borrow_mut().push((other, byte)); + }, + } + TokenSinkResult::Continue + } + + fn set_current_byte(&self, byte_offset: u64) { + self.current_byte.set(byte_offset); + } + } + + fn tokenize_bytes(input: &str) -> Vec<(Token, u64)> { + let sink = BytesMatch::new(); + let tok = Tokenizer::new( + sink, + TokenizerOpts { + exact_errors: false, + discard_bom: true, + profile: false, + initial_state: None, + last_start_tag_name: None, + }, + ); + let buf = BufferQueue::default(); + buf.push_back(StrTendril::from(input)); + let _ = tok.feed(&buf); + tok.end(); + tok.sink.entries.take() + } + + fn tokenize_raw_bytes(input: &str) -> Vec<(Token, u64)> { + let sink = RawBytesMatch::new(); + let tok = Tokenizer::new( + sink, + TokenizerOpts { + exact_errors: false, + discard_bom: true, + profile: false, + initial_state: None, + last_start_tag_name: None, + }, + ); + let buf = BufferQueue::default(); + buf.push_back(StrTendril::from(input)); + let _ = tok.feed(&buf); + tok.end(); + tok.sink.entries.take() + } + + fn start(name: &str) -> Token { + TagToken(Tag { + kind: StartTag, + name: LocalName::from(name), + self_closing: false, + attrs: vec![], + had_duplicate_attributes: false, + }) + } + + fn end(name: &str) -> Token { + TagToken(Tag { + kind: EndTag, + name: LocalName::from(name), + self_closing: false, + attrs: vec![], + had_duplicate_attributes: false, + }) + } + + fn chars(s: &str) -> Token { + CharacterTokens(StrTendril::from(s)) + } + + #[test] + fn check_byte_offsets_simple_tags() { + // <a> = bytes 0-2 → offset 0 + // <b> = bytes 3-5 → offset 3 + // </b> = bytes 6-9 → offset 6 + // </a> = bytes 10-13 → offset 10 + let entries = tokenize_bytes("<a><b></b></a>"); + assert_eq!( + entries, + vec![ + (start("a"), 0), + (start("b"), 3), + (end("b"), 6), + (end("a"), 10), + ] + ); + } + + #[test] + fn check_byte_offsets_text_content() { + // <p> = bytes 0-2 → offset 0 + // "hello" = bytes 3-7 → offset 3 (right after '>') + // </p> = bytes 8-11 → offset 8 + let entries = tokenize_bytes("<p>hello</p>"); + assert_eq!( + entries, + vec![(start("p"), 0), (chars("hello"), 3), (end("p"), 8),] + ); + } + + #[test] + fn check_byte_offsets_multibyte_text() { + // <p> = bytes 0-2 → offset 0 + // "é" = bytes 3-4 (é = 2 UTF-8 bytes) → offset 3 + // </p> = bytes 5-8 → offset 5 + let entries = tokenize_bytes("<p>é</p>"); + assert_eq!( + entries, + vec![(start("p"), 0), (chars("é"), 3), (end("p"), 5),] + ); + } + + #[test] + fn check_byte_offsets_sequential_siblings() { + // <h1> = bytes 0-3 → offset 0 + // "X" = bytes 4 → offset 4 + // </h1> = bytes 5-9 → offset 5 + // <p> = bytes 10-12 → offset 10 + // "Y" = bytes 13 → offset 13 + // </p> = bytes 14-17 → offset 14 + let entries = tokenize_bytes("<h1>X</h1><p>Y</p>"); + assert_eq!( + entries, + vec![ + (start("h1"), 0), + (chars("X"), 4), + (end("h1"), 5), + (start("p"), 10), + (chars("Y"), 13), + (end("p"), 14), + ] + ); + } + + #[test] + fn check_byte_offsets_entity_text_chunks() { + // <p> = bytes 0-2 → offset 0 + // "a" = byte 3 + // "&" = bytes 4-8, decoded to "&" + // "b" = byte 9 + let entries = tokenize_raw_bytes("<p>a&b</p>"); + assert_eq!( + entries, + vec![ + (start("p"), 0), + (chars("a"), 3), + (chars("&"), 4), + (chars("b"), 9), + (end("p"), 10), + ] + ); + } +} diff --git a/html5ever/src/tree_builder/mod.rs b/html5ever/src/tree_builder/mod.rs index 3fcfaec3..0be385ed 100644 --- a/html5ever/src/tree_builder/mod.rs +++ b/html5ever/src/tree_builder/mod.rs @@ -474,6 +474,11 @@ where { type Handle = Handle; + #[cfg(feature = "source-positions")] + fn set_current_byte(&self, byte_offset: u64) { + self.sink.set_current_byte(byte_offset); + } + fn process_token(&self, token: tokenizer::Token, line_number: u64) -> TokenSinkResult<Handle> { if line_number != self.current_line.get() { self.sink.set_current_line(line_number); @@ -673,8 +678,22 @@ where ProcessResult::ToRawData(k) } - // The generic raw text / RCDATA parsing algorithm. + /// The generic raw text / RCDATA parsing algorithm. + /// Insert a RCDATA/RAWTEXT element and switch the tokenizer to raw-text mode. + /// + /// XHTML allows self-closing syntax (`<title/>`, `<style/>`, …) on these + /// elements. The HTML5 spec ignores the `/` and enters the raw-text state, + /// which swallows the remainder of the document until a matching end tag. + /// When the `xhtml-self-closing` feature is enabled, the self-closing flag + /// is honoured instead: an empty element is inserted and the tokenizer stays + /// in the current insertion mode. fn parse_raw_data(&self, tag: Tag, k: RawKind) -> ProcessResult<Handle> { + #[cfg(feature = "xhtml-self-closing")] + if tag.self_closing { + self.insert_and_pop_element_for(tag); + return ProcessResult::DoneAckSelfClosing; + } + self.insert_element_for(tag); self.to_raw_text_mode(k) } diff --git a/markup5ever/Cargo.toml b/markup5ever/Cargo.toml index 764f9a0a..79024927 100644 --- a/markup5ever/Cargo.toml +++ b/markup5ever/Cargo.toml @@ -3,20 +3,25 @@ name = "markup5ever" description = "Common code for xml5ever and html5ever" documentation = "https://docs.rs/markup5ever" categories = [ "parser-implementations", "web-programming" ] -version.workspace = true -license.workspace = true -authors.workspace = true -repository.workspace = true -edition.workspace = true -rust-version.workspace = true +version = "0.39.0" +license = "MIT OR Apache-2.0" +authors = [ "The html5ever Project Developers" ] +repository = "https://github.com/servo/html5ever" +edition = "2021" +rust-version = "1.71.0" [lib] path = "lib.rs" [features] serde = ["web_atoms/serde"] +# Surfaces byte-accurate source positions through the `TreeSink` interface. +# When enabled, `TreeSink::set_current_byte` is called before each tree +# mutation with the UTF-8 byte offset of the current token in the input. +# Use this to assign stable, parser-independent offsets to DOM nodes. +source-positions = [] [dependencies] -web_atoms = { workspace = true } -tendril = { workspace = true } -log = { workspace = true } +web_atoms = "0.2.4" +tendril = "0.5" +log = "0.4" diff --git a/markup5ever/interface/tree_builder.rs b/markup5ever/interface/tree_builder.rs index e1683de0..fbc70b16 100644 --- a/markup5ever/interface/tree_builder.rs +++ b/markup5ever/interface/tree_builder.rs @@ -269,6 +269,18 @@ pub trait TreeSink { /// Called whenever the line number changes. fn set_current_line(&self, _line_number: u64) {} + /// Called whenever the source byte offset changes. + /// + /// Only called when the `source-positions` feature is enabled on the + /// `html5ever` crate. The offset is the number of UTF-8 bytes consumed + /// from the input up to and including the last character of the token + /// that just triggered the current tree-builder callback. + /// + /// Implement this method to obtain byte-accurate source positions for + /// nodes. The default implementation is a no-op. + #[cfg(feature = "source-positions")] + fn set_current_byte(&self, _byte_offset: u64) {} + fn allow_declarative_shadow_roots(&self, _intended_parent: &Self::Handle) -> bool { true } diff --git a/markup5ever/util/buffer_queue.rs b/markup5ever/util/buffer_queue.rs index d5e6864f..ca9af75c 100644 --- a/markup5ever/util/buffer_queue.rs +++ b/markup5ever/util/buffer_queue.rs @@ -18,6 +18,8 @@ //! //! [`BufferQueue`]: struct.BufferQueue.html +#[cfg(feature = "source-positions")] +use std::cell::Cell; use std::{ cell::{RefCell, RefMut}, collections::VecDeque, @@ -51,6 +53,13 @@ pub enum SetResult { pub struct BufferQueue { /// Buffers to process. buffers: RefCell<VecDeque<StrTendril>>, + /// Total number of UTF-8 bytes consumed from this queue so far. + /// + /// Only present when the `source-positions` feature is enabled. Used by + /// the tokenizer to surface byte-accurate source offsets via + /// [`TokenSink::set_current_byte`] and [`TreeSink::set_current_byte`]. + #[cfg(feature = "source-positions")] + bytes_consumed: Cell<u64>, } impl Default for BufferQueue { @@ -59,6 +68,8 @@ impl Default for BufferQueue { fn default() -> Self { Self { buffers: RefCell::new(VecDeque::with_capacity(16)), + #[cfg(feature = "source-positions")] + bytes_consumed: Cell::new(0), } } } @@ -70,6 +81,43 @@ impl BufferQueue { self.buffers.borrow().is_empty() } + /// Returns the total number of UTF-8 bytes consumed from this queue. + /// + /// Only available when the `source-positions` feature is enabled. The + /// value monotonically increases as characters are consumed via + /// [`next`], [`pop_except_from`], and [`eat`]. Re-queuing bytes via + /// [`push_front`] does **not** decrement the counter — the tokenizer + /// uses its own `reconsume` flag for single-character look-back and + /// never actually re-pushes bytes that were already counted. + #[cfg(feature = "source-positions")] + #[inline] + pub fn bytes_consumed(&self) -> u64 { + self.bytes_consumed.get() + } + + /// Advance the bytes-consumed counter by `n`. + /// + /// Only available when the `source-positions` feature is enabled. + /// Used by SIMD fast paths that consume bytes directly from a tendril + /// without going through [`next`] or [`pop_except_from`]. + #[cfg(feature = "source-positions")] + #[inline] + pub fn advance_bytes_consumed(&self, n: u64) { + self.bytes_consumed.set(self.bytes_consumed.get() + n); + } + + /// Retreat the bytes-consumed counter by `n`. + /// + /// Only available when the `source-positions` feature is enabled. Used by + /// tokenizer lookahead paths that consume raw bytes, then push unmatched + /// suffix bytes back onto the queue. + #[cfg(feature = "source-positions")] + #[inline] + pub fn retreat_bytes_consumed(&self, n: u64) { + self.bytes_consumed + .set(self.bytes_consumed.get().saturating_sub(n)); + } + /// Get the buffer at the beginning of the queue. #[inline] pub fn pop_front(&self) -> Option<StrTendril> { @@ -146,9 +194,15 @@ impl BufferQueue { out = buf.unsafe_subtendril(0, n); buf.unsafe_pop_front(n); } + #[cfg(feature = "source-positions")] + self.bytes_consumed + .set(self.bytes_consumed.get() + out.len() as u64); (Some(NotFromSet(out)), buf.is_empty()) } else { let c = buf.pop_front_char().expect("empty buffer in queue"); + #[cfg(feature = "source-positions")] + self.bytes_consumed + .set(self.bytes_consumed.get() + c.len_utf8() as u64); (Some(FromSet(c)), buf.is_empty()) } }, @@ -218,6 +272,10 @@ impl BufferQueue { Some(ref mut buf) => buf.pop_front(consumed_from_last as u32), } + #[cfg(feature = "source-positions")] + self.bytes_consumed + .set(self.bytes_consumed.get() + pat.len() as u64); + Some(true) } @@ -229,6 +287,9 @@ impl BufferQueue { None => (None, false), Some(buf) => { let c = buf.pop_front_char().expect("empty buffer in queue"); + #[cfg(feature = "source-positions")] + self.bytes_consumed + .set(self.bytes_consumed.get() + c.len_utf8() as u64); (Some(c), buf.is_empty()) }, }; @@ -331,3 +392,120 @@ mod test { assert_eq!(bq.next(), None); } } + +#[cfg(all(test, feature = "source-positions"))] +mod test_source_positions { + use tendril::SliceExt; + + use super::BufferQueue; + use super::SetResult::{FromSet, NotFromSet}; + + #[test] + fn next_advances_counter_by_utf8_width() { + let bq = BufferQueue::default(); + assert_eq!(bq.bytes_consumed(), 0); + + // ASCII: 1 byte each + bq.push_back("abc".to_tendril()); + bq.next(); + assert_eq!(bq.bytes_consumed(), 1); + bq.next(); + assert_eq!(bq.bytes_consumed(), 2); + bq.next(); + assert_eq!(bq.bytes_consumed(), 3); + + // Multibyte: 'é' is 2 bytes (U+00E9, encoded as 0xC3 0xA9) + bq.push_back("é".to_tendril()); + bq.next(); + assert_eq!(bq.bytes_consumed(), 5); + } + + #[test] + fn pop_except_from_bulk_advances_counter() { + let bq = BufferQueue::default(); + // "abc" are not in the set; '&' is + bq.push_back("abc&def".to_tendril()); + let set = small_char_set!('&'); + + // Bulk NotFromSet: 3 bytes consumed + assert_eq!( + bq.pop_except_from(set), + Some(NotFromSet("abc".to_tendril())) + ); + assert_eq!(bq.bytes_consumed(), 3); + + // Single FromSet '&': 1 byte consumed + assert_eq!(bq.pop_except_from(set), Some(FromSet('&'))); + assert_eq!(bq.bytes_consumed(), 4); + + // Bulk NotFromSet: 3 more bytes + assert_eq!( + bq.pop_except_from(set), + Some(NotFromSet("def".to_tendril())) + ); + assert_eq!(bq.bytes_consumed(), 7); + } + + #[test] + fn pop_except_from_multibyte_bulk_advances_by_byte_len() { + // "café" is 5 bytes (c=1, a=1, f=1, é=2). '&' terminates the bulk. + // Confirms NotFromSet advances by the byte length of the tendril slice, + // not by the character count. + let bq = BufferQueue::default(); + bq.push_back("café&".to_tendril()); + let set = small_char_set!('&'); + + let result = bq.pop_except_from(set); + assert!(matches!(result, Some(NotFromSet(_)))); + // 'c'=1 + 'a'=1 + 'f'=1 + 'é'=2 = 5 bytes + assert_eq!(bq.bytes_consumed(), 5); + } + + #[test] + fn eat_advances_counter_on_match_not_on_no_match() { + let bq = BufferQueue::default(); + bq.push_back("abcdef".to_tendril()); + + // No match: counter unchanged + assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false)); + assert_eq!(bq.bytes_consumed(), 0); + + // Match "abc": counter advances by 3 + assert_eq!(bq.eat("abc", u8::eq_ignore_ascii_case), Some(true)); + assert_eq!(bq.bytes_consumed(), 3); + + // Match "def": counter advances by 3 more + assert_eq!(bq.eat("def", u8::eq_ignore_ascii_case), Some(true)); + assert_eq!(bq.bytes_consumed(), 6); + } + + #[test] + fn push_front_does_not_decrement_counter() { + let bq = BufferQueue::default(); + bq.push_back("abc".to_tendril()); + bq.next(); // consume 'a' → 1 + bq.next(); // consume 'b' → 2 + assert_eq!(bq.bytes_consumed(), 2); + + // Re-queue something — counter must not decrease + bq.push_front("xy".to_tendril()); + assert_eq!(bq.bytes_consumed(), 2); + + // Consuming the re-queued bytes advances further + bq.next(); // 'x' → 3 + bq.next(); // 'y' → 4 + assert_eq!(bq.bytes_consumed(), 4); + } + + #[test] + fn advance_bytes_consumed_adds_exactly() { + let bq = BufferQueue::default(); + assert_eq!(bq.bytes_consumed(), 0); + + bq.advance_bytes_consumed(7); + assert_eq!(bq.bytes_consumed(), 7); + + bq.advance_bytes_consumed(3); + assert_eq!(bq.bytes_consumed(), 10); + } +} diff --git a/rcdom/Cargo.toml b/rcdom/Cargo.toml index caf52b54..30dab873 100644 --- a/rcdom/Cargo.toml +++ b/rcdom/Cargo.toml @@ -20,6 +20,10 @@ markup5ever = { workspace = true, features = ["serde"] } tendril = { workspace = true } xml5ever = { workspace = true } +[features] +source-positions = ["html5ever/source-positions"] +xhtml-self-closing = ["html5ever/xhtml-self-closing"] + [dev-dependencies] criterion = { workspace = true } env_logger = { workspace = true } @@ -45,3 +49,11 @@ harness = false [[test]] name = "xml-tokenizer" harness = false + +[[test]] +name = "source-positions-integration" +required-features = ["source-positions"] + +[[test]] +name = "xhtml-self-closing-integration" +required-features = ["xhtml-self-closing"] diff --git a/rcdom/tests/source-positions-integration.rs b/rcdom/tests/source-positions-integration.rs new file mode 100644 index 00000000..052c63e1 --- /dev/null +++ b/rcdom/tests/source-positions-integration.rs @@ -0,0 +1,214 @@ +// Copyright 2014-2026 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Integration tests for the `source-positions` feature. +//! +//! Verifies that byte offsets flow correctly from `BufferQueue` through the +//! tokenizer and tree builder all the way into `TreeSink::set_current_byte`, +//! and that the offsets correspond to the actual positions of element opening +//! tags in the source string. + +#[cfg(feature = "source-positions")] +mod source_positions { + use html5ever::driver; + use html5ever::tendril::stream::TendrilSink; + use html5ever::tendril::StrTendril; + use html5ever::ExpandedName; + use html5ever::QualName; + use markup5ever::interface::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; + use markup5ever::Attribute; + use markup5ever_rcdom::{Handle, RcDom}; + use std::borrow::Cow; + use std::cell::{Cell, RefCell}; + + /// Wraps `RcDom` and records `(local_name, byte_offset)` for every + /// element created while `set_current_byte` is active. + struct ByteCapturingDOM { + current_byte: Cell<u64>, + elements: RefCell<Vec<(String, u64)>>, + rcdom: RcDom, + } + + impl ByteCapturingDOM { + fn new() -> Self { + ByteCapturingDOM { + current_byte: Cell::new(0), + elements: RefCell::new(vec![]), + rcdom: RcDom::default(), + } + } + + /// Returns recorded `(local_name, byte_offset)` pairs, skipping the + /// implicit wrapper elements html5ever inserts (`html`, `head`, `body`). + fn content_elements(&self) -> Vec<(String, u64)> { + self.elements + .borrow() + .iter() + .filter(|(name, _)| !matches!(name.as_str(), "html" | "head" | "body")) + .cloned() + .collect() + } + } + + impl TreeSink for ByteCapturingDOM { + type Output = Self; + type ElemName<'a> = ExpandedName<'a>; + + fn finish(self) -> Self { + self + } + + type Handle = Handle; + + fn parse_error(&self, msg: Cow<'static, str>) { + self.rcdom.parse_error(msg); + } + + fn get_document(&self) -> Handle { + self.rcdom.get_document() + } + + fn get_template_contents(&self, target: &Handle) -> Handle { + self.rcdom.get_template_contents(target) + } + + fn set_quirks_mode(&self, mode: QuirksMode) { + self.rcdom.set_quirks_mode(mode) + } + + fn same_node(&self, x: &Handle, y: &Handle) -> bool { + self.rcdom.same_node(x, y) + } + + fn elem_name<'a>(&'a self, target: &'a Handle) -> ExpandedName<'a> { + self.rcdom.elem_name(target) + } + + fn create_element( + &self, + name: QualName, + attrs: Vec<Attribute>, + flags: ElementFlags, + ) -> Handle { + self.elements + .borrow_mut() + .push((name.local.to_string(), self.current_byte.get())); + self.rcdom.create_element(name, attrs, flags) + } + + fn create_comment(&self, text: StrTendril) -> Handle { + self.rcdom.create_comment(text) + } + + fn create_pi(&self, target: StrTendril, content: StrTendril) -> Handle { + self.rcdom.create_pi(target, content) + } + + fn append(&self, parent: &Handle, child: NodeOrText<Handle>) { + self.rcdom.append(parent, child) + } + + fn append_before_sibling(&self, sibling: &Handle, child: NodeOrText<Handle>) { + self.rcdom.append_before_sibling(sibling, child) + } + + fn append_based_on_parent_node( + &self, + element: &Handle, + prev_element: &Handle, + child: NodeOrText<Handle>, + ) { + self.rcdom + .append_based_on_parent_node(element, prev_element, child) + } + + fn append_doctype_to_document( + &self, + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril, + ) { + self.rcdom + .append_doctype_to_document(name, public_id, system_id); + } + + fn add_attrs_if_missing(&self, target: &Handle, attrs: Vec<Attribute>) { + self.rcdom.add_attrs_if_missing(target, attrs); + } + + fn remove_from_parent(&self, target: &Handle) { + self.rcdom.remove_from_parent(target); + } + + fn reparent_children(&self, node: &Handle, new_parent: &Handle) { + self.rcdom.reparent_children(node, new_parent); + } + + fn mark_script_already_started(&self, target: &Handle) { + self.rcdom.mark_script_already_started(target); + } + + fn set_current_line(&self, line_number: u64) { + self.rcdom.set_current_line(line_number); + } + + fn set_current_byte(&self, byte_offset: u64) { + self.current_byte.set(byte_offset); + } + } + + fn parse(input: &str) -> ByteCapturingDOM { + let sink = ByteCapturingDOM::new(); + driver::parse_document(sink, Default::default()).one(StrTendril::from(input)) + } + + #[test] + fn element_byte_offsets_match_source_positions() { + // <p> starts at byte 0 + // <div> starts at byte 14 ("<p>hello</p>" = 12 chars + 2 for "</p>") + // <p>hello</p> = 12 bytes, </p> = 4 bytes → <div> at 16? Let's be precise: + // "<p>hello</p><div>world</div>" + // 0123456789012345678901234567 + // <p> = 0, </p> = 8, <div> = 12 + let result = parse("<p>hello</p><div>world</div>"); + let elems = result.content_elements(); + + assert_eq!(elems.len(), 2, "expected p and div, got: {:?}", elems); + assert_eq!(elems[0], ("p".to_string(), 0)); + assert_eq!(elems[1], ("div".to_string(), 12)); + } + + #[test] + fn nested_element_byte_offset() { + // "<div><span>x</span></div>" + // 01234567890123456789... + // <div> = 0, <span> = 5 + let result = parse("<div><span>x</span></div>"); + let elems = result.content_elements(); + + assert_eq!(elems.len(), 2, "expected div and span, got: {:?}", elems); + assert_eq!(elems[0], ("div".to_string(), 0)); + assert_eq!(elems[1], ("span".to_string(), 5)); + } + + #[test] + fn multibyte_content_does_not_shift_subsequent_offsets() { + // "<p>café</p><span>next</span>" + // 'é' = 2 bytes, so: + // <p> = byte 0 + // </p> = byte 3+5 = byte 8 ("café" = c(1)+a(1)+f(1)+é(2) = 5 bytes) + // <span> = byte 8 + 4 = byte 12 ("</p>" = 4 bytes) + let result = parse("<p>café</p><span>next</span>"); + let elems = result.content_elements(); + + assert_eq!(elems.len(), 2, "expected p and span, got: {:?}", elems); + assert_eq!(elems[0], ("p".to_string(), 0)); + assert_eq!(elems[1], ("span".to_string(), 12)); + } +} diff --git a/rcdom/tests/xhtml-self-closing-integration.rs b/rcdom/tests/xhtml-self-closing-integration.rs new file mode 100644 index 00000000..c49aaf69 --- /dev/null +++ b/rcdom/tests/xhtml-self-closing-integration.rs @@ -0,0 +1,123 @@ +// Copyright 2014-2026 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Integration tests for the `xhtml-self-closing` feature. +//! +//! EPUB content is XHTML and uses self-closing syntax on RCDATA/RAWTEXT +//! elements such as `<title/>` and `<style/>`. Without `xhtml-self-closing`, +//! html5ever treats these as opening tags and enters raw-text mode, consuming +//! the rest of the document. These tests verify the feature makes parsing +//! behave as XHTML authors expect. + +#[cfg(feature = "xhtml-self-closing")] +mod xhtml_self_closing { + use html5ever::driver; + use html5ever::tendril::stream::TendrilSink; + use html5ever::tendril::StrTendril; + use markup5ever_rcdom::{NodeData, RcDom}; + + fn parse(input: &str) -> RcDom { + driver::parse_document(RcDom::default(), Default::default()).one(StrTendril::from(input)) + } + + /// Walk the tree and collect all element names. + fn element_names(node: &markup5ever_rcdom::Handle) -> Vec<String> { + let mut names = Vec::new(); + collect_names(node, &mut names); + names + } + + fn collect_names(node: &markup5ever_rcdom::Handle, out: &mut Vec<String>) { + if let NodeData::Element { ref name, .. } = node.data { + out.push(name.local.to_string()); + } + for child in node.children.borrow().iter() { + collect_names(child, out); + } + } + + /// Return the text content of the first element with the given local name. + fn text_of(dom: &RcDom, tag: &str) -> Option<String> { + find_text(&dom.document, tag) + } + + fn find_text(node: &markup5ever_rcdom::Handle, tag: &str) -> Option<String> { + if let NodeData::Element { ref name, .. } = node.data { + if name.local.as_ref() == tag { + let mut text = String::new(); + for child in node.children.borrow().iter() { + if let NodeData::Text { ref contents } = child.data { + text.push_str(&contents.borrow()); + } + } + return Some(text); + } + } + for child in node.children.borrow().iter() { + if let Some(t) = find_text(child, tag) { + return Some(t); + } + } + None + } + + #[test] + fn self_closing_title_does_not_swallow_body() { + // Without the feature <title/> opens a RCDATA region that swallows + // everything up to the next . With it, is empty and + // the body parses normally. + let dom = parse("<html><head><title/></head><body><p>visible</p></body></html>"); + let names = element_names(&dom.document); + + assert!( + names.contains(&"body".to_string()), + "body element should be present; got: {:?}", + names + ); + assert!( + names.contains(&"p".to_string()), + "p element inside body should be present; got: {:?}", + names + ); + + let text = text_of(&dom, "p"); + assert_eq!( + text.as_deref(), + Some("visible"), + "<p> text should be 'visible', got: {:?}", + text + ); + } + + #[test] + fn self_closing_style_does_not_swallow_body() { + let dom = parse("<html><head><style/></head><body><p>content</p></body></html>"); + let names = element_names(&dom.document); + + assert!( + names.contains(&"p".to_string()), + "p element should not be swallowed by <style/>; got: {:?}", + names + ); + } + + #[test] + fn normal_closed_title_still_captures_rcdata_text() { + // A properly-closed <title>… must still capture its RCDATA + // content — the feature must not break normal title parsing. + let dom = parse("My Book"); + let text = text_of(&dom, "title"); + assert_eq!( + text.as_deref(), + Some("My Book"), + "title text should be 'My Book', got: {:?}", + text + ); + } +} From 56499d6166445f0a8f25a4f43e539d54bb2db6f5 Mon Sep 17 00:00:00 2001 From: Kevin Hellemun <17928966+OGKevin@users.noreply.github.com> Date: Thu, 18 Jun 2026 15:33:43 +0200 Subject: [PATCH 2/2] chore: some brooming Change-Id: 1cd53437f8710e57e9a9bb17f38bc2fc Change-Id-Short: ynmuwvwskrsy --- Cargo.toml | 4 +- html5ever/Cargo.toml | 6 +- html5ever/src/tokenizer/interface.rs | 3 +- html5ever/src/tokenizer/mod.rs | 46 +----- html5ever/src/tree_builder/mod.rs | 9 +- markup5ever/interface/tree_builder.rs | 14 +- markup5ever/util/buffer_queue.rs | 102 +++++++----- rcdom/tests/source-positions-integration.rs | 147 ++++++++++-------- rcdom/tests/xhtml-self-closing-integration.rs | 17 +- 9 files changed, 164 insertions(+), 184 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f2a87026..141c9124 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,9 +44,7 @@ serde_json = "1.0" typed-arena = "2.0.2" # Redirect crates.io tendril/web_atoms to the local path crates so that -# markup5ever (which cannot use workspace.dependencies when loaded as a -# Cargo [patch] from outside this workspace) gets the same crate instance -# as rcdom and xml5ever. +# markup5ever gets the same crate instance as rcdom and xml5ever. [patch.crates-io] tendril = { path = "tendril" } web_atoms = { path = "web_atoms" } diff --git a/html5ever/Cargo.toml b/html5ever/Cargo.toml index 6db09321..23b752b0 100644 --- a/html5ever/Cargo.toml +++ b/html5ever/Cargo.toml @@ -18,10 +18,8 @@ serde = ["markup5ever/serde"] # Surfaces byte-accurate source positions; see markup5ever for full description. source-positions = ["markup5ever/source-positions"] # Honour the XML/XHTML self-closing syntax (``, `<style/>`, …) on -# RCDATA and RAWTEXT elements. Without this, html5ever follows the HTML5 -# spec and treats `<title/>` as opening a RCDATA region that swallows the -# rest of the document. EPUB content is XHTML and relies on self-closing -# being honoured. +# RCDATA and RAWTEXT elements. Without this, html5ever treats `<title/>` +# as opening a RCDATA region that swallows the rest of the document. xhtml-self-closing = [] [dependencies] diff --git a/html5ever/src/tokenizer/interface.rs b/html5ever/src/tokenizer/interface.rs index c9ee28c9..97437809 100644 --- a/html5ever/src/tokenizer/interface.rs +++ b/html5ever/src/tokenizer/interface.rs @@ -133,8 +133,7 @@ pub trait TokenSink { /// Called just before each token is dispatched to [`process_token`], /// with the number of UTF-8 bytes consumed from the input so far. /// - /// Only called when the `source-positions` feature is enabled. The - /// default implementation is a no-op. + /// The default implementation is a no-op. #[cfg(feature = "source-positions")] fn set_current_byte(&self, _byte_offset: u64) {} diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs index 971ac0ba..8b0d473a 100644 --- a/html5ever/src/tokenizer/mod.rs +++ b/html5ever/src/tokenizer/mod.rs @@ -185,27 +185,24 @@ pub struct Tokenizer<Sink> { /// Number of UTF-8 bytes consumed from the input so far. /// /// Kept in sync with `BufferQueue::bytes_consumed` after every character - /// is consumed. Only present when the `source-positions` feature is - /// enabled. + /// is consumed. #[cfg(feature = "source-positions")] current_byte: Cell<u64>, /// Byte offset of the first character of the current token. /// /// For tag, comment, and doctype tokens this is the byte of the `<` that - /// opened them — captured whenever `<` is consumed in - /// `get_preprocessed_char`. For character tokens it is the byte right - /// after the end of the previous token, which equals the first byte of - /// the text content — tracked via `last_token_end_byte`. - /// Only present when the `source-positions` feature is enabled. + /// opened them, captured whenever `<` is consumed in `get_preprocessed_char`. + /// + /// For character tokens it is the byte right after the end of the previous token, + /// which equals the first byte of the text content, this is tracked via `last_token_end_byte`. #[cfg(feature = "source-positions")] token_start_byte: Cell<u64>, /// Byte offset one past the end of the most recently emitted token. /// /// Updated at the end of each `process_token` call. Used as the start - /// byte for the next character token. Only present when the - /// `source-positions` feature is enabled. + /// byte for the next character token. #[cfg(feature = "source-positions")] last_token_end_byte: Cell<u64>, } @@ -2487,10 +2484,7 @@ mod test_source_positions { /// Records (token, byte_offset) pairs via `set_current_byte`. struct BytesMatch { - /// Byte offset delivered by the most recent `set_current_byte` call. current_byte: std::cell::Cell<u64>, - /// Byte offset at the start of the current character run. - /// Captured on the first `CharacterTokens` chunk; cleared after flush. text_start_byte: std::cell::Cell<Option<u64>>, current_str: RefCell<StrTendril>, entries: RefCell<Vec<(Token, u64)>>, @@ -2567,13 +2561,8 @@ mod test_source_positions { EOFToken => { self.flush_chars(); }, - TagToken(mut t) => { + TagToken(t) => { self.flush_chars(); - if let EndTag = t.kind { - t.attrs = vec![]; - } else { - t.attrs.sort_by(|a, b| a.name.cmp(&b.name)); - } self.entries.borrow_mut().push((TagToken(t), byte)); }, other => { @@ -2653,10 +2642,6 @@ mod test_source_positions { #[test] fn check_byte_offsets_simple_tags() { - // <a> = bytes 0-2 → offset 0 - // <b> = bytes 3-5 → offset 3 - // </b> = bytes 6-9 → offset 6 - // </a> = bytes 10-13 → offset 10 let entries = tokenize_bytes("<a><b></b></a>"); assert_eq!( entries, @@ -2671,9 +2656,6 @@ mod test_source_positions { #[test] fn check_byte_offsets_text_content() { - // <p> = bytes 0-2 → offset 0 - // "hello" = bytes 3-7 → offset 3 (right after '>') - // </p> = bytes 8-11 → offset 8 let entries = tokenize_bytes("<p>hello</p>"); assert_eq!( entries, @@ -2683,24 +2665,14 @@ mod test_source_positions { #[test] fn check_byte_offsets_multibyte_text() { - // <p> = bytes 0-2 → offset 0 - // "é" = bytes 3-4 (é = 2 UTF-8 bytes) → offset 3 - // </p> = bytes 5-8 → offset 5 let entries = tokenize_bytes("<p>é</p>"); assert_eq!( entries, vec![(start("p"), 0), (chars("é"), 3), (end("p"), 5),] ); } - #[test] fn check_byte_offsets_sequential_siblings() { - // <h1> = bytes 0-3 → offset 0 - // "X" = bytes 4 → offset 4 - // </h1> = bytes 5-9 → offset 5 - // <p> = bytes 10-12 → offset 10 - // "Y" = bytes 13 → offset 13 - // </p> = bytes 14-17 → offset 14 let entries = tokenize_bytes("<h1>X</h1><p>Y</p>"); assert_eq!( entries, @@ -2717,10 +2689,6 @@ mod test_source_positions { #[test] fn check_byte_offsets_entity_text_chunks() { - // <p> = bytes 0-2 → offset 0 - // "a" = byte 3 - // "&" = bytes 4-8, decoded to "&" - // "b" = byte 9 let entries = tokenize_raw_bytes("<p>a&b</p>"); assert_eq!( entries, diff --git a/html5ever/src/tree_builder/mod.rs b/html5ever/src/tree_builder/mod.rs index 0be385ed..bf28847a 100644 --- a/html5ever/src/tree_builder/mod.rs +++ b/html5ever/src/tree_builder/mod.rs @@ -681,12 +681,9 @@ where /// The generic raw text / RCDATA parsing algorithm. /// Insert a RCDATA/RAWTEXT element and switch the tokenizer to raw-text mode. /// - /// XHTML allows self-closing syntax (`<title/>`, `<style/>`, …) on these - /// elements. The HTML5 spec ignores the `/` and enters the raw-text state, - /// which swallows the remainder of the document until a matching end tag. - /// When the `xhtml-self-closing` feature is enabled, the self-closing flag - /// is honoured instead: an empty element is inserted and the tokenizer stays - /// in the current insertion mode. + /// When the `xhtml-self-closing` feature is enabled, (`<title/>`, `<style/>`, …) + /// are treated as empty elements instead of invalid HTML which ends up + /// swallowing all the content that comes after it. fn parse_raw_data(&self, tag: Tag, k: RawKind) -> ProcessResult<Handle> { #[cfg(feature = "xhtml-self-closing")] if tag.self_closing { diff --git a/markup5ever/interface/tree_builder.rs b/markup5ever/interface/tree_builder.rs index fbc70b16..44803457 100644 --- a/markup5ever/interface/tree_builder.rs +++ b/markup5ever/interface/tree_builder.rs @@ -269,15 +269,15 @@ pub trait TreeSink { /// Called whenever the line number changes. fn set_current_line(&self, _line_number: u64) {} - /// Called whenever the source byte offset changes. + /// Called before a tree-builder callback with the source byte offset for the + /// token or text segment that triggered it. /// - /// Only called when the `source-positions` feature is enabled on the - /// `html5ever` crate. The offset is the number of UTF-8 bytes consumed - /// from the input up to and including the last character of the token - /// that just triggered the current tree-builder callback. + /// For start tags, end tags, comments, and doctypes this is the UTF-8 byte + /// offset of the token's first byte in the original input. For character + /// tokens this is the UTF-8 byte offset of the first byte in the current text + /// segment. /// - /// Implement this method to obtain byte-accurate source positions for - /// nodes. The default implementation is a no-op. + /// The default implementation is a no-op. #[cfg(feature = "source-positions")] fn set_current_byte(&self, _byte_offset: u64) {} diff --git a/markup5ever/util/buffer_queue.rs b/markup5ever/util/buffer_queue.rs index ca9af75c..4099ff60 100644 --- a/markup5ever/util/buffer_queue.rs +++ b/markup5ever/util/buffer_queue.rs @@ -55,8 +55,7 @@ pub struct BufferQueue { buffers: RefCell<VecDeque<StrTendril>>, /// Total number of UTF-8 bytes consumed from this queue so far. /// - /// Only present when the `source-positions` feature is enabled. Used by - /// the tokenizer to surface byte-accurate source offsets via + /// Used by the tokenizer to surface byte-accurate source offsets via /// [`TokenSink::set_current_byte`] and [`TreeSink::set_current_byte`]. #[cfg(feature = "source-positions")] bytes_consumed: Cell<u64>, @@ -83,12 +82,11 @@ impl BufferQueue { /// Returns the total number of UTF-8 bytes consumed from this queue. /// - /// Only available when the `source-positions` feature is enabled. The - /// value monotonically increases as characters are consumed via + /// The value monotonically increases as characters are consumed via /// [`next`], [`pop_except_from`], and [`eat`]. Re-queuing bytes via - /// [`push_front`] does **not** decrement the counter — the tokenizer - /// uses its own `reconsume` flag for single-character look-back and - /// never actually re-pushes bytes that were already counted. + /// [`push_front`] does **not** decrement the counter. + /// + /// To reduce bytes_consumed, use [`retreat_bytes_consumed`]. #[cfg(feature = "source-positions")] #[inline] pub fn bytes_consumed(&self) -> u64 { @@ -97,9 +95,7 @@ impl BufferQueue { /// Advance the bytes-consumed counter by `n`. /// - /// Only available when the `source-positions` feature is enabled. - /// Used by SIMD fast paths that consume bytes directly from a tendril - /// without going through [`next`] or [`pop_except_from`]. + /// Use this to manually advance the counter when bypassing: [`next`], [`pop_except_from`], and [`eat`] #[cfg(feature = "source-positions")] #[inline] pub fn advance_bytes_consumed(&self, n: u64) { @@ -108,8 +104,7 @@ impl BufferQueue { /// Retreat the bytes-consumed counter by `n`. /// - /// Only available when the `source-positions` feature is enabled. Used by - /// tokenizer lookahead paths that consume raw bytes, then push unmatched + /// Used by tokenizer lookahead paths that consume raw bytes, then push unmatched /// suffix bytes back onto the queue. #[cfg(feature = "source-positions")] #[inline] @@ -401,11 +396,10 @@ mod test_source_positions { use super::SetResult::{FromSet, NotFromSet}; #[test] - fn next_advances_counter_by_utf8_width() { + fn next_advances_counter_by_utf8_width_single() { let bq = BufferQueue::default(); assert_eq!(bq.bytes_consumed(), 0); - // ASCII: 1 byte each bq.push_back("abc".to_tendril()); bq.next(); assert_eq!(bq.bytes_consumed(), 1); @@ -413,87 +407,99 @@ mod test_source_positions { assert_eq!(bq.bytes_consumed(), 2); bq.next(); assert_eq!(bq.bytes_consumed(), 3); + } + + #[test] + fn next_advances_counter_by_utf8_width_double() { + let bq = BufferQueue::default(); + assert_eq!(bq.bytes_consumed(), 0); - // Multibyte: 'é' is 2 bytes (U+00E9, encoded as 0xC3 0xA9) bq.push_back("é".to_tendril()); bq.next(); - assert_eq!(bq.bytes_consumed(), 5); + assert_eq!(bq.bytes_consumed(), 2); } #[test] - fn pop_except_from_bulk_advances_counter() { + fn pop_except_from_not_from_set_advances_counter() { let bq = BufferQueue::default(); - // "abc" are not in the set; '&' is - bq.push_back("abc&def".to_tendril()); + bq.push_back("abc&".to_tendril()); let set = small_char_set!('&'); - // Bulk NotFromSet: 3 bytes consumed assert_eq!( bq.pop_except_from(set), Some(NotFromSet("abc".to_tendril())) ); assert_eq!(bq.bytes_consumed(), 3); + } + + #[test] + fn pop_except_from_from_set_advances_counter() { + let bq = BufferQueue::default(); + bq.push_back("&def".to_tendril()); + let set = small_char_set!('&'); - // Single FromSet '&': 1 byte consumed assert_eq!(bq.pop_except_from(set), Some(FromSet('&'))); + assert_eq!(bq.bytes_consumed(), 1); + } + + #[test] + fn pop_except_from_successive_calls_accumulate_counter() { + let bq = BufferQueue::default(); + bq.push_back("abc&def".to_tendril()); + let set = small_char_set!('&'); + + bq.pop_except_from(set); + assert_eq!(bq.bytes_consumed(), 3); + + bq.pop_except_from(set); assert_eq!(bq.bytes_consumed(), 4); - // Bulk NotFromSet: 3 more bytes - assert_eq!( - bq.pop_except_from(set), - Some(NotFromSet("def".to_tendril())) - ); + bq.pop_except_from(set); assert_eq!(bq.bytes_consumed(), 7); } #[test] fn pop_except_from_multibyte_bulk_advances_by_byte_len() { - // "café" is 5 bytes (c=1, a=1, f=1, é=2). '&' terminates the bulk. - // Confirms NotFromSet advances by the byte length of the tendril slice, - // not by the character count. let bq = BufferQueue::default(); bq.push_back("café&".to_tendril()); let set = small_char_set!('&'); let result = bq.pop_except_from(set); assert!(matches!(result, Some(NotFromSet(_)))); - // 'c'=1 + 'a'=1 + 'f'=1 + 'é'=2 = 5 bytes assert_eq!(bq.bytes_consumed(), 5); } #[test] - fn eat_advances_counter_on_match_not_on_no_match() { + fn eat_advances_counter_accordingly() { let bq = BufferQueue::default(); bq.push_back("abcdef".to_tendril()); - // No match: counter unchanged assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false)); assert_eq!(bq.bytes_consumed(), 0); - // Match "abc": counter advances by 3 assert_eq!(bq.eat("abc", u8::eq_ignore_ascii_case), Some(true)); assert_eq!(bq.bytes_consumed(), 3); - // Match "def": counter advances by 3 more assert_eq!(bq.eat("def", u8::eq_ignore_ascii_case), Some(true)); assert_eq!(bq.bytes_consumed(), 6); } #[test] + /// This test is to ensure the behaviour contract of push_front is kept. + /// There are use cases where pushing front should technically not retreat the + /// bytes counter, so it's up to the caller to decide if pushing front should retreat. fn push_front_does_not_decrement_counter() { let bq = BufferQueue::default(); bq.push_back("abc".to_tendril()); - bq.next(); // consume 'a' → 1 - bq.next(); // consume 'b' → 2 + bq.next(); + bq.next(); assert_eq!(bq.bytes_consumed(), 2); - // Re-queue something — counter must not decrease bq.push_front("xy".to_tendril()); assert_eq!(bq.bytes_consumed(), 2); - // Consuming the re-queued bytes advances further - bq.next(); // 'x' → 3 - bq.next(); // 'y' → 4 + bq.next(); + bq.next(); assert_eq!(bq.bytes_consumed(), 4); } @@ -508,4 +514,20 @@ mod test_source_positions { bq.advance_bytes_consumed(3); assert_eq!(bq.bytes_consumed(), 10); } + + #[test] + fn retreat_bytes_consumed_subtracts_exactly() { + let bq = BufferQueue::default(); + bq.advance_bytes_consumed(10); + assert_eq!(bq.bytes_consumed(), 10); + + bq.retreat_bytes_consumed(3); + assert_eq!(bq.bytes_consumed(), 7); + + bq.retreat_bytes_consumed(7); + assert_eq!(bq.bytes_consumed(), 0); + + bq.retreat_bytes_consumed(5); + assert_eq!(bq.bytes_consumed(), 0); + } } diff --git a/rcdom/tests/source-positions-integration.rs b/rcdom/tests/source-positions-integration.rs index 052c63e1..34ebe25c 100644 --- a/rcdom/tests/source-positions-integration.rs +++ b/rcdom/tests/source-positions-integration.rs @@ -1,18 +1,16 @@ -// Copyright 2014-2026 The html5ever Project Developers. See the -// COPYRIGHT file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or -// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license -// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - //! Integration tests for the `source-positions` feature. //! //! Verifies that byte offsets flow correctly from `BufferQueue` through the //! tokenizer and tree builder all the way into `TreeSink::set_current_byte`, //! and that the offsets correspond to the actual positions of element opening //! tags in the source string. +//! +//! 2 Critical behaviours are under test: +//! +//! 1. When no explicit <head>,<html>,<body> tags are part of the payload +//! they get injected implicitly, they should not skew the byte offset. +//! 2. When the above tags are explicitly part of the payload, they should be part +//! of the count. #[cfg(feature = "source-positions")] mod source_positions { @@ -28,7 +26,9 @@ mod source_positions { use std::cell::{Cell, RefCell}; /// Wraps `RcDom` and records `(local_name, byte_offset)` for every - /// element created while `set_current_byte` is active. + /// element created. + /// + /// These are then later used for assertions. struct ByteCapturingDOM { current_byte: Cell<u64>, elements: RefCell<Vec<(String, u64)>>, @@ -44,28 +44,21 @@ mod source_positions { } } - /// Returns recorded `(local_name, byte_offset)` pairs, skipping the - /// implicit wrapper elements html5ever inserts (`html`, `head`, `body`). fn content_elements(&self) -> Vec<(String, u64)> { - self.elements - .borrow() - .iter() - .filter(|(name, _)| !matches!(name.as_str(), "html" | "head" | "body")) - .cloned() - .collect() + self.elements.borrow().clone() } } impl TreeSink for ByteCapturingDOM { + type Handle = Handle; type Output = Self; + type ElemName<'a> = ExpandedName<'a>; fn finish(self) -> Self { self } - type Handle = Handle; - fn parse_error(&self, msg: Cow<'static, str>) { self.rcdom.parse_error(msg); } @@ -74,18 +67,6 @@ mod source_positions { self.rcdom.get_document() } - fn get_template_contents(&self, target: &Handle) -> Handle { - self.rcdom.get_template_contents(target) - } - - fn set_quirks_mode(&self, mode: QuirksMode) { - self.rcdom.set_quirks_mode(mode) - } - - fn same_node(&self, x: &Handle, y: &Handle) -> bool { - self.rcdom.same_node(x, y) - } - fn elem_name<'a>(&'a self, target: &'a Handle) -> ExpandedName<'a> { self.rcdom.elem_name(target) } @@ -114,10 +95,6 @@ mod source_positions { self.rcdom.append(parent, child) } - fn append_before_sibling(&self, sibling: &Handle, child: NodeOrText<Handle>) { - self.rcdom.append_before_sibling(sibling, child) - } - fn append_based_on_parent_node( &self, element: &Handle, @@ -138,6 +115,22 @@ mod source_positions { .append_doctype_to_document(name, public_id, system_id); } + fn get_template_contents(&self, target: &Handle) -> Handle { + self.rcdom.get_template_contents(target) + } + + fn same_node(&self, x: &Handle, y: &Handle) -> bool { + self.rcdom.same_node(x, y) + } + + fn set_quirks_mode(&self, mode: QuirksMode) { + self.rcdom.set_quirks_mode(mode) + } + + fn append_before_sibling(&self, sibling: &Handle, child: NodeOrText<Handle>) { + self.rcdom.append_before_sibling(sibling, child) + } + fn add_attrs_if_missing(&self, target: &Handle, attrs: Vec<Attribute>) { self.rcdom.add_attrs_if_missing(target, attrs); } @@ -150,14 +143,6 @@ mod source_positions { self.rcdom.reparent_children(node, new_parent); } - fn mark_script_already_started(&self, target: &Handle) { - self.rcdom.mark_script_already_started(target); - } - - fn set_current_line(&self, line_number: u64) { - self.rcdom.set_current_line(line_number); - } - fn set_current_byte(&self, byte_offset: u64) { self.current_byte.set(byte_offset); } @@ -170,45 +155,73 @@ mod source_positions { #[test] fn element_byte_offsets_match_source_positions() { - // <p> starts at byte 0 - // <div> starts at byte 14 ("<p>hello</p>" = 12 chars + 2 for "</p>") - // <p>hello</p> = 12 bytes, </p> = 4 bytes → <div> at 16? Let's be precise: - // "<p>hello</p><div>world</div>" - // 0123456789012345678901234567 - // <p> = 0, </p> = 8, <div> = 12 let result = parse("<p>hello</p><div>world</div>"); let elems = result.content_elements(); - assert_eq!(elems.len(), 2, "expected p and div, got: {:?}", elems); - assert_eq!(elems[0], ("p".to_string(), 0)); - assert_eq!(elems[1], ("div".to_string(), 12)); + assert_eq!( + elems.len(), + 5, + "expected html, head, body, p and div, got: {:?}", + elems + ); + assert_eq!(elems[0], ("html".to_string(), 0)); + assert_eq!(elems[1], ("head".to_string(), 0)); + assert_eq!(elems[2], ("body".to_string(), 0)); + assert_eq!(elems[3], ("p".to_string(), 0)); + assert_eq!(elems[4], ("div".to_string(), 12)); } #[test] fn nested_element_byte_offset() { - // "<div><span>x</span></div>" - // 01234567890123456789... - // <div> = 0, <span> = 5 let result = parse("<div><span>x</span></div>"); let elems = result.content_elements(); - assert_eq!(elems.len(), 2, "expected div and span, got: {:?}", elems); - assert_eq!(elems[0], ("div".to_string(), 0)); - assert_eq!(elems[1], ("span".to_string(), 5)); + assert_eq!( + elems.len(), + 5, + "expected html, head, body, div and span, got: {:?}", + elems + ); + assert_eq!(elems[0], ("html".to_string(), 0)); + assert_eq!(elems[1], ("head".to_string(), 0)); + assert_eq!(elems[2], ("body".to_string(), 0)); + assert_eq!(elems[3], ("div".to_string(), 0)); + assert_eq!(elems[4], ("span".to_string(), 5)); + } + + #[test] + fn explicit_html_head_body_offsets() { + let result = parse("<html><head></head><body><p>hi</p></body></html>"); + let elems = result.content_elements(); + + assert_eq!( + elems.len(), + 4, + "expected html, head, body, p, got: {:?}", + elems + ); + assert_eq!(elems[0], ("html".to_string(), 0)); + assert_eq!(elems[1], ("head".to_string(), 6)); + assert_eq!(elems[2], ("body".to_string(), 19)); + assert_eq!(elems[3], ("p".to_string(), 25)); } #[test] + /// <span> should start at byte 12, and not 13 due to é being 2 bytes. fn multibyte_content_does_not_shift_subsequent_offsets() { - // "<p>café</p><span>next</span>" - // 'é' = 2 bytes, so: - // <p> = byte 0 - // </p> = byte 3+5 = byte 8 ("café" = c(1)+a(1)+f(1)+é(2) = 5 bytes) - // <span> = byte 8 + 4 = byte 12 ("</p>" = 4 bytes) let result = parse("<p>café</p><span>next</span>"); let elems = result.content_elements(); - assert_eq!(elems.len(), 2, "expected p and span, got: {:?}", elems); - assert_eq!(elems[0], ("p".to_string(), 0)); - assert_eq!(elems[1], ("span".to_string(), 12)); + assert_eq!( + elems.len(), + 5, + "expected html, head, body, p and span, got: {:?}", + elems + ); + assert_eq!(elems[0], ("html".to_string(), 0)); + assert_eq!(elems[1], ("head".to_string(), 0)); + assert_eq!(elems[2], ("body".to_string(), 0)); + assert_eq!(elems[3], ("p".to_string(), 0)); + assert_eq!(elems[4], ("span".to_string(), 12)); } } diff --git a/rcdom/tests/xhtml-self-closing-integration.rs b/rcdom/tests/xhtml-self-closing-integration.rs index c49aaf69..ae3a5d4a 100644 --- a/rcdom/tests/xhtml-self-closing-integration.rs +++ b/rcdom/tests/xhtml-self-closing-integration.rs @@ -1,19 +1,9 @@ -// Copyright 2014-2026 The html5ever Project Developers. See the -// COPYRIGHT file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or -// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license -// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - //! Integration tests for the `xhtml-self-closing` feature. //! //! EPUB content is XHTML and uses self-closing syntax on RCDATA/RAWTEXT //! elements such as `<title/>` and `<style/>`. Without `xhtml-self-closing`, //! html5ever treats these as opening tags and enters raw-text mode, consuming -//! the rest of the document. These tests verify the feature makes parsing -//! behave as XHTML authors expect. +//! the rest of the document #[cfg(feature = "xhtml-self-closing")] mod xhtml_self_closing { @@ -69,9 +59,6 @@ mod xhtml_self_closing { #[test] fn self_closing_title_does_not_swallow_body() { - // Without the feature <title/> opens a RCDATA region that swallows - // everything up to the next . With it, is empty and - // the body parses normally. let dom = parse("<html><head><title/></head><body><p>visible</p></body></html>"); let names = element_names(&dom.document); @@ -109,8 +96,6 @@ mod xhtml_self_closing { #[test] fn normal_closed_title_still_captures_rcdata_text() { - // A properly-closed <title>… must still capture its RCDATA - // content — the feature must not break normal title parsing. let dom = parse("My Book"); let text = text_of(&dom, "title"); assert_eq!(