diff --git a/Cargo.toml b/Cargo.toml index 4714cdbc..141c9124 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,3 +42,9 @@ libtest-mimic = "0.8.1" rand = "0.9" serde_json = "1.0" typed-arena = "2.0.2" + +# Redirect crates.io tendril/web_atoms to the local path crates so that +# markup5ever gets the same crate instance as rcdom and xml5ever. +[patch.crates-io] +tendril = { path = "tendril" } +web_atoms = { path = "web_atoms" } diff --git a/html5ever/Cargo.toml b/html5ever/Cargo.toml index 3584e456..23b752b0 100644 --- a/html5ever/Cargo.toml +++ b/html5ever/Cargo.toml @@ -5,25 +5,31 @@ documentation = "https://docs.rs/html5ever" categories = [ "parser-implementations", "web-programming" ] keywords = ["html", "html5", "parser", "parsing"] readme = "../README.md" -version.workspace = true -license.workspace = true -authors.workspace = true -repository.workspace = true -edition.workspace = true -rust-version.workspace = true +version = "0.39.0" +license = "MIT OR Apache-2.0" +authors = [ "The html5ever Project Developers" ] +repository = "https://github.com/servo/html5ever" +edition = "2021" +rust-version = "1.71.0" [features] trace_tokenizer = [] serde = ["markup5ever/serde"] +# Surfaces byte-accurate source positions; see markup5ever for full description. +source-positions = ["markup5ever/source-positions"] +# Honour the XML/XHTML self-closing syntax (``, `<style/>`, …) on +# RCDATA and RAWTEXT elements. Without this, html5ever treats `<title/>` +# as opening a RCDATA region that swallows the rest of the document. +xhtml-self-closing = [] [dependencies] -markup5ever = { workspace = true } -memchr = { workspace = true } -log = { workspace = true } +markup5ever = { version = "0.39", path = "../markup5ever" } +memchr = "2.8.0" +log = "0.4" [dev-dependencies] -criterion = { workspace = true } -typed-arena = { workspace = true } +criterion = "0.8" +typed-arena = "2.0.2" [[bench]] name = "html5ever" diff --git a/html5ever/src/tokenizer/char_ref/mod.rs b/html5ever/src/tokenizer/char_ref/mod.rs index e119477d..c8da81c8 100644 --- a/html5ever/src/tokenizer/char_ref/mod.rs +++ b/html5ever/src/tokenizer/char_ref/mod.rs @@ -212,7 +212,11 @@ impl CharRefTokenizer { unconsume.push_char(c) } + #[cfg(feature = "source-positions")] + let unconsume_len = unconsume.len() as u64; input.push_front(unconsume); + #[cfg(feature = "source-positions")] + input.retreat_bytes_consumed(unconsume_len); tokenizer.emit_error(Borrowed("Numeric character reference without digits")); Status::Done(CharRef::EMPTY) } @@ -292,7 +296,12 @@ impl CharRefTokenizer { } fn unconsume_name(&mut self, input: &BufferQueue) { - input.push_front(self.name_buf_opt.take().unwrap()); + let name_buf = self.name_buf_opt.take().unwrap(); + #[cfg(feature = "source-positions")] + let name_buf_len = name_buf.len() as u64; + input.push_front(name_buf); + #[cfg(feature = "source-positions")] + input.retreat_bytes_consumed(name_buf_len); } fn finish_named<Sink: TokenSink>( @@ -367,7 +376,12 @@ impl CharRefTokenizer { self.unconsume_name(input); Status::Done(CharRef::EMPTY) } else { - input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..])); + let unconsumed = StrTendril::from_slice(&self.name_buf()[name_len..]); + #[cfg(feature = "source-positions")] + let unconsumed_len = unconsumed.len() as u64; + input.push_front(unconsumed); + #[cfg(feature = "source-positions")] + input.retreat_bytes_consumed(unconsumed_len); tokenizer.ignore_lf.set(false); Status::Done(CharRef { chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()], @@ -419,6 +433,8 @@ impl CharRefTokenizer { }, State::Octothorpe => { input.push_front(StrTendril::from_slice("#")); + #[cfg(feature = "source-positions")] + input.retreat_bytes_consumed(1); tokenizer.emit_error(Borrowed("EOF after '#' in character reference")); Status::Done(CharRef::EMPTY) }, diff --git a/html5ever/src/tokenizer/interface.rs b/html5ever/src/tokenizer/interface.rs index b1436a71..97437809 100644 --- a/html5ever/src/tokenizer/interface.rs +++ b/html5ever/src/tokenizer/interface.rs @@ -130,6 +130,13 @@ pub trait TokenSink { /// Signal that tokenization reached the end of the document. fn end(&self) {} + /// Called just before each token is dispatched to [`process_token`], + /// with the number of UTF-8 bytes consumed from the input so far. + /// + /// The default implementation is a no-op. + #[cfg(feature = "source-positions")] + fn set_current_byte(&self, _byte_offset: u64) {} + /// Used in the [markup declaration open state]. By default, this always /// returns false and thus all CDATA sections are tokenized as bogus /// comments. diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs index ba9a095c..8b0d473a 100644 --- a/html5ever/src/tokenizer/mod.rs +++ b/html5ever/src/tokenizer/mod.rs @@ -181,6 +181,30 @@ pub struct Tokenizer<Sink> { /// Track current line current_line: Cell<u64>, + + /// Number of UTF-8 bytes consumed from the input so far. + /// + /// Kept in sync with `BufferQueue::bytes_consumed` after every character + /// is consumed. + #[cfg(feature = "source-positions")] + current_byte: Cell<u64>, + + /// Byte offset of the first character of the current token. + /// + /// For tag, comment, and doctype tokens this is the byte of the `<` that + /// opened them, captured whenever `<` is consumed in `get_preprocessed_char`. + /// + /// For character tokens it is the byte right after the end of the previous token, + /// which equals the first byte of the text content, this is tracked via `last_token_end_byte`. + #[cfg(feature = "source-positions")] + token_start_byte: Cell<u64>, + + /// Byte offset one past the end of the most recently emitted token. + /// + /// Updated at the end of each `process_token` call. Used as the start + /// byte for the next character token. + #[cfg(feature = "source-positions")] + last_token_end_byte: Cell<u64>, } impl<Sink: TokenSink> Tokenizer<Sink> { @@ -216,6 +240,12 @@ impl<Sink: TokenSink> Tokenizer<Sink> { state_profile: RefCell::new(BTreeMap::new()), time_in_sink: Cell::new(0), current_line: Cell::new(1), + #[cfg(feature = "source-positions")] + current_byte: Cell::new(0), + #[cfg(feature = "source-positions")] + token_start_byte: Cell::new(0), + #[cfg(feature = "source-positions")] + last_token_end_byte: Cell::new(0), } } @@ -243,13 +273,27 @@ impl<Sink: TokenSink> Tokenizer<Sink> { } fn process_token(&self, token: Token) -> TokenSinkResult<Sink::Handle> { - if self.opts.profile { + #[cfg(feature = "source-positions")] + { + let byte = match &token { + Token::TagToken(_) | Token::CommentToken(_) | Token::DoctypeToken(_) => { + self.token_start_byte.get() + }, + Token::CharacterTokens(_) => self.last_token_end_byte.get(), + _ => self.current_byte.get(), + }; + self.sink.set_current_byte(byte); + } + let result = if self.opts.profile { let (ret, dt) = time!(self.sink.process_token(token, self.current_line.get())); self.time_in_sink.set(self.time_in_sink.get() + dt); ret } else { self.sink.process_token(token, self.current_line.get()) - } + }; + #[cfg(feature = "source-positions")] + self.last_token_end_byte.set(self.current_byte.get()); + result } fn process_token_and_continue(&self, token: Token) { @@ -292,6 +336,17 @@ impl<Sink: TokenSink> Tokenizer<Sink> { trace!("got character {c}"); self.current_char.set(c); + #[cfg(feature = "source-positions")] + { + let pos = input.bytes_consumed(); + if pos > 0 { + self.current_byte.set(pos); + if c == '<' { + self.token_start_byte + .set(pos.saturating_sub(c.len_utf8() as u64)); + } + } + } Some(c) } @@ -325,7 +380,13 @@ impl<Sink: TokenSink> Tokenizer<Sink> { // NB: We don't set self.current_char for a run of characters not // in the set. It shouldn't matter for the codepaths that use // this. - _ => d, + other => { + #[cfg(feature = "source-positions")] + if other.is_some() { + self.current_byte.set(input.bytes_consumed()); + } + other + }, } } @@ -621,7 +682,20 @@ impl<Sink: TokenSink> Tokenizer<Sink> { if self.reconsume.get() { self.reconsume.set(false); } else { + #[cfg(not(feature = "source-positions"))] input.next(); + #[cfg(feature = "source-positions")] + { + let c = input.next(); + if let Some(c) = c { + let pos = input.bytes_consumed(); + self.current_byte.set(pos); + if c == '<' { + self.token_start_byte + .set(pos.saturating_sub(c.len_utf8() as u64)); + } + } + } } } @@ -757,6 +831,20 @@ impl<Sink: TokenSink> Tokenizer<Sink> { // This CPU is guaranteed to support SIMD due to the is_supported_simd_feature_detected check above let result = unsafe { self.data_state_simd_fast_path(&mut front_buffer) }; + #[cfg(feature = "source-positions")] + if let Some(ref r) = result { + let n = match r { + SetResult::NotFromSet(ref t) => t.len() as u64, + SetResult::FromSet(c) => c.len_utf8() as u64, + }; + input.advance_bytes_consumed(n); + self.current_byte.set(input.bytes_consumed()); + if let SetResult::FromSet('<') = r { + self.token_start_byte + .set(input.bytes_consumed() - '<'.len_utf8() as u64); + } + } + if front_buffer.is_empty() { drop(front_buffer); input.pop_front(); @@ -1752,6 +1840,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> { let mut char_ref_tokenizer = self.char_ref_tokenizer.borrow_mut(); let progress = match char_ref_tokenizer.as_mut().unwrap().step(self, input) { char_ref::Status::Done(char_ref) => { + #[cfg(feature = "source-positions")] + self.current_byte.set(input.bytes_consumed()); self.process_char_ref(char_ref); *char_ref_tokenizer = None; return ProcessResult::Continue; @@ -2379,3 +2469,236 @@ mod test { assert_eq!(results, expected); } } + +#[cfg(all(test, feature = "source-positions"))] +mod test_source_positions { + use crate::tendril::StrTendril; + + use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, TagToken}; + use super::interface::{EndTag, StartTag, Tag, Token}; + use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; + + use crate::LocalName; + use markup5ever::buffer_queue::BufferQueue; + use std::cell::RefCell; + + /// Records (token, byte_offset) pairs via `set_current_byte`. + struct BytesMatch { + current_byte: std::cell::Cell<u64>, + text_start_byte: std::cell::Cell<Option<u64>>, + current_str: RefCell<StrTendril>, + entries: RefCell<Vec<(Token, u64)>>, + } + + impl BytesMatch { + fn new() -> Self { + BytesMatch { + current_byte: std::cell::Cell::new(0), + text_start_byte: std::cell::Cell::new(None), + current_str: RefCell::new(StrTendril::new()), + entries: RefCell::new(vec![]), + } + } + + /// Emit the accumulated character run using the byte of its first chunk. + fn flush_chars(&self) { + let s = self.current_str.take(); + if !s.is_empty() { + let byte = self.text_start_byte.get().unwrap_or(0); + self.text_start_byte.set(None); + self.entries.borrow_mut().push((CharacterTokens(s), byte)); + } + } + } + + /// Records every token without coalescing adjacent character chunks. + struct RawBytesMatch { + current_byte: std::cell::Cell<u64>, + entries: RefCell<Vec<(Token, u64)>>, + } + + impl RawBytesMatch { + fn new() -> Self { + RawBytesMatch { + current_byte: std::cell::Cell::new(0), + entries: RefCell::new(vec![]), + } + } + } + + impl TokenSink for RawBytesMatch { + type Handle = (); + + fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<Self::Handle> { + if !matches!(token, EOFToken) { + self.entries + .borrow_mut() + .push((token, self.current_byte.get())); + } + TokenSinkResult::Continue + } + + fn set_current_byte(&self, byte_offset: u64) { + self.current_byte.set(byte_offset); + } + } + + impl TokenSink for BytesMatch { + type Handle = (); + + fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<Self::Handle> { + let byte = self.current_byte.get(); + match token { + CharacterTokens(b) => { + if self.text_start_byte.get().is_none() { + self.text_start_byte.set(Some(byte)); + } + self.current_str.borrow_mut().push_slice(&b); + }, + NullCharacterToken => { + self.current_str.borrow_mut().push_char('\0'); + }, + EOFToken => { + self.flush_chars(); + }, + TagToken(t) => { + self.flush_chars(); + self.entries.borrow_mut().push((TagToken(t), byte)); + }, + other => { + self.flush_chars(); + self.entries.borrow_mut().push((other, byte)); + }, + } + TokenSinkResult::Continue + } + + fn set_current_byte(&self, byte_offset: u64) { + self.current_byte.set(byte_offset); + } + } + + fn tokenize_bytes(input: &str) -> Vec<(Token, u64)> { + let sink = BytesMatch::new(); + let tok = Tokenizer::new( + sink, + TokenizerOpts { + exact_errors: false, + discard_bom: true, + profile: false, + initial_state: None, + last_start_tag_name: None, + }, + ); + let buf = BufferQueue::default(); + buf.push_back(StrTendril::from(input)); + let _ = tok.feed(&buf); + tok.end(); + tok.sink.entries.take() + } + + fn tokenize_raw_bytes(input: &str) -> Vec<(Token, u64)> { + let sink = RawBytesMatch::new(); + let tok = Tokenizer::new( + sink, + TokenizerOpts { + exact_errors: false, + discard_bom: true, + profile: false, + initial_state: None, + last_start_tag_name: None, + }, + ); + let buf = BufferQueue::default(); + buf.push_back(StrTendril::from(input)); + let _ = tok.feed(&buf); + tok.end(); + tok.sink.entries.take() + } + + fn start(name: &str) -> Token { + TagToken(Tag { + kind: StartTag, + name: LocalName::from(name), + self_closing: false, + attrs: vec![], + had_duplicate_attributes: false, + }) + } + + fn end(name: &str) -> Token { + TagToken(Tag { + kind: EndTag, + name: LocalName::from(name), + self_closing: false, + attrs: vec![], + had_duplicate_attributes: false, + }) + } + + fn chars(s: &str) -> Token { + CharacterTokens(StrTendril::from(s)) + } + + #[test] + fn check_byte_offsets_simple_tags() { + let entries = tokenize_bytes("<a><b></b></a>"); + assert_eq!( + entries, + vec![ + (start("a"), 0), + (start("b"), 3), + (end("b"), 6), + (end("a"), 10), + ] + ); + } + + #[test] + fn check_byte_offsets_text_content() { + let entries = tokenize_bytes("<p>hello</p>"); + assert_eq!( + entries, + vec![(start("p"), 0), (chars("hello"), 3), (end("p"), 8),] + ); + } + + #[test] + fn check_byte_offsets_multibyte_text() { + let entries = tokenize_bytes("<p>é</p>"); + assert_eq!( + entries, + vec![(start("p"), 0), (chars("é"), 3), (end("p"), 5),] + ); + } + #[test] + fn check_byte_offsets_sequential_siblings() { + let entries = tokenize_bytes("<h1>X</h1><p>Y</p>"); + assert_eq!( + entries, + vec![ + (start("h1"), 0), + (chars("X"), 4), + (end("h1"), 5), + (start("p"), 10), + (chars("Y"), 13), + (end("p"), 14), + ] + ); + } + + #[test] + fn check_byte_offsets_entity_text_chunks() { + let entries = tokenize_raw_bytes("<p>a&b</p>"); + assert_eq!( + entries, + vec![ + (start("p"), 0), + (chars("a"), 3), + (chars("&"), 4), + (chars("b"), 9), + (end("p"), 10), + ] + ); + } +} diff --git a/html5ever/src/tree_builder/mod.rs b/html5ever/src/tree_builder/mod.rs index 3fcfaec3..bf28847a 100644 --- a/html5ever/src/tree_builder/mod.rs +++ b/html5ever/src/tree_builder/mod.rs @@ -474,6 +474,11 @@ where { type Handle = Handle; + #[cfg(feature = "source-positions")] + fn set_current_byte(&self, byte_offset: u64) { + self.sink.set_current_byte(byte_offset); + } + fn process_token(&self, token: tokenizer::Token, line_number: u64) -> TokenSinkResult<Handle> { if line_number != self.current_line.get() { self.sink.set_current_line(line_number); @@ -673,8 +678,19 @@ where ProcessResult::ToRawData(k) } - // The generic raw text / RCDATA parsing algorithm. + /// The generic raw text / RCDATA parsing algorithm. + /// Insert a RCDATA/RAWTEXT element and switch the tokenizer to raw-text mode. + /// + /// When the `xhtml-self-closing` feature is enabled, (`<title/>`, `<style/>`, …) + /// are treated as empty elements instead of invalid HTML which ends up + /// swallowing all the content that comes after it. fn parse_raw_data(&self, tag: Tag, k: RawKind) -> ProcessResult<Handle> { + #[cfg(feature = "xhtml-self-closing")] + if tag.self_closing { + self.insert_and_pop_element_for(tag); + return ProcessResult::DoneAckSelfClosing; + } + self.insert_element_for(tag); self.to_raw_text_mode(k) } diff --git a/markup5ever/Cargo.toml b/markup5ever/Cargo.toml index 764f9a0a..79024927 100644 --- a/markup5ever/Cargo.toml +++ b/markup5ever/Cargo.toml @@ -3,20 +3,25 @@ name = "markup5ever" description = "Common code for xml5ever and html5ever" documentation = "https://docs.rs/markup5ever" categories = [ "parser-implementations", "web-programming" ] -version.workspace = true -license.workspace = true -authors.workspace = true -repository.workspace = true -edition.workspace = true -rust-version.workspace = true +version = "0.39.0" +license = "MIT OR Apache-2.0" +authors = [ "The html5ever Project Developers" ] +repository = "https://github.com/servo/html5ever" +edition = "2021" +rust-version = "1.71.0" [lib] path = "lib.rs" [features] serde = ["web_atoms/serde"] +# Surfaces byte-accurate source positions through the `TreeSink` interface. +# When enabled, `TreeSink::set_current_byte` is called before each tree +# mutation with the UTF-8 byte offset of the current token in the input. +# Use this to assign stable, parser-independent offsets to DOM nodes. +source-positions = [] [dependencies] -web_atoms = { workspace = true } -tendril = { workspace = true } -log = { workspace = true } +web_atoms = "0.2.4" +tendril = "0.5" +log = "0.4" diff --git a/markup5ever/interface/tree_builder.rs b/markup5ever/interface/tree_builder.rs index e1683de0..44803457 100644 --- a/markup5ever/interface/tree_builder.rs +++ b/markup5ever/interface/tree_builder.rs @@ -269,6 +269,18 @@ pub trait TreeSink { /// Called whenever the line number changes. fn set_current_line(&self, _line_number: u64) {} + /// Called before a tree-builder callback with the source byte offset for the + /// token or text segment that triggered it. + /// + /// For start tags, end tags, comments, and doctypes this is the UTF-8 byte + /// offset of the token's first byte in the original input. For character + /// tokens this is the UTF-8 byte offset of the first byte in the current text + /// segment. + /// + /// The default implementation is a no-op. + #[cfg(feature = "source-positions")] + fn set_current_byte(&self, _byte_offset: u64) {} + fn allow_declarative_shadow_roots(&self, _intended_parent: &Self::Handle) -> bool { true } diff --git a/markup5ever/util/buffer_queue.rs b/markup5ever/util/buffer_queue.rs index d5e6864f..4099ff60 100644 --- a/markup5ever/util/buffer_queue.rs +++ b/markup5ever/util/buffer_queue.rs @@ -18,6 +18,8 @@ //! //! [`BufferQueue`]: struct.BufferQueue.html +#[cfg(feature = "source-positions")] +use std::cell::Cell; use std::{ cell::{RefCell, RefMut}, collections::VecDeque, @@ -51,6 +53,12 @@ pub enum SetResult { pub struct BufferQueue { /// Buffers to process. buffers: RefCell<VecDeque<StrTendril>>, + /// Total number of UTF-8 bytes consumed from this queue so far. + /// + /// Used by the tokenizer to surface byte-accurate source offsets via + /// [`TokenSink::set_current_byte`] and [`TreeSink::set_current_byte`]. + #[cfg(feature = "source-positions")] + bytes_consumed: Cell<u64>, } impl Default for BufferQueue { @@ -59,6 +67,8 @@ impl Default for BufferQueue { fn default() -> Self { Self { buffers: RefCell::new(VecDeque::with_capacity(16)), + #[cfg(feature = "source-positions")] + bytes_consumed: Cell::new(0), } } } @@ -70,6 +80,39 @@ impl BufferQueue { self.buffers.borrow().is_empty() } + /// Returns the total number of UTF-8 bytes consumed from this queue. + /// + /// The value monotonically increases as characters are consumed via + /// [`next`], [`pop_except_from`], and [`eat`]. Re-queuing bytes via + /// [`push_front`] does **not** decrement the counter. + /// + /// To reduce bytes_consumed, use [`retreat_bytes_consumed`]. + #[cfg(feature = "source-positions")] + #[inline] + pub fn bytes_consumed(&self) -> u64 { + self.bytes_consumed.get() + } + + /// Advance the bytes-consumed counter by `n`. + /// + /// Use this to manually advance the counter when bypassing: [`next`], [`pop_except_from`], and [`eat`] + #[cfg(feature = "source-positions")] + #[inline] + pub fn advance_bytes_consumed(&self, n: u64) { + self.bytes_consumed.set(self.bytes_consumed.get() + n); + } + + /// Retreat the bytes-consumed counter by `n`. + /// + /// Used by tokenizer lookahead paths that consume raw bytes, then push unmatched + /// suffix bytes back onto the queue. + #[cfg(feature = "source-positions")] + #[inline] + pub fn retreat_bytes_consumed(&self, n: u64) { + self.bytes_consumed + .set(self.bytes_consumed.get().saturating_sub(n)); + } + /// Get the buffer at the beginning of the queue. #[inline] pub fn pop_front(&self) -> Option<StrTendril> { @@ -146,9 +189,15 @@ impl BufferQueue { out = buf.unsafe_subtendril(0, n); buf.unsafe_pop_front(n); } + #[cfg(feature = "source-positions")] + self.bytes_consumed + .set(self.bytes_consumed.get() + out.len() as u64); (Some(NotFromSet(out)), buf.is_empty()) } else { let c = buf.pop_front_char().expect("empty buffer in queue"); + #[cfg(feature = "source-positions")] + self.bytes_consumed + .set(self.bytes_consumed.get() + c.len_utf8() as u64); (Some(FromSet(c)), buf.is_empty()) } }, @@ -218,6 +267,10 @@ impl BufferQueue { Some(ref mut buf) => buf.pop_front(consumed_from_last as u32), } + #[cfg(feature = "source-positions")] + self.bytes_consumed + .set(self.bytes_consumed.get() + pat.len() as u64); + Some(true) } @@ -229,6 +282,9 @@ impl BufferQueue { None => (None, false), Some(buf) => { let c = buf.pop_front_char().expect("empty buffer in queue"); + #[cfg(feature = "source-positions")] + self.bytes_consumed + .set(self.bytes_consumed.get() + c.len_utf8() as u64); (Some(c), buf.is_empty()) }, }; @@ -331,3 +387,147 @@ mod test { assert_eq!(bq.next(), None); } } + +#[cfg(all(test, feature = "source-positions"))] +mod test_source_positions { + use tendril::SliceExt; + + use super::BufferQueue; + use super::SetResult::{FromSet, NotFromSet}; + + #[test] + fn next_advances_counter_by_utf8_width_single() { + let bq = BufferQueue::default(); + assert_eq!(bq.bytes_consumed(), 0); + + bq.push_back("abc".to_tendril()); + bq.next(); + assert_eq!(bq.bytes_consumed(), 1); + bq.next(); + assert_eq!(bq.bytes_consumed(), 2); + bq.next(); + assert_eq!(bq.bytes_consumed(), 3); + } + + #[test] + fn next_advances_counter_by_utf8_width_double() { + let bq = BufferQueue::default(); + assert_eq!(bq.bytes_consumed(), 0); + + bq.push_back("é".to_tendril()); + bq.next(); + assert_eq!(bq.bytes_consumed(), 2); + } + + #[test] + fn pop_except_from_not_from_set_advances_counter() { + let bq = BufferQueue::default(); + bq.push_back("abc&".to_tendril()); + let set = small_char_set!('&'); + + assert_eq!( + bq.pop_except_from(set), + Some(NotFromSet("abc".to_tendril())) + ); + assert_eq!(bq.bytes_consumed(), 3); + } + + #[test] + fn pop_except_from_from_set_advances_counter() { + let bq = BufferQueue::default(); + bq.push_back("&def".to_tendril()); + let set = small_char_set!('&'); + + assert_eq!(bq.pop_except_from(set), Some(FromSet('&'))); + assert_eq!(bq.bytes_consumed(), 1); + } + + #[test] + fn pop_except_from_successive_calls_accumulate_counter() { + let bq = BufferQueue::default(); + bq.push_back("abc&def".to_tendril()); + let set = small_char_set!('&'); + + bq.pop_except_from(set); + assert_eq!(bq.bytes_consumed(), 3); + + bq.pop_except_from(set); + assert_eq!(bq.bytes_consumed(), 4); + + bq.pop_except_from(set); + assert_eq!(bq.bytes_consumed(), 7); + } + + #[test] + fn pop_except_from_multibyte_bulk_advances_by_byte_len() { + let bq = BufferQueue::default(); + bq.push_back("café&".to_tendril()); + let set = small_char_set!('&'); + + let result = bq.pop_except_from(set); + assert!(matches!(result, Some(NotFromSet(_)))); + assert_eq!(bq.bytes_consumed(), 5); + } + + #[test] + fn eat_advances_counter_accordingly() { + let bq = BufferQueue::default(); + bq.push_back("abcdef".to_tendril()); + + assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false)); + assert_eq!(bq.bytes_consumed(), 0); + + assert_eq!(bq.eat("abc", u8::eq_ignore_ascii_case), Some(true)); + assert_eq!(bq.bytes_consumed(), 3); + + assert_eq!(bq.eat("def", u8::eq_ignore_ascii_case), Some(true)); + assert_eq!(bq.bytes_consumed(), 6); + } + + #[test] + /// This test is to ensure the behaviour contract of push_front is kept. + /// There are use cases where pushing front should technically not retreat the + /// bytes counter, so it's up to the caller to decide if pushing front should retreat. + fn push_front_does_not_decrement_counter() { + let bq = BufferQueue::default(); + bq.push_back("abc".to_tendril()); + bq.next(); + bq.next(); + assert_eq!(bq.bytes_consumed(), 2); + + bq.push_front("xy".to_tendril()); + assert_eq!(bq.bytes_consumed(), 2); + + bq.next(); + bq.next(); + assert_eq!(bq.bytes_consumed(), 4); + } + + #[test] + fn advance_bytes_consumed_adds_exactly() { + let bq = BufferQueue::default(); + assert_eq!(bq.bytes_consumed(), 0); + + bq.advance_bytes_consumed(7); + assert_eq!(bq.bytes_consumed(), 7); + + bq.advance_bytes_consumed(3); + assert_eq!(bq.bytes_consumed(), 10); + } + + #[test] + fn retreat_bytes_consumed_subtracts_exactly() { + let bq = BufferQueue::default(); + bq.advance_bytes_consumed(10); + assert_eq!(bq.bytes_consumed(), 10); + + bq.retreat_bytes_consumed(3); + assert_eq!(bq.bytes_consumed(), 7); + + bq.retreat_bytes_consumed(7); + assert_eq!(bq.bytes_consumed(), 0); + + bq.retreat_bytes_consumed(5); + assert_eq!(bq.bytes_consumed(), 0); + } +} diff --git a/rcdom/Cargo.toml b/rcdom/Cargo.toml index caf52b54..30dab873 100644 --- a/rcdom/Cargo.toml +++ b/rcdom/Cargo.toml @@ -20,6 +20,10 @@ markup5ever = { workspace = true, features = ["serde"] } tendril = { workspace = true } xml5ever = { workspace = true } +[features] +source-positions = ["html5ever/source-positions"] +xhtml-self-closing = ["html5ever/xhtml-self-closing"] + [dev-dependencies] criterion = { workspace = true } env_logger = { workspace = true } @@ -45,3 +49,11 @@ harness = false [[test]] name = "xml-tokenizer" harness = false + +[[test]] +name = "source-positions-integration" +required-features = ["source-positions"] + +[[test]] +name = "xhtml-self-closing-integration" +required-features = ["xhtml-self-closing"] diff --git a/rcdom/tests/source-positions-integration.rs b/rcdom/tests/source-positions-integration.rs new file mode 100644 index 00000000..34ebe25c --- /dev/null +++ b/rcdom/tests/source-positions-integration.rs @@ -0,0 +1,227 @@ +//! Integration tests for the `source-positions` feature. +//! +//! Verifies that byte offsets flow correctly from `BufferQueue` through the +//! tokenizer and tree builder all the way into `TreeSink::set_current_byte`, +//! and that the offsets correspond to the actual positions of element opening +//! tags in the source string. +//! +//! 2 Critical behaviours are under test: +//! +//! 1. When no explicit <head>,<html>,<body> tags are part of the payload +//! they get injected implicitly, they should not skew the byte offset. +//! 2. When the above tags are explicitly part of the payload, they should be part +//! of the count. + +#[cfg(feature = "source-positions")] +mod source_positions { + use html5ever::driver; + use html5ever::tendril::stream::TendrilSink; + use html5ever::tendril::StrTendril; + use html5ever::ExpandedName; + use html5ever::QualName; + use markup5ever::interface::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; + use markup5ever::Attribute; + use markup5ever_rcdom::{Handle, RcDom}; + use std::borrow::Cow; + use std::cell::{Cell, RefCell}; + + /// Wraps `RcDom` and records `(local_name, byte_offset)` for every + /// element created. + /// + /// These are then later used for assertions. + struct ByteCapturingDOM { + current_byte: Cell<u64>, + elements: RefCell<Vec<(String, u64)>>, + rcdom: RcDom, + } + + impl ByteCapturingDOM { + fn new() -> Self { + ByteCapturingDOM { + current_byte: Cell::new(0), + elements: RefCell::new(vec![]), + rcdom: RcDom::default(), + } + } + + fn content_elements(&self) -> Vec<(String, u64)> { + self.elements.borrow().clone() + } + } + + impl TreeSink for ByteCapturingDOM { + type Handle = Handle; + type Output = Self; + + type ElemName<'a> = ExpandedName<'a>; + + fn finish(self) -> Self { + self + } + + fn parse_error(&self, msg: Cow<'static, str>) { + self.rcdom.parse_error(msg); + } + + fn get_document(&self) -> Handle { + self.rcdom.get_document() + } + + fn elem_name<'a>(&'a self, target: &'a Handle) -> ExpandedName<'a> { + self.rcdom.elem_name(target) + } + + fn create_element( + &self, + name: QualName, + attrs: Vec<Attribute>, + flags: ElementFlags, + ) -> Handle { + self.elements + .borrow_mut() + .push((name.local.to_string(), self.current_byte.get())); + self.rcdom.create_element(name, attrs, flags) + } + + fn create_comment(&self, text: StrTendril) -> Handle { + self.rcdom.create_comment(text) + } + + fn create_pi(&self, target: StrTendril, content: StrTendril) -> Handle { + self.rcdom.create_pi(target, content) + } + + fn append(&self, parent: &Handle, child: NodeOrText<Handle>) { + self.rcdom.append(parent, child) + } + + fn append_based_on_parent_node( + &self, + element: &Handle, + prev_element: &Handle, + child: NodeOrText<Handle>, + ) { + self.rcdom + .append_based_on_parent_node(element, prev_element, child) + } + + fn append_doctype_to_document( + &self, + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril, + ) { + self.rcdom + .append_doctype_to_document(name, public_id, system_id); + } + + fn get_template_contents(&self, target: &Handle) -> Handle { + self.rcdom.get_template_contents(target) + } + + fn same_node(&self, x: &Handle, y: &Handle) -> bool { + self.rcdom.same_node(x, y) + } + + fn set_quirks_mode(&self, mode: QuirksMode) { + self.rcdom.set_quirks_mode(mode) + } + + fn append_before_sibling(&self, sibling: &Handle, child: NodeOrText<Handle>) { + self.rcdom.append_before_sibling(sibling, child) + } + + fn add_attrs_if_missing(&self, target: &Handle, attrs: Vec<Attribute>) { + self.rcdom.add_attrs_if_missing(target, attrs); + } + + fn remove_from_parent(&self, target: &Handle) { + self.rcdom.remove_from_parent(target); + } + + fn reparent_children(&self, node: &Handle, new_parent: &Handle) { + self.rcdom.reparent_children(node, new_parent); + } + + fn set_current_byte(&self, byte_offset: u64) { + self.current_byte.set(byte_offset); + } + } + + fn parse(input: &str) -> ByteCapturingDOM { + let sink = ByteCapturingDOM::new(); + driver::parse_document(sink, Default::default()).one(StrTendril::from(input)) + } + + #[test] + fn element_byte_offsets_match_source_positions() { + let result = parse("<p>hello</p><div>world</div>"); + let elems = result.content_elements(); + + assert_eq!( + elems.len(), + 5, + "expected html, head, body, p and div, got: {:?}", + elems + ); + assert_eq!(elems[0], ("html".to_string(), 0)); + assert_eq!(elems[1], ("head".to_string(), 0)); + assert_eq!(elems[2], ("body".to_string(), 0)); + assert_eq!(elems[3], ("p".to_string(), 0)); + assert_eq!(elems[4], ("div".to_string(), 12)); + } + + #[test] + fn nested_element_byte_offset() { + let result = parse("<div><span>x</span></div>"); + let elems = result.content_elements(); + + assert_eq!( + elems.len(), + 5, + "expected html, head, body, div and span, got: {:?}", + elems + ); + assert_eq!(elems[0], ("html".to_string(), 0)); + assert_eq!(elems[1], ("head".to_string(), 0)); + assert_eq!(elems[2], ("body".to_string(), 0)); + assert_eq!(elems[3], ("div".to_string(), 0)); + assert_eq!(elems[4], ("span".to_string(), 5)); + } + + #[test] + fn explicit_html_head_body_offsets() { + let result = parse("<html><head></head><body><p>hi</p></body></html>"); + let elems = result.content_elements(); + + assert_eq!( + elems.len(), + 4, + "expected html, head, body, p, got: {:?}", + elems + ); + assert_eq!(elems[0], ("html".to_string(), 0)); + assert_eq!(elems[1], ("head".to_string(), 6)); + assert_eq!(elems[2], ("body".to_string(), 19)); + assert_eq!(elems[3], ("p".to_string(), 25)); + } + + #[test] + /// <span> should start at byte 12, and not 13 due to é being 2 bytes. + fn multibyte_content_does_not_shift_subsequent_offsets() { + let result = parse("<p>café</p><span>next</span>"); + let elems = result.content_elements(); + + assert_eq!( + elems.len(), + 5, + "expected html, head, body, p and span, got: {:?}", + elems + ); + assert_eq!(elems[0], ("html".to_string(), 0)); + assert_eq!(elems[1], ("head".to_string(), 0)); + assert_eq!(elems[2], ("body".to_string(), 0)); + assert_eq!(elems[3], ("p".to_string(), 0)); + assert_eq!(elems[4], ("span".to_string(), 12)); + } +} diff --git a/rcdom/tests/xhtml-self-closing-integration.rs b/rcdom/tests/xhtml-self-closing-integration.rs new file mode 100644 index 00000000..ae3a5d4a --- /dev/null +++ b/rcdom/tests/xhtml-self-closing-integration.rs @@ -0,0 +1,108 @@ +//! Integration tests for the `xhtml-self-closing` feature. +//! +//! EPUB content is XHTML and uses self-closing syntax on RCDATA/RAWTEXT +//! elements such as `<title/>` and `<style/>`. Without `xhtml-self-closing`, +//! html5ever treats these as opening tags and enters raw-text mode, consuming +//! the rest of the document + +#[cfg(feature = "xhtml-self-closing")] +mod xhtml_self_closing { + use html5ever::driver; + use html5ever::tendril::stream::TendrilSink; + use html5ever::tendril::StrTendril; + use markup5ever_rcdom::{NodeData, RcDom}; + + fn parse(input: &str) -> RcDom { + driver::parse_document(RcDom::default(), Default::default()).one(StrTendril::from(input)) + } + + /// Walk the tree and collect all element names. + fn element_names(node: &markup5ever_rcdom::Handle) -> Vec<String> { + let mut names = Vec::new(); + collect_names(node, &mut names); + names + } + + fn collect_names(node: &markup5ever_rcdom::Handle, out: &mut Vec<String>) { + if let NodeData::Element { ref name, .. } = node.data { + out.push(name.local.to_string()); + } + for child in node.children.borrow().iter() { + collect_names(child, out); + } + } + + /// Return the text content of the first element with the given local name. + fn text_of(dom: &RcDom, tag: &str) -> Option<String> { + find_text(&dom.document, tag) + } + + fn find_text(node: &markup5ever_rcdom::Handle, tag: &str) -> Option<String> { + if let NodeData::Element { ref name, .. } = node.data { + if name.local.as_ref() == tag { + let mut text = String::new(); + for child in node.children.borrow().iter() { + if let NodeData::Text { ref contents } = child.data { + text.push_str(&contents.borrow()); + } + } + return Some(text); + } + } + for child in node.children.borrow().iter() { + if let Some(t) = find_text(child, tag) { + return Some(t); + } + } + None + } + + #[test] + fn self_closing_title_does_not_swallow_body() { + let dom = parse("<html><head><title/></head><body><p>visible</p></body></html>"); + let names = element_names(&dom.document); + + assert!( + names.contains(&"body".to_string()), + "body element should be present; got: {:?}", + names + ); + assert!( + names.contains(&"p".to_string()), + "p element inside body should be present; got: {:?}", + names + ); + + let text = text_of(&dom, "p"); + assert_eq!( + text.as_deref(), + Some("visible"), + "<p> text should be 'visible', got: {:?}", + text + ); + } + + #[test] + fn self_closing_style_does_not_swallow_body() { + let dom = parse("<html><head><style/></head><body><p>content</p></body></html>"); + let names = element_names(&dom.document); + + assert!( + names.contains(&"p".to_string()), + "p element should not be swallowed by <style/>; got: {:?}", + names + ); + } + + #[test] + fn normal_closed_title_still_captures_rcdata_text() { + let dom = parse("<html><head><title>My Book"); + let text = text_of(&dom, "title"); + assert_eq!( + text.as_deref(), + Some("My Book"), + "title text should be 'My Book', got: {:?}", + text + ); + } +}