diff --git a/Cargo.toml b/Cargo.toml
index 4714cdbc..141c9124 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -42,3 +42,9 @@ libtest-mimic = "0.8.1"
rand = "0.9"
serde_json = "1.0"
typed-arena = "2.0.2"
+
+# Redirect crates.io tendril/web_atoms to the local path crates so that
+# markup5ever gets the same crate instance as rcdom and xml5ever.
+[patch.crates-io]
+tendril = { path = "tendril" }
+web_atoms = { path = "web_atoms" }
diff --git a/html5ever/Cargo.toml b/html5ever/Cargo.toml
index 3584e456..23b752b0 100644
--- a/html5ever/Cargo.toml
+++ b/html5ever/Cargo.toml
@@ -5,25 +5,31 @@ documentation = "https://docs.rs/html5ever"
categories = [ "parser-implementations", "web-programming" ]
keywords = ["html", "html5", "parser", "parsing"]
readme = "../README.md"
-version.workspace = true
-license.workspace = true
-authors.workspace = true
-repository.workspace = true
-edition.workspace = true
-rust-version.workspace = true
+version = "0.39.0"
+license = "MIT OR Apache-2.0"
+authors = [ "The html5ever Project Developers" ]
+repository = "https://github.com/servo/html5ever"
+edition = "2021"
+rust-version = "1.71.0"
[features]
trace_tokenizer = []
serde = ["markup5ever/serde"]
+# Surfaces byte-accurate source positions; see markup5ever for full description.
+source-positions = ["markup5ever/source-positions"]
+# Honour the XML/XHTML self-closing syntax (`
`, ``, …) on
+# RCDATA and RAWTEXT elements. Without this, html5ever treats ``
+# as opening a RCDATA region that swallows the rest of the document.
+xhtml-self-closing = []
[dependencies]
-markup5ever = { workspace = true }
-memchr = { workspace = true }
-log = { workspace = true }
+markup5ever = { version = "0.39", path = "../markup5ever" }
+memchr = "2.8.0"
+log = "0.4"
[dev-dependencies]
-criterion = { workspace = true }
-typed-arena = { workspace = true }
+criterion = "0.8"
+typed-arena = "2.0.2"
[[bench]]
name = "html5ever"
diff --git a/html5ever/src/tokenizer/char_ref/mod.rs b/html5ever/src/tokenizer/char_ref/mod.rs
index e119477d..c8da81c8 100644
--- a/html5ever/src/tokenizer/char_ref/mod.rs
+++ b/html5ever/src/tokenizer/char_ref/mod.rs
@@ -212,7 +212,11 @@ impl CharRefTokenizer {
unconsume.push_char(c)
}
+ #[cfg(feature = "source-positions")]
+ let unconsume_len = unconsume.len() as u64;
input.push_front(unconsume);
+ #[cfg(feature = "source-positions")]
+ input.retreat_bytes_consumed(unconsume_len);
tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
Status::Done(CharRef::EMPTY)
}
@@ -292,7 +296,12 @@ impl CharRefTokenizer {
}
fn unconsume_name(&mut self, input: &BufferQueue) {
- input.push_front(self.name_buf_opt.take().unwrap());
+ let name_buf = self.name_buf_opt.take().unwrap();
+ #[cfg(feature = "source-positions")]
+ let name_buf_len = name_buf.len() as u64;
+ input.push_front(name_buf);
+ #[cfg(feature = "source-positions")]
+ input.retreat_bytes_consumed(name_buf_len);
}
fn finish_named(
@@ -367,7 +376,12 @@ impl CharRefTokenizer {
self.unconsume_name(input);
Status::Done(CharRef::EMPTY)
} else {
- input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
+ let unconsumed = StrTendril::from_slice(&self.name_buf()[name_len..]);
+ #[cfg(feature = "source-positions")]
+ let unconsumed_len = unconsumed.len() as u64;
+ input.push_front(unconsumed);
+ #[cfg(feature = "source-positions")]
+ input.retreat_bytes_consumed(unconsumed_len);
tokenizer.ignore_lf.set(false);
Status::Done(CharRef {
chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
@@ -419,6 +433,8 @@ impl CharRefTokenizer {
},
State::Octothorpe => {
input.push_front(StrTendril::from_slice("#"));
+ #[cfg(feature = "source-positions")]
+ input.retreat_bytes_consumed(1);
tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
Status::Done(CharRef::EMPTY)
},
diff --git a/html5ever/src/tokenizer/interface.rs b/html5ever/src/tokenizer/interface.rs
index b1436a71..97437809 100644
--- a/html5ever/src/tokenizer/interface.rs
+++ b/html5ever/src/tokenizer/interface.rs
@@ -130,6 +130,13 @@ pub trait TokenSink {
/// Signal that tokenization reached the end of the document.
fn end(&self) {}
+ /// Called just before each token is dispatched to [`process_token`],
+ /// with the number of UTF-8 bytes consumed from the input so far.
+ ///
+ /// The default implementation is a no-op.
+ #[cfg(feature = "source-positions")]
+ fn set_current_byte(&self, _byte_offset: u64) {}
+
/// Used in the [markup declaration open state]. By default, this always
/// returns false and thus all CDATA sections are tokenized as bogus
/// comments.
diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs
index ba9a095c..8b0d473a 100644
--- a/html5ever/src/tokenizer/mod.rs
+++ b/html5ever/src/tokenizer/mod.rs
@@ -181,6 +181,30 @@ pub struct Tokenizer {
/// Track current line
current_line: Cell,
+
+ /// Number of UTF-8 bytes consumed from the input so far.
+ ///
+ /// Kept in sync with `BufferQueue::bytes_consumed` after every character
+ /// is consumed.
+ #[cfg(feature = "source-positions")]
+ current_byte: Cell,
+
+ /// Byte offset of the first character of the current token.
+ ///
+ /// For tag, comment, and doctype tokens this is the byte of the `<` that
+ /// opened them, captured whenever `<` is consumed in `get_preprocessed_char`.
+ ///
+ /// For character tokens it is the byte right after the end of the previous token,
+ /// which equals the first byte of the text content, this is tracked via `last_token_end_byte`.
+ #[cfg(feature = "source-positions")]
+ token_start_byte: Cell,
+
+ /// Byte offset one past the end of the most recently emitted token.
+ ///
+ /// Updated at the end of each `process_token` call. Used as the start
+ /// byte for the next character token.
+ #[cfg(feature = "source-positions")]
+ last_token_end_byte: Cell,
}
impl Tokenizer {
@@ -216,6 +240,12 @@ impl Tokenizer {
state_profile: RefCell::new(BTreeMap::new()),
time_in_sink: Cell::new(0),
current_line: Cell::new(1),
+ #[cfg(feature = "source-positions")]
+ current_byte: Cell::new(0),
+ #[cfg(feature = "source-positions")]
+ token_start_byte: Cell::new(0),
+ #[cfg(feature = "source-positions")]
+ last_token_end_byte: Cell::new(0),
}
}
@@ -243,13 +273,27 @@ impl Tokenizer {
}
fn process_token(&self, token: Token) -> TokenSinkResult {
- if self.opts.profile {
+ #[cfg(feature = "source-positions")]
+ {
+ let byte = match &token {
+ Token::TagToken(_) | Token::CommentToken(_) | Token::DoctypeToken(_) => {
+ self.token_start_byte.get()
+ },
+ Token::CharacterTokens(_) => self.last_token_end_byte.get(),
+ _ => self.current_byte.get(),
+ };
+ self.sink.set_current_byte(byte);
+ }
+ let result = if self.opts.profile {
let (ret, dt) = time!(self.sink.process_token(token, self.current_line.get()));
self.time_in_sink.set(self.time_in_sink.get() + dt);
ret
} else {
self.sink.process_token(token, self.current_line.get())
- }
+ };
+ #[cfg(feature = "source-positions")]
+ self.last_token_end_byte.set(self.current_byte.get());
+ result
}
fn process_token_and_continue(&self, token: Token) {
@@ -292,6 +336,17 @@ impl Tokenizer {
trace!("got character {c}");
self.current_char.set(c);
+ #[cfg(feature = "source-positions")]
+ {
+ let pos = input.bytes_consumed();
+ if pos > 0 {
+ self.current_byte.set(pos);
+ if c == '<' {
+ self.token_start_byte
+ .set(pos.saturating_sub(c.len_utf8() as u64));
+ }
+ }
+ }
Some(c)
}
@@ -325,7 +380,13 @@ impl Tokenizer {
// NB: We don't set self.current_char for a run of characters not
// in the set. It shouldn't matter for the codepaths that use
// this.
- _ => d,
+ other => {
+ #[cfg(feature = "source-positions")]
+ if other.is_some() {
+ self.current_byte.set(input.bytes_consumed());
+ }
+ other
+ },
}
}
@@ -621,7 +682,20 @@ impl Tokenizer {
if self.reconsume.get() {
self.reconsume.set(false);
} else {
+ #[cfg(not(feature = "source-positions"))]
input.next();
+ #[cfg(feature = "source-positions")]
+ {
+ let c = input.next();
+ if let Some(c) = c {
+ let pos = input.bytes_consumed();
+ self.current_byte.set(pos);
+ if c == '<' {
+ self.token_start_byte
+ .set(pos.saturating_sub(c.len_utf8() as u64));
+ }
+ }
+ }
}
}
@@ -757,6 +831,20 @@ impl Tokenizer {
// This CPU is guaranteed to support SIMD due to the is_supported_simd_feature_detected check above
let result = unsafe { self.data_state_simd_fast_path(&mut front_buffer) };
+ #[cfg(feature = "source-positions")]
+ if let Some(ref r) = result {
+ let n = match r {
+ SetResult::NotFromSet(ref t) => t.len() as u64,
+ SetResult::FromSet(c) => c.len_utf8() as u64,
+ };
+ input.advance_bytes_consumed(n);
+ self.current_byte.set(input.bytes_consumed());
+ if let SetResult::FromSet('<') = r {
+ self.token_start_byte
+ .set(input.bytes_consumed() - '<'.len_utf8() as u64);
+ }
+ }
+
if front_buffer.is_empty() {
drop(front_buffer);
input.pop_front();
@@ -1752,6 +1840,8 @@ impl Tokenizer {
let mut char_ref_tokenizer = self.char_ref_tokenizer.borrow_mut();
let progress = match char_ref_tokenizer.as_mut().unwrap().step(self, input) {
char_ref::Status::Done(char_ref) => {
+ #[cfg(feature = "source-positions")]
+ self.current_byte.set(input.bytes_consumed());
self.process_char_ref(char_ref);
*char_ref_tokenizer = None;
return ProcessResult::Continue;
@@ -2379,3 +2469,236 @@ mod test {
assert_eq!(results, expected);
}
}
+
+#[cfg(all(test, feature = "source-positions"))]
+mod test_source_positions {
+ use crate::tendril::StrTendril;
+
+ use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, TagToken};
+ use super::interface::{EndTag, StartTag, Tag, Token};
+ use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
+
+ use crate::LocalName;
+ use markup5ever::buffer_queue::BufferQueue;
+ use std::cell::RefCell;
+
+ /// Records (token, byte_offset) pairs via `set_current_byte`.
+ struct BytesMatch {
+ current_byte: std::cell::Cell,
+ text_start_byte: std::cell::Cell