servo · OGKevin · Jun 16, 2026 · Jun 18, 2026 · simonwuelker · Jun 30, 2026
diff --git a/Cargo.toml b/Cargo.toml
@@ -42,3 +42,9 @@ libtest-mimic = "0.8.1"
 rand = "0.9"
 serde_json = "1.0"
 typed-arena = "2.0.2"
+
+# Redirect crates.io tendril/web_atoms to the local path crates so that
+# markup5ever gets the same crate instance as rcdom and xml5ever.
+[patch.crates-io]
+tendril = { path = "tendril" }
+web_atoms = { path = "web_atoms" }
diff --git a/html5ever/Cargo.toml b/html5ever/Cargo.toml
@@ -5,25 +5,31 @@ documentation = "https://docs.rs/html5ever"
 categories = [ "parser-implementations", "web-programming" ]
 keywords = ["html", "html5", "parser", "parsing"]
 readme = "../README.md"
-version.workspace = true
-license.workspace = true
-authors.workspace = true
-repository.workspace = true
-edition.workspace = true
-rust-version.workspace = true
+version = "0.39.0"
+license = "MIT OR Apache-2.0"
+authors = [ "The html5ever Project Developers" ]
+repository = "https://github.com/servo/html5ever"
+edition = "2021"
+rust-version = "1.71.0"
 
 [features]
 trace_tokenizer = []
 serde = ["markup5ever/serde"]
+# Surfaces byte-accurate source positions; see markup5ever for full description.
+source-positions = ["markup5ever/source-positions"]
+# Honour the XML/XHTML self-closing syntax (`<title/>`, `<style/>`, …) on
+# RCDATA and RAWTEXT elements. Without this, html5ever treats `<title/>`
+# as opening a RCDATA region that swallows the rest of the document.
+xhtml-self-closing = []
 
 [dependencies]
-markup5ever = { workspace = true }
-memchr = { workspace = true }
-log = { workspace = true }
+markup5ever = { version = "0.39", path = "../markup5ever" }
+memchr = "2.8.0"
+log = "0.4"
 
 [dev-dependencies]
-criterion = { workspace = true }
-typed-arena = { workspace = true }
+criterion = "0.8"
+typed-arena = "2.0.2"
 
 [[bench]]
 name = "html5ever"

diff --git a/html5ever/src/tokenizer/char_ref/mod.rs b/html5ever/src/tokenizer/char_ref/mod.rs
@@ -212,7 +212,11 @@ impl CharRefTokenizer {
             unconsume.push_char(c)
         }
 
+        #[cfg(feature = "source-positions")]
+        let unconsume_len = unconsume.len() as u64;
         input.push_front(unconsume);
+        #[cfg(feature = "source-positions")]
+        input.retreat_bytes_consumed(unconsume_len);
         tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
         Status::Done(CharRef::EMPTY)
     }
@@ -292,7 +296,12 @@ impl CharRefTokenizer {
     }
 
     fn unconsume_name(&mut self, input: &BufferQueue) {
-        input.push_front(self.name_buf_opt.take().unwrap());
+        let name_buf = self.name_buf_opt.take().unwrap();
+        #[cfg(feature = "source-positions")]
+        let name_buf_len = name_buf.len() as u64;
+        input.push_front(name_buf);
+        #[cfg(feature = "source-positions")]
+        input.retreat_bytes_consumed(name_buf_len);
     }
 
     fn finish_named<Sink: TokenSink>(
@@ -367,7 +376,12 @@ impl CharRefTokenizer {
                     self.unconsume_name(input);
                     Status::Done(CharRef::EMPTY)
                 } else {
-                    input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
+                    let unconsumed = StrTendril::from_slice(&self.name_buf()[name_len..]);
+                    #[cfg(feature = "source-positions")]
+                    let unconsumed_len = unconsumed.len() as u64;
+                    input.push_front(unconsumed);
+                    #[cfg(feature = "source-positions")]
+                    input.retreat_bytes_consumed(unconsumed_len);
                     tokenizer.ignore_lf.set(false);
                     Status::Done(CharRef {
                         chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
@@ -419,6 +433,8 @@ impl CharRefTokenizer {
                 },
                 State::Octothorpe => {
                     input.push_front(StrTendril::from_slice("#"));
+                    #[cfg(feature = "source-positions")]
+                    input.retreat_bytes_consumed(1);
                     tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
                     Status::Done(CharRef::EMPTY)
                 },

diff --git a/html5ever/src/tokenizer/interface.rs b/html5ever/src/tokenizer/interface.rs
@@ -130,6 +130,13 @@ pub trait TokenSink {
     /// Signal that tokenization reached the end of the document.
     fn end(&self) {}
 
+    /// Called just before each token is dispatched to [`process_token`],
+    /// with the number of UTF-8 bytes consumed from the input so far.
+    ///
+    /// The default implementation is a no-op.
+    #[cfg(feature = "source-positions")]
+    fn set_current_byte(&self, _byte_offset: u64) {}
+
     /// Used in the [markup declaration open state]. By default, this always
     /// returns false and thus all CDATA sections are tokenized as bogus
     /// comments.