From 61b410c5bc3ecd1157493e36dcff1ed3e34b540d Mon Sep 17 00:00:00 2001
From: Kevin Hellemun <17928966+OGKevin@users.noreply.github.com>
Date: Tue, 16 Jun 2026 06:45:40 +0200
Subject: [PATCH 1/2] feat: add byte tracking

This commit introduced a new feature flag that enables the ability to
track byte offsets.

When the source-positions feature is enabled, the tokenizer will track
the number of UTF-8 bytes consumed from the input so far. This is done
by giving BufferQueue a `bytes_consumed` field, which is incremented
every time a character is consumed.

The changes in cargo.toml were needed to make this project load as a git
submodule in the Cadmus project.

The xhtml-self-closing feature was needed due to EPUBs using
XHTML-compatible self-closing on RCDATA/RAWTEXT elements.

Change-Id: 566446e2bca101b7fefdca639c1b4d26
Change-Id-Short: uttvvtlxonpy
---
 Cargo.toml                                    |   8 +
 html5ever/Cargo.toml                          |  30 +-
 html5ever/src/tokenizer/char_ref/mod.rs       |  20 +-
 html5ever/src/tokenizer/interface.rs          |   8 +
 html5ever/src/tokenizer/mod.rs                | 361 +++++++++++++++++-
 html5ever/src/tree_builder/mod.rs             |  21 +-
 markup5ever/Cargo.toml                        |  23 +-
 markup5ever/interface/tree_builder.rs         |  12 +
 markup5ever/util/buffer_queue.rs              | 178 +++++++++
 rcdom/Cargo.toml                              |  12 +
 rcdom/tests/source-positions-integration.rs   | 214 +++++++++++
 rcdom/tests/xhtml-self-closing-integration.rs | 123 ++++++
 12 files changed, 984 insertions(+), 26 deletions(-)
 create mode 100644 rcdom/tests/source-positions-integration.rs
 create mode 100644 rcdom/tests/xhtml-self-closing-integration.rs
diff --git a/Cargo.toml b/Cargo.toml
index 4714cdbc..f2a87026 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -42,3 +42,11 @@ libtest-mimic = "0.8.1"
 rand = "0.9"
 serde_json = "1.0"
 typed-arena = "2.0.2"
+
+# Redirect crates.io tendril/web_atoms to the local path crates so that
+# markup5ever (which cannot use workspace.dependencies when loaded as a
+# Cargo [patch] from outside this workspace) gets the same crate instance
+# as rcdom and xml5ever.
+[patch.crates-io]
+tendril = { path = "tendril" }
+web_atoms = { path = "web_atoms" }
diff --git a/html5ever/Cargo.toml b/html5ever/Cargo.toml
index 3584e456..6db09321 100644
--- a/html5ever/Cargo.toml
+++ b/html5ever/Cargo.toml
@@ -5,25 +5,33 @@ documentation = "https://docs.rs/html5ever"
 categories = [ "parser-implementations", "web-programming" ]
 keywords = ["html", "html5", "parser", "parsing"]
 readme = "../README.md"
-version.workspace = true
-license.workspace = true
-authors.workspace = true
-repository.workspace = true
-edition.workspace = true
-rust-version.workspace = true
+version = "0.39.0"
+license = "MIT OR Apache-2.0"
+authors = [ "The html5ever Project Developers" ]
+repository = "https://github.com/servo/html5ever"
+edition = "2021"
+rust-version = "1.71.0"
 
 [features]
 trace_tokenizer = []
 serde = ["markup5ever/serde"]
+# Surfaces byte-accurate source positions; see markup5ever for full description.
+source-positions = ["markup5ever/source-positions"]
+# Honour the XML/XHTML self-closing syntax (`<title/>`, `<style/>`, …) on
+# RCDATA and RAWTEXT elements. Without this, html5ever follows the HTML5
+# spec and treats `<title/>` as opening a RCDATA region that swallows the
+# rest of the document. EPUB content is XHTML and relies on self-closing
+# being honoured.
+xhtml-self-closing = []
 
 [dependencies]
-markup5ever = { workspace = true }
-memchr = { workspace = true }
-log = { workspace = true }
+markup5ever = { version = "0.39", path = "../markup5ever" }
+memchr = "2.8.0"
+log = "0.4"
 
 [dev-dependencies]
-criterion = { workspace = true }
-typed-arena = { workspace = true }
+criterion = "0.8"
+typed-arena = "2.0.2"
 
 [[bench]]
 name = "html5ever"
diff --git a/html5ever/src/tokenizer/char_ref/mod.rs b/html5ever/src/tokenizer/char_ref/mod.rs
index e119477d..c8da81c8 100644
--- a/html5ever/src/tokenizer/char_ref/mod.rs
+++ b/html5ever/src/tokenizer/char_ref/mod.rs
@@ -212,7 +212,11 @@ impl CharRefTokenizer {
             unconsume.push_char(c)
         }
 
+        #[cfg(feature = "source-positions")]
+        let unconsume_len = unconsume.len() as u64;
         input.push_front(unconsume);
+        #[cfg(feature = "source-positions")]
+        input.retreat_bytes_consumed(unconsume_len);
         tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
         Status::Done(CharRef::EMPTY)
     }
@@ -292,7 +296,12 @@ impl CharRefTokenizer {
     }
 
     fn unconsume_name(&mut self, input: &BufferQueue) {
-        input.push_front(self.name_buf_opt.take().unwrap());
+        let name_buf = self.name_buf_opt.take().unwrap();
+        #[cfg(feature = "source-positions")]
+        let name_buf_len = name_buf.len() as u64;
+        input.push_front(name_buf);
+        #[cfg(feature = "source-positions")]
+        input.retreat_bytes_consumed(name_buf_len);
     }
 
     fn finish_named<Sink: TokenSink>(
@@ -367,7 +376,12 @@ impl CharRefTokenizer {
                     self.unconsume_name(input);
                     Status::Done(CharRef::EMPTY)
                 } else {
-                    input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
+                    let unconsumed = StrTendril::from_slice(&self.name_buf()[name_len..]);
+                    #[cfg(feature = "source-positions")]
+                    let unconsumed_len = unconsumed.len() as u64;
+                    input.push_front(unconsumed);
+                    #[cfg(feature = "source-positions")]
+                    input.retreat_bytes_consumed(unconsumed_len);
                     tokenizer.ignore_lf.set(false);
                     Status::Done(CharRef {
                         chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
@@ -419,6 +433,8 @@ impl CharRefTokenizer {
                 },
                 State::Octothorpe => {
                     input.push_front(StrTendril::from_slice("#"));
+                    #[cfg(feature = "source-positions")]
+                    input.retreat_bytes_consumed(1);
                     tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
                     Status::Done(CharRef::EMPTY)
                 },
diff --git a/html5ever/src/tokenizer/interface.rs b/html5ever/src/tokenizer/interface.rs
index b1436a71..c9ee28c9 100644
--- a/html5ever/src/tokenizer/interface.rs
+++ b/html5ever/src/tokenizer/interface.rs
@@ -130,6 +130,14 @@ pub trait TokenSink {
     /// Signal that tokenization reached the end of the document.
     fn end(&self) {}
 
+    /// Called just before each token is dispatched to [`process_token`],
+    /// with the number of UTF-8 bytes consumed from the input so far.
+    ///
+    /// Only called when the `source-positions` feature is enabled. The
+    /// default implementation is a no-op.
+    #[cfg(feature = "source-positions")]
+    fn set_current_byte(&self, _byte_offset: u64) {}
+
     /// Used in the [markup declaration open state]. By default, this always
     /// returns false and thus all CDATA sections are tokenized as bogus
     /// comments.
diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs
index ba9a095c..971ac0ba 100644
--- a/html5ever/src/tokenizer/mod.rs
+++ b/html5ever/src/tokenizer/mod.rs
@@ -181,6 +181,33 @@ pub struct Tokenizer<Sink> {
 
     /// Track current line
     current_line: Cell<u64>,
+
+    /// Number of UTF-8 bytes consumed from the input so far.
+    ///
+    /// Kept in sync with `BufferQueue::bytes_consumed` after every character
+    /// is consumed. Only present when the `source-positions` feature is
+    /// enabled.
+    #[cfg(feature = "source-positions")]
+    current_byte: Cell<u64>,
+
+    /// Byte offset of the first character of the current token.
+    ///
+    /// For tag, comment, and doctype tokens this is the byte of the `<` that
+    /// opened them — captured whenever `<` is consumed in
+    /// `get_preprocessed_char`. For character tokens it is the byte right
+    /// after the end of the previous token, which equals the first byte of
+    /// the text content — tracked via `last_token_end_byte`.
+    /// Only present when the `source-positions` feature is enabled.
+    #[cfg(feature = "source-positions")]
+    token_start_byte: Cell<u64>,
+
+    /// Byte offset one past the end of the most recently emitted token.
+    ///
+    /// Updated at the end of each `process_token` call. Used as the start
+    /// byte for the next character token. Only present when the
+    /// `source-positions` feature is enabled.
+    #[cfg(feature = "source-positions")]
+    last_token_end_byte: Cell<u64>,
 }
 
 impl<Sink: TokenSink> Tokenizer<Sink> {
@@ -216,6 +243,12 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             state_profile: RefCell::new(BTreeMap::new()),
             time_in_sink: Cell::new(0),
             current_line: Cell::new(1),
+            #[cfg(feature = "source-positions")]
+            current_byte: Cell::new(0),
+            #[cfg(feature = "source-positions")]
+            token_start_byte: Cell::new(0),
+            #[cfg(feature = "source-positions")]
+            last_token_end_byte: Cell::new(0),
         }
     }
 
@@ -243,13 +276,27 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
     }
 
     fn process_token(&self, token: Token) -> TokenSinkResult<Sink::Handle> {
-        if self.opts.profile {
+        #[cfg(feature = "source-positions")]
+        {
+            let byte = match &token {
+                Token::TagToken(_) | Token::CommentToken(_) | Token::DoctypeToken(_) => {
+                    self.token_start_byte.get()
+                },
+                Token::CharacterTokens(_) => self.last_token_end_byte.get(),
+                _ => self.current_byte.get(),
+            };
+            self.sink.set_current_byte(byte);
+        }
+        let result = if self.opts.profile {
             let (ret, dt) = time!(self.sink.process_token(token, self.current_line.get()));
             self.time_in_sink.set(self.time_in_sink.get() + dt);
             ret
         } else {
             self.sink.process_token(token, self.current_line.get())
-        }
+        };
+        #[cfg(feature = "source-positions")]
+        self.last_token_end_byte.set(self.current_byte.get());
+        result
     }
 
     fn process_token_and_continue(&self, token: Token) {
@@ -292,6 +339,17 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
 
         trace!("got character {c}");
         self.current_char.set(c);
+        #[cfg(feature = "source-positions")]
+        {
+            let pos = input.bytes_consumed();
+            if pos > 0 {
+                self.current_byte.set(pos);
+                if c == '<' {
+                    self.token_start_byte
+                        .set(pos.saturating_sub(c.len_utf8() as u64));
+                }
+            }
+        }
         Some(c)
     }
 
@@ -325,7 +383,13 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             // NB: We don't set self.current_char for a run of characters not
             // in the set.  It shouldn't matter for the codepaths that use
             // this.
-            _ => d,
+            other => {
+                #[cfg(feature = "source-positions")]
+                if other.is_some() {
+                    self.current_byte.set(input.bytes_consumed());
+                }
+                other
+            },
         }
     }
 
@@ -621,7 +685,20 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
         if self.reconsume.get() {
             self.reconsume.set(false);
         } else {
+            #[cfg(not(feature = "source-positions"))]
             input.next();
+            #[cfg(feature = "source-positions")]
+            {
+                let c = input.next();
+                if let Some(c) = c {
+                    let pos = input.bytes_consumed();
+                    self.current_byte.set(pos);
+                    if c == '<' {
+                        self.token_start_byte
+                            .set(pos.saturating_sub(c.len_utf8() as u64));
+                    }
+                }
+            }
         }
     }
 
@@ -757,6 +834,20 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                         // This CPU is guaranteed to support SIMD due to the is_supported_simd_feature_detected check above
                         let result = unsafe { self.data_state_simd_fast_path(&mut front_buffer) };
 
+                        #[cfg(feature = "source-positions")]
+                        if let Some(ref r) = result {
+                            let n = match r {
+                                SetResult::NotFromSet(ref t) => t.len() as u64,
+                                SetResult::FromSet(c) => c.len_utf8() as u64,
+                            };
+                            input.advance_bytes_consumed(n);
+                            self.current_byte.set(input.bytes_consumed());
+                            if let SetResult::FromSet('<') = r {
+                                self.token_start_byte
+                                    .set(input.bytes_consumed() - '<'.len_utf8() as u64);
+                            }
+                        }
+
                         if front_buffer.is_empty() {
                             drop(front_buffer);
                             input.pop_front();
@@ -1752,6 +1843,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
         let mut char_ref_tokenizer = self.char_ref_tokenizer.borrow_mut();
         let progress = match char_ref_tokenizer.as_mut().unwrap().step(self, input) {
             char_ref::Status::Done(char_ref) => {
+                #[cfg(feature = "source-positions")]
+                self.current_byte.set(input.bytes_consumed());
                 self.process_char_ref(char_ref);
                 *char_ref_tokenizer = None;
                 return ProcessResult::Continue;
@@ -2379,3 +2472,265 @@ mod test {
         assert_eq!(results, expected);
     }
 }
+
+#[cfg(all(test, feature = "source-positions"))]
+mod test_source_positions {
+    use crate::tendril::StrTendril;
+
+    use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, TagToken};
+    use super::interface::{EndTag, StartTag, Tag, Token};
+    use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
+
+    use crate::LocalName;
+    use markup5ever::buffer_queue::BufferQueue;
+    use std::cell::RefCell;
+
+    /// Records (token, byte_offset) pairs via `set_current_byte`.
+    struct BytesMatch {
+        /// Byte offset delivered by the most recent `set_current_byte` call.
+        current_byte: std::cell::Cell<u64>,
+        /// Byte offset at the start of the current character run.
+        /// Captured on the first `CharacterTokens` chunk; cleared after flush.
+        text_start_byte: std::cell::Cell<Option<u64>>,
+        current_str: RefCell<StrTendril>,
+        entries: RefCell<Vec<(Token, u64)>>,
+    }
+
+    impl BytesMatch {
+        fn new() -> Self {
+            BytesMatch {
+                current_byte: std::cell::Cell::new(0),
+                text_start_byte: std::cell::Cell::new(None),
+                current_str: RefCell::new(StrTendril::new()),
+                entries: RefCell::new(vec![]),
+            }
+        }
+
+        /// Emit the accumulated character run using the byte of its first chunk.
+        fn flush_chars(&self) {
+            let s = self.current_str.take();
+            if !s.is_empty() {
+                let byte = self.text_start_byte.get().unwrap_or(0);
+                self.text_start_byte.set(None);
+                self.entries.borrow_mut().push((CharacterTokens(s), byte));
+            }
+        }
+    }
+
+    /// Records every token without coalescing adjacent character chunks.
+    struct RawBytesMatch {
+        current_byte: std::cell::Cell<u64>,
+        entries: RefCell<Vec<(Token, u64)>>,
+    }
+
+    impl RawBytesMatch {
+        fn new() -> Self {
+            RawBytesMatch {
+                current_byte: std::cell::Cell::new(0),
+                entries: RefCell::new(vec![]),
+            }
+        }
+    }
+
+    impl TokenSink for RawBytesMatch {
+        type Handle = ();
+
+        fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<Self::Handle> {
+            if !matches!(token, EOFToken) {
+                self.entries
+                    .borrow_mut()
+                    .push((token, self.current_byte.get()));
+            }
+            TokenSinkResult::Continue
+        }
+
+        fn set_current_byte(&self, byte_offset: u64) {
+            self.current_byte.set(byte_offset);
+        }
+    }
+
+    impl TokenSink for BytesMatch {
+        type Handle = ();
+
+        fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<Self::Handle> {
+            let byte = self.current_byte.get();
+            match token {
+                CharacterTokens(b) => {
+                    if self.text_start_byte.get().is_none() {
+                        self.text_start_byte.set(Some(byte));
+                    }
+                    self.current_str.borrow_mut().push_slice(&b);
+                },
+                NullCharacterToken => {
+                    self.current_str.borrow_mut().push_char('\0');
+                },
+                EOFToken => {
+                    self.flush_chars();
+                },
+                TagToken(mut t) => {
+                    self.flush_chars();
+                    if let EndTag = t.kind {
+                        t.attrs = vec![];
+                    } else {
+                        t.attrs.sort_by(|a, b| a.name.cmp(&b.name));
+                    }
+                    self.entries.borrow_mut().push((TagToken(t), byte));
+                },
+                other => {
+                    self.flush_chars();
+                    self.entries.borrow_mut().push((other, byte));
+                },
+            }
+            TokenSinkResult::Continue
+        }
+
+        fn set_current_byte(&self, byte_offset: u64) {
+            self.current_byte.set(byte_offset);
+        }
+    }
+
+    fn tokenize_bytes(input: &str) -> Vec<(Token, u64)> {
+        let sink = BytesMatch::new();
+        let tok = Tokenizer::new(
+            sink,
+            TokenizerOpts {
+                exact_errors: false,
+                discard_bom: true,
+                profile: false,
+                initial_state: None,
+                last_start_tag_name: None,
+            },
+        );
+        let buf = BufferQueue::default();
+        buf.push_back(StrTendril::from(input));
+        let _ = tok.feed(&buf);
+        tok.end();
+        tok.sink.entries.take()
+    }
+
+    fn tokenize_raw_bytes(input: &str) -> Vec<(Token, u64)> {
+        let sink = RawBytesMatch::new();
+        let tok = Tokenizer::new(
+            sink,
+            TokenizerOpts {
+                exact_errors: false,
+                discard_bom: true,
+                profile: false,
+                initial_state: None,
+                last_start_tag_name: None,
+            },
+        );
+        let buf = BufferQueue::default();
+        buf.push_back(StrTendril::from(input));
+        let _ = tok.feed(&buf);
+        tok.end();
+        tok.sink.entries.take()
+    }
+
+    fn start(name: &str) -> Token {
+        TagToken(Tag {
+            kind: StartTag,
+            name: LocalName::from(name),
+            self_closing: false,
+            attrs: vec![],
+            had_duplicate_attributes: false,
+        })
+    }
+
+    fn end(name: &str) -> Token {
+        TagToken(Tag {
+            kind: EndTag,
+            name: LocalName::from(name),
+            self_closing: false,
+            attrs: vec![],
+            had_duplicate_attributes: false,
+        })
+    }
+
+    fn chars(s: &str) -> Token {
+        CharacterTokens(StrTendril::from(s))
+    }
+
+    #[test]
+    fn check_byte_offsets_simple_tags() {
+        // <a>   = bytes 0-2  → offset 0
+        // <b>   = bytes 3-5  → offset 3
+        // </b>  = bytes 6-9  → offset 6
+        // </a>  = bytes 10-13 → offset 10
+        let entries = tokenize_bytes("<a><b></b></a>");
+        assert_eq!(
+            entries,
+            vec![
+                (start("a"), 0),
+                (start("b"), 3),
+                (end("b"), 6),
+                (end("a"), 10),
+            ]
+        );
+    }
+
+    #[test]
+    fn check_byte_offsets_text_content() {
+        // <p>     = bytes 0-2   → offset 0
+        // "hello" = bytes 3-7   → offset 3 (right after '>')
+        // </p>    = bytes 8-11  → offset 8
+        let entries = tokenize_bytes("<p>hello</p>");
+        assert_eq!(
+            entries,
+            vec![(start("p"), 0), (chars("hello"), 3), (end("p"), 8),]
+        );
+    }
+
+    #[test]
+    fn check_byte_offsets_multibyte_text() {
+        // <p>   = bytes 0-2  → offset 0
+        // "é"   = bytes 3-4  (é = 2 UTF-8 bytes) → offset 3
+        // </p>  = bytes 5-8  → offset 5
+        let entries = tokenize_bytes("<p>é</p>");
+        assert_eq!(
+            entries,
+            vec![(start("p"), 0), (chars("é"), 3), (end("p"), 5),]
+        );
+    }
+
+    #[test]
+    fn check_byte_offsets_sequential_siblings() {
+        // <h1>  = bytes 0-3   → offset 0
+        // "X"   = bytes 4     → offset 4
+        // </h1> = bytes 5-9   → offset 5
+        // <p>   = bytes 10-12 → offset 10
+        // "Y"   = bytes 13    → offset 13
+        // </p>  = bytes 14-17 → offset 14
+        let entries = tokenize_bytes("<h1>X</h1><p>Y</p>");
+        assert_eq!(
+            entries,
+            vec![
+                (start("h1"), 0),
+                (chars("X"), 4),
+                (end("h1"), 5),
+                (start("p"), 10),
+                (chars("Y"), 13),
+                (end("p"), 14),
+            ]
+        );
+    }
+
+    #[test]
+    fn check_byte_offsets_entity_text_chunks() {
+        // <p>     = bytes 0-2 → offset 0
+        // "a"     = byte 3
+        // "&amp;" = bytes 4-8, decoded to "&"
+        // "b"     = byte 9
+        let entries = tokenize_raw_bytes("<p>a&amp;b</p>");
+        assert_eq!(
+            entries,
+            vec![
+                (start("p"), 0),
+                (chars("a"), 3),
+                (chars("&"), 4),
+                (chars("b"), 9),
+                (end("p"), 10),
+            ]
+        );
+    }
+}
diff --git a/html5ever/src/tree_builder/mod.rs b/html5ever/src/tree_builder/mod.rs
index 3fcfaec3..0be385ed 100644
--- a/html5ever/src/tree_builder/mod.rs
+++ b/html5ever/src/tree_builder/mod.rs
@@ -474,6 +474,11 @@ where
 {
     type Handle = Handle;
 
+    #[cfg(feature = "source-positions")]
+    fn set_current_byte(&self, byte_offset: u64) {
+        self.sink.set_current_byte(byte_offset);
+    }
+
     fn process_token(&self, token: tokenizer::Token, line_number: u64) -> TokenSinkResult<Handle> {
         if line_number != self.current_line.get() {
             self.sink.set_current_line(line_number);
@@ -673,8 +678,22 @@ where
         ProcessResult::ToRawData(k)
     }
 
-    // The generic raw text / RCDATA parsing algorithm.
+    /// The generic raw text / RCDATA parsing algorithm.
+    /// Insert a RCDATA/RAWTEXT element and switch the tokenizer to raw-text mode.
+    ///
+    /// XHTML allows self-closing syntax (`<title/>`, `<style/>`, …) on these
+    /// elements. The HTML5 spec ignores the `/` and enters the raw-text state,
+    /// which swallows the remainder of the document until a matching end tag.
+    /// When the `xhtml-self-closing` feature is enabled, the self-closing flag
+    /// is honoured instead: an empty element is inserted and the tokenizer stays
+    /// in the current insertion mode.
     fn parse_raw_data(&self, tag: Tag, k: RawKind) -> ProcessResult<Handle> {
+        #[cfg(feature = "xhtml-self-closing")]
+        if tag.self_closing {
+            self.insert_and_pop_element_for(tag);
+            return ProcessResult::DoneAckSelfClosing;
+        }
+
         self.insert_element_for(tag);
         self.to_raw_text_mode(k)
     }
diff --git a/markup5ever/Cargo.toml b/markup5ever/Cargo.toml
index 764f9a0a..79024927 100644
--- a/markup5ever/Cargo.toml
+++ b/markup5ever/Cargo.toml
@@ -3,20 +3,25 @@ name = "markup5ever"
 description = "Common code for xml5ever and html5ever"
 documentation = "https://docs.rs/markup5ever"
 categories = [ "parser-implementations", "web-programming" ]
-version.workspace = true
-license.workspace = true
-authors.workspace = true
-repository.workspace = true
-edition.workspace = true
-rust-version.workspace = true
+version = "0.39.0"
+license = "MIT OR Apache-2.0"
+authors = [ "The html5ever Project Developers" ]
+repository = "https://github.com/servo/html5ever"
+edition = "2021"
+rust-version = "1.71.0"
 
 [lib]
 path = "lib.rs"
 
 [features]
 serde = ["web_atoms/serde"]
+# Surfaces byte-accurate source positions through the `TreeSink` interface.
+# When enabled, `TreeSink::set_current_byte` is called before each tree
+# mutation with the UTF-8 byte offset of the current token in the input.
+# Use this to assign stable, parser-independent offsets to DOM nodes.
+source-positions = []
 
 [dependencies]
-web_atoms = { workspace = true }
-tendril = { workspace = true }
-log = { workspace = true }
+web_atoms = "0.2.4"
+tendril = "0.5"
+log = "0.4"
diff --git a/markup5ever/interface/tree_builder.rs b/markup5ever/interface/tree_builder.rs
index e1683de0..fbc70b16 100644
--- a/markup5ever/interface/tree_builder.rs
+++ b/markup5ever/interface/tree_builder.rs
@@ -269,6 +269,18 @@ pub trait TreeSink {
     /// Called whenever the line number changes.
     fn set_current_line(&self, _line_number: u64) {}
 
+    /// Called whenever the source byte offset changes.
+    ///
+    /// Only called when the `source-positions` feature is enabled on the
+    /// `html5ever` crate. The offset is the number of UTF-8 bytes consumed
+    /// from the input up to and including the last character of the token
+    /// that just triggered the current tree-builder callback.
+    ///
+    /// Implement this method to obtain byte-accurate source positions for
+    /// nodes. The default implementation is a no-op.
+    #[cfg(feature = "source-positions")]
+    fn set_current_byte(&self, _byte_offset: u64) {}
+
     fn allow_declarative_shadow_roots(&self, _intended_parent: &Self::Handle) -> bool {
         true
     }
diff --git a/markup5ever/util/buffer_queue.rs b/markup5ever/util/buffer_queue.rs
index d5e6864f..ca9af75c 100644
--- a/markup5ever/util/buffer_queue.rs
+++ b/markup5ever/util/buffer_queue.rs
@@ -18,6 +18,8 @@
 //!
 //! [`BufferQueue`]: struct.BufferQueue.html
 
+#[cfg(feature = "source-positions")]
+use std::cell::Cell;
 use std::{
     cell::{RefCell, RefMut},
     collections::VecDeque,
@@ -51,6 +53,13 @@ pub enum SetResult {
 pub struct BufferQueue {
     /// Buffers to process.
     buffers: RefCell<VecDeque<StrTendril>>,
+    /// Total number of UTF-8 bytes consumed from this queue so far.
+    ///
+    /// Only present when the `source-positions` feature is enabled. Used by
+    /// the tokenizer to surface byte-accurate source offsets via
+    /// [`TokenSink::set_current_byte`] and [`TreeSink::set_current_byte`].
+    #[cfg(feature = "source-positions")]
+    bytes_consumed: Cell<u64>,
 }
 
 impl Default for BufferQueue {
@@ -59,6 +68,8 @@ impl Default for BufferQueue {
     fn default() -> Self {
         Self {
             buffers: RefCell::new(VecDeque::with_capacity(16)),
+            #[cfg(feature = "source-positions")]
+            bytes_consumed: Cell::new(0),
         }
     }
 }
@@ -70,6 +81,43 @@ impl BufferQueue {
         self.buffers.borrow().is_empty()
     }
 
+    /// Returns the total number of UTF-8 bytes consumed from this queue.
+    ///
+    /// Only available when the `source-positions` feature is enabled. The
+    /// value monotonically increases as characters are consumed via
+    /// [`next`], [`pop_except_from`], and [`eat`]. Re-queuing bytes via
+    /// [`push_front`] does **not** decrement the counter — the tokenizer
+    /// uses its own `reconsume` flag for single-character look-back and
+    /// never actually re-pushes bytes that were already counted.
+    #[cfg(feature = "source-positions")]
+    #[inline]
+    pub fn bytes_consumed(&self) -> u64 {
+        self.bytes_consumed.get()
+    }
+
+    /// Advance the bytes-consumed counter by `n`.
+    ///
+    /// Only available when the `source-positions` feature is enabled.
+    /// Used by SIMD fast paths that consume bytes directly from a tendril
+    /// without going through [`next`] or [`pop_except_from`].
+    #[cfg(feature = "source-positions")]
+    #[inline]
+    pub fn advance_bytes_consumed(&self, n: u64) {
+        self.bytes_consumed.set(self.bytes_consumed.get() + n);
+    }
+
+    /// Retreat the bytes-consumed counter by `n`.
+    ///
+    /// Only available when the `source-positions` feature is enabled. Used by
+    /// tokenizer lookahead paths that consume raw bytes, then push unmatched
+    /// suffix bytes back onto the queue.
+    #[cfg(feature = "source-positions")]
+    #[inline]
+    pub fn retreat_bytes_consumed(&self, n: u64) {
+        self.bytes_consumed
+            .set(self.bytes_consumed.get().saturating_sub(n));
+    }
+
     /// Get the buffer at the beginning of the queue.
     #[inline]
     pub fn pop_front(&self) -> Option<StrTendril> {
@@ -146,9 +194,15 @@ impl BufferQueue {
                         out = buf.unsafe_subtendril(0, n);
                         buf.unsafe_pop_front(n);
                     }
+                    #[cfg(feature = "source-positions")]
+                    self.bytes_consumed
+                        .set(self.bytes_consumed.get() + out.len() as u64);
                     (Some(NotFromSet(out)), buf.is_empty())
                 } else {
                     let c = buf.pop_front_char().expect("empty buffer in queue");
+                    #[cfg(feature = "source-positions")]
+                    self.bytes_consumed
+                        .set(self.bytes_consumed.get() + c.len_utf8() as u64);
                     (Some(FromSet(c)), buf.is_empty())
                 }
             },
@@ -218,6 +272,10 @@ impl BufferQueue {
             Some(ref mut buf) => buf.pop_front(consumed_from_last as u32),
         }
 
+        #[cfg(feature = "source-positions")]
+        self.bytes_consumed
+            .set(self.bytes_consumed.get() + pat.len() as u64);
+
         Some(true)
     }
 
@@ -229,6 +287,9 @@ impl BufferQueue {
             None => (None, false),
             Some(buf) => {
                 let c = buf.pop_front_char().expect("empty buffer in queue");
+                #[cfg(feature = "source-positions")]
+                self.bytes_consumed
+                    .set(self.bytes_consumed.get() + c.len_utf8() as u64);
                 (Some(c), buf.is_empty())
             },
         };
@@ -331,3 +392,120 @@ mod test {
         assert_eq!(bq.next(), None);
     }
 }
+
+#[cfg(all(test, feature = "source-positions"))]
+mod test_source_positions {
+    use tendril::SliceExt;
+
+    use super::BufferQueue;
+    use super::SetResult::{FromSet, NotFromSet};
+
+    #[test]
+    fn next_advances_counter_by_utf8_width() {
+        let bq = BufferQueue::default();
+        assert_eq!(bq.bytes_consumed(), 0);
+
+        // ASCII: 1 byte each
+        bq.push_back("abc".to_tendril());
+        bq.next();
+        assert_eq!(bq.bytes_consumed(), 1);
+        bq.next();
+        assert_eq!(bq.bytes_consumed(), 2);
+        bq.next();
+        assert_eq!(bq.bytes_consumed(), 3);
+
+        // Multibyte: 'é' is 2 bytes (U+00E9, encoded as 0xC3 0xA9)
+        bq.push_back("é".to_tendril());
+        bq.next();
+        assert_eq!(bq.bytes_consumed(), 5);
+    }
+
+    #[test]
+    fn pop_except_from_bulk_advances_counter() {
+        let bq = BufferQueue::default();
+        // "abc" are not in the set; '&' is
+        bq.push_back("abc&def".to_tendril());
+        let set = small_char_set!('&');
+
+        // Bulk NotFromSet: 3 bytes consumed
+        assert_eq!(
+            bq.pop_except_from(set),
+            Some(NotFromSet("abc".to_tendril()))
+        );
+        assert_eq!(bq.bytes_consumed(), 3);
+
+        // Single FromSet '&': 1 byte consumed
+        assert_eq!(bq.pop_except_from(set), Some(FromSet('&')));
+        assert_eq!(bq.bytes_consumed(), 4);
+
+        // Bulk NotFromSet: 3 more bytes
+        assert_eq!(
+            bq.pop_except_from(set),
+            Some(NotFromSet("def".to_tendril()))
+        );
+        assert_eq!(bq.bytes_consumed(), 7);
+    }
+
+    #[test]
+    fn pop_except_from_multibyte_bulk_advances_by_byte_len() {
+        // "café" is 5 bytes (c=1, a=1, f=1, é=2). '&' terminates the bulk.
+        // Confirms NotFromSet advances by the byte length of the tendril slice,
+        // not by the character count.
+        let bq = BufferQueue::default();
+        bq.push_back("café&".to_tendril());
+        let set = small_char_set!('&');
+
+        let result = bq.pop_except_from(set);
+        assert!(matches!(result, Some(NotFromSet(_))));
+        // 'c'=1 + 'a'=1 + 'f'=1 + 'é'=2 = 5 bytes
+        assert_eq!(bq.bytes_consumed(), 5);
+    }
+
+    #[test]
+    fn eat_advances_counter_on_match_not_on_no_match() {
+        let bq = BufferQueue::default();
+        bq.push_back("abcdef".to_tendril());
+
+        // No match: counter unchanged
+        assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false));
+        assert_eq!(bq.bytes_consumed(), 0);
+
+        // Match "abc": counter advances by 3
+        assert_eq!(bq.eat("abc", u8::eq_ignore_ascii_case), Some(true));
+        assert_eq!(bq.bytes_consumed(), 3);
+
+        // Match "def": counter advances by 3 more
+        assert_eq!(bq.eat("def", u8::eq_ignore_ascii_case), Some(true));
+        assert_eq!(bq.bytes_consumed(), 6);
+    }
+
+    #[test]
+    fn push_front_does_not_decrement_counter() {
+        let bq = BufferQueue::default();
+        bq.push_back("abc".to_tendril());
+        bq.next(); // consume 'a' → 1
+        bq.next(); // consume 'b' → 2
+        assert_eq!(bq.bytes_consumed(), 2);
+
+        // Re-queue something — counter must not decrease
+        bq.push_front("xy".to_tendril());
+        assert_eq!(bq.bytes_consumed(), 2);
+
+        // Consuming the re-queued bytes advances further
+        bq.next(); // 'x' → 3
+        bq.next(); // 'y' → 4
+        assert_eq!(bq.bytes_consumed(), 4);
+    }
+
+    #[test]
+    fn advance_bytes_consumed_adds_exactly() {
+        let bq = BufferQueue::default();
+        assert_eq!(bq.bytes_consumed(), 0);
+
+        bq.advance_bytes_consumed(7);
+        assert_eq!(bq.bytes_consumed(), 7);
+
+        bq.advance_bytes_consumed(3);
+        assert_eq!(bq.bytes_consumed(), 10);
+    }
+}
diff --git a/rcdom/Cargo.toml b/rcdom/Cargo.toml
index caf52b54..30dab873 100644
--- a/rcdom/Cargo.toml
+++ b/rcdom/Cargo.toml
@@ -20,6 +20,10 @@ markup5ever = { workspace = true, features = ["serde"] }
 tendril = { workspace = true }
 xml5ever = { workspace = true }
 
+[features]
+source-positions = ["html5ever/source-positions"]
+xhtml-self-closing = ["html5ever/xhtml-self-closing"]
+
 [dev-dependencies]
 criterion = { workspace = true }
 env_logger = { workspace = true }
@@ -45,3 +49,11 @@ harness = false
 [[test]]
 name = "xml-tokenizer"
 harness = false
+
+[[test]]
+name = "source-positions-integration"
+required-features = ["source-positions"]
+
+[[test]]
+name = "xhtml-self-closing-integration"
+required-features = ["xhtml-self-closing"]
diff --git a/rcdom/tests/source-positions-integration.rs b/rcdom/tests/source-positions-integration.rs
new file mode 100644
index 00000000..052c63e1
--- /dev/null
+++ b/rcdom/tests/source-positions-integration.rs
@@ -0,0 +1,214 @@
+// Copyright 2014-2026 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! Integration tests for the `source-positions` feature.
+//!
+//! Verifies that byte offsets flow correctly from `BufferQueue` through the
+//! tokenizer and tree builder all the way into `TreeSink::set_current_byte`,
+//! and that the offsets correspond to the actual positions of element opening
+//! tags in the source string.
+
+#[cfg(feature = "source-positions")]
+mod source_positions {
+    use html5ever::driver;
+    use html5ever::tendril::stream::TendrilSink;
+    use html5ever::tendril::StrTendril;
+    use html5ever::ExpandedName;
+    use html5ever::QualName;
+    use markup5ever::interface::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
+    use markup5ever::Attribute;
+    use markup5ever_rcdom::{Handle, RcDom};
+    use std::borrow::Cow;
+    use std::cell::{Cell, RefCell};
+
+    /// Wraps `RcDom` and records `(local_name, byte_offset)` for every
+    /// element created while `set_current_byte` is active.
+    struct ByteCapturingDOM {
+        current_byte: Cell<u64>,
+        elements: RefCell<Vec<(String, u64)>>,
+        rcdom: RcDom,
+    }
+
+    impl ByteCapturingDOM {
+        fn new() -> Self {
+            ByteCapturingDOM {
+                current_byte: Cell::new(0),
+                elements: RefCell::new(vec![]),
+                rcdom: RcDom::default(),
+            }
+        }
+
+        /// Returns recorded `(local_name, byte_offset)` pairs, skipping the
+        /// implicit wrapper elements html5ever inserts (`html`, `head`, `body`).
+        fn content_elements(&self) -> Vec<(String, u64)> {
+            self.elements
+                .borrow()
+                .iter()
+                .filter(|(name, _)| !matches!(name.as_str(), "html" | "head" | "body"))
+                .cloned()
+                .collect()
+        }
+    }
+
+    impl TreeSink for ByteCapturingDOM {
+        type Output = Self;
+        type ElemName<'a> = ExpandedName<'a>;
+
+        fn finish(self) -> Self {
+            self
+        }
+
+        type Handle = Handle;
+
+        fn parse_error(&self, msg: Cow<'static, str>) {
+            self.rcdom.parse_error(msg);
+        }
+
+        fn get_document(&self) -> Handle {
+            self.rcdom.get_document()
+        }
+
+        fn get_template_contents(&self, target: &Handle) -> Handle {
+            self.rcdom.get_template_contents(target)
+        }
+
+        fn set_quirks_mode(&self, mode: QuirksMode) {
+            self.rcdom.set_quirks_mode(mode)
+        }
+
+        fn same_node(&self, x: &Handle, y: &Handle) -> bool {
+            self.rcdom.same_node(x, y)
+        }
+
+        fn elem_name<'a>(&'a self, target: &'a Handle) -> ExpandedName<'a> {
+            self.rcdom.elem_name(target)
+        }
+
+        fn create_element(
+            &self,
+            name: QualName,
+            attrs: Vec<Attribute>,
+            flags: ElementFlags,
+        ) -> Handle {
+            self.elements
+                .borrow_mut()
+                .push((name.local.to_string(), self.current_byte.get()));
+            self.rcdom.create_element(name, attrs, flags)
+        }
+
+        fn create_comment(&self, text: StrTendril) -> Handle {
+            self.rcdom.create_comment(text)
+        }
+
+        fn create_pi(&self, target: StrTendril, content: StrTendril) -> Handle {
+            self.rcdom.create_pi(target, content)
+        }
+
+        fn append(&self, parent: &Handle, child: NodeOrText<Handle>) {
+            self.rcdom.append(parent, child)
+        }
+
+        fn append_before_sibling(&self, sibling: &Handle, child: NodeOrText<Handle>) {
+            self.rcdom.append_before_sibling(sibling, child)
+        }
+
+        fn append_based_on_parent_node(
+            &self,
+            element: &Handle,
+            prev_element: &Handle,
+            child: NodeOrText<Handle>,
+        ) {
+            self.rcdom
+                .append_based_on_parent_node(element, prev_element, child)
+        }
+
+        fn append_doctype_to_document(
+            &self,
+            name: StrTendril,
+            public_id: StrTendril,
+            system_id: StrTendril,
+        ) {
+            self.rcdom
+                .append_doctype_to_document(name, public_id, system_id);
+        }
+
+        fn add_attrs_if_missing(&self, target: &Handle, attrs: Vec<Attribute>) {
+            self.rcdom.add_attrs_if_missing(target, attrs);
+        }
+
+        fn remove_from_parent(&self, target: &Handle) {
+            self.rcdom.remove_from_parent(target);
+        }
+
+        fn reparent_children(&self, node: &Handle, new_parent: &Handle) {
+            self.rcdom.reparent_children(node, new_parent);
+        }
+
+        fn mark_script_already_started(&self, target: &Handle) {
+            self.rcdom.mark_script_already_started(target);
+        }
+
+        fn set_current_line(&self, line_number: u64) {
+            self.rcdom.set_current_line(line_number);
+        }
+
+        fn set_current_byte(&self, byte_offset: u64) {
+            self.current_byte.set(byte_offset);
+        }
+    }
+
+    fn parse(input: &str) -> ByteCapturingDOM {
+        let sink = ByteCapturingDOM::new();
+        driver::parse_document(sink, Default::default()).one(StrTendril::from(input))
+    }
+
+    #[test]
+    fn element_byte_offsets_match_source_positions() {
+        // <p>   starts at byte 0
+        // <div> starts at byte 14  ("<p>hello</p>" = 12 chars + 2 for "</p>")
+        //   <p>hello</p> = 12 bytes, </p> = 4 bytes → <div> at 16? Let's be precise:
+        // "<p>hello</p><div>world</div>"
+        //  0123456789012345678901234567
+        //  <p> = 0, </p> = 8, <div> = 12
+        let result = parse("<p>hello</p><div>world</div>");
+        let elems = result.content_elements();
+
+        assert_eq!(elems.len(), 2, "expected p and div, got: {:?}", elems);
+        assert_eq!(elems[0], ("p".to_string(), 0));
+        assert_eq!(elems[1], ("div".to_string(), 12));
+    }
+
+    #[test]
+    fn nested_element_byte_offset() {
+        // "<div><span>x</span></div>"
+        //  01234567890123456789...
+        // <div> = 0, <span> = 5
+        let result = parse("<div><span>x</span></div>");
+        let elems = result.content_elements();
+
+        assert_eq!(elems.len(), 2, "expected div and span, got: {:?}", elems);
+        assert_eq!(elems[0], ("div".to_string(), 0));
+        assert_eq!(elems[1], ("span".to_string(), 5));
+    }
+
+    #[test]
+    fn multibyte_content_does_not_shift_subsequent_offsets() {
+        // "<p>café</p><span>next</span>"
+        // 'é' = 2 bytes, so:
+        // <p>    = byte 0
+        // </p>   = byte 3+5 = byte 8  ("café" = c(1)+a(1)+f(1)+é(2) = 5 bytes)
+        // <span> = byte 8 + 4 = byte 12 ("</p>" = 4 bytes)
+        let result = parse("<p>café</p><span>next</span>");
+        let elems = result.content_elements();
+
+        assert_eq!(elems.len(), 2, "expected p and span, got: {:?}", elems);
+        assert_eq!(elems[0], ("p".to_string(), 0));
+        assert_eq!(elems[1], ("span".to_string(), 12));
+    }
+}
diff --git a/rcdom/tests/xhtml-self-closing-integration.rs b/rcdom/tests/xhtml-self-closing-integration.rs
new file mode 100644
index 00000000..c49aaf69
--- /dev/null
+++ b/rcdom/tests/xhtml-self-closing-integration.rs
@@ -0,0 +1,123 @@
+// Copyright 2014-2026 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! Integration tests for the `xhtml-self-closing` feature.
+//!
+//! EPUB content is XHTML and uses self-closing syntax on RCDATA/RAWTEXT
+//! elements such as `<title/>` and `<style/>`. Without `xhtml-self-closing`,
+//! html5ever treats these as opening tags and enters raw-text mode, consuming
+//! the rest of the document. These tests verify the feature makes parsing
+//! behave as XHTML authors expect.
+
+#[cfg(feature = "xhtml-self-closing")]
+mod xhtml_self_closing {
+    use html5ever::driver;
+    use html5ever::tendril::stream::TendrilSink;
+    use html5ever::tendril::StrTendril;
+    use markup5ever_rcdom::{NodeData, RcDom};
+
+    fn parse(input: &str) -> RcDom {
+        driver::parse_document(RcDom::default(), Default::default()).one(StrTendril::from(input))
+    }
+
+    /// Walk the tree and collect all element names.
+    fn element_names(node: &markup5ever_rcdom::Handle) -> Vec<String> {
+        let mut names = Vec::new();
+        collect_names(node, &mut names);
+        names
+    }
+
+    fn collect_names(node: &markup5ever_rcdom::Handle, out: &mut Vec<String>) {
+        if let NodeData::Element { ref name, .. } = node.data {
+            out.push(name.local.to_string());
+        }
+        for child in node.children.borrow().iter() {
+            collect_names(child, out);
+        }
+    }
+
+    /// Return the text content of the first element with the given local name.
+    fn text_of(dom: &RcDom, tag: &str) -> Option<String> {
+        find_text(&dom.document, tag)
+    }
+
+    fn find_text(node: &markup5ever_rcdom::Handle, tag: &str) -> Option<String> {
+        if let NodeData::Element { ref name, .. } = node.data {
+            if name.local.as_ref() == tag {
+                let mut text = String::new();
+                for child in node.children.borrow().iter() {
+                    if let NodeData::Text { ref contents } = child.data {
+                        text.push_str(&contents.borrow());
+                    }
+                }
+                return Some(text);
+            }
+        }
+        for child in node.children.borrow().iter() {
+            if let Some(t) = find_text(child, tag) {
+                return Some(t);
+            }
+        }
+        None
+    }
+
+    #[test]
+    fn self_closing_title_does_not_swallow_body() {
+        // Without the feature <title/> opens a RCDATA region that swallows
+        // everything up to the next </title>. With it, <title/> is empty and
+        // the body parses normally.
+        let dom = parse("<html><head><title/></head><body><p>visible</p></body></html>");
+        let names = element_names(&dom.document);
+
+        assert!(
+            names.contains(&"body".to_string()),
+            "body element should be present; got: {:?}",
+            names
+        );
+        assert!(
+            names.contains(&"p".to_string()),
+            "p element inside body should be present; got: {:?}",
+            names
+        );
+
+        let text = text_of(&dom, "p");
+        assert_eq!(
+            text.as_deref(),
+            Some("visible"),
+            "<p> text should be 'visible', got: {:?}",
+            text
+        );
+    }
+
+    #[test]
+    fn self_closing_style_does_not_swallow_body() {
+        let dom = parse("<html><head><style/></head><body><p>content</p></body></html>");
+        let names = element_names(&dom.document);
+
+        assert!(
+            names.contains(&"p".to_string()),
+            "p element should not be swallowed by <style/>; got: {:?}",
+            names
+        );
+    }
+
+    #[test]
+    fn normal_closed_title_still_captures_rcdata_text() {
+        // A properly-closed <title>…</title> must still capture its RCDATA
+        // content — the feature must not break normal title parsing.
+        let dom = parse("<html><head><title>My Book</title></head><body></body></html>");
+        let text = text_of(&dom, "title");
+        assert_eq!(
+            text.as_deref(),
+            Some("My Book"),
+            "title text should be 'My Book', got: {:?}",
+            text
+        );
+    }
+}

From 56499d6166445f0a8f25a4f43e539d54bb2db6f5 Mon Sep 17 00:00:00 2001
From: Kevin Hellemun <17928966+OGKevin@users.noreply.github.com>
Date: Thu, 18 Jun 2026 15:33:43 +0200
Subject: [PATCH 2/2] chore: some brooming

Change-Id: 1cd53437f8710e57e9a9bb17f38bc2fc
Change-Id-Short: ynmuwvwskrsy
---
 Cargo.toml                                    |   4 +-
 html5ever/Cargo.toml                          |   6 +-
 html5ever/src/tokenizer/interface.rs          |   3 +-
 html5ever/src/tokenizer/mod.rs                |  46 +-----
 html5ever/src/tree_builder/mod.rs             |   9 +-
 markup5ever/interface/tree_builder.rs         |  14 +-
 markup5ever/util/buffer_queue.rs              | 102 +++++++-----
 rcdom/tests/source-positions-integration.rs   | 147 ++++++++++--------
 rcdom/tests/xhtml-self-closing-integration.rs |  17 +-
 9 files changed, 164 insertions(+), 184 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index f2a87026..141c9124 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -44,9 +44,7 @@ serde_json = "1.0"
 typed-arena = "2.0.2"
 
 # Redirect crates.io tendril/web_atoms to the local path crates so that
-# markup5ever (which cannot use workspace.dependencies when loaded as a
-# Cargo [patch] from outside this workspace) gets the same crate instance
-# as rcdom and xml5ever.
+# markup5ever gets the same crate instance as rcdom and xml5ever.
 [patch.crates-io]
 tendril = { path = "tendril" }
 web_atoms = { path = "web_atoms" }
diff --git a/html5ever/Cargo.toml b/html5ever/Cargo.toml
index 6db09321..23b752b0 100644
--- a/html5ever/Cargo.toml
+++ b/html5ever/Cargo.toml
@@ -18,10 +18,8 @@ serde = ["markup5ever/serde"]
 # Surfaces byte-accurate source positions; see markup5ever for full description.
 source-positions = ["markup5ever/source-positions"]
 # Honour the XML/XHTML self-closing syntax (`<title/>`, `<style/>`, …) on
-# RCDATA and RAWTEXT elements. Without this, html5ever follows the HTML5
-# spec and treats `<title/>` as opening a RCDATA region that swallows the
-# rest of the document. EPUB content is XHTML and relies on self-closing
-# being honoured.
+# RCDATA and RAWTEXT elements. Without this, html5ever treats `<title/>`
+# as opening a RCDATA region that swallows the rest of the document.
 xhtml-self-closing = []
 
 [dependencies]
diff --git a/html5ever/src/tokenizer/interface.rs b/html5ever/src/tokenizer/interface.rs
index c9ee28c9..97437809 100644
--- a/html5ever/src/tokenizer/interface.rs
+++ b/html5ever/src/tokenizer/interface.rs
@@ -133,8 +133,7 @@ pub trait TokenSink {
     /// Called just before each token is dispatched to [`process_token`],
     /// with the number of UTF-8 bytes consumed from the input so far.
     ///
-    /// Only called when the `source-positions` feature is enabled. The
-    /// default implementation is a no-op.
+    /// The default implementation is a no-op.
     #[cfg(feature = "source-positions")]
     fn set_current_byte(&self, _byte_offset: u64) {}
 
diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs
index 971ac0ba..8b0d473a 100644
--- a/html5ever/src/tokenizer/mod.rs
+++ b/html5ever/src/tokenizer/mod.rs
@@ -185,27 +185,24 @@ pub struct Tokenizer<Sink> {
     /// Number of UTF-8 bytes consumed from the input so far.
     ///
     /// Kept in sync with `BufferQueue::bytes_consumed` after every character
-    /// is consumed. Only present when the `source-positions` feature is
-    /// enabled.
+    /// is consumed.
     #[cfg(feature = "source-positions")]
     current_byte: Cell<u64>,
 
     /// Byte offset of the first character of the current token.
     ///
     /// For tag, comment, and doctype tokens this is the byte of the `<` that
-    /// opened them — captured whenever `<` is consumed in
-    /// `get_preprocessed_char`. For character tokens it is the byte right
-    /// after the end of the previous token, which equals the first byte of
-    /// the text content — tracked via `last_token_end_byte`.
-    /// Only present when the `source-positions` feature is enabled.
+    /// opened them, captured whenever `<` is consumed in `get_preprocessed_char`.
+    ///
+    /// For character tokens it is the byte right after the end of the previous token,
+    /// which equals the first byte of the text content, this is tracked via `last_token_end_byte`.
     #[cfg(feature = "source-positions")]
     token_start_byte: Cell<u64>,
 
     /// Byte offset one past the end of the most recently emitted token.
     ///
     /// Updated at the end of each `process_token` call. Used as the start
-    /// byte for the next character token. Only present when the
-    /// `source-positions` feature is enabled.
+    /// byte for the next character token.
     #[cfg(feature = "source-positions")]
     last_token_end_byte: Cell<u64>,
 }
@@ -2487,10 +2484,7 @@ mod test_source_positions {
 
     /// Records (token, byte_offset) pairs via `set_current_byte`.
     struct BytesMatch {
-        /// Byte offset delivered by the most recent `set_current_byte` call.
         current_byte: std::cell::Cell<u64>,
-        /// Byte offset at the start of the current character run.
-        /// Captured on the first `CharacterTokens` chunk; cleared after flush.
         text_start_byte: std::cell::Cell<Option<u64>>,
         current_str: RefCell<StrTendril>,
         entries: RefCell<Vec<(Token, u64)>>,
@@ -2567,13 +2561,8 @@ mod test_source_positions {
                 EOFToken => {
                     self.flush_chars();
                 },
-                TagToken(mut t) => {
+                TagToken(t) => {
                     self.flush_chars();
-                    if let EndTag = t.kind {
-                        t.attrs = vec![];
-                    } else {
-                        t.attrs.sort_by(|a, b| a.name.cmp(&b.name));
-                    }
                     self.entries.borrow_mut().push((TagToken(t), byte));
                 },
                 other => {
@@ -2653,10 +2642,6 @@ mod test_source_positions {
 
     #[test]
     fn check_byte_offsets_simple_tags() {
-        // <a>   = bytes 0-2  → offset 0
-        // <b>   = bytes 3-5  → offset 3
-        // </b>  = bytes 6-9  → offset 6
-        // </a>  = bytes 10-13 → offset 10
         let entries = tokenize_bytes("<a><b></b></a>");
         assert_eq!(
             entries,
@@ -2671,9 +2656,6 @@ mod test_source_positions {
 
     #[test]
     fn check_byte_offsets_text_content() {
-        // <p>     = bytes 0-2   → offset 0
-        // "hello" = bytes 3-7   → offset 3 (right after '>')
-        // </p>    = bytes 8-11  → offset 8
         let entries = tokenize_bytes("<p>hello</p>");
         assert_eq!(
             entries,
@@ -2683,24 +2665,14 @@ mod test_source_positions {
 
     #[test]
     fn check_byte_offsets_multibyte_text() {
-        // <p>   = bytes 0-2  → offset 0
-        // "é"   = bytes 3-4  (é = 2 UTF-8 bytes) → offset 3
-        // </p>  = bytes 5-8  → offset 5
         let entries = tokenize_bytes("<p>é</p>");
         assert_eq!(
             entries,
             vec![(start("p"), 0), (chars("é"), 3), (end("p"), 5),]
         );
     }
-
     #[test]
     fn check_byte_offsets_sequential_siblings() {
-        // <h1>  = bytes 0-3   → offset 0
-        // "X"   = bytes 4     → offset 4
-        // </h1> = bytes 5-9   → offset 5
-        // <p>   = bytes 10-12 → offset 10
-        // "Y"   = bytes 13    → offset 13
-        // </p>  = bytes 14-17 → offset 14
         let entries = tokenize_bytes("<h1>X</h1><p>Y</p>");
         assert_eq!(
             entries,
@@ -2717,10 +2689,6 @@ mod test_source_positions {
 
     #[test]
     fn check_byte_offsets_entity_text_chunks() {
-        // <p>     = bytes 0-2 → offset 0
-        // "a"     = byte 3
-        // "&amp;" = bytes 4-8, decoded to "&"
-        // "b"     = byte 9
         let entries = tokenize_raw_bytes("<p>a&amp;b</p>");
         assert_eq!(
             entries,
diff --git a/html5ever/src/tree_builder/mod.rs b/html5ever/src/tree_builder/mod.rs
index 0be385ed..bf28847a 100644
--- a/html5ever/src/tree_builder/mod.rs
+++ b/html5ever/src/tree_builder/mod.rs
@@ -681,12 +681,9 @@ where
     /// The generic raw text / RCDATA parsing algorithm.
     /// Insert a RCDATA/RAWTEXT element and switch the tokenizer to raw-text mode.
     ///
-    /// XHTML allows self-closing syntax (`<title/>`, `<style/>`, …) on these
-    /// elements. The HTML5 spec ignores the `/` and enters the raw-text state,
-    /// which swallows the remainder of the document until a matching end tag.
-    /// When the `xhtml-self-closing` feature is enabled, the self-closing flag
-    /// is honoured instead: an empty element is inserted and the tokenizer stays
-    /// in the current insertion mode.
+    /// When the `xhtml-self-closing` feature is enabled, (`<title/>`, `<style/>`, …)
+    /// are treated as empty elements instead of invalid HTML which ends up
+    /// swallowing all the content that comes after it.
     fn parse_raw_data(&self, tag: Tag, k: RawKind) -> ProcessResult<Handle> {
         #[cfg(feature = "xhtml-self-closing")]
         if tag.self_closing {
diff --git a/markup5ever/interface/tree_builder.rs b/markup5ever/interface/tree_builder.rs
index fbc70b16..44803457 100644
--- a/markup5ever/interface/tree_builder.rs
+++ b/markup5ever/interface/tree_builder.rs
@@ -269,15 +269,15 @@ pub trait TreeSink {
     /// Called whenever the line number changes.
     fn set_current_line(&self, _line_number: u64) {}
 
-    /// Called whenever the source byte offset changes.
+    /// Called before a tree-builder callback with the source byte offset for the
+    /// token or text segment that triggered it.
     ///
-    /// Only called when the `source-positions` feature is enabled on the
-    /// `html5ever` crate. The offset is the number of UTF-8 bytes consumed
-    /// from the input up to and including the last character of the token
-    /// that just triggered the current tree-builder callback.
+    /// For start tags, end tags, comments, and doctypes this is the UTF-8 byte
+    /// offset of the token's first byte in the original input. For character
+    /// tokens this is the UTF-8 byte offset of the first byte in the current text
+    /// segment.
     ///
-    /// Implement this method to obtain byte-accurate source positions for
-    /// nodes. The default implementation is a no-op.
+    /// The default implementation is a no-op.
     #[cfg(feature = "source-positions")]
     fn set_current_byte(&self, _byte_offset: u64) {}
 
diff --git a/markup5ever/util/buffer_queue.rs b/markup5ever/util/buffer_queue.rs
index ca9af75c..4099ff60 100644
--- a/markup5ever/util/buffer_queue.rs
+++ b/markup5ever/util/buffer_queue.rs
@@ -55,8 +55,7 @@ pub struct BufferQueue {
     buffers: RefCell<VecDeque<StrTendril>>,
     /// Total number of UTF-8 bytes consumed from this queue so far.
     ///
-    /// Only present when the `source-positions` feature is enabled. Used by
-    /// the tokenizer to surface byte-accurate source offsets via
+    /// Used by the tokenizer to surface byte-accurate source offsets via
     /// [`TokenSink::set_current_byte`] and [`TreeSink::set_current_byte`].
     #[cfg(feature = "source-positions")]
     bytes_consumed: Cell<u64>,
@@ -83,12 +82,11 @@ impl BufferQueue {
 
     /// Returns the total number of UTF-8 bytes consumed from this queue.
     ///
-    /// Only available when the `source-positions` feature is enabled. The
-    /// value monotonically increases as characters are consumed via
+    /// The value monotonically increases as characters are consumed via
     /// [`next`], [`pop_except_from`], and [`eat`]. Re-queuing bytes via
-    /// [`push_front`] does **not** decrement the counter — the tokenizer
-    /// uses its own `reconsume` flag for single-character look-back and
-    /// never actually re-pushes bytes that were already counted.
+    /// [`push_front`] does **not** decrement the counter.
+    ///
+    /// To reduce bytes_consumed, use [`retreat_bytes_consumed`].
     #[cfg(feature = "source-positions")]
     #[inline]
     pub fn bytes_consumed(&self) -> u64 {
@@ -97,9 +95,7 @@ impl BufferQueue {
 
     /// Advance the bytes-consumed counter by `n`.
     ///
-    /// Only available when the `source-positions` feature is enabled.
-    /// Used by SIMD fast paths that consume bytes directly from a tendril
-    /// without going through [`next`] or [`pop_except_from`].
+    /// Use this to manually advance the counter when bypassing: [`next`], [`pop_except_from`], and [`eat`]
     #[cfg(feature = "source-positions")]
     #[inline]
     pub fn advance_bytes_consumed(&self, n: u64) {
@@ -108,8 +104,7 @@ impl BufferQueue {
 
     /// Retreat the bytes-consumed counter by `n`.
     ///
-    /// Only available when the `source-positions` feature is enabled. Used by
-    /// tokenizer lookahead paths that consume raw bytes, then push unmatched
+    /// Used by tokenizer lookahead paths that consume raw bytes, then push unmatched
     /// suffix bytes back onto the queue.
     #[cfg(feature = "source-positions")]
     #[inline]
@@ -401,11 +396,10 @@ mod test_source_positions {
     use super::SetResult::{FromSet, NotFromSet};
 
     #[test]
-    fn next_advances_counter_by_utf8_width() {
+    fn next_advances_counter_by_utf8_width_single() {
         let bq = BufferQueue::default();
         assert_eq!(bq.bytes_consumed(), 0);
 
-        // ASCII: 1 byte each
         bq.push_back("abc".to_tendril());
         bq.next();
         assert_eq!(bq.bytes_consumed(), 1);
@@ -413,87 +407,99 @@ mod test_source_positions {
         assert_eq!(bq.bytes_consumed(), 2);
         bq.next();
         assert_eq!(bq.bytes_consumed(), 3);
+    }
+
+    #[test]
+    fn next_advances_counter_by_utf8_width_double() {
+        let bq = BufferQueue::default();
+        assert_eq!(bq.bytes_consumed(), 0);
 
-        // Multibyte: 'é' is 2 bytes (U+00E9, encoded as 0xC3 0xA9)
         bq.push_back("é".to_tendril());
         bq.next();
-        assert_eq!(bq.bytes_consumed(), 5);
+        assert_eq!(bq.bytes_consumed(), 2);
     }
 
     #[test]
-    fn pop_except_from_bulk_advances_counter() {
+    fn pop_except_from_not_from_set_advances_counter() {
         let bq = BufferQueue::default();
-        // "abc" are not in the set; '&' is
-        bq.push_back("abc&def".to_tendril());
+        bq.push_back("abc&".to_tendril());
         let set = small_char_set!('&');
 
-        // Bulk NotFromSet: 3 bytes consumed
         assert_eq!(
             bq.pop_except_from(set),
             Some(NotFromSet("abc".to_tendril()))
         );
         assert_eq!(bq.bytes_consumed(), 3);
+    }
+
+    #[test]
+    fn pop_except_from_from_set_advances_counter() {
+        let bq = BufferQueue::default();
+        bq.push_back("&def".to_tendril());
+        let set = small_char_set!('&');
 
-        // Single FromSet '&': 1 byte consumed
         assert_eq!(bq.pop_except_from(set), Some(FromSet('&')));
+        assert_eq!(bq.bytes_consumed(), 1);
+    }
+
+    #[test]
+    fn pop_except_from_successive_calls_accumulate_counter() {
+        let bq = BufferQueue::default();
+        bq.push_back("abc&def".to_tendril());
+        let set = small_char_set!('&');
+
+        bq.pop_except_from(set);
+        assert_eq!(bq.bytes_consumed(), 3);
+
+        bq.pop_except_from(set);
         assert_eq!(bq.bytes_consumed(), 4);
 
-        // Bulk NotFromSet: 3 more bytes
-        assert_eq!(
-            bq.pop_except_from(set),
-            Some(NotFromSet("def".to_tendril()))
-        );
+        bq.pop_except_from(set);
         assert_eq!(bq.bytes_consumed(), 7);
     }
 
     #[test]
     fn pop_except_from_multibyte_bulk_advances_by_byte_len() {
-        // "café" is 5 bytes (c=1, a=1, f=1, é=2). '&' terminates the bulk.
-        // Confirms NotFromSet advances by the byte length of the tendril slice,
-        // not by the character count.
         let bq = BufferQueue::default();
         bq.push_back("café&".to_tendril());
         let set = small_char_set!('&');
 
         let result = bq.pop_except_from(set);
         assert!(matches!(result, Some(NotFromSet(_))));
-        // 'c'=1 + 'a'=1 + 'f'=1 + 'é'=2 = 5 bytes
         assert_eq!(bq.bytes_consumed(), 5);
     }
 
     #[test]
-    fn eat_advances_counter_on_match_not_on_no_match() {
+    fn eat_advances_counter_accordingly() {
         let bq = BufferQueue::default();
         bq.push_back("abcdef".to_tendril());
 
-        // No match: counter unchanged
         assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false));
         assert_eq!(bq.bytes_consumed(), 0);
 
-        // Match "abc": counter advances by 3
         assert_eq!(bq.eat("abc", u8::eq_ignore_ascii_case), Some(true));
         assert_eq!(bq.bytes_consumed(), 3);
 
-        // Match "def": counter advances by 3 more
         assert_eq!(bq.eat("def", u8::eq_ignore_ascii_case), Some(true));
         assert_eq!(bq.bytes_consumed(), 6);
     }
 
     #[test]
+    /// This test is to ensure the behaviour contract of push_front is kept.
+    /// There are use cases where pushing front should technically not retreat the
+    /// bytes counter, so it's up to the caller to decide if pushing front should retreat.
     fn push_front_does_not_decrement_counter() {
         let bq = BufferQueue::default();
         bq.push_back("abc".to_tendril());
-        bq.next(); // consume 'a' → 1
-        bq.next(); // consume 'b' → 2
+        bq.next();
+        bq.next();
         assert_eq!(bq.bytes_consumed(), 2);
 
-        // Re-queue something — counter must not decrease
         bq.push_front("xy".to_tendril());
         assert_eq!(bq.bytes_consumed(), 2);
 
-        // Consuming the re-queued bytes advances further
-        bq.next(); // 'x' → 3
-        bq.next(); // 'y' → 4
+        bq.next();
+        bq.next();
         assert_eq!(bq.bytes_consumed(), 4);
     }
 
@@ -508,4 +514,20 @@ mod test_source_positions {
         bq.advance_bytes_consumed(3);
         assert_eq!(bq.bytes_consumed(), 10);
     }
+
+    #[test]
+    fn retreat_bytes_consumed_subtracts_exactly() {
+        let bq = BufferQueue::default();
+        bq.advance_bytes_consumed(10);
+        assert_eq!(bq.bytes_consumed(), 10);
+
+        bq.retreat_bytes_consumed(3);
+        assert_eq!(bq.bytes_consumed(), 7);
+
+        bq.retreat_bytes_consumed(7);
+        assert_eq!(bq.bytes_consumed(), 0);
+
+        bq.retreat_bytes_consumed(5);
+        assert_eq!(bq.bytes_consumed(), 0);
+    }
 }
diff --git a/rcdom/tests/source-positions-integration.rs b/rcdom/tests/source-positions-integration.rs
index 052c63e1..34ebe25c 100644
--- a/rcdom/tests/source-positions-integration.rs
+++ b/rcdom/tests/source-positions-integration.rs
@@ -1,18 +1,16 @@
-// Copyright 2014-2026 The html5ever Project Developers. See the
-// COPYRIGHT file at the top-level directory of this distribution.
-//
-// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-// option. This file may not be copied, modified, or distributed
-// except according to those terms.
-
 //! Integration tests for the `source-positions` feature.
 //!
 //! Verifies that byte offsets flow correctly from `BufferQueue` through the
 //! tokenizer and tree builder all the way into `TreeSink::set_current_byte`,
 //! and that the offsets correspond to the actual positions of element opening
 //! tags in the source string.
+//!
+//! 2 Critical behaviours are under test:
+//!
+//! 1. When no explicit <head>,<html>,<body> tags are part of the payload
+//!    they get injected implicitly, they should not skew the byte offset.
+//! 2. When the above tags are explicitly part of the payload, they should be part
+//!    of the count.
 
 #[cfg(feature = "source-positions")]
 mod source_positions {
@@ -28,7 +26,9 @@ mod source_positions {
     use std::cell::{Cell, RefCell};
 
     /// Wraps `RcDom` and records `(local_name, byte_offset)` for every
-    /// element created while `set_current_byte` is active.
+    /// element created.
+    ///
+    /// These are then later used for assertions.
     struct ByteCapturingDOM {
         current_byte: Cell<u64>,
         elements: RefCell<Vec<(String, u64)>>,
@@ -44,28 +44,21 @@ mod source_positions {
             }
         }
 
-        /// Returns recorded `(local_name, byte_offset)` pairs, skipping the
-        /// implicit wrapper elements html5ever inserts (`html`, `head`, `body`).
         fn content_elements(&self) -> Vec<(String, u64)> {
-            self.elements
-                .borrow()
-                .iter()
-                .filter(|(name, _)| !matches!(name.as_str(), "html" | "head" | "body"))
-                .cloned()
-                .collect()
+            self.elements.borrow().clone()
         }
     }
 
     impl TreeSink for ByteCapturingDOM {
+        type Handle = Handle;
         type Output = Self;
+
         type ElemName<'a> = ExpandedName<'a>;
 
         fn finish(self) -> Self {
             self
         }
 
-        type Handle = Handle;
-
         fn parse_error(&self, msg: Cow<'static, str>) {
             self.rcdom.parse_error(msg);
         }
@@ -74,18 +67,6 @@ mod source_positions {
             self.rcdom.get_document()
         }
 
-        fn get_template_contents(&self, target: &Handle) -> Handle {
-            self.rcdom.get_template_contents(target)
-        }
-
-        fn set_quirks_mode(&self, mode: QuirksMode) {
-            self.rcdom.set_quirks_mode(mode)
-        }
-
-        fn same_node(&self, x: &Handle, y: &Handle) -> bool {
-            self.rcdom.same_node(x, y)
-        }
-
         fn elem_name<'a>(&'a self, target: &'a Handle) -> ExpandedName<'a> {
             self.rcdom.elem_name(target)
         }
@@ -114,10 +95,6 @@ mod source_positions {
             self.rcdom.append(parent, child)
         }
 
-        fn append_before_sibling(&self, sibling: &Handle, child: NodeOrText<Handle>) {
-            self.rcdom.append_before_sibling(sibling, child)
-        }
-
         fn append_based_on_parent_node(
             &self,
             element: &Handle,
@@ -138,6 +115,22 @@ mod source_positions {
                 .append_doctype_to_document(name, public_id, system_id);
         }
 
+        fn get_template_contents(&self, target: &Handle) -> Handle {
+            self.rcdom.get_template_contents(target)
+        }
+
+        fn same_node(&self, x: &Handle, y: &Handle) -> bool {
+            self.rcdom.same_node(x, y)
+        }
+
+        fn set_quirks_mode(&self, mode: QuirksMode) {
+            self.rcdom.set_quirks_mode(mode)
+        }
+
+        fn append_before_sibling(&self, sibling: &Handle, child: NodeOrText<Handle>) {
+            self.rcdom.append_before_sibling(sibling, child)
+        }
+
         fn add_attrs_if_missing(&self, target: &Handle, attrs: Vec<Attribute>) {
             self.rcdom.add_attrs_if_missing(target, attrs);
         }
@@ -150,14 +143,6 @@ mod source_positions {
             self.rcdom.reparent_children(node, new_parent);
         }
 
-        fn mark_script_already_started(&self, target: &Handle) {
-            self.rcdom.mark_script_already_started(target);
-        }
-
-        fn set_current_line(&self, line_number: u64) {
-            self.rcdom.set_current_line(line_number);
-        }
-
         fn set_current_byte(&self, byte_offset: u64) {
             self.current_byte.set(byte_offset);
         }
@@ -170,45 +155,73 @@ mod source_positions {
 
     #[test]
     fn element_byte_offsets_match_source_positions() {
-        // <p>   starts at byte 0
-        // <div> starts at byte 14  ("<p>hello</p>" = 12 chars + 2 for "</p>")
-        //   <p>hello</p> = 12 bytes, </p> = 4 bytes → <div> at 16? Let's be precise:
-        // "<p>hello</p><div>world</div>"
-        //  0123456789012345678901234567
-        //  <p> = 0, </p> = 8, <div> = 12
         let result = parse("<p>hello</p><div>world</div>");
         let elems = result.content_elements();
 
-        assert_eq!(elems.len(), 2, "expected p and div, got: {:?}", elems);
-        assert_eq!(elems[0], ("p".to_string(), 0));
-        assert_eq!(elems[1], ("div".to_string(), 12));
+        assert_eq!(
+            elems.len(),
+            5,
+            "expected html, head, body, p and div, got: {:?}",
+            elems
+        );
+        assert_eq!(elems[0], ("html".to_string(), 0));
+        assert_eq!(elems[1], ("head".to_string(), 0));
+        assert_eq!(elems[2], ("body".to_string(), 0));
+        assert_eq!(elems[3], ("p".to_string(), 0));
+        assert_eq!(elems[4], ("div".to_string(), 12));
     }
 
     #[test]
     fn nested_element_byte_offset() {
-        // "<div><span>x</span></div>"
-        //  01234567890123456789...
-        // <div> = 0, <span> = 5
         let result = parse("<div><span>x</span></div>");
         let elems = result.content_elements();
 
-        assert_eq!(elems.len(), 2, "expected div and span, got: {:?}", elems);
-        assert_eq!(elems[0], ("div".to_string(), 0));
-        assert_eq!(elems[1], ("span".to_string(), 5));
+        assert_eq!(
+            elems.len(),
+            5,
+            "expected html, head, body, div and span, got: {:?}",
+            elems
+        );
+        assert_eq!(elems[0], ("html".to_string(), 0));
+        assert_eq!(elems[1], ("head".to_string(), 0));
+        assert_eq!(elems[2], ("body".to_string(), 0));
+        assert_eq!(elems[3], ("div".to_string(), 0));
+        assert_eq!(elems[4], ("span".to_string(), 5));
+    }
+
+    #[test]
+    fn explicit_html_head_body_offsets() {
+        let result = parse("<html><head></head><body><p>hi</p></body></html>");
+        let elems = result.content_elements();
+
+        assert_eq!(
+            elems.len(),
+            4,
+            "expected html, head, body, p, got: {:?}",
+            elems
+        );
+        assert_eq!(elems[0], ("html".to_string(), 0));
+        assert_eq!(elems[1], ("head".to_string(), 6));
+        assert_eq!(elems[2], ("body".to_string(), 19));
+        assert_eq!(elems[3], ("p".to_string(), 25));
     }
 
     #[test]
+    /// <span> should start at byte 12, and not 13 due to é being 2 bytes.
     fn multibyte_content_does_not_shift_subsequent_offsets() {
-        // "<p>café</p><span>next</span>"
-        // 'é' = 2 bytes, so:
-        // <p>    = byte 0
-        // </p>   = byte 3+5 = byte 8  ("café" = c(1)+a(1)+f(1)+é(2) = 5 bytes)
-        // <span> = byte 8 + 4 = byte 12 ("</p>" = 4 bytes)
         let result = parse("<p>café</p><span>next</span>");
         let elems = result.content_elements();
 
-        assert_eq!(elems.len(), 2, "expected p and span, got: {:?}", elems);
-        assert_eq!(elems[0], ("p".to_string(), 0));
-        assert_eq!(elems[1], ("span".to_string(), 12));
+        assert_eq!(
+            elems.len(),
+            5,
+            "expected html, head, body, p and span, got: {:?}",
+            elems
+        );
+        assert_eq!(elems[0], ("html".to_string(), 0));
+        assert_eq!(elems[1], ("head".to_string(), 0));
+        assert_eq!(elems[2], ("body".to_string(), 0));
+        assert_eq!(elems[3], ("p".to_string(), 0));
+        assert_eq!(elems[4], ("span".to_string(), 12));
     }
 }
diff --git a/rcdom/tests/xhtml-self-closing-integration.rs b/rcdom/tests/xhtml-self-closing-integration.rs
index c49aaf69..ae3a5d4a 100644
--- a/rcdom/tests/xhtml-self-closing-integration.rs
+++ b/rcdom/tests/xhtml-self-closing-integration.rs
@@ -1,19 +1,9 @@
-// Copyright 2014-2026 The html5ever Project Developers. See the
-// COPYRIGHT file at the top-level directory of this distribution.
-//
-// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-// option. This file may not be copied, modified, or distributed
-// except according to those terms.
-
 //! Integration tests for the `xhtml-self-closing` feature.
 //!
 //! EPUB content is XHTML and uses self-closing syntax on RCDATA/RAWTEXT
 //! elements such as `<title/>` and `<style/>`. Without `xhtml-self-closing`,
 //! html5ever treats these as opening tags and enters raw-text mode, consuming
-//! the rest of the document. These tests verify the feature makes parsing
-//! behave as XHTML authors expect.
+//! the rest of the document
 
 #[cfg(feature = "xhtml-self-closing")]
 mod xhtml_self_closing {
@@ -69,9 +59,6 @@ mod xhtml_self_closing {
 
     #[test]
     fn self_closing_title_does_not_swallow_body() {
-        // Without the feature <title/> opens a RCDATA region that swallows
-        // everything up to the next </title>. With it, <title/> is empty and
-        // the body parses normally.
         let dom = parse("<html><head><title/></head><body><p>visible</p></body></html>");
         let names = element_names(&dom.document);
 
@@ -109,8 +96,6 @@ mod xhtml_self_closing {
 
     #[test]
     fn normal_closed_title_still_captures_rcdata_text() {
-        // A properly-closed <title>…</title> must still capture its RCDATA
-        // content — the feature must not break normal title parsing.
         let dom = parse("<html><head><title>My Book</title></head><body></body></html>");
         let text = text_of(&dom, "title");
         assert_eq!(