diff --git a/Cargo.toml b/Cargo.toml
index 4714cdbc..141c9124 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -42,3 +42,9 @@ libtest-mimic = "0.8.1"
 rand = "0.9"
 serde_json = "1.0"
 typed-arena = "2.0.2"
+
+# Redirect crates.io tendril/web_atoms to the local path crates so that
+# markup5ever gets the same crate instance as rcdom and xml5ever.
+[patch.crates-io]
+tendril = { path = "tendril" }
+web_atoms = { path = "web_atoms" }
diff --git a/html5ever/Cargo.toml b/html5ever/Cargo.toml
index 3584e456..23b752b0 100644
--- a/html5ever/Cargo.toml
+++ b/html5ever/Cargo.toml
@@ -5,25 +5,31 @@ documentation = "https://docs.rs/html5ever"
 categories = [ "parser-implementations", "web-programming" ]
 keywords = ["html", "html5", "parser", "parsing"]
 readme = "../README.md"
-version.workspace = true
-license.workspace = true
-authors.workspace = true
-repository.workspace = true
-edition.workspace = true
-rust-version.workspace = true
+version = "0.39.0"
+license = "MIT OR Apache-2.0"
+authors = [ "The html5ever Project Developers" ]
+repository = "https://github.com/servo/html5ever"
+edition = "2021"
+rust-version = "1.71.0"
 
 [features]
 trace_tokenizer = []
 serde = ["markup5ever/serde"]
+# Surfaces byte-accurate source positions; see markup5ever for full description.
+source-positions = ["markup5ever/source-positions"]
+# Honour the XML/XHTML self-closing syntax (`<title/>`, `<style/>`, …) on
+# RCDATA and RAWTEXT elements. Without this, html5ever treats `<title/>`
+# as opening a RCDATA region that swallows the rest of the document.
+xhtml-self-closing = []
 
 [dependencies]
-markup5ever = { workspace = true }
-memchr = { workspace = true }
-log = { workspace = true }
+markup5ever = { version = "0.39", path = "../markup5ever" }
+memchr = "2.8.0"
+log = "0.4"
 
 [dev-dependencies]
-criterion = { workspace = true }
-typed-arena = { workspace = true }
+criterion = "0.8"
+typed-arena = "2.0.2"
 
 [[bench]]
 name = "html5ever"
diff --git a/html5ever/src/tokenizer/char_ref/mod.rs b/html5ever/src/tokenizer/char_ref/mod.rs
index e119477d..c8da81c8 100644
--- a/html5ever/src/tokenizer/char_ref/mod.rs
+++ b/html5ever/src/tokenizer/char_ref/mod.rs
@@ -212,7 +212,11 @@ impl CharRefTokenizer {
             unconsume.push_char(c)
         }
 
+        #[cfg(feature = "source-positions")]
+        let unconsume_len = unconsume.len() as u64;
         input.push_front(unconsume);
+        #[cfg(feature = "source-positions")]
+        input.retreat_bytes_consumed(unconsume_len);
         tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
         Status::Done(CharRef::EMPTY)
     }
@@ -292,7 +296,12 @@ impl CharRefTokenizer {
     }
 
     fn unconsume_name(&mut self, input: &BufferQueue) {
-        input.push_front(self.name_buf_opt.take().unwrap());
+        let name_buf = self.name_buf_opt.take().unwrap();
+        #[cfg(feature = "source-positions")]
+        let name_buf_len = name_buf.len() as u64;
+        input.push_front(name_buf);
+        #[cfg(feature = "source-positions")]
+        input.retreat_bytes_consumed(name_buf_len);
     }
 
     fn finish_named<Sink: TokenSink>(
@@ -367,7 +376,12 @@ impl CharRefTokenizer {
                     self.unconsume_name(input);
                     Status::Done(CharRef::EMPTY)
                 } else {
-                    input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
+                    let unconsumed = StrTendril::from_slice(&self.name_buf()[name_len..]);
+                    #[cfg(feature = "source-positions")]
+                    let unconsumed_len = unconsumed.len() as u64;
+                    input.push_front(unconsumed);
+                    #[cfg(feature = "source-positions")]
+                    input.retreat_bytes_consumed(unconsumed_len);
                     tokenizer.ignore_lf.set(false);
                     Status::Done(CharRef {
                         chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
@@ -419,6 +433,8 @@ impl CharRefTokenizer {
                 },
                 State::Octothorpe => {
                     input.push_front(StrTendril::from_slice("#"));
+                    #[cfg(feature = "source-positions")]
+                    input.retreat_bytes_consumed(1);
                     tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
                     Status::Done(CharRef::EMPTY)
                 },
diff --git a/html5ever/src/tokenizer/interface.rs b/html5ever/src/tokenizer/interface.rs
index b1436a71..97437809 100644
--- a/html5ever/src/tokenizer/interface.rs
+++ b/html5ever/src/tokenizer/interface.rs
@@ -130,6 +130,13 @@ pub trait TokenSink {
     /// Signal that tokenization reached the end of the document.
     fn end(&self) {}
 
+    /// Called just before each token is dispatched to [`process_token`],
+    /// with the number of UTF-8 bytes consumed from the input so far.
+    ///
+    /// The default implementation is a no-op.
+    #[cfg(feature = "source-positions")]
+    fn set_current_byte(&self, _byte_offset: u64) {}
+
     /// Used in the [markup declaration open state]. By default, this always
     /// returns false and thus all CDATA sections are tokenized as bogus
     /// comments.
diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs
index ba9a095c..8b0d473a 100644
--- a/html5ever/src/tokenizer/mod.rs
+++ b/html5ever/src/tokenizer/mod.rs
@@ -181,6 +181,30 @@ pub struct Tokenizer<Sink> {
 
     /// Track current line
     current_line: Cell<u64>,
+
+    /// Number of UTF-8 bytes consumed from the input so far.
+    ///
+    /// Kept in sync with `BufferQueue::bytes_consumed` after every character
+    /// is consumed.
+    #[cfg(feature = "source-positions")]
+    current_byte: Cell<u64>,
+
+    /// Byte offset of the first character of the current token.
+    ///
+    /// For tag, comment, and doctype tokens this is the byte of the `<` that
+    /// opened them, captured whenever `<` is consumed in `get_preprocessed_char`.
+    ///
+    /// For character tokens it is the byte right after the end of the previous token,
+    /// which equals the first byte of the text content, this is tracked via `last_token_end_byte`.
+    #[cfg(feature = "source-positions")]
+    token_start_byte: Cell<u64>,
+
+    /// Byte offset one past the end of the most recently emitted token.
+    ///
+    /// Updated at the end of each `process_token` call. Used as the start
+    /// byte for the next character token.
+    #[cfg(feature = "source-positions")]
+    last_token_end_byte: Cell<u64>,
 }
 
 impl<Sink: TokenSink> Tokenizer<Sink> {
@@ -216,6 +240,12 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             state_profile: RefCell::new(BTreeMap::new()),
             time_in_sink: Cell::new(0),
             current_line: Cell::new(1),
+            #[cfg(feature = "source-positions")]
+            current_byte: Cell::new(0),
+            #[cfg(feature = "source-positions")]
+            token_start_byte: Cell::new(0),
+            #[cfg(feature = "source-positions")]
+            last_token_end_byte: Cell::new(0),
         }
     }
 
@@ -243,13 +273,27 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
     }
 
     fn process_token(&self, token: Token) -> TokenSinkResult<Sink::Handle> {
-        if self.opts.profile {
+        #[cfg(feature = "source-positions")]
+        {
+            let byte = match &token {
+                Token::TagToken(_) | Token::CommentToken(_) | Token::DoctypeToken(_) => {
+                    self.token_start_byte.get()
+                },
+                Token::CharacterTokens(_) => self.last_token_end_byte.get(),
+                _ => self.current_byte.get(),
+            };
+            self.sink.set_current_byte(byte);
+        }
+        let result = if self.opts.profile {
             let (ret, dt) = time!(self.sink.process_token(token, self.current_line.get()));
             self.time_in_sink.set(self.time_in_sink.get() + dt);
             ret
         } else {
             self.sink.process_token(token, self.current_line.get())
-        }
+        };
+        #[cfg(feature = "source-positions")]
+        self.last_token_end_byte.set(self.current_byte.get());
+        result
     }
 
     fn process_token_and_continue(&self, token: Token) {
@@ -292,6 +336,17 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
 
         trace!("got character {c}");
         self.current_char.set(c);
+        #[cfg(feature = "source-positions")]
+        {
+            let pos = input.bytes_consumed();
+            if pos > 0 {
+                self.current_byte.set(pos);
+                if c == '<' {
+                    self.token_start_byte
+                        .set(pos.saturating_sub(c.len_utf8() as u64));
+                }
+            }
+        }
         Some(c)
     }
 
@@ -325,7 +380,13 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             // NB: We don't set self.current_char for a run of characters not
             // in the set.  It shouldn't matter for the codepaths that use
             // this.
-            _ => d,
+            other => {
+                #[cfg(feature = "source-positions")]
+                if other.is_some() {
+                    self.current_byte.set(input.bytes_consumed());
+                }
+                other
+            },
         }
     }
 
@@ -621,7 +682,20 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
         if self.reconsume.get() {
             self.reconsume.set(false);
         } else {
+            #[cfg(not(feature = "source-positions"))]
             input.next();
+            #[cfg(feature = "source-positions")]
+            {
+                let c = input.next();
+                if let Some(c) = c {
+                    let pos = input.bytes_consumed();
+                    self.current_byte.set(pos);
+                    if c == '<' {
+                        self.token_start_byte
+                            .set(pos.saturating_sub(c.len_utf8() as u64));
+                    }
+                }
+            }
         }
     }
 
@@ -757,6 +831,20 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                         // This CPU is guaranteed to support SIMD due to the is_supported_simd_feature_detected check above
                         let result = unsafe { self.data_state_simd_fast_path(&mut front_buffer) };
 
+                        #[cfg(feature = "source-positions")]
+                        if let Some(ref r) = result {
+                            let n = match r {
+                                SetResult::NotFromSet(ref t) => t.len() as u64,
+                                SetResult::FromSet(c) => c.len_utf8() as u64,
+                            };
+                            input.advance_bytes_consumed(n);
+                            self.current_byte.set(input.bytes_consumed());
+                            if let SetResult::FromSet('<') = r {
+                                self.token_start_byte
+                                    .set(input.bytes_consumed() - '<'.len_utf8() as u64);
+                            }
+                        }
+
                         if front_buffer.is_empty() {
                             drop(front_buffer);
                             input.pop_front();
@@ -1752,6 +1840,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
         let mut char_ref_tokenizer = self.char_ref_tokenizer.borrow_mut();
         let progress = match char_ref_tokenizer.as_mut().unwrap().step(self, input) {
             char_ref::Status::Done(char_ref) => {
+                #[cfg(feature = "source-positions")]
+                self.current_byte.set(input.bytes_consumed());
                 self.process_char_ref(char_ref);
                 *char_ref_tokenizer = None;
                 return ProcessResult::Continue;
@@ -2379,3 +2469,236 @@ mod test {
         assert_eq!(results, expected);
     }
 }
+
+#[cfg(all(test, feature = "source-positions"))]
+mod test_source_positions {
+    use crate::tendril::StrTendril;
+
+    use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, TagToken};
+    use super::interface::{EndTag, StartTag, Tag, Token};
+    use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
+
+    use crate::LocalName;
+    use markup5ever::buffer_queue::BufferQueue;
+    use std::cell::RefCell;
+
+    /// Records (token, byte_offset) pairs via `set_current_byte`.
+    struct BytesMatch {
+        current_byte: std::cell::Cell<u64>,
+        text_start_byte: std::cell::Cell<Option<u64>>,
+        current_str: RefCell<StrTendril>,
+        entries: RefCell<Vec<(Token, u64)>>,
+    }
+
+    impl BytesMatch {
+        fn new() -> Self {
+            BytesMatch {
+                current_byte: std::cell::Cell::new(0),
+                text_start_byte: std::cell::Cell::new(None),
+                current_str: RefCell::new(StrTendril::new()),
+                entries: RefCell::new(vec![]),
+            }
+        }
+
+        /// Emit the accumulated character run using the byte of its first chunk.
+        fn flush_chars(&self) {
+            let s = self.current_str.take();
+            if !s.is_empty() {
+                let byte = self.text_start_byte.get().unwrap_or(0);
+                self.text_start_byte.set(None);
+                self.entries.borrow_mut().push((CharacterTokens(s), byte));
+            }
+        }
+    }
+
+    /// Records every token without coalescing adjacent character chunks.
+    struct RawBytesMatch {
+        current_byte: std::cell::Cell<u64>,
+        entries: RefCell<Vec<(Token, u64)>>,
+    }
+
+    impl RawBytesMatch {
+        fn new() -> Self {
+            RawBytesMatch {
+                current_byte: std::cell::Cell::new(0),
+                entries: RefCell::new(vec![]),
+            }
+        }
+    }
+
+    impl TokenSink for RawBytesMatch {
+        type Handle = ();
+
+        fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<Self::Handle> {
+            if !matches!(token, EOFToken) {
+                self.entries
+                    .borrow_mut()
+                    .push((token, self.current_byte.get()));
+            }
+            TokenSinkResult::Continue
+        }
+
+        fn set_current_byte(&self, byte_offset: u64) {
+            self.current_byte.set(byte_offset);
+        }
+    }
+
+    impl TokenSink for BytesMatch {
+        type Handle = ();
+
+        fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<Self::Handle> {
+            let byte = self.current_byte.get();
+            match token {
+                CharacterTokens(b) => {
+                    if self.text_start_byte.get().is_none() {
+                        self.text_start_byte.set(Some(byte));
+                    }
+                    self.current_str.borrow_mut().push_slice(&b);
+                },
+                NullCharacterToken => {
+                    self.current_str.borrow_mut().push_char('\0');
+                },
+                EOFToken => {
+                    self.flush_chars();
+                },
+                TagToken(t) => {
+                    self.flush_chars();
+                    self.entries.borrow_mut().push((TagToken(t), byte));
+                },
+                other => {
+                    self.flush_chars();
+                    self.entries.borrow_mut().push((other, byte));
+                },
+            }
+            TokenSinkResult::Continue
+        }
+
+        fn set_current_byte(&self, byte_offset: u64) {
+            self.current_byte.set(byte_offset);
+        }
+    }
+
+    fn tokenize_bytes(input: &str) -> Vec<(Token, u64)> {
+        let sink = BytesMatch::new();
+        let tok = Tokenizer::new(
+            sink,
+            TokenizerOpts {
+                exact_errors: false,
+                discard_bom: true,
+                profile: false,
+                initial_state: None,
+                last_start_tag_name: None,
+            },
+        );
+        let buf = BufferQueue::default();
+        buf.push_back(StrTendril::from(input));
+        let _ = tok.feed(&buf);
+        tok.end();
+        tok.sink.entries.take()
+    }
+
+    fn tokenize_raw_bytes(input: &str) -> Vec<(Token, u64)> {
+        let sink = RawBytesMatch::new();
+        let tok = Tokenizer::new(
+            sink,
+            TokenizerOpts {
+                exact_errors: false,
+                discard_bom: true,
+                profile: false,
+                initial_state: None,
+                last_start_tag_name: None,
+            },
+        );
+        let buf = BufferQueue::default();
+        buf.push_back(StrTendril::from(input));
+        let _ = tok.feed(&buf);
+        tok.end();
+        tok.sink.entries.take()
+    }
+
+    fn start(name: &str) -> Token {
+        TagToken(Tag {
+            kind: StartTag,
+            name: LocalName::from(name),
+            self_closing: false,
+            attrs: vec![],
+            had_duplicate_attributes: false,
+        })
+    }
+
+    fn end(name: &str) -> Token {
+        TagToken(Tag {
+            kind: EndTag,
+            name: LocalName::from(name),
+            self_closing: false,
+            attrs: vec![],
+            had_duplicate_attributes: false,
+        })
+    }
+
+    fn chars(s: &str) -> Token {
+        CharacterTokens(StrTendril::from(s))
+    }
+
+    #[test]
+    fn check_byte_offsets_simple_tags() {
+        let entries = tokenize_bytes("<a><b></b></a>");
+        assert_eq!(
+            entries,
+            vec![
+                (start("a"), 0),
+                (start("b"), 3),
+                (end("b"), 6),
+                (end("a"), 10),
+            ]
+        );
+    }
+
+    #[test]
+    fn check_byte_offsets_text_content() {
+        let entries = tokenize_bytes("<p>hello</p>");
+        assert_eq!(
+            entries,
+            vec![(start("p"), 0), (chars("hello"), 3), (end("p"), 8),]
+        );
+    }
+
+    #[test]
+    fn check_byte_offsets_multibyte_text() {
+        let entries = tokenize_bytes("<p>é</p>");
+        assert_eq!(
+            entries,
+            vec![(start("p"), 0), (chars("é"), 3), (end("p"), 5),]
+        );
+    }
+    #[test]
+    fn check_byte_offsets_sequential_siblings() {
+        let entries = tokenize_bytes("<h1>X</h1><p>Y</p>");
+        assert_eq!(
+            entries,
+            vec![
+                (start("h1"), 0),
+                (chars("X"), 4),
+                (end("h1"), 5),
+                (start("p"), 10),
+                (chars("Y"), 13),
+                (end("p"), 14),
+            ]
+        );
+    }
+
+    #[test]
+    fn check_byte_offsets_entity_text_chunks() {
+        let entries = tokenize_raw_bytes("<p>a&amp;b</p>");
+        assert_eq!(
+            entries,
+            vec![
+                (start("p"), 0),
+                (chars("a"), 3),
+                (chars("&"), 4),
+                (chars("b"), 9),
+                (end("p"), 10),
+            ]
+        );
+    }
+}
diff --git a/html5ever/src/tree_builder/mod.rs b/html5ever/src/tree_builder/mod.rs
index 3fcfaec3..bf28847a 100644
--- a/html5ever/src/tree_builder/mod.rs
+++ b/html5ever/src/tree_builder/mod.rs
@@ -474,6 +474,11 @@ where
 {
     type Handle = Handle;
 
+    #[cfg(feature = "source-positions")]
+    fn set_current_byte(&self, byte_offset: u64) {
+        self.sink.set_current_byte(byte_offset);
+    }
+
     fn process_token(&self, token: tokenizer::Token, line_number: u64) -> TokenSinkResult<Handle> {
         if line_number != self.current_line.get() {
             self.sink.set_current_line(line_number);
@@ -673,8 +678,19 @@ where
         ProcessResult::ToRawData(k)
     }
 
-    // The generic raw text / RCDATA parsing algorithm.
+    /// The generic raw text / RCDATA parsing algorithm.
+    /// Insert a RCDATA/RAWTEXT element and switch the tokenizer to raw-text mode.
+    ///
+    /// When the `xhtml-self-closing` feature is enabled, (`<title/>`, `<style/>`, …)
+    /// are treated as empty elements instead of invalid HTML which ends up
+    /// swallowing all the content that comes after it.
     fn parse_raw_data(&self, tag: Tag, k: RawKind) -> ProcessResult<Handle> {
+        #[cfg(feature = "xhtml-self-closing")]
+        if tag.self_closing {
+            self.insert_and_pop_element_for(tag);
+            return ProcessResult::DoneAckSelfClosing;
+        }
+
         self.insert_element_for(tag);
         self.to_raw_text_mode(k)
     }
diff --git a/markup5ever/Cargo.toml b/markup5ever/Cargo.toml
index 764f9a0a..79024927 100644
--- a/markup5ever/Cargo.toml
+++ b/markup5ever/Cargo.toml
@@ -3,20 +3,25 @@ name = "markup5ever"
 description = "Common code for xml5ever and html5ever"
 documentation = "https://docs.rs/markup5ever"
 categories = [ "parser-implementations", "web-programming" ]
-version.workspace = true
-license.workspace = true
-authors.workspace = true
-repository.workspace = true
-edition.workspace = true
-rust-version.workspace = true
+version = "0.39.0"
+license = "MIT OR Apache-2.0"
+authors = [ "The html5ever Project Developers" ]
+repository = "https://github.com/servo/html5ever"
+edition = "2021"
+rust-version = "1.71.0"
 
 [lib]
 path = "lib.rs"
 
 [features]
 serde = ["web_atoms/serde"]
+# Surfaces byte-accurate source positions through the `TreeSink` interface.
+# When enabled, `TreeSink::set_current_byte` is called before each tree
+# mutation with the UTF-8 byte offset of the current token in the input.
+# Use this to assign stable, parser-independent offsets to DOM nodes.
+source-positions = []
 
 [dependencies]
-web_atoms = { workspace = true }
-tendril = { workspace = true }
-log = { workspace = true }
+web_atoms = "0.2.4"
+tendril = "0.5"
+log = "0.4"
diff --git a/markup5ever/interface/tree_builder.rs b/markup5ever/interface/tree_builder.rs
index e1683de0..44803457 100644
--- a/markup5ever/interface/tree_builder.rs
+++ b/markup5ever/interface/tree_builder.rs
@@ -269,6 +269,18 @@ pub trait TreeSink {
     /// Called whenever the line number changes.
     fn set_current_line(&self, _line_number: u64) {}
 
+    /// Called before a tree-builder callback with the source byte offset for the
+    /// token or text segment that triggered it.
+    ///
+    /// For start tags, end tags, comments, and doctypes this is the UTF-8 byte
+    /// offset of the token's first byte in the original input. For character
+    /// tokens this is the UTF-8 byte offset of the first byte in the current text
+    /// segment.
+    ///
+    /// The default implementation is a no-op.
+    #[cfg(feature = "source-positions")]
+    fn set_current_byte(&self, _byte_offset: u64) {}
+
     fn allow_declarative_shadow_roots(&self, _intended_parent: &Self::Handle) -> bool {
         true
     }
diff --git a/markup5ever/util/buffer_queue.rs b/markup5ever/util/buffer_queue.rs
index d5e6864f..4099ff60 100644
--- a/markup5ever/util/buffer_queue.rs
+++ b/markup5ever/util/buffer_queue.rs
@@ -18,6 +18,8 @@
 //!
 //! [`BufferQueue`]: struct.BufferQueue.html
 
+#[cfg(feature = "source-positions")]
+use std::cell::Cell;
 use std::{
     cell::{RefCell, RefMut},
     collections::VecDeque,
@@ -51,6 +53,12 @@ pub enum SetResult {
 pub struct BufferQueue {
     /// Buffers to process.
     buffers: RefCell<VecDeque<StrTendril>>,
+    /// Total number of UTF-8 bytes consumed from this queue so far.
+    ///
+    /// Used by the tokenizer to surface byte-accurate source offsets via
+    /// [`TokenSink::set_current_byte`] and [`TreeSink::set_current_byte`].
+    #[cfg(feature = "source-positions")]
+    bytes_consumed: Cell<u64>,
 }
 
 impl Default for BufferQueue {
@@ -59,6 +67,8 @@ impl Default for BufferQueue {
     fn default() -> Self {
         Self {
             buffers: RefCell::new(VecDeque::with_capacity(16)),
+            #[cfg(feature = "source-positions")]
+            bytes_consumed: Cell::new(0),
         }
     }
 }
@@ -70,6 +80,39 @@ impl BufferQueue {
         self.buffers.borrow().is_empty()
     }
 
+    /// Returns the total number of UTF-8 bytes consumed from this queue.
+    ///
+    /// The value monotonically increases as characters are consumed via
+    /// [`next`], [`pop_except_from`], and [`eat`]. Re-queuing bytes via
+    /// [`push_front`] does **not** decrement the counter.
+    ///
+    /// To reduce bytes_consumed, use [`retreat_bytes_consumed`].
+    #[cfg(feature = "source-positions")]
+    #[inline]
+    pub fn bytes_consumed(&self) -> u64 {
+        self.bytes_consumed.get()
+    }
+
+    /// Advance the bytes-consumed counter by `n`.
+    ///
+    /// Use this to manually advance the counter when bypassing: [`next`], [`pop_except_from`], and [`eat`]
+    #[cfg(feature = "source-positions")]
+    #[inline]
+    pub fn advance_bytes_consumed(&self, n: u64) {
+        self.bytes_consumed.set(self.bytes_consumed.get() + n);
+    }
+
+    /// Retreat the bytes-consumed counter by `n`.
+    ///
+    /// Used by tokenizer lookahead paths that consume raw bytes, then push unmatched
+    /// suffix bytes back onto the queue.
+    #[cfg(feature = "source-positions")]
+    #[inline]
+    pub fn retreat_bytes_consumed(&self, n: u64) {
+        self.bytes_consumed
+            .set(self.bytes_consumed.get().saturating_sub(n));
+    }
+
     /// Get the buffer at the beginning of the queue.
     #[inline]
     pub fn pop_front(&self) -> Option<StrTendril> {
@@ -146,9 +189,15 @@ impl BufferQueue {
                         out = buf.unsafe_subtendril(0, n);
                         buf.unsafe_pop_front(n);
                     }
+                    #[cfg(feature = "source-positions")]
+                    self.bytes_consumed
+                        .set(self.bytes_consumed.get() + out.len() as u64);
                     (Some(NotFromSet(out)), buf.is_empty())
                 } else {
                     let c = buf.pop_front_char().expect("empty buffer in queue");
+                    #[cfg(feature = "source-positions")]
+                    self.bytes_consumed
+                        .set(self.bytes_consumed.get() + c.len_utf8() as u64);
                     (Some(FromSet(c)), buf.is_empty())
                 }
             },
@@ -218,6 +267,10 @@ impl BufferQueue {
             Some(ref mut buf) => buf.pop_front(consumed_from_last as u32),
         }
 
+        #[cfg(feature = "source-positions")]
+        self.bytes_consumed
+            .set(self.bytes_consumed.get() + pat.len() as u64);
+
         Some(true)
     }
 
@@ -229,6 +282,9 @@ impl BufferQueue {
             None => (None, false),
             Some(buf) => {
                 let c = buf.pop_front_char().expect("empty buffer in queue");
+                #[cfg(feature = "source-positions")]
+                self.bytes_consumed
+                    .set(self.bytes_consumed.get() + c.len_utf8() as u64);
                 (Some(c), buf.is_empty())
             },
         };
@@ -331,3 +387,147 @@ mod test {
         assert_eq!(bq.next(), None);
     }
 }
+
+#[cfg(all(test, feature = "source-positions"))]
+mod test_source_positions {
+    use tendril::SliceExt;
+
+    use super::BufferQueue;
+    use super::SetResult::{FromSet, NotFromSet};
+
+    #[test]
+    fn next_advances_counter_by_utf8_width_single() {
+        let bq = BufferQueue::default();
+        assert_eq!(bq.bytes_consumed(), 0);
+
+        bq.push_back("abc".to_tendril());
+        bq.next();
+        assert_eq!(bq.bytes_consumed(), 1);
+        bq.next();
+        assert_eq!(bq.bytes_consumed(), 2);
+        bq.next();
+        assert_eq!(bq.bytes_consumed(), 3);
+    }
+
+    #[test]
+    fn next_advances_counter_by_utf8_width_double() {
+        let bq = BufferQueue::default();
+        assert_eq!(bq.bytes_consumed(), 0);
+
+        bq.push_back("é".to_tendril());
+        bq.next();
+        assert_eq!(bq.bytes_consumed(), 2);
+    }
+
+    #[test]
+    fn pop_except_from_not_from_set_advances_counter() {
+        let bq = BufferQueue::default();
+        bq.push_back("abc&".to_tendril());
+        let set = small_char_set!('&');
+
+        assert_eq!(
+            bq.pop_except_from(set),
+            Some(NotFromSet("abc".to_tendril()))
+        );
+        assert_eq!(bq.bytes_consumed(), 3);
+    }
+
+    #[test]
+    fn pop_except_from_from_set_advances_counter() {
+        let bq = BufferQueue::default();
+        bq.push_back("&def".to_tendril());
+        let set = small_char_set!('&');
+
+        assert_eq!(bq.pop_except_from(set), Some(FromSet('&')));
+        assert_eq!(bq.bytes_consumed(), 1);
+    }
+
+    #[test]
+    fn pop_except_from_successive_calls_accumulate_counter() {
+        let bq = BufferQueue::default();
+        bq.push_back("abc&def".to_tendril());
+        let set = small_char_set!('&');
+
+        bq.pop_except_from(set);
+        assert_eq!(bq.bytes_consumed(), 3);
+
+        bq.pop_except_from(set);
+        assert_eq!(bq.bytes_consumed(), 4);
+
+        bq.pop_except_from(set);
+        assert_eq!(bq.bytes_consumed(), 7);
+    }
+
+    #[test]
+    fn pop_except_from_multibyte_bulk_advances_by_byte_len() {
+        let bq = BufferQueue::default();
+        bq.push_back("café&".to_tendril());
+        let set = small_char_set!('&');
+
+        let result = bq.pop_except_from(set);
+        assert!(matches!(result, Some(NotFromSet(_))));
+        assert_eq!(bq.bytes_consumed(), 5);
+    }
+
+    #[test]
+    fn eat_advances_counter_accordingly() {
+        let bq = BufferQueue::default();
+        bq.push_back("abcdef".to_tendril());
+
+        assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false));
+        assert_eq!(bq.bytes_consumed(), 0);
+
+        assert_eq!(bq.eat("abc", u8::eq_ignore_ascii_case), Some(true));
+        assert_eq!(bq.bytes_consumed(), 3);
+
+        assert_eq!(bq.eat("def", u8::eq_ignore_ascii_case), Some(true));
+        assert_eq!(bq.bytes_consumed(), 6);
+    }
+
+    #[test]
+    /// This test is to ensure the behaviour contract of push_front is kept.
+    /// There are use cases where pushing front should technically not retreat the
+    /// bytes counter, so it's up to the caller to decide if pushing front should retreat.
+    fn push_front_does_not_decrement_counter() {
+        let bq = BufferQueue::default();
+        bq.push_back("abc".to_tendril());
+        bq.next();
+        bq.next();
+        assert_eq!(bq.bytes_consumed(), 2);
+
+        bq.push_front("xy".to_tendril());
+        assert_eq!(bq.bytes_consumed(), 2);
+
+        bq.next();
+        bq.next();
+        assert_eq!(bq.bytes_consumed(), 4);
+    }
+
+    #[test]
+    fn advance_bytes_consumed_adds_exactly() {
+        let bq = BufferQueue::default();
+        assert_eq!(bq.bytes_consumed(), 0);
+
+        bq.advance_bytes_consumed(7);
+        assert_eq!(bq.bytes_consumed(), 7);
+
+        bq.advance_bytes_consumed(3);
+        assert_eq!(bq.bytes_consumed(), 10);
+    }
+
+    #[test]
+    fn retreat_bytes_consumed_subtracts_exactly() {
+        let bq = BufferQueue::default();
+        bq.advance_bytes_consumed(10);
+        assert_eq!(bq.bytes_consumed(), 10);
+
+        bq.retreat_bytes_consumed(3);
+        assert_eq!(bq.bytes_consumed(), 7);
+
+        bq.retreat_bytes_consumed(7);
+        assert_eq!(bq.bytes_consumed(), 0);
+
+        bq.retreat_bytes_consumed(5);
+        assert_eq!(bq.bytes_consumed(), 0);
+    }
+}
diff --git a/rcdom/Cargo.toml b/rcdom/Cargo.toml
index caf52b54..30dab873 100644
--- a/rcdom/Cargo.toml
+++ b/rcdom/Cargo.toml
@@ -20,6 +20,10 @@ markup5ever = { workspace = true, features = ["serde"] }
 tendril = { workspace = true }
 xml5ever = { workspace = true }
 
+[features]
+source-positions = ["html5ever/source-positions"]
+xhtml-self-closing = ["html5ever/xhtml-self-closing"]
+
 [dev-dependencies]
 criterion = { workspace = true }
 env_logger = { workspace = true }
@@ -45,3 +49,11 @@ harness = false
 [[test]]
 name = "xml-tokenizer"
 harness = false
+
+[[test]]
+name = "source-positions-integration"
+required-features = ["source-positions"]
+
+[[test]]
+name = "xhtml-self-closing-integration"
+required-features = ["xhtml-self-closing"]
diff --git a/rcdom/tests/source-positions-integration.rs b/rcdom/tests/source-positions-integration.rs
new file mode 100644
index 00000000..34ebe25c
--- /dev/null
+++ b/rcdom/tests/source-positions-integration.rs
@@ -0,0 +1,227 @@
+//! Integration tests for the `source-positions` feature.
+//!
+//! Verifies that byte offsets flow correctly from `BufferQueue` through the
+//! tokenizer and tree builder all the way into `TreeSink::set_current_byte`,
+//! and that the offsets correspond to the actual positions of element opening
+//! tags in the source string.
+//!
+//! 2 Critical behaviours are under test:
+//!
+//! 1. When no explicit <head>,<html>,<body> tags are part of the payload
+//!    they get injected implicitly, they should not skew the byte offset.
+//! 2. When the above tags are explicitly part of the payload, they should be part
+//!    of the count.
+
+#[cfg(feature = "source-positions")]
+mod source_positions {
+    use html5ever::driver;
+    use html5ever::tendril::stream::TendrilSink;
+    use html5ever::tendril::StrTendril;
+    use html5ever::ExpandedName;
+    use html5ever::QualName;
+    use markup5ever::interface::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
+    use markup5ever::Attribute;
+    use markup5ever_rcdom::{Handle, RcDom};
+    use std::borrow::Cow;
+    use std::cell::{Cell, RefCell};
+
+    /// Wraps `RcDom` and records `(local_name, byte_offset)` for every
+    /// element created.
+    ///
+    /// These are then later used for assertions.
+    struct ByteCapturingDOM {
+        current_byte: Cell<u64>,
+        elements: RefCell<Vec<(String, u64)>>,
+        rcdom: RcDom,
+    }
+
+    impl ByteCapturingDOM {
+        fn new() -> Self {
+            ByteCapturingDOM {
+                current_byte: Cell::new(0),
+                elements: RefCell::new(vec![]),
+                rcdom: RcDom::default(),
+            }
+        }
+
+        fn content_elements(&self) -> Vec<(String, u64)> {
+            self.elements.borrow().clone()
+        }
+    }
+
+    impl TreeSink for ByteCapturingDOM {
+        type Handle = Handle;
+        type Output = Self;
+
+        type ElemName<'a> = ExpandedName<'a>;
+
+        fn finish(self) -> Self {
+            self
+        }
+
+        fn parse_error(&self, msg: Cow<'static, str>) {
+            self.rcdom.parse_error(msg);
+        }
+
+        fn get_document(&self) -> Handle {
+            self.rcdom.get_document()
+        }
+
+        fn elem_name<'a>(&'a self, target: &'a Handle) -> ExpandedName<'a> {
+            self.rcdom.elem_name(target)
+        }
+
+        fn create_element(
+            &self,
+            name: QualName,
+            attrs: Vec<Attribute>,
+            flags: ElementFlags,
+        ) -> Handle {
+            self.elements
+                .borrow_mut()
+                .push((name.local.to_string(), self.current_byte.get()));
+            self.rcdom.create_element(name, attrs, flags)
+        }
+
+        fn create_comment(&self, text: StrTendril) -> Handle {
+            self.rcdom.create_comment(text)
+        }
+
+        fn create_pi(&self, target: StrTendril, content: StrTendril) -> Handle {
+            self.rcdom.create_pi(target, content)
+        }
+
+        fn append(&self, parent: &Handle, child: NodeOrText<Handle>) {
+            self.rcdom.append(parent, child)
+        }
+
+        fn append_based_on_parent_node(
+            &self,
+            element: &Handle,
+            prev_element: &Handle,
+            child: NodeOrText<Handle>,
+        ) {
+            self.rcdom
+                .append_based_on_parent_node(element, prev_element, child)
+        }
+
+        fn append_doctype_to_document(
+            &self,
+            name: StrTendril,
+            public_id: StrTendril,
+            system_id: StrTendril,
+        ) {
+            self.rcdom
+                .append_doctype_to_document(name, public_id, system_id);
+        }
+
+        fn get_template_contents(&self, target: &Handle) -> Handle {
+            self.rcdom.get_template_contents(target)
+        }
+
+        fn same_node(&self, x: &Handle, y: &Handle) -> bool {
+            self.rcdom.same_node(x, y)
+        }
+
+        fn set_quirks_mode(&self, mode: QuirksMode) {
+            self.rcdom.set_quirks_mode(mode)
+        }
+
+        fn append_before_sibling(&self, sibling: &Handle, child: NodeOrText<Handle>) {
+            self.rcdom.append_before_sibling(sibling, child)
+        }
+
+        fn add_attrs_if_missing(&self, target: &Handle, attrs: Vec<Attribute>) {
+            self.rcdom.add_attrs_if_missing(target, attrs);
+        }
+
+        fn remove_from_parent(&self, target: &Handle) {
+            self.rcdom.remove_from_parent(target);
+        }
+
+        fn reparent_children(&self, node: &Handle, new_parent: &Handle) {
+            self.rcdom.reparent_children(node, new_parent);
+        }
+
+        fn set_current_byte(&self, byte_offset: u64) {
+            self.current_byte.set(byte_offset);
+        }
+    }
+
+    fn parse(input: &str) -> ByteCapturingDOM {
+        let sink = ByteCapturingDOM::new();
+        driver::parse_document(sink, Default::default()).one(StrTendril::from(input))
+    }
+
+    #[test]
+    fn element_byte_offsets_match_source_positions() {
+        let result = parse("<p>hello</p><div>world</div>");
+        let elems = result.content_elements();
+
+        assert_eq!(
+            elems.len(),
+            5,
+            "expected html, head, body, p and div, got: {:?}",
+            elems
+        );
+        assert_eq!(elems[0], ("html".to_string(), 0));
+        assert_eq!(elems[1], ("head".to_string(), 0));
+        assert_eq!(elems[2], ("body".to_string(), 0));
+        assert_eq!(elems[3], ("p".to_string(), 0));
+        assert_eq!(elems[4], ("div".to_string(), 12));
+    }
+
+    #[test]
+    fn nested_element_byte_offset() {
+        let result = parse("<div><span>x</span></div>");
+        let elems = result.content_elements();
+
+        assert_eq!(
+            elems.len(),
+            5,
+            "expected html, head, body, div and span, got: {:?}",
+            elems
+        );
+        assert_eq!(elems[0], ("html".to_string(), 0));
+        assert_eq!(elems[1], ("head".to_string(), 0));
+        assert_eq!(elems[2], ("body".to_string(), 0));
+        assert_eq!(elems[3], ("div".to_string(), 0));
+        assert_eq!(elems[4], ("span".to_string(), 5));
+    }
+
+    #[test]
+    fn explicit_html_head_body_offsets() {
+        let result = parse("<html><head></head><body><p>hi</p></body></html>");
+        let elems = result.content_elements();
+
+        assert_eq!(
+            elems.len(),
+            4,
+            "expected html, head, body, p, got: {:?}",
+            elems
+        );
+        assert_eq!(elems[0], ("html".to_string(), 0));
+        assert_eq!(elems[1], ("head".to_string(), 6));
+        assert_eq!(elems[2], ("body".to_string(), 19));
+        assert_eq!(elems[3], ("p".to_string(), 25));
+    }
+
+    #[test]
+    /// <span> should start at byte 12, and not 13 due to é being 2 bytes.
+    fn multibyte_content_does_not_shift_subsequent_offsets() {
+        let result = parse("<p>café</p><span>next</span>");
+        let elems = result.content_elements();
+
+        assert_eq!(
+            elems.len(),
+            5,
+            "expected html, head, body, p and span, got: {:?}",
+            elems
+        );
+        assert_eq!(elems[0], ("html".to_string(), 0));
+        assert_eq!(elems[1], ("head".to_string(), 0));
+        assert_eq!(elems[2], ("body".to_string(), 0));
+        assert_eq!(elems[3], ("p".to_string(), 0));
+        assert_eq!(elems[4], ("span".to_string(), 12));
+    }
+}
diff --git a/rcdom/tests/xhtml-self-closing-integration.rs b/rcdom/tests/xhtml-self-closing-integration.rs
new file mode 100644
index 00000000..ae3a5d4a
--- /dev/null
+++ b/rcdom/tests/xhtml-self-closing-integration.rs
@@ -0,0 +1,108 @@
+//! Integration tests for the `xhtml-self-closing` feature.
+//!
+//! EPUB content is XHTML and uses self-closing syntax on RCDATA/RAWTEXT
+//! elements such as `<title/>` and `<style/>`. Without `xhtml-self-closing`,
+//! html5ever treats these as opening tags and enters raw-text mode, consuming
+//! the rest of the document
+
+#[cfg(feature = "xhtml-self-closing")]
+mod xhtml_self_closing {
+    use html5ever::driver;
+    use html5ever::tendril::stream::TendrilSink;
+    use html5ever::tendril::StrTendril;
+    use markup5ever_rcdom::{NodeData, RcDom};
+
+    fn parse(input: &str) -> RcDom {
+        driver::parse_document(RcDom::default(), Default::default()).one(StrTendril::from(input))
+    }
+
+    /// Walk the tree and collect all element names.
+    fn element_names(node: &markup5ever_rcdom::Handle) -> Vec<String> {
+        let mut names = Vec::new();
+        collect_names(node, &mut names);
+        names
+    }
+
+    fn collect_names(node: &markup5ever_rcdom::Handle, out: &mut Vec<String>) {
+        if let NodeData::Element { ref name, .. } = node.data {
+            out.push(name.local.to_string());
+        }
+        for child in node.children.borrow().iter() {
+            collect_names(child, out);
+        }
+    }
+
+    /// Return the text content of the first element with the given local name.
+    fn text_of(dom: &RcDom, tag: &str) -> Option<String> {
+        find_text(&dom.document, tag)
+    }
+
+    fn find_text(node: &markup5ever_rcdom::Handle, tag: &str) -> Option<String> {
+        if let NodeData::Element { ref name, .. } = node.data {
+            if name.local.as_ref() == tag {
+                let mut text = String::new();
+                for child in node.children.borrow().iter() {
+                    if let NodeData::Text { ref contents } = child.data {
+                        text.push_str(&contents.borrow());
+                    }
+                }
+                return Some(text);
+            }
+        }
+        for child in node.children.borrow().iter() {
+            if let Some(t) = find_text(child, tag) {
+                return Some(t);
+            }
+        }
+        None
+    }
+
+    #[test]
+    fn self_closing_title_does_not_swallow_body() {
+        let dom = parse("<html><head><title/></head><body><p>visible</p></body></html>");
+        let names = element_names(&dom.document);
+
+        assert!(
+            names.contains(&"body".to_string()),
+            "body element should be present; got: {:?}",
+            names
+        );
+        assert!(
+            names.contains(&"p".to_string()),
+            "p element inside body should be present; got: {:?}",
+            names
+        );
+
+        let text = text_of(&dom, "p");
+        assert_eq!(
+            text.as_deref(),
+            Some("visible"),
+            "<p> text should be 'visible', got: {:?}",
+            text
+        );
+    }
+
+    #[test]
+    fn self_closing_style_does_not_swallow_body() {
+        let dom = parse("<html><head><style/></head><body><p>content</p></body></html>");
+        let names = element_names(&dom.document);
+
+        assert!(
+            names.contains(&"p".to_string()),
+            "p element should not be swallowed by <style/>; got: {:?}",
+            names
+        );
+    }
+
+    #[test]
+    fn normal_closed_title_still_captures_rcdata_text() {
+        let dom = parse("<html><head><title>My Book</title></head><body></body></html>");
+        let text = text_of(&dom, "title");
+        assert_eq!(
+            text.as_deref(),
+            Some("My Book"),
+            "title text should be 'My Book', got: {:?}",
+            text
+        );
+    }
+}