Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,9 @@ libtest-mimic = "0.8.1"
rand = "0.9"
serde_json = "1.0"
typed-arena = "2.0.2"

# Redirect crates.io tendril/web_atoms to the local path crates so that
# markup5ever gets the same crate instance as rcdom and xml5ever.
[patch.crates-io]
tendril = { path = "tendril" }
web_atoms = { path = "web_atoms" }
28 changes: 17 additions & 11 deletions html5ever/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,31 @@ documentation = "https://docs.rs/html5ever"
categories = [ "parser-implementations", "web-programming" ]
keywords = ["html", "html5", "parser", "parsing"]
readme = "../README.md"
version.workspace = true
license.workspace = true
authors.workspace = true
repository.workspace = true
edition.workspace = true
rust-version.workspace = true
version = "0.39.0"
license = "MIT OR Apache-2.0"
authors = [ "The html5ever Project Developers" ]
repository = "https://github.com/servo/html5ever"
edition = "2021"
rust-version = "1.71.0"
Comment on lines +8 to +13

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You'll need to bump the version if we publish this, but don't modify the other fields please.


[features]
trace_tokenizer = []
serde = ["markup5ever/serde"]
# Surfaces byte-accurate source positions; see markup5ever for full description.
source-positions = ["markup5ever/source-positions"]
# Honour the XML/XHTML self-closing syntax (`<title/>`, `<style/>`, …) on
# RCDATA and RAWTEXT elements. Without this, html5ever treats `<title/>`
# as opening a RCDATA region that swallows the rest of the document.
xhtml-self-closing = []

[dependencies]
markup5ever = { workspace = true }
memchr = { workspace = true }
log = { workspace = true }
markup5ever = { version = "0.39", path = "../markup5ever" }
memchr = "2.8.0"
log = "0.4"

[dev-dependencies]
criterion = { workspace = true }
typed-arena = { workspace = true }
criterion = "0.8"
typed-arena = "2.0.2"

[[bench]]
name = "html5ever"
Expand Down
20 changes: 18 additions & 2 deletions html5ever/src/tokenizer/char_ref/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,11 @@ impl CharRefTokenizer {
unconsume.push_char(c)
}

#[cfg(feature = "source-positions")]
let unconsume_len = unconsume.len() as u64;
input.push_front(unconsume);
#[cfg(feature = "source-positions")]
input.retreat_bytes_consumed(unconsume_len);
tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
Status::Done(CharRef::EMPTY)
}
Expand Down Expand Up @@ -292,7 +296,12 @@ impl CharRefTokenizer {
}

fn unconsume_name(&mut self, input: &BufferQueue) {
input.push_front(self.name_buf_opt.take().unwrap());
let name_buf = self.name_buf_opt.take().unwrap();
#[cfg(feature = "source-positions")]
let name_buf_len = name_buf.len() as u64;
input.push_front(name_buf);
#[cfg(feature = "source-positions")]
input.retreat_bytes_consumed(name_buf_len);
}

fn finish_named<Sink: TokenSink>(
Expand Down Expand Up @@ -367,7 +376,12 @@ impl CharRefTokenizer {
self.unconsume_name(input);
Status::Done(CharRef::EMPTY)
} else {
input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
let unconsumed = StrTendril::from_slice(&self.name_buf()[name_len..]);
#[cfg(feature = "source-positions")]
let unconsumed_len = unconsumed.len() as u64;
input.push_front(unconsumed);
#[cfg(feature = "source-positions")]
input.retreat_bytes_consumed(unconsumed_len);
tokenizer.ignore_lf.set(false);
Status::Done(CharRef {
chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
Expand Down Expand Up @@ -419,6 +433,8 @@ impl CharRefTokenizer {
},
State::Octothorpe => {
input.push_front(StrTendril::from_slice("#"));
#[cfg(feature = "source-positions")]
input.retreat_bytes_consumed(1);
tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
Status::Done(CharRef::EMPTY)
},
Expand Down
7 changes: 7 additions & 0 deletions html5ever/src/tokenizer/interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,13 @@ pub trait TokenSink {
/// Signal that tokenization reached the end of the document.
fn end(&self) {}

/// Called just before each token is dispatched to [`process_token`],
/// with the number of UTF-8 bytes consumed from the input so far.
///
/// The default implementation is a no-op.
#[cfg(feature = "source-positions")]
fn set_current_byte(&self, _byte_offset: u64) {}

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would strongly prefer passing the byte position as an offset to process_token instead of adding a new method that will be called right before.

You can add something like a struct SourcePosition which would contain the line number and (optionally) the byte offset.


/// Used in the [markup declaration open state]. By default, this always
/// returns false and thus all CDATA sections are tokenized as bogus
/// comments.
Expand Down
Loading
Loading