From 5f6206aba0cbfe27f3993e3ece7e1da07f2e7eb9 Mon Sep 17 00:00:00 2001 From: Jakub Onderka Date: Mon, 8 Jun 2026 21:00:11 +0200 Subject: [PATCH] feat(stdlib): LRU cache for parse_user_agent function --- Cargo.lock | 11 +++ Cargo.toml | 2 + src/stdlib/parse_user_agent.rs | 151 ++++++++++++++++++++++----------- 3 files changed, 113 insertions(+), 51 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 670f4372f4..38e93aba34 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1370,6 +1370,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" dependencies = [ "allocator-api2", + "foldhash 0.2.0", +] + +[[package]] +name = "hashlink" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5081f264ed7adee96ea4b4778b6bb9da0a7228b084587aa3bd3ff05da7c5a3b" +dependencies = [ + "hashbrown 0.17.1", ] [[package]] @@ -4296,6 +4306,7 @@ dependencies = [ "flate2", "getrandom 0.3.4", "grok", + "hashlink", "hex", "hmac", "hostname", diff --git a/Cargo.toml b/Cargo.toml index e022516966..cec963d7a8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -151,6 +151,7 @@ stdlib-base = [ "dep:dns-lookup", "dep:flate2", "dep:grok", + "dep:hashlink", "dep:hostname", "dep:iana-time-zone", "dep:idna", @@ -301,6 +302,7 @@ reqwest-middleware = { version = "0.4", default-features = false, optional = tru reqwest-retry = { version = "0.8", default-features = false, optional = true } dns-lookup = { version = "3", optional = true } domain = { version = "0.12.0", optional = true, features = ["resolv-sync", "serde"] } +hashlink = { version = "0.12", optional = true } hostname = { version = "0.4", optional = true } grok = { version = "2.4", optional = true } onig = { version = "6", default-features = false, optional = true } diff --git a/src/stdlib/parse_user_agent.rs b/src/stdlib/parse_user_agent.rs index 94648963f7..965b7bc771 100644 --- a/src/stdlib/parse_user_agent.rs +++ b/src/stdlib/parse_user_agent.rs @@ -1,12 +1,8 @@ use crate::compiler::function::EnumVariant; use crate::compiler::prelude::*; -use std::{ - borrow::Cow, - collections::BTreeMap, - fmt, - str::FromStr, - sync::{Arc, LazyLock}, -}; +use crate::value::value::simdutf_bytes_utf8_lossy; +use std::sync::{Arc, Mutex}; +use std::{borrow::Cow, collections::BTreeMap, fmt, str::FromStr, sync::LazyLock}; use woothee::parser::Parser as WootheeParser; static UA_EXTRACTOR: LazyLock = LazyLock::new(|| { @@ -16,7 +12,7 @@ static UA_EXTRACTOR: LazyLock = LazyLock::new(|| { static DEFAULT_MODE: Value = Value::Bytes(Bytes::from_static("fast".as_bytes())); -static MODE_ENUM: &[EnumVariant] = &[ +const MODE_ENUM: &[EnumVariant] = &[ EnumVariant { value: "fast", description: "Fastest mode but most unreliable. Uses parser from project [Woothee](https://github.com/woothee/woothee).", @@ -48,6 +44,11 @@ const PARAMETERS: &[Parameter] = &[ ) .default(&DEFAULT_MODE) .enum_variants(MODE_ENUM), + Parameter::optional( + "cache", + kind::INTEGER, + "Defines how many parsed agents can be stored in cache.", + ), ]; #[derive(Clone, Copy, Debug)] @@ -195,61 +196,101 @@ impl Function for ParseUserAgent { .map(|s| Mode::from_str(&s).expect("validated enum")) .expect("mode not bytes"); - let parser = match mode { - Mode::Fast => { - let parser = WootheeParser::new(); + let cache = arguments + .optional_literal("cache", state)? + .map(|c| { + c.clone() + .try_integer() + .map_err(|_| function::Error::InvalidArgument { + keyword: "cache", + value: c, + error: "must be integer literal", + }) + }) + .transpose()?; + + Ok( + if let Some(cache_size) = cache + && cache_size > 0 + { + #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let cache = Arc::new(Mutex::new(hashlink::LruCache::new(cache_size as usize))); + ParseUserAgentWithCacheFn { value, mode, cache }.as_expr() + } else { + ParseUserAgentFn { value, mode }.as_expr() + }, + ) + } +} - Arc::new(move |s: &str| parser.parse_user_agent(s).partial_schema()) as Arc<_> - } - Mode::Reliable => { - let fast = WootheeParser::new(); - let slow = &UA_EXTRACTOR; - - Arc::new(move |s: &str| { - let ua = fast.parse_user_agent(s); - let ua = if ua.browser.family.is_none() || ua.os.family.is_none() { - let better_ua = slow.parse_user_agent(s); - better_ua.or(ua) - } else { - ua - }; - ua.partial_schema() - }) as Arc<_> +fn parse_user_agent(bytes: &Bytes, mode: Mode) -> Value { + let string = simdutf_bytes_utf8_lossy(bytes); + match mode { + Mode::Fast => WootheeParser::new() + .parse_user_agent(&string) + .partial_schema(), + Mode::Reliable => { + let ua = WootheeParser::new().parse_user_agent(&string); + if ua.browser.family.is_none() || ua.os.family.is_none() { + let better_ua = UA_EXTRACTOR.parse_user_agent(&string); + better_ua.or(ua) + } else { + ua } - Mode::Enriched => { - let fast = WootheeParser::new(); - let slow = &UA_EXTRACTOR; - - Arc::new(move |s: &str| { - slow.parse_user_agent(s) - .or(fast.parse_user_agent(s)) - .full_schema() - }) as Arc<_> - } - }; - - Ok(ParseUserAgentFn { - value, - mode, - parser, + .partial_schema() } - .as_expr()) + Mode::Enriched => UA_EXTRACTOR + .parse_user_agent(&string) + .or(WootheeParser::new().parse_user_agent(&string)) + .full_schema(), } } -#[derive(Clone)] +#[derive(Clone, Debug)] struct ParseUserAgentFn { value: Box, mode: Mode, - parser: Arc Value + Send + Sync>, } impl FunctionExpression for ParseUserAgentFn { fn resolve(&self, ctx: &mut Context) -> Resolved { let value = self.value.resolve(ctx)?; - let string = value.try_bytes_utf8_lossy()?; + let bytes = value.try_bytes()?; + Ok(parse_user_agent(&bytes, self.mode)) + } + + fn type_def(&self, _: &state::TypeState) -> TypeDef { + self.mode.type_def() + } +} + +#[derive(Clone)] +struct ParseUserAgentWithCacheFn { + value: Box, + mode: Mode, + cache: Arc>>, +} + +impl FunctionExpression for ParseUserAgentWithCacheFn { + fn resolve(&self, ctx: &mut Context) -> Resolved { + let value = self.value.resolve(ctx)?; + let bytes = value.try_bytes()?; + + if bytes.len() > 512 { + // Do not cache unusually big user agents + return Ok(parse_user_agent(&bytes, self.mode)); + } - Ok((self.parser)(&string)) + if let Some(value) = self.cache.lock().unwrap().get(&bytes) { + return Ok(value.clone()); + } + + let value = parse_user_agent(&bytes, self.mode); + + let cloned = value.clone(); + self.cache.lock().unwrap().insert(bytes, cloned); + + Ok(value) } fn type_def(&self, _: &state::TypeState) -> TypeDef { @@ -257,12 +298,14 @@ impl FunctionExpression for ParseUserAgentFn { } } -impl fmt::Debug for ParseUserAgentFn { +impl fmt::Debug for ParseUserAgentWithCacheFn { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, - "ParseUserAgentFn{{ value: {:?}, mode: {:?}}}", - self.value, self.mode + "ParseUserAgentWithCacheFn{{ value: {:?}, mode: {:?}, cache: {}}}", + self.value, + self.mode, + self.cache.lock().unwrap().capacity() ) } } @@ -665,6 +708,12 @@ mod tests { tdef: Mode::Fast.type_def(), } + parses_with_cache { + args: func_args![ value: "Mozilla/4.0 (compatible; MSIE 7.66; Windows NT 5.1; SV1)", cache: 10 ], + want: Ok(value!({ browser: { family: "Internet Explorer", version: "7.66" }, device: { category: "pc" }, os: { family: "Windows XP", version: "NT 5.1" } })), + tdef: Mode::Fast.type_def(), + } + unknown_user_agent { args: func_args![ value: "w3m/0.3", mode: "enriched"], want: Ok(value!({ browser: { family: null, major: null, minor: null, patch: null, version: null }, device: { brand: null, category: null, family: null, model: null }, os: { family: null, major: null, minor: null, patch: null, patch_minor: null, version: null } })),