diff --git a/Cargo.lock b/Cargo.lock index 9574b656..5f2f2fc8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7,6 +7,7 @@ name = "adblock" version = "0.11.1" dependencies = [ "addr", + "arrayvec", "base64", "bitflags", "criterion", @@ -105,6 +106,12 @@ version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223" +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + [[package]] name = "autocfg" version = "1.1.0" diff --git a/Cargo.toml b/Cargo.toml index 2a615ad0..b9fa55e5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,6 +39,7 @@ rustc-hash = { version = "1.1.0", default-features = false } memchr = "2.4" base64 = "0.22" rmp-serde = "0.15" +arrayvec = "0.7" cssparser = { version = "0.34", optional = true } selectors = { version = "0.26", optional = true } precomputed-hash = "0.1" diff --git a/src/filters/fb_network_builder.rs b/src/filters/fb_network_builder.rs index e569a3fb..ebbeb940 100644 --- a/src/filters/fb_network_builder.rs +++ b/src/filters/fb_network_builder.rs @@ -7,6 +7,7 @@ use flatbuffers::WIPOffset; use crate::filters::fb_builder::EngineFlatBuilder; use crate::filters::network::{FilterTokens, NetworkFilter}; use crate::filters::token_selector::TokenSelector; +use crate::utils::TokensBuffer; use crate::filters::network::NetworkFilterMaskHelper; use crate::flatbuffers::containers::flat_multimap::FlatMultiMapBuilder; @@ -134,6 +135,7 @@ impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for NetworkFilterListBuilder { let mut optimizable = HashMap::>::new(); let mut token_frequencies = TokenSelector::new(rule_list.filters.len()); + let mut tokens_buffer = TokensBuffer::default(); { for network_filter in rule_list.filters { @@ -157,7 +159,7 @@ impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for NetworkFilterListBuilder { } }; - let multi_tokens = network_filter.get_tokens_optimized(); + let multi_tokens = network_filter.get_tokens_optimized(&mut tokens_buffer); match multi_tokens { FilterTokens::Empty => { // No tokens, add to fallback bucket (token 0) @@ -171,7 +173,7 @@ impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for NetworkFilterListBuilder { } } FilterTokens::Other(tokens) => { - let best_token = token_frequencies.select_least_used_token(&tokens); + let best_token = token_frequencies.select_least_used_token(tokens); token_frequencies.record_usage(best_token); store_filter(best_token); } diff --git a/src/filters/network.rs b/src/filters/network.rs index 043b7825..01b73baf 100644 --- a/src/filters/network.rs +++ b/src/filters/network.rs @@ -15,9 +15,7 @@ use crate::filters::abstract_network::{ use crate::lists::ParseOptions; use crate::regex_manager::RegexManager; use crate::request; -use crate::utils::{self, Hash}; - -pub(crate) const TOKENS_BUFFER_SIZE: usize = 200; +use crate::utils::{self, Hash, TokensBuffer}; /// For now, only support `$removeparam` with simple alphanumeric/dash/underscore patterns. static VALID_PARAM: Lazy = Lazy::new(|| Regex::new(r"^[a-zA-Z0-9_\-]+$").unwrap()); @@ -312,10 +310,10 @@ pub enum FilterPart { } #[derive(Debug, PartialEq)] -pub enum FilterTokens { +pub enum FilterTokens<'a> { Empty, - OptDomains(Vec), - Other(Vec), + OptDomains(&'a [Hash]), + Other(&'a [Hash]), } pub struct FilterPartIterator<'a> { @@ -885,17 +883,21 @@ impl NetworkFilter { #[deprecated(since = "0.11.1", note = "use get_tokens_optimized instead")] pub fn get_tokens(&self) -> Vec> { - match self.get_tokens_optimized() { + let mut tokens_buffer = TokensBuffer::default(); + match self.get_tokens_optimized(&mut tokens_buffer) { FilterTokens::OptDomains(domains) => { - domains.into_iter().map(|domain| vec![domain]).collect() + domains.iter().map(|domain| vec![*domain]).collect() } - FilterTokens::Other(tokens) => vec![tokens], + FilterTokens::Other(tokens) => vec![tokens.to_vec()], FilterTokens::Empty => vec![], } } - pub fn get_tokens_optimized(&self) -> FilterTokens { - let mut tokens: Vec = Vec::with_capacity(TOKENS_BUFFER_SIZE); + pub fn get_tokens_optimized<'a>( + &'a self, + tokens_buffer: &'a mut TokensBuffer, + ) -> FilterTokens<'a> { + tokens_buffer.clear(); // If there is only one domain and no domain negation, we also use this // domain as a token. @@ -905,7 +907,7 @@ impl NetworkFilter { { if let Some(domains) = self.opt_domains.as_ref() { if let Some(domain) = domains.first() { - tokens.push(*domain) + tokens_buffer.push(*domain); } } } @@ -918,7 +920,7 @@ impl NetworkFilter { (self.is_plain() || self.is_regex()) && !self.is_right_anchor(); let skip_first_token = self.is_right_anchor(); - utils::tokenize_filter_to(f, skip_first_token, skip_last_token, &mut tokens); + utils::tokenize_filter_to(f, skip_first_token, skip_last_token, tokens_buffer); } } FilterPart::AnyOf(_) => (), // across AnyOf set of filters no single token is guaranteed to match to a request @@ -928,45 +930,43 @@ impl NetworkFilter { // Append tokens from hostname, if any if !self.mask.contains(NetworkFilterMask::IS_HOSTNAME_REGEX) { if let Some(hostname) = self.hostname.as_ref() { - utils::tokenize_to(hostname, &mut tokens); + utils::tokenize_to(hostname, tokens_buffer); } } else if let Some(hostname) = self.hostname.as_ref() { // Find last dot to tokenize the prefix let last_dot_pos = hostname.rfind('.'); if let Some(last_dot_pos) = last_dot_pos { - utils::tokenize_to(&hostname[..last_dot_pos], &mut tokens); + utils::tokenize_to(&hostname[..last_dot_pos], tokens_buffer); } } - if tokens.is_empty() && self.mask.contains(NetworkFilterMask::IS_REMOVEPARAM) { + if tokens_buffer.is_empty() && self.mask.contains(NetworkFilterMask::IS_REMOVEPARAM) { if let Some(removeparam) = &self.modifier_option { if VALID_PARAM.is_match(removeparam) { - utils::tokenize_to(&removeparam.to_ascii_lowercase(), &mut tokens); + utils::tokenize_to(&removeparam.to_ascii_lowercase(), tokens_buffer); } } } // If we got no tokens for the filter/hostname part, then we will dispatch // this filter in multiple buckets based on the domains option. - if tokens.is_empty() && self.opt_domains.is_some() && self.opt_not_domains.is_none() { + if tokens_buffer.is_empty() && self.opt_domains.is_some() && self.opt_not_domains.is_none() + { if let Some(opt_domains) = self.opt_domains.as_ref() { if !opt_domains.is_empty() { - return FilterTokens::OptDomains(opt_domains.clone()); + return FilterTokens::OptDomains(opt_domains); } } FilterTokens::Empty } else { // Add optional token for protocol if self.for_http() && !self.for_https() { - tokens.push(utils::fast_hash("http")); + tokens_buffer.push(utils::fast_hash("http")); } else if self.for_https() && !self.for_http() { - tokens.push(utils::fast_hash("https")); + tokens_buffer.push(utils::fast_hash("https")); } - // Remake a vector to drop extra capacity. - let mut t = Vec::with_capacity(tokens.len()); - t.extend(tokens); - FilterTokens::Other(t) + FilterTokens::Other(tokens_buffer.as_slice()) } } } diff --git a/src/request.rs b/src/request.rs index 5d853cb4..fd5409f1 100644 --- a/src/request.rs +++ b/src/request.rs @@ -239,11 +239,11 @@ impl Request { } fn calculate_tokens(url_lower_cased: &str) -> Vec { - let mut tokens = vec![]; + let mut tokens = utils::TokensBuffer::default(); utils::tokenize_pooled(url_lower_cased, &mut tokens); // Add zero token as a fallback to wildcard rule bucket tokens.push(0); - tokens + tokens.into_iter().collect() } #[cfg(test)] diff --git a/src/utils.rs b/src/utils.rs index b9cbe995..056f1dcf 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -6,6 +6,8 @@ use seahash::hash; #[cfg(target_pointer_width = "32")] use seahash::reference::hash; +pub use arrayvec::ArrayVec; + pub type Hash = u64; // A smaller version of Hash that is used in serialized format. @@ -27,16 +29,14 @@ fn is_allowed_filter(ch: char) -> bool { ch.is_alphanumeric() || ch == '%' } -pub(crate) const TOKENS_BUFFER_SIZE: usize = 128; -pub(crate) const TOKENS_BUFFER_RESERVED: usize = 1; -const TOKENS_MAX: usize = TOKENS_BUFFER_SIZE - TOKENS_BUFFER_RESERVED; +pub type TokensBuffer = ArrayVec; fn fast_tokenizer_no_regex( pattern: &str, is_allowed_code: &dyn Fn(char) -> bool, skip_first_token: bool, skip_last_token: bool, - tokens_buffer: &mut Vec, + tokens_buffer: &mut TokensBuffer, ) { // let mut tokens_buffer_index = 0; let mut inside: bool = false; @@ -44,8 +44,8 @@ fn fast_tokenizer_no_regex( let mut preceding_ch: Option = None; // Used to check if a '*' is not just before a token for (i, c) in pattern.char_indices() { - if tokens_buffer.len() >= TOKENS_MAX { - return; + if tokens_buffer.capacity() - tokens_buffer.len() <= 1 { + return; // reserve one free slot for the zero token } if is_allowed_code(c) { if !inside { @@ -75,17 +75,17 @@ fn fast_tokenizer_no_regex( } } -pub(crate) fn tokenize_pooled(pattern: &str, tokens_buffer: &mut Vec) { +pub(crate) fn tokenize_pooled(pattern: &str, tokens_buffer: &mut TokensBuffer) { fast_tokenizer_no_regex(pattern, &is_allowed_filter, false, false, tokens_buffer); } pub fn tokenize(pattern: &str) -> Vec { - let mut tokens_buffer: Vec = Vec::with_capacity(TOKENS_BUFFER_SIZE); + let mut tokens_buffer = TokensBuffer::default(); tokenize_to(pattern, &mut tokens_buffer); - tokens_buffer + tokens_buffer.into_iter().collect() } -pub(crate) fn tokenize_to(pattern: &str, tokens_buffer: &mut Vec) { +pub(crate) fn tokenize_to(pattern: &str, tokens_buffer: &mut TokensBuffer) { fast_tokenizer_no_regex(pattern, &is_allowed_filter, false, false, tokens_buffer); } @@ -95,21 +95,21 @@ pub(crate) fn tokenize_filter( skip_first_token: bool, skip_last_token: bool, ) -> Vec { - let mut tokens_buffer: Vec = Vec::with_capacity(TOKENS_BUFFER_SIZE); + let mut tokens_buffer = TokensBuffer::default(); tokenize_filter_to( pattern, skip_first_token, skip_last_token, &mut tokens_buffer, ); - tokens_buffer + tokens_buffer.into_iter().collect() } pub(crate) fn tokenize_filter_to( pattern: &str, skip_first_token: bool, skip_last_token: bool, - tokens_buffer: &mut Vec, + tokens_buffer: &mut TokensBuffer, ) { fast_tokenizer_no_regex( pattern, diff --git a/tests/matching.rs b/tests/matching.rs index d4f26934..e8649c07 100644 --- a/tests/matching.rs +++ b/tests/matching.rs @@ -199,5 +199,5 @@ fn check_rule_matching_browserlike() { let (blocked, passes) = bench_rule_matching_browserlike(&engine, &requests); let msg = "The number of blocked/passed requests has changed. ".to_string() + "If this is expected, update the expected values in the test."; - assert_eq!((blocked, passes), (106860, 136085), "{msg}"); + assert_eq!((blocked, passes), (106861, 136084), "{msg}"); } diff --git a/tests/unit/filters/network.rs b/tests/unit/filters/network.rs index b22e0a52..6d636831 100644 --- a/tests/unit/filters/network.rs +++ b/tests/unit/filters/network.rs @@ -1191,12 +1191,10 @@ mod parse_tests { fn test_simple_pattern_tokenization() { let rule = "||some.primewire.c*/sw$script,1p"; let filter = NetworkFilter::parse(rule, true, ParseOptions::default()).unwrap(); + let mut tokens_buffer = utils::TokensBuffer::default(); assert_eq!( - filter.get_tokens_optimized(), - FilterTokens::Other(vec![ - utils::fast_hash("some"), - utils::fast_hash("primewire") - ]) + filter.get_tokens_optimized(&mut tokens_buffer), + FilterTokens::Other(&[utils::fast_hash("some"), utils::fast_hash("primewire")]) ); } }