From 6d24c8bf064ac3dcba540b4502a75489eb3cb52f Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Mon, 3 Nov 2025 23:08:56 +0400 Subject: [PATCH 01/11] [perf] use StackVector to save memory & CPU --- src/filters/network.rs | 13 +++------- src/flatbuffers/unsafe_tools.rs | 44 +++++++++++++++++++++++++++++++++ src/request.rs | 4 +-- src/utils.rs | 25 +++++++++---------- 4 files changed, 61 insertions(+), 25 deletions(-) diff --git a/src/filters/network.rs b/src/filters/network.rs index 043b7825..391efa97 100644 --- a/src/filters/network.rs +++ b/src/filters/network.rs @@ -15,9 +15,7 @@ use crate::filters::abstract_network::{ use crate::lists::ParseOptions; use crate::regex_manager::RegexManager; use crate::request; -use crate::utils::{self, Hash}; - -pub(crate) const TOKENS_BUFFER_SIZE: usize = 200; +use crate::utils::{self, Hash, TokensBuffer}; /// For now, only support `$removeparam` with simple alphanumeric/dash/underscore patterns. static VALID_PARAM: Lazy = Lazy::new(|| Regex::new(r"^[a-zA-Z0-9_\-]+$").unwrap()); @@ -895,7 +893,7 @@ impl NetworkFilter { } pub fn get_tokens_optimized(&self) -> FilterTokens { - let mut tokens: Vec = Vec::with_capacity(TOKENS_BUFFER_SIZE); + let mut tokens = TokensBuffer::default(); // If there is only one domain and no domain negation, we also use this // domain as a token. @@ -905,7 +903,7 @@ impl NetworkFilter { { if let Some(domains) = self.opt_domains.as_ref() { if let Some(domain) = domains.first() { - tokens.push(*domain) + tokens.push(*domain); } } } @@ -963,10 +961,7 @@ impl NetworkFilter { tokens.push(utils::fast_hash("https")); } - // Remake a vector to drop extra capacity. - let mut t = Vec::with_capacity(tokens.len()); - t.extend(tokens); - FilterTokens::Other(t) + FilterTokens::Other(tokens.into_vec()) } } } diff --git a/src/flatbuffers/unsafe_tools.rs b/src/flatbuffers/unsafe_tools.rs index 7d3973c9..9ff709b9 100644 --- a/src/flatbuffers/unsafe_tools.rs +++ b/src/flatbuffers/unsafe_tools.rs @@ -1,6 +1,7 @@ //! Unsafe utility functions for working with flatbuffers and other low-level operations. use crate::filters::flatbuffer_generated::fb; +use std::mem::MaybeUninit; // Minimum alignment for the beginning of the flatbuffer data. const MIN_ALIGNMENT: usize = 8; @@ -97,3 +98,46 @@ impl VerifiedFlatbufferMemory { &self.raw_data[self.start..] } } + +/// A simple stack-allocated vector. +/// It is used to avoid allocations when the vector is small. +pub(crate) struct StackVector { + data: [MaybeUninit; MAX_SIZE], + size: usize, +} + +impl Default for StackVector +where + T: Default + Copy, +{ + fn default() -> Self { + Self { + data: [MaybeUninit::uninit(); MAX_SIZE], + size: 0, + } + } +} + +impl StackVector { + pub fn push(&mut self, value: T) -> bool { + if self.size < MAX_SIZE { + self.data[self.size] = MaybeUninit::new(value); + self.size += 1; + true + } else { + false + } + } + + pub fn is_empty(&self) -> bool { + self.size == 0 + } + + pub fn into_vec(self) -> Vec { + let mut v = Vec::with_capacity(self.size); + for i in 0..self.size { + v.push(unsafe { self.data[i].assume_init_read() }); + } + v + } +} diff --git a/src/request.rs b/src/request.rs index 5d853cb4..ce2b96eb 100644 --- a/src/request.rs +++ b/src/request.rs @@ -239,11 +239,11 @@ impl Request { } fn calculate_tokens(url_lower_cased: &str) -> Vec { - let mut tokens = vec![]; + let mut tokens = utils::TokensBuffer::default(); utils::tokenize_pooled(url_lower_cased, &mut tokens); // Add zero token as a fallback to wildcard rule bucket tokens.push(0); - tokens + tokens.into_vec() } #[cfg(test)] diff --git a/src/utils.rs b/src/utils.rs index b9cbe995..a7352789 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -6,6 +6,8 @@ use seahash::hash; #[cfg(target_pointer_width = "32")] use seahash::reference::hash; +use crate::flatbuffers::unsafe_tools::StackVector; + pub type Hash = u64; // A smaller version of Hash that is used in serialized format. @@ -27,16 +29,14 @@ fn is_allowed_filter(ch: char) -> bool { ch.is_alphanumeric() || ch == '%' } -pub(crate) const TOKENS_BUFFER_SIZE: usize = 128; -pub(crate) const TOKENS_BUFFER_RESERVED: usize = 1; -const TOKENS_MAX: usize = TOKENS_BUFFER_SIZE - TOKENS_BUFFER_RESERVED; +pub(crate) type TokensBuffer = StackVector; fn fast_tokenizer_no_regex( pattern: &str, is_allowed_code: &dyn Fn(char) -> bool, skip_first_token: bool, skip_last_token: bool, - tokens_buffer: &mut Vec, + tokens_buffer: &mut TokensBuffer, ) { // let mut tokens_buffer_index = 0; let mut inside: bool = false; @@ -44,9 +44,6 @@ fn fast_tokenizer_no_regex( let mut preceding_ch: Option = None; // Used to check if a '*' is not just before a token for (i, c) in pattern.char_indices() { - if tokens_buffer.len() >= TOKENS_MAX { - return; - } if is_allowed_code(c) { if !inside { inside = true; @@ -75,17 +72,17 @@ fn fast_tokenizer_no_regex( } } -pub(crate) fn tokenize_pooled(pattern: &str, tokens_buffer: &mut Vec) { +pub(crate) fn tokenize_pooled(pattern: &str, tokens_buffer: &mut TokensBuffer) { fast_tokenizer_no_regex(pattern, &is_allowed_filter, false, false, tokens_buffer); } pub fn tokenize(pattern: &str) -> Vec { - let mut tokens_buffer: Vec = Vec::with_capacity(TOKENS_BUFFER_SIZE); + let mut tokens_buffer = TokensBuffer::default(); tokenize_to(pattern, &mut tokens_buffer); - tokens_buffer + tokens_buffer.into_vec() } -pub(crate) fn tokenize_to(pattern: &str, tokens_buffer: &mut Vec) { +pub(crate) fn tokenize_to(pattern: &str, tokens_buffer: &mut TokensBuffer) { fast_tokenizer_no_regex(pattern, &is_allowed_filter, false, false, tokens_buffer); } @@ -95,21 +92,21 @@ pub(crate) fn tokenize_filter( skip_first_token: bool, skip_last_token: bool, ) -> Vec { - let mut tokens_buffer: Vec = Vec::with_capacity(TOKENS_BUFFER_SIZE); + let mut tokens_buffer = TokensBuffer::default(); tokenize_filter_to( pattern, skip_first_token, skip_last_token, &mut tokens_buffer, ); - tokens_buffer + tokens_buffer.into_vec() } pub(crate) fn tokenize_filter_to( pattern: &str, skip_first_token: bool, skip_last_token: bool, - tokens_buffer: &mut Vec, + tokens_buffer: &mut TokensBuffer, ) { fast_tokenizer_no_regex( pattern, From a48509d9b2f343836b5f709833cf63accc87b9a6 Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Wed, 12 Nov 2025 14:16:08 +0400 Subject: [PATCH 02/11] Change get_tokens_optimized() --- src/filters/fb_network_builder.rs | 4 +++- src/filters/network.rs | 39 ++++++++++++++++--------------- src/flatbuffers/unsafe_tools.rs | 10 +++++++- src/utils.rs | 2 +- tests/unit/filters/network.rs | 5 ++-- 5 files changed, 36 insertions(+), 24 deletions(-) diff --git a/src/filters/fb_network_builder.rs b/src/filters/fb_network_builder.rs index e569a3fb..b186dac3 100644 --- a/src/filters/fb_network_builder.rs +++ b/src/filters/fb_network_builder.rs @@ -7,6 +7,7 @@ use flatbuffers::WIPOffset; use crate::filters::fb_builder::EngineFlatBuilder; use crate::filters::network::{FilterTokens, NetworkFilter}; use crate::filters::token_selector::TokenSelector; +use crate::utils::TokensBuffer; use crate::filters::network::NetworkFilterMaskHelper; use crate::flatbuffers::containers::flat_multimap::FlatMultiMapBuilder; @@ -134,6 +135,7 @@ impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for NetworkFilterListBuilder { let mut optimizable = HashMap::>::new(); let mut token_frequencies = TokenSelector::new(rule_list.filters.len()); + let mut tokens_buffer = TokensBuffer::default(); { for network_filter in rule_list.filters { @@ -157,7 +159,7 @@ impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for NetworkFilterListBuilder { } }; - let multi_tokens = network_filter.get_tokens_optimized(); + let multi_tokens = network_filter.get_tokens_optimized(&mut tokens_buffer); match multi_tokens { FilterTokens::Empty => { // No tokens, add to fallback bucket (token 0) diff --git a/src/filters/network.rs b/src/filters/network.rs index 391efa97..a8cde5b6 100644 --- a/src/filters/network.rs +++ b/src/filters/network.rs @@ -310,10 +310,10 @@ pub enum FilterPart { } #[derive(Debug, PartialEq)] -pub enum FilterTokens { +pub enum FilterTokens<'a> { Empty, - OptDomains(Vec), - Other(Vec), + OptDomains(&'a [Hash]), + Other(&'a [Hash]), } pub struct FilterPartIterator<'a> { @@ -883,17 +883,18 @@ impl NetworkFilter { #[deprecated(since = "0.11.1", note = "use get_tokens_optimized instead")] pub fn get_tokens(&self) -> Vec> { - match self.get_tokens_optimized() { + let mut tokens_buffer = TokensBuffer::default(); + match self.get_tokens_optimized(&mut tokens_buffer) { FilterTokens::OptDomains(domains) => { - domains.into_iter().map(|domain| vec![domain]).collect() + domains.into_iter().map(|domain| vec![*domain]).collect() } - FilterTokens::Other(tokens) => vec![tokens], + FilterTokens::Other(tokens) => vec![tokens.to_vec()], FilterTokens::Empty => vec![], } } - pub fn get_tokens_optimized(&self) -> FilterTokens { - let mut tokens = TokensBuffer::default(); + pub fn get_tokens_optimized<'a>(&'a self, tokens_buffer: &'a mut TokensBuffer) -> FilterTokens<'a> { + tokens_buffer.clear(); // If there is only one domain and no domain negation, we also use this // domain as a token. @@ -903,7 +904,7 @@ impl NetworkFilter { { if let Some(domains) = self.opt_domains.as_ref() { if let Some(domain) = domains.first() { - tokens.push(*domain); + tokens_buffer.push(*domain); } } } @@ -916,7 +917,7 @@ impl NetworkFilter { (self.is_plain() || self.is_regex()) && !self.is_right_anchor(); let skip_first_token = self.is_right_anchor(); - utils::tokenize_filter_to(f, skip_first_token, skip_last_token, &mut tokens); + utils::tokenize_filter_to(f, skip_first_token, skip_last_token, tokens_buffer); } } FilterPart::AnyOf(_) => (), // across AnyOf set of filters no single token is guaranteed to match to a request @@ -926,42 +927,42 @@ impl NetworkFilter { // Append tokens from hostname, if any if !self.mask.contains(NetworkFilterMask::IS_HOSTNAME_REGEX) { if let Some(hostname) = self.hostname.as_ref() { - utils::tokenize_to(hostname, &mut tokens); + utils::tokenize_to(hostname, tokens_buffer); } } else if let Some(hostname) = self.hostname.as_ref() { // Find last dot to tokenize the prefix let last_dot_pos = hostname.rfind('.'); if let Some(last_dot_pos) = last_dot_pos { - utils::tokenize_to(&hostname[..last_dot_pos], &mut tokens); + utils::tokenize_to(&hostname[..last_dot_pos], tokens_buffer); } } - if tokens.is_empty() && self.mask.contains(NetworkFilterMask::IS_REMOVEPARAM) { + if tokens_buffer.is_empty() && self.mask.contains(NetworkFilterMask::IS_REMOVEPARAM) { if let Some(removeparam) = &self.modifier_option { if VALID_PARAM.is_match(removeparam) { - utils::tokenize_to(&removeparam.to_ascii_lowercase(), &mut tokens); + utils::tokenize_to(&removeparam.to_ascii_lowercase(), tokens_buffer); } } } // If we got no tokens for the filter/hostname part, then we will dispatch // this filter in multiple buckets based on the domains option. - if tokens.is_empty() && self.opt_domains.is_some() && self.opt_not_domains.is_none() { + if tokens_buffer.is_empty() && self.opt_domains.is_some() && self.opt_not_domains.is_none() { if let Some(opt_domains) = self.opt_domains.as_ref() { if !opt_domains.is_empty() { - return FilterTokens::OptDomains(opt_domains.clone()); + return FilterTokens::OptDomains(opt_domains); } } FilterTokens::Empty } else { // Add optional token for protocol if self.for_http() && !self.for_https() { - tokens.push(utils::fast_hash("http")); + tokens_buffer.push(utils::fast_hash("http")); } else if self.for_https() && !self.for_http() { - tokens.push(utils::fast_hash("https")); + tokens_buffer.push(utils::fast_hash("https")); } - FilterTokens::Other(tokens.into_vec()) + FilterTokens::Other(tokens_buffer.as_slice()) } } } diff --git a/src/flatbuffers/unsafe_tools.rs b/src/flatbuffers/unsafe_tools.rs index 9ff709b9..d6ea490c 100644 --- a/src/flatbuffers/unsafe_tools.rs +++ b/src/flatbuffers/unsafe_tools.rs @@ -101,7 +101,7 @@ impl VerifiedFlatbufferMemory { /// A simple stack-allocated vector. /// It is used to avoid allocations when the vector is small. -pub(crate) struct StackVector { +pub struct StackVector { data: [MaybeUninit; MAX_SIZE], size: usize, } @@ -133,6 +133,14 @@ impl StackVector { self.size == 0 } + pub fn clear(&mut self) { + self.size = 0; + } + + pub fn as_slice(&self) -> &[T] { + unsafe { std::slice::from_raw_parts(self.data.as_ptr() as *const T, self.size) } + } + pub fn into_vec(self) -> Vec { let mut v = Vec::with_capacity(self.size); for i in 0..self.size { diff --git a/src/utils.rs b/src/utils.rs index a7352789..1338816a 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -29,7 +29,7 @@ fn is_allowed_filter(ch: char) -> bool { ch.is_alphanumeric() || ch == '%' } -pub(crate) type TokensBuffer = StackVector; +pub type TokensBuffer = StackVector; fn fast_tokenizer_no_regex( pattern: &str, diff --git a/tests/unit/filters/network.rs b/tests/unit/filters/network.rs index b22e0a52..042d1cb5 100644 --- a/tests/unit/filters/network.rs +++ b/tests/unit/filters/network.rs @@ -1191,9 +1191,10 @@ mod parse_tests { fn test_simple_pattern_tokenization() { let rule = "||some.primewire.c*/sw$script,1p"; let filter = NetworkFilter::parse(rule, true, ParseOptions::default()).unwrap(); + let mut tokens_buffer = utils::TokensBuffer::default(); assert_eq!( - filter.get_tokens_optimized(), - FilterTokens::Other(vec![ + filter.get_tokens_optimized(&mut tokens_buffer), + FilterTokens::Other(&[ utils::fast_hash("some"), utils::fast_hash("primewire") ]) From 5c1e3bf37c49ee0a78d13a07465e963390ccec9e Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Wed, 12 Nov 2025 14:17:00 +0400 Subject: [PATCH 03/11] cargo fmt --- src/filters/network.rs | 8 ++++++-- tests/unit/filters/network.rs | 5 +---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/filters/network.rs b/src/filters/network.rs index a8cde5b6..9fb719f8 100644 --- a/src/filters/network.rs +++ b/src/filters/network.rs @@ -893,7 +893,10 @@ impl NetworkFilter { } } - pub fn get_tokens_optimized<'a>(&'a self, tokens_buffer: &'a mut TokensBuffer) -> FilterTokens<'a> { + pub fn get_tokens_optimized<'a>( + &'a self, + tokens_buffer: &'a mut TokensBuffer, + ) -> FilterTokens<'a> { tokens_buffer.clear(); // If there is only one domain and no domain negation, we also use this @@ -947,7 +950,8 @@ impl NetworkFilter { // If we got no tokens for the filter/hostname part, then we will dispatch // this filter in multiple buckets based on the domains option. - if tokens_buffer.is_empty() && self.opt_domains.is_some() && self.opt_not_domains.is_none() { + if tokens_buffer.is_empty() && self.opt_domains.is_some() && self.opt_not_domains.is_none() + { if let Some(opt_domains) = self.opt_domains.as_ref() { if !opt_domains.is_empty() { return FilterTokens::OptDomains(opt_domains); diff --git a/tests/unit/filters/network.rs b/tests/unit/filters/network.rs index 042d1cb5..6d636831 100644 --- a/tests/unit/filters/network.rs +++ b/tests/unit/filters/network.rs @@ -1194,10 +1194,7 @@ mod parse_tests { let mut tokens_buffer = utils::TokensBuffer::default(); assert_eq!( filter.get_tokens_optimized(&mut tokens_buffer), - FilterTokens::Other(&[ - utils::fast_hash("some"), - utils::fast_hash("primewire") - ]) + FilterTokens::Other(&[utils::fast_hash("some"), utils::fast_hash("primewire")]) ); } } From 2db0ac208a8b330d9ded879fe9ffacb2c41df23f Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Wed, 12 Nov 2025 14:22:19 +0400 Subject: [PATCH 04/11] Fix the test expectations --- tests/matching.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/matching.rs b/tests/matching.rs index d4f26934..e8649c07 100644 --- a/tests/matching.rs +++ b/tests/matching.rs @@ -199,5 +199,5 @@ fn check_rule_matching_browserlike() { let (blocked, passes) = bench_rule_matching_browserlike(&engine, &requests); let msg = "The number of blocked/passed requests has changed. ".to_string() + "If this is expected, update the expected values in the test."; - assert_eq!((blocked, passes), (106860, 136085), "{msg}"); + assert_eq!((blocked, passes), (106861, 136084), "{msg}"); } From 906e550e183df88e468cccb87d74bea97068993c Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Wed, 12 Nov 2025 14:25:21 +0400 Subject: [PATCH 05/11] Fix clippy --- src/filters/fb_network_builder.rs | 2 +- src/filters/network.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/filters/fb_network_builder.rs b/src/filters/fb_network_builder.rs index b186dac3..ebbeb940 100644 --- a/src/filters/fb_network_builder.rs +++ b/src/filters/fb_network_builder.rs @@ -173,7 +173,7 @@ impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for NetworkFilterListBuilder { } } FilterTokens::Other(tokens) => { - let best_token = token_frequencies.select_least_used_token(&tokens); + let best_token = token_frequencies.select_least_used_token(tokens); token_frequencies.record_usage(best_token); store_filter(best_token); } diff --git a/src/filters/network.rs b/src/filters/network.rs index 9fb719f8..01b73baf 100644 --- a/src/filters/network.rs +++ b/src/filters/network.rs @@ -886,7 +886,7 @@ impl NetworkFilter { let mut tokens_buffer = TokensBuffer::default(); match self.get_tokens_optimized(&mut tokens_buffer) { FilterTokens::OptDomains(domains) => { - domains.into_iter().map(|domain| vec![*domain]).collect() + domains.iter().map(|domain| vec![*domain]).collect() } FilterTokens::Other(tokens) => vec![tokens.to_vec()], FilterTokens::Empty => vec![], From cba077f2940f3a1b150f6655536012bb2b88cad2 Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Wed, 12 Nov 2025 16:06:13 +0400 Subject: [PATCH 06/11] add drop() --- src/flatbuffers/unsafe_tools.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/flatbuffers/unsafe_tools.rs b/src/flatbuffers/unsafe_tools.rs index d6ea490c..b454f62c 100644 --- a/src/flatbuffers/unsafe_tools.rs +++ b/src/flatbuffers/unsafe_tools.rs @@ -118,6 +118,14 @@ where } } +impl Drop for StackVector { + fn drop(&mut self) { + for i in 0..self.size { + unsafe { self.data[i].assume_init_drop() }; + } + } +} + impl StackVector { pub fn push(&mut self, value: T) -> bool { if self.size < MAX_SIZE { @@ -134,6 +142,9 @@ impl StackVector { } pub fn clear(&mut self) { + for i in 0..self.size { + unsafe { self.data[i].assume_init_drop() }; + } self.size = 0; } From c36bbe85b94c9f23885d6c7d59898e40ac39c08b Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Wed, 12 Nov 2025 16:17:20 +0400 Subject: [PATCH 07/11] Remove unsafe{} --- src/flatbuffers/unsafe_tools.rs | 37 +++++++++++---------------------- src/utils.rs | 2 +- 2 files changed, 13 insertions(+), 26 deletions(-) diff --git a/src/flatbuffers/unsafe_tools.rs b/src/flatbuffers/unsafe_tools.rs index b454f62c..7441f560 100644 --- a/src/flatbuffers/unsafe_tools.rs +++ b/src/flatbuffers/unsafe_tools.rs @@ -1,7 +1,6 @@ //! Unsafe utility functions for working with flatbuffers and other low-level operations. use crate::filters::flatbuffer_generated::fb; -use std::mem::MaybeUninit; // Minimum alignment for the beginning of the flatbuffer data. const MIN_ALIGNMENT: usize = 8; @@ -99,37 +98,27 @@ impl VerifiedFlatbufferMemory { } } -/// A simple stack-allocated vector. -/// It is used to avoid allocations when the vector is small. +/// A stack-allocated vector that uses [T; MAX_SIZE] with Default initialization. +/// All elements are initialized to T::default(), and we track the logical size separately. +/// Note: a future impl can switch to using MaybeUninit with unsafe code for better efficiency. pub struct StackVector { - data: [MaybeUninit; MAX_SIZE], + data: [T; MAX_SIZE], size: usize, } -impl Default for StackVector -where - T: Default + Copy, -{ +impl Default for StackVector { fn default() -> Self { Self { - data: [MaybeUninit::uninit(); MAX_SIZE], + data: [T::default(); MAX_SIZE], size: 0, } } } -impl Drop for StackVector { - fn drop(&mut self) { - for i in 0..self.size { - unsafe { self.data[i].assume_init_drop() }; - } - } -} - -impl StackVector { +impl StackVector { pub fn push(&mut self, value: T) -> bool { if self.size < MAX_SIZE { - self.data[self.size] = MaybeUninit::new(value); + self.data[self.size] = value; self.size += 1; true } else { @@ -142,21 +131,19 @@ impl StackVector { } pub fn clear(&mut self) { - for i in 0..self.size { - unsafe { self.data[i].assume_init_drop() }; - } self.size = 0; } pub fn as_slice(&self) -> &[T] { - unsafe { std::slice::from_raw_parts(self.data.as_ptr() as *const T, self.size) } + &self.data[..self.size] } - pub fn into_vec(self) -> Vec { + pub fn into_vec(mut self) -> Vec { let mut v = Vec::with_capacity(self.size); for i in 0..self.size { - v.push(unsafe { self.data[i].assume_init_read() }); + v.push(std::mem::take(&mut self.data[i])); } + self.size = 0; v } } diff --git a/src/utils.rs b/src/utils.rs index 1338816a..6918b21e 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -29,7 +29,7 @@ fn is_allowed_filter(ch: char) -> bool { ch.is_alphanumeric() || ch == '%' } -pub type TokensBuffer = StackVector; +pub type TokensBuffer = StackVector; fn fast_tokenizer_no_regex( pattern: &str, From d6a346bcd1838b28f6aed9637cc00d0d8bc165e5 Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Wed, 12 Nov 2025 16:24:03 +0400 Subject: [PATCH 08/11] Restore logic to reserve a free slot for zero token --- src/flatbuffers/unsafe_tools.rs | 4 ++++ src/utils.rs | 3 +++ 2 files changed, 7 insertions(+) diff --git a/src/flatbuffers/unsafe_tools.rs b/src/flatbuffers/unsafe_tools.rs index 7441f560..40522dc0 100644 --- a/src/flatbuffers/unsafe_tools.rs +++ b/src/flatbuffers/unsafe_tools.rs @@ -130,6 +130,10 @@ impl StackVector { self.size == 0 } + pub fn get_free_capacity(&self) -> usize { + MAX_SIZE - self.size + } + pub fn clear(&mut self) { self.size = 0; } diff --git a/src/utils.rs b/src/utils.rs index 6918b21e..8c7b95bb 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -44,6 +44,9 @@ fn fast_tokenizer_no_regex( let mut preceding_ch: Option = None; // Used to check if a '*' is not just before a token for (i, c) in pattern.char_indices() { + if tokens_buffer.get_free_capacity() <= 1 { + return; // reserve one free slot for the zero token + } if is_allowed_code(c) { if !inside { inside = true; From d3df508793302a11d52426dcc2b86ab665b34d52 Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Wed, 12 Nov 2025 20:21:35 +0400 Subject: [PATCH 09/11] Rename and move the struct --- src/flatbuffers/unsafe_tools.rs | 54 ------------------------------- src/utils.rs | 56 +++++++++++++++++++++++++++++++-- tests/unit/utils.rs | 42 +++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 56 deletions(-) diff --git a/src/flatbuffers/unsafe_tools.rs b/src/flatbuffers/unsafe_tools.rs index 40522dc0..7d3973c9 100644 --- a/src/flatbuffers/unsafe_tools.rs +++ b/src/flatbuffers/unsafe_tools.rs @@ -97,57 +97,3 @@ impl VerifiedFlatbufferMemory { &self.raw_data[self.start..] } } - -/// A stack-allocated vector that uses [T; MAX_SIZE] with Default initialization. -/// All elements are initialized to T::default(), and we track the logical size separately. -/// Note: a future impl can switch to using MaybeUninit with unsafe code for better efficiency. -pub struct StackVector { - data: [T; MAX_SIZE], - size: usize, -} - -impl Default for StackVector { - fn default() -> Self { - Self { - data: [T::default(); MAX_SIZE], - size: 0, - } - } -} - -impl StackVector { - pub fn push(&mut self, value: T) -> bool { - if self.size < MAX_SIZE { - self.data[self.size] = value; - self.size += 1; - true - } else { - false - } - } - - pub fn is_empty(&self) -> bool { - self.size == 0 - } - - pub fn get_free_capacity(&self) -> usize { - MAX_SIZE - self.size - } - - pub fn clear(&mut self) { - self.size = 0; - } - - pub fn as_slice(&self) -> &[T] { - &self.data[..self.size] - } - - pub fn into_vec(mut self) -> Vec { - let mut v = Vec::with_capacity(self.size); - for i in 0..self.size { - v.push(std::mem::take(&mut self.data[i])); - } - self.size = 0; - v - } -} diff --git a/src/utils.rs b/src/utils.rs index 8c7b95bb..1c226b40 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -6,7 +6,59 @@ use seahash::hash; #[cfg(target_pointer_width = "32")] use seahash::reference::hash; -use crate::flatbuffers::unsafe_tools::StackVector; +/// A stack-allocated vector that uses [T; MAX_SIZE] with Default initialization. +/// All elements are initialized to T::default(), and we track the logical size separately. +/// Note: a future impl can switch to using MaybeUninit with unsafe code for better efficiency. +pub struct ArrayVec { + data: [T; MAX_SIZE], + size: usize, +} + +impl Default for ArrayVec { + fn default() -> Self { + Self { + data: [T::default(); MAX_SIZE], + size: 0, + } + } +} + +impl ArrayVec { + pub fn push(&mut self, value: T) -> bool { + if self.size < MAX_SIZE { + self.data[self.size] = value; + self.size += 1; + true + } else { + false + } + } + + pub fn is_empty(&self) -> bool { + self.size == 0 + } + + pub fn get_free_capacity(&self) -> usize { + MAX_SIZE - self.size + } + + pub fn clear(&mut self) { + self.size = 0; + } + + pub fn as_slice(&self) -> &[T] { + &self.data[..self.size] + } + + pub fn into_vec(mut self) -> Vec { + let mut v = Vec::with_capacity(self.size); + for i in 0..self.size { + v.push(std::mem::take(&mut self.data[i])); + } + self.size = 0; + v + } +} pub type Hash = u64; @@ -29,7 +81,7 @@ fn is_allowed_filter(ch: char) -> bool { ch.is_alphanumeric() || ch == '%' } -pub type TokensBuffer = StackVector; +pub type TokensBuffer = ArrayVec; fn fast_tokenizer_no_regex( pattern: &str, diff --git a/tests/unit/utils.rs b/tests/unit/utils.rs index 1a28e644..3c4c569e 100644 --- a/tests/unit/utils.rs +++ b/tests/unit/utils.rs @@ -98,4 +98,46 @@ mod tests { assert!(!bin_lookup(&[1, 2, 3, 4, 42], 0)); assert!(!bin_lookup(&[1, 2, 3, 4, 42], 5)); } + + #[test] + fn test_array_vec_default_is_empty() { + let vec: crate::utils::ArrayVec = crate::utils::ArrayVec::default(); + assert!(vec.is_empty()); + assert_eq!(vec.as_slice(), &[] as &[u64]); + assert_eq!(vec.get_free_capacity(), 4); + } + + #[test] + fn test_array_vec_push_and_access() { + let mut vec: crate::utils::ArrayVec = crate::utils::ArrayVec::default(); + assert!(vec.push(1)); + assert!(vec.push(2)); + assert!(vec.push(3)); + assert_eq!(vec.as_slice(), &[1, 2, 3]); + assert_eq!(vec.get_free_capacity(), 1); + assert!(!vec.is_empty()); + } + + #[test] + fn test_array_vec_push_beyond_capacity() { + let mut vec: crate::utils::ArrayVec = crate::utils::ArrayVec::default(); + assert!(vec.push(1)); + assert!(vec.push(2)); + assert!(!vec.push(3)); // Should fail to push beyond capacity + assert_eq!(vec.as_slice(), &[1, 2]); + assert_eq!(vec.get_free_capacity(), 0); + } + + #[test] + fn test_array_vec_clear() { + let mut vec: crate::utils::ArrayVec = crate::utils::ArrayVec::default(); + vec.push(1); + vec.push(2); + vec.push(3); + assert_eq!(vec.as_slice(), &[1, 2, 3]); + vec.clear(); + assert!(vec.is_empty()); + assert_eq!(vec.as_slice(), &[] as &[u64]); + assert_eq!(vec.get_free_capacity(), 4); + } } From ae17f9bb927295acefa88843cb090b3c9e36b3e7 Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Wed, 12 Nov 2025 20:23:30 +0400 Subject: [PATCH 10/11] Cleanup in tests --- tests/unit/utils.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/unit/utils.rs b/tests/unit/utils.rs index 3c4c569e..a8e4fed6 100644 --- a/tests/unit/utils.rs +++ b/tests/unit/utils.rs @@ -1,6 +1,7 @@ #[cfg(test)] mod tests { use super::super::*; + use crate::utils::ArrayVec; #[test] #[ignore] // won't match hard-coded values when using a different hash function @@ -101,7 +102,7 @@ mod tests { #[test] fn test_array_vec_default_is_empty() { - let vec: crate::utils::ArrayVec = crate::utils::ArrayVec::default(); + let vec: ArrayVec = Default::default(); assert!(vec.is_empty()); assert_eq!(vec.as_slice(), &[] as &[u64]); assert_eq!(vec.get_free_capacity(), 4); @@ -109,7 +110,7 @@ mod tests { #[test] fn test_array_vec_push_and_access() { - let mut vec: crate::utils::ArrayVec = crate::utils::ArrayVec::default(); + let mut vec: ArrayVec = Default::default(); assert!(vec.push(1)); assert!(vec.push(2)); assert!(vec.push(3)); @@ -120,7 +121,7 @@ mod tests { #[test] fn test_array_vec_push_beyond_capacity() { - let mut vec: crate::utils::ArrayVec = crate::utils::ArrayVec::default(); + let mut vec: ArrayVec = Default::default(); assert!(vec.push(1)); assert!(vec.push(2)); assert!(!vec.push(3)); // Should fail to push beyond capacity @@ -130,7 +131,7 @@ mod tests { #[test] fn test_array_vec_clear() { - let mut vec: crate::utils::ArrayVec = crate::utils::ArrayVec::default(); + let mut vec: ArrayVec = Default::default(); vec.push(1); vec.push(2); vec.push(3); From f8dd45adfb423bf4aec009729679302e56509026 Mon Sep 17 00:00:00 2001 From: Mikhail Atuchin Date: Fri, 14 Nov 2025 11:33:20 +0400 Subject: [PATCH 11/11] use arrayvec --- Cargo.lock | 7 ++++++ Cargo.toml | 1 + src/request.rs | 2 +- src/utils.rs | 60 +++------------------------------------------ tests/unit/utils.rs | 43 -------------------------------- 5 files changed, 13 insertions(+), 100 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9574b656..5f2f2fc8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7,6 +7,7 @@ name = "adblock" version = "0.11.1" dependencies = [ "addr", + "arrayvec", "base64", "bitflags", "criterion", @@ -105,6 +106,12 @@ version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223" +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + [[package]] name = "autocfg" version = "1.1.0" diff --git a/Cargo.toml b/Cargo.toml index 2a615ad0..b9fa55e5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,6 +39,7 @@ rustc-hash = { version = "1.1.0", default-features = false } memchr = "2.4" base64 = "0.22" rmp-serde = "0.15" +arrayvec = "0.7" cssparser = { version = "0.34", optional = true } selectors = { version = "0.26", optional = true } precomputed-hash = "0.1" diff --git a/src/request.rs b/src/request.rs index ce2b96eb..fd5409f1 100644 --- a/src/request.rs +++ b/src/request.rs @@ -243,7 +243,7 @@ fn calculate_tokens(url_lower_cased: &str) -> Vec { utils::tokenize_pooled(url_lower_cased, &mut tokens); // Add zero token as a fallback to wildcard rule bucket tokens.push(0); - tokens.into_vec() + tokens.into_iter().collect() } #[cfg(test)] diff --git a/src/utils.rs b/src/utils.rs index 1c226b40..056f1dcf 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -6,59 +6,7 @@ use seahash::hash; #[cfg(target_pointer_width = "32")] use seahash::reference::hash; -/// A stack-allocated vector that uses [T; MAX_SIZE] with Default initialization. -/// All elements are initialized to T::default(), and we track the logical size separately. -/// Note: a future impl can switch to using MaybeUninit with unsafe code for better efficiency. -pub struct ArrayVec { - data: [T; MAX_SIZE], - size: usize, -} - -impl Default for ArrayVec { - fn default() -> Self { - Self { - data: [T::default(); MAX_SIZE], - size: 0, - } - } -} - -impl ArrayVec { - pub fn push(&mut self, value: T) -> bool { - if self.size < MAX_SIZE { - self.data[self.size] = value; - self.size += 1; - true - } else { - false - } - } - - pub fn is_empty(&self) -> bool { - self.size == 0 - } - - pub fn get_free_capacity(&self) -> usize { - MAX_SIZE - self.size - } - - pub fn clear(&mut self) { - self.size = 0; - } - - pub fn as_slice(&self) -> &[T] { - &self.data[..self.size] - } - - pub fn into_vec(mut self) -> Vec { - let mut v = Vec::with_capacity(self.size); - for i in 0..self.size { - v.push(std::mem::take(&mut self.data[i])); - } - self.size = 0; - v - } -} +pub use arrayvec::ArrayVec; pub type Hash = u64; @@ -96,7 +44,7 @@ fn fast_tokenizer_no_regex( let mut preceding_ch: Option = None; // Used to check if a '*' is not just before a token for (i, c) in pattern.char_indices() { - if tokens_buffer.get_free_capacity() <= 1 { + if tokens_buffer.capacity() - tokens_buffer.len() <= 1 { return; // reserve one free slot for the zero token } if is_allowed_code(c) { @@ -134,7 +82,7 @@ pub(crate) fn tokenize_pooled(pattern: &str, tokens_buffer: &mut TokensBuffer) { pub fn tokenize(pattern: &str) -> Vec { let mut tokens_buffer = TokensBuffer::default(); tokenize_to(pattern, &mut tokens_buffer); - tokens_buffer.into_vec() + tokens_buffer.into_iter().collect() } pub(crate) fn tokenize_to(pattern: &str, tokens_buffer: &mut TokensBuffer) { @@ -154,7 +102,7 @@ pub(crate) fn tokenize_filter( skip_last_token, &mut tokens_buffer, ); - tokens_buffer.into_vec() + tokens_buffer.into_iter().collect() } pub(crate) fn tokenize_filter_to( diff --git a/tests/unit/utils.rs b/tests/unit/utils.rs index a8e4fed6..1a28e644 100644 --- a/tests/unit/utils.rs +++ b/tests/unit/utils.rs @@ -1,7 +1,6 @@ #[cfg(test)] mod tests { use super::super::*; - use crate::utils::ArrayVec; #[test] #[ignore] // won't match hard-coded values when using a different hash function @@ -99,46 +98,4 @@ mod tests { assert!(!bin_lookup(&[1, 2, 3, 4, 42], 0)); assert!(!bin_lookup(&[1, 2, 3, 4, 42], 5)); } - - #[test] - fn test_array_vec_default_is_empty() { - let vec: ArrayVec = Default::default(); - assert!(vec.is_empty()); - assert_eq!(vec.as_slice(), &[] as &[u64]); - assert_eq!(vec.get_free_capacity(), 4); - } - - #[test] - fn test_array_vec_push_and_access() { - let mut vec: ArrayVec = Default::default(); - assert!(vec.push(1)); - assert!(vec.push(2)); - assert!(vec.push(3)); - assert_eq!(vec.as_slice(), &[1, 2, 3]); - assert_eq!(vec.get_free_capacity(), 1); - assert!(!vec.is_empty()); - } - - #[test] - fn test_array_vec_push_beyond_capacity() { - let mut vec: ArrayVec = Default::default(); - assert!(vec.push(1)); - assert!(vec.push(2)); - assert!(!vec.push(3)); // Should fail to push beyond capacity - assert_eq!(vec.as_slice(), &[1, 2]); - assert_eq!(vec.get_free_capacity(), 0); - } - - #[test] - fn test_array_vec_clear() { - let mut vec: ArrayVec = Default::default(); - vec.push(1); - vec.push(2); - vec.push(3); - assert_eq!(vec.as_slice(), &[1, 2, 3]); - vec.clear(); - assert!(vec.is_empty()); - assert_eq!(vec.as_slice(), &[] as &[u64]); - assert_eq!(vec.get_free_capacity(), 4); - } }