Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions src/filters/fb_network_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use flatbuffers::WIPOffset;
use crate::filters::fb_builder::EngineFlatBuilder;
use crate::filters::network::{FilterTokens, NetworkFilter};
use crate::filters::token_selector::TokenSelector;
use crate::utils::TokensBuffer;

use crate::filters::network::NetworkFilterMaskHelper;
use crate::flatbuffers::containers::flat_multimap::FlatMultiMapBuilder;
Expand Down Expand Up @@ -134,6 +135,7 @@ impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for NetworkFilterListBuilder {
let mut optimizable = HashMap::<ShortHash, Vec<NetworkFilter>>::new();

let mut token_frequencies = TokenSelector::new(rule_list.filters.len());
let mut tokens_buffer = TokensBuffer::default();

{
for network_filter in rule_list.filters {
Expand All @@ -157,7 +159,7 @@ impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for NetworkFilterListBuilder {
}
};

let multi_tokens = network_filter.get_tokens_optimized();
let multi_tokens = network_filter.get_tokens_optimized(&mut tokens_buffer);
match multi_tokens {
FilterTokens::Empty => {
// No tokens, add to fallback bucket (token 0)
Expand All @@ -171,7 +173,7 @@ impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for NetworkFilterListBuilder {
}
}
FilterTokens::Other(tokens) => {
let best_token = token_frequencies.select_least_used_token(&tokens);
let best_token = token_frequencies.select_least_used_token(tokens);
token_frequencies.record_usage(best_token);
store_filter(best_token);
}
Expand Down
50 changes: 25 additions & 25 deletions src/filters/network.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@ use crate::filters::abstract_network::{
use crate::lists::ParseOptions;
use crate::regex_manager::RegexManager;
use crate::request;
use crate::utils::{self, Hash};

pub(crate) const TOKENS_BUFFER_SIZE: usize = 200;
use crate::utils::{self, Hash, TokensBuffer};

/// For now, only support `$removeparam` with simple alphanumeric/dash/underscore patterns.
static VALID_PARAM: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[a-zA-Z0-9_\-]+$").unwrap());
Expand Down Expand Up @@ -312,10 +310,10 @@ pub enum FilterPart {
}

#[derive(Debug, PartialEq)]
pub enum FilterTokens {
pub enum FilterTokens<'a> {
Empty,
OptDomains(Vec<Hash>),
Other(Vec<Hash>),
OptDomains(&'a [Hash]),
Other(&'a [Hash]),
}

pub struct FilterPartIterator<'a> {
Expand Down Expand Up @@ -885,17 +883,21 @@ impl NetworkFilter {

#[deprecated(since = "0.11.1", note = "use get_tokens_optimized instead")]
pub fn get_tokens(&self) -> Vec<Vec<Hash>> {
match self.get_tokens_optimized() {
let mut tokens_buffer = TokensBuffer::default();
match self.get_tokens_optimized(&mut tokens_buffer) {
FilterTokens::OptDomains(domains) => {
domains.into_iter().map(|domain| vec![domain]).collect()
domains.iter().map(|domain| vec![*domain]).collect()
}
FilterTokens::Other(tokens) => vec![tokens],
FilterTokens::Other(tokens) => vec![tokens.to_vec()],
FilterTokens::Empty => vec![],
}
}

pub fn get_tokens_optimized(&self) -> FilterTokens {
let mut tokens: Vec<Hash> = Vec::with_capacity(TOKENS_BUFFER_SIZE);
pub fn get_tokens_optimized<'a>(
&'a self,
tokens_buffer: &'a mut TokensBuffer,
) -> FilterTokens<'a> {
tokens_buffer.clear();

// If there is only one domain and no domain negation, we also use this
// domain as a token.
Expand All @@ -905,7 +907,7 @@ impl NetworkFilter {
{
if let Some(domains) = self.opt_domains.as_ref() {
if let Some(domain) = domains.first() {
tokens.push(*domain)
tokens_buffer.push(*domain);
}
}
}
Expand All @@ -918,7 +920,7 @@ impl NetworkFilter {
(self.is_plain() || self.is_regex()) && !self.is_right_anchor();
let skip_first_token = self.is_right_anchor();

utils::tokenize_filter_to(f, skip_first_token, skip_last_token, &mut tokens);
utils::tokenize_filter_to(f, skip_first_token, skip_last_token, tokens_buffer);
}
}
FilterPart::AnyOf(_) => (), // across AnyOf set of filters no single token is guaranteed to match to a request
Expand All @@ -928,45 +930,43 @@ impl NetworkFilter {
// Append tokens from hostname, if any
if !self.mask.contains(NetworkFilterMask::IS_HOSTNAME_REGEX) {
if let Some(hostname) = self.hostname.as_ref() {
utils::tokenize_to(hostname, &mut tokens);
utils::tokenize_to(hostname, tokens_buffer);
}
} else if let Some(hostname) = self.hostname.as_ref() {
// Find last dot to tokenize the prefix
let last_dot_pos = hostname.rfind('.');
if let Some(last_dot_pos) = last_dot_pos {
utils::tokenize_to(&hostname[..last_dot_pos], &mut tokens);
utils::tokenize_to(&hostname[..last_dot_pos], tokens_buffer);
}
}

if tokens.is_empty() && self.mask.contains(NetworkFilterMask::IS_REMOVEPARAM) {
if tokens_buffer.is_empty() && self.mask.contains(NetworkFilterMask::IS_REMOVEPARAM) {
if let Some(removeparam) = &self.modifier_option {
if VALID_PARAM.is_match(removeparam) {
utils::tokenize_to(&removeparam.to_ascii_lowercase(), &mut tokens);
utils::tokenize_to(&removeparam.to_ascii_lowercase(), tokens_buffer);
}
}
}

// If we got no tokens for the filter/hostname part, then we will dispatch
// this filter in multiple buckets based on the domains option.
if tokens.is_empty() && self.opt_domains.is_some() && self.opt_not_domains.is_none() {
if tokens_buffer.is_empty() && self.opt_domains.is_some() && self.opt_not_domains.is_none()
{
if let Some(opt_domains) = self.opt_domains.as_ref() {
if !opt_domains.is_empty() {
return FilterTokens::OptDomains(opt_domains.clone());
return FilterTokens::OptDomains(opt_domains);
}
}
FilterTokens::Empty
} else {
// Add optional token for protocol
if self.for_http() && !self.for_https() {
tokens.push(utils::fast_hash("http"));
tokens_buffer.push(utils::fast_hash("http"));
} else if self.for_https() && !self.for_http() {
tokens.push(utils::fast_hash("https"));
tokens_buffer.push(utils::fast_hash("https"));
}

// Remake a vector to drop extra capacity.
let mut t = Vec::with_capacity(tokens.len());
t.extend(tokens);
FilterTokens::Other(t)
FilterTokens::Other(tokens_buffer.as_slice())
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/request.rs
Original file line number Diff line number Diff line change
Expand Up @@ -239,11 +239,11 @@ impl Request {
}

fn calculate_tokens(url_lower_cased: &str) -> Vec<utils::Hash> {
let mut tokens = vec![];
let mut tokens = utils::TokensBuffer::default();
utils::tokenize_pooled(url_lower_cased, &mut tokens);
// Add zero token as a fallback to wildcard rule bucket
tokens.push(0);
tokens
tokens.into_vec()
}

#[cfg(test)]
Expand Down
78 changes: 65 additions & 13 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,60 @@ use seahash::hash;
#[cfg(target_pointer_width = "32")]
use seahash::reference::hash;

/// A stack-allocated vector that uses [T; MAX_SIZE] with Default initialization.
/// All elements are initialized to T::default(), and we track the logical size separately.
/// Note: a future impl can switch to using MaybeUninit with unsafe code for better efficiency.
pub struct ArrayVec<T, const MAX_SIZE: usize> {
data: [T; MAX_SIZE],
size: usize,
}
Comment on lines +9 to +15
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's use the existing arrayvec crate for this. That implementation has already been optimized and battle-tested across the Rust ecosystem (along with the similar ArrayString) for a very long time now.

arrayvec 0.7.6 is already in the dependency tree for brave-core.


impl<T: Default + Copy, const MAX_SIZE: usize> Default for ArrayVec<T, MAX_SIZE> {
fn default() -> Self {
Self {
data: [T::default(); MAX_SIZE],
size: 0,
}
}
}

impl<T: Default, const MAX_SIZE: usize> ArrayVec<T, MAX_SIZE> {
pub fn push(&mut self, value: T) -> bool {
if self.size < MAX_SIZE {
self.data[self.size] = value;
self.size += 1;
true
} else {
false
}
}

pub fn is_empty(&self) -> bool {
self.size == 0
}

pub fn get_free_capacity(&self) -> usize {
MAX_SIZE - self.size
}

pub fn clear(&mut self) {
self.size = 0;
}

pub fn as_slice(&self) -> &[T] {
&self.data[..self.size]
}

pub fn into_vec(mut self) -> Vec<T> {
let mut v = Vec::with_capacity(self.size);
for i in 0..self.size {
v.push(std::mem::take(&mut self.data[i]));
}
self.size = 0;
v
}
}

pub type Hash = u64;

// A smaller version of Hash that is used in serialized format.
Expand All @@ -27,25 +81,23 @@ fn is_allowed_filter(ch: char) -> bool {
ch.is_alphanumeric() || ch == '%'
}

pub(crate) const TOKENS_BUFFER_SIZE: usize = 128;
pub(crate) const TOKENS_BUFFER_RESERVED: usize = 1;
const TOKENS_MAX: usize = TOKENS_BUFFER_SIZE - TOKENS_BUFFER_RESERVED;
pub type TokensBuffer = ArrayVec<Hash, 256>;

fn fast_tokenizer_no_regex(
pattern: &str,
is_allowed_code: &dyn Fn(char) -> bool,
skip_first_token: bool,
skip_last_token: bool,
tokens_buffer: &mut Vec<Hash>,
tokens_buffer: &mut TokensBuffer,
) {
// let mut tokens_buffer_index = 0;
let mut inside: bool = false;
let mut start = 0;
let mut preceding_ch: Option<char> = None; // Used to check if a '*' is not just before a token

for (i, c) in pattern.char_indices() {
if tokens_buffer.len() >= TOKENS_MAX {
return;
if tokens_buffer.get_free_capacity() <= 1 {
return; // reserve one free slot for the zero token
}
if is_allowed_code(c) {
if !inside {
Expand Down Expand Up @@ -75,17 +127,17 @@ fn fast_tokenizer_no_regex(
}
}

pub(crate) fn tokenize_pooled(pattern: &str, tokens_buffer: &mut Vec<Hash>) {
pub(crate) fn tokenize_pooled(pattern: &str, tokens_buffer: &mut TokensBuffer) {
fast_tokenizer_no_regex(pattern, &is_allowed_filter, false, false, tokens_buffer);
}

pub fn tokenize(pattern: &str) -> Vec<Hash> {
let mut tokens_buffer: Vec<Hash> = Vec::with_capacity(TOKENS_BUFFER_SIZE);
let mut tokens_buffer = TokensBuffer::default();
tokenize_to(pattern, &mut tokens_buffer);
tokens_buffer
tokens_buffer.into_vec()
}

pub(crate) fn tokenize_to(pattern: &str, tokens_buffer: &mut Vec<Hash>) {
pub(crate) fn tokenize_to(pattern: &str, tokens_buffer: &mut TokensBuffer) {
fast_tokenizer_no_regex(pattern, &is_allowed_filter, false, false, tokens_buffer);
}

Expand All @@ -95,21 +147,21 @@ pub(crate) fn tokenize_filter(
skip_first_token: bool,
skip_last_token: bool,
) -> Vec<Hash> {
let mut tokens_buffer: Vec<Hash> = Vec::with_capacity(TOKENS_BUFFER_SIZE);
let mut tokens_buffer = TokensBuffer::default();
tokenize_filter_to(
pattern,
skip_first_token,
skip_last_token,
&mut tokens_buffer,
);
tokens_buffer
tokens_buffer.into_vec()
}

pub(crate) fn tokenize_filter_to(
pattern: &str,
skip_first_token: bool,
skip_last_token: bool,
tokens_buffer: &mut Vec<Hash>,
tokens_buffer: &mut TokensBuffer,
) {
fast_tokenizer_no_regex(
pattern,
Expand Down
2 changes: 1 addition & 1 deletion tests/matching.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,5 +199,5 @@ fn check_rule_matching_browserlike() {
let (blocked, passes) = bench_rule_matching_browserlike(&engine, &requests);
let msg = "The number of blocked/passed requests has changed. ".to_string()
+ "If this is expected, update the expected values in the test.";
assert_eq!((blocked, passes), (106860, 136085), "{msg}");
assert_eq!((blocked, passes), (106861, 136084), "{msg}");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this expected ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mentioned in the PR description: we hits the token limits for one rule.
If you take master and increase the limits, you will get the same behavior.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nvm, the reason is in the descr

}
8 changes: 3 additions & 5 deletions tests/unit/filters/network.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1191,12 +1191,10 @@ mod parse_tests {
fn test_simple_pattern_tokenization() {
let rule = "||some.primewire.c*/sw$script,1p";
let filter = NetworkFilter::parse(rule, true, ParseOptions::default()).unwrap();
let mut tokens_buffer = utils::TokensBuffer::default();
assert_eq!(
filter.get_tokens_optimized(),
FilterTokens::Other(vec![
utils::fast_hash("some"),
utils::fast_hash("primewire")
])
filter.get_tokens_optimized(&mut tokens_buffer),
FilterTokens::Other(&[utils::fast_hash("some"), utils::fast_hash("primewire")])
);
}
}
42 changes: 42 additions & 0 deletions tests/unit/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,4 +98,46 @@ mod tests {
assert!(!bin_lookup(&[1, 2, 3, 4, 42], 0));
assert!(!bin_lookup(&[1, 2, 3, 4, 42], 5));
}

#[test]
fn test_array_vec_default_is_empty() {
let vec: crate::utils::ArrayVec<u64, 4> = crate::utils::ArrayVec::default();
assert!(vec.is_empty());
assert_eq!(vec.as_slice(), &[] as &[u64]);
assert_eq!(vec.get_free_capacity(), 4);
}

#[test]
fn test_array_vec_push_and_access() {
let mut vec: crate::utils::ArrayVec<u64, 4> = crate::utils::ArrayVec::default();
assert!(vec.push(1));
assert!(vec.push(2));
assert!(vec.push(3));
assert_eq!(vec.as_slice(), &[1, 2, 3]);
assert_eq!(vec.get_free_capacity(), 1);
assert!(!vec.is_empty());
}

#[test]
fn test_array_vec_push_beyond_capacity() {
let mut vec: crate::utils::ArrayVec<u64, 2> = crate::utils::ArrayVec::default();
assert!(vec.push(1));
assert!(vec.push(2));
assert!(!vec.push(3)); // Should fail to push beyond capacity
assert_eq!(vec.as_slice(), &[1, 2]);
assert_eq!(vec.get_free_capacity(), 0);
}

#[test]
fn test_array_vec_clear() {
let mut vec: crate::utils::ArrayVec<u64, 4> = crate::utils::ArrayVec::default();
vec.push(1);
vec.push(2);
vec.push(3);
assert_eq!(vec.as_slice(), &[1, 2, 3]);
vec.clear();
assert!(vec.is_empty());
assert_eq!(vec.as_slice(), &[] as &[u64]);
assert_eq!(vec.get_free_capacity(), 4);
}
}