Skip to content

Commit 3feffc9

Browse files
authored
[perf] Replace token_histogram to token selector (#563)
The PRs replaced old (slow and not enough memory preserving) token_histogram to TokenSelector. It * process filters one by one, which faster and uses less memory. Also it unblocks the next optimizations. * has the problematic tokens hardcoded; * saves about -10% of building time. * provides the final token quality that is a little worse, but the difference in matching performance is about 3-4%.
1 parent e9299eb commit 3feffc9

File tree

7 files changed

+183
-109
lines changed

7 files changed

+183
-109
lines changed

src/filters/fb_network_builder.rs

Lines changed: 14 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@ use flatbuffers::WIPOffset;
66

77
use crate::filters::fb_builder::EngineFlatBuilder;
88
use crate::filters::network::{FilterTokens, NetworkFilter};
9+
use crate::filters::token_selector::TokenSelector;
910

1011
use crate::filters::network::NetworkFilterMaskHelper;
1112
use crate::flatbuffers::containers::flat_multimap::FlatMultiMapBuilder;
1213
use crate::flatbuffers::containers::flat_serialize::{FlatBuilder, FlatSerialize, WIPFlatVec};
13-
use crate::network_filter_list::token_histogram;
1414
use crate::optimizer;
1515
use crate::utils::{to_short_hash, Hash, ShortHash};
1616

@@ -133,21 +133,10 @@ impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for NetworkFilterListBuilder {
133133

134134
let mut optimizable = HashMap::<ShortHash, Vec<NetworkFilter>>::new();
135135

136-
// Compute tokens for all filters
137-
let filter_tokens: Vec<_> = rule_list
138-
.filters
139-
.into_iter()
140-
.map(|filter| {
141-
let tokens = filter.get_tokens_optimized();
142-
(filter, tokens)
143-
})
144-
.collect();
145-
146-
// compute the tokens' frequency histogram
147-
let (total_number_of_tokens, tokens_histogram) = token_histogram(&filter_tokens);
136+
let mut token_frequencies = TokenSelector::new(rule_list.filters.len());
148137

149138
{
150-
for (network_filter, multi_tokens) in filter_tokens.into_iter() {
139+
for network_filter in rule_list.filters {
151140
let flat_filter = if !rule_list.optimize
152141
|| !optimizer::is_filter_optimizable_by_patterns(&network_filter)
153142
{
@@ -156,46 +145,34 @@ impl<'a> FlatSerialize<'a, EngineFlatBuilder<'a>> for NetworkFilterListBuilder {
156145
None
157146
};
158147

159-
let mut store_filter = |token: ShortHash| {
148+
let mut store_filter = |token: Hash| {
149+
let short_token = to_short_hash(token);
160150
if let Some(flat_filter) = flat_filter {
161-
filter_map_builder.insert(token, flat_filter);
151+
filter_map_builder.insert(short_token, flat_filter);
162152
} else {
163153
optimizable
164-
.entry(token)
154+
.entry(short_token)
165155
.or_default()
166156
.push(network_filter.clone());
167157
}
168158
};
169159

160+
let multi_tokens = network_filter.get_tokens_optimized();
170161
match multi_tokens {
171162
FilterTokens::Empty => {
172-
// No tokens, skip this filter
163+
// No tokens, add to fallback bucket (token 0)
164+
store_filter(0);
173165
}
174166
FilterTokens::OptDomains(opt_domains) => {
175167
// For OptDomains, each domain is treated as a separate token group
176168
for &token in opt_domains.iter() {
177-
store_filter(to_short_hash(token));
169+
store_filter(token);
170+
token_frequencies.record_usage(token);
178171
}
179172
}
180173
FilterTokens::Other(tokens) => {
181-
// For Other tokens, find the best token from the group
182-
let mut best_token: ShortHash = 0;
183-
let mut min_count = total_number_of_tokens + 1;
184-
for &token in tokens.iter() {
185-
let token = to_short_hash(token);
186-
match tokens_histogram.get(&token) {
187-
None => {
188-
min_count = 0;
189-
best_token = token
190-
}
191-
Some(&count) if count < min_count => {
192-
min_count = count;
193-
best_token = token
194-
}
195-
_ => {}
196-
}
197-
}
198-
174+
let best_token = token_frequencies.select_least_used_token(&tokens);
175+
token_frequencies.record_usage(best_token);
199176
store_filter(best_token);
200177
}
201178
}

src/filters/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ pub(crate) mod fb_network;
99
pub(crate) mod fb_network_builder;
1010
pub(crate) mod filter_data_context;
1111
pub mod network;
12+
pub(crate) mod token_selector;
1213

1314
#[allow(unknown_lints)]
1415
#[allow(

src/filters/token_selector.rs

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
//! Token selector for optimizing filter storage by choosing least-used tokens
2+
3+
use crate::utils::{to_short_hash, Hash, ShortHash};
4+
use seahash::SeaHasher;
5+
use std::collections::HashMap;
6+
use std::hash::BuildHasherDefault;
7+
8+
// Two groups of tokens that better be avoided during token selection.
9+
// The list was built by logging each token that triggered .matches() call
10+
// for some network filter in the benchmark.
11+
const WORST_TOKENS: [&str; 4] = ["https", "http", "www", "com"];
12+
const BAD_TOKENS: [&str; 36] = [
13+
"uk",
14+
"net",
15+
"org",
16+
"io",
17+
"de",
18+
"fr",
19+
"es",
20+
"it",
21+
"nl",
22+
"se",
23+
"ru",
24+
"pl",
25+
"co",
26+
"js",
27+
"css",
28+
"img",
29+
"jpg",
30+
"html",
31+
"png",
32+
"cdn",
33+
"static",
34+
"images",
35+
"api",
36+
"wp",
37+
"ad",
38+
"ads",
39+
"content",
40+
"doubleclick",
41+
"analytics",
42+
"assets",
43+
"id",
44+
"min",
45+
"amazon",
46+
"google",
47+
"googlesyndication",
48+
"googleapis",
49+
];
50+
51+
const WORST_TOKEN_USAGE: usize = usize::MAX / 2;
52+
const BAD_TOKEN_USAGE: usize = usize::MAX / 4;
53+
54+
/// Selects the optimal token for filter storage by tracking usage frequencies.
55+
/// Tokens that are used less frequently are preferred for better efficiency.
56+
pub(crate) struct TokenSelector {
57+
usage: HashMap<ShortHash, usize, BuildHasherDefault<SeaHasher>>,
58+
}
59+
60+
impl TokenSelector {
61+
/// Creates a new TokenSelector with pre-populated commonly-used tokens.
62+
pub fn new(capacity: usize) -> Self {
63+
let mut usage = HashMap::with_capacity_and_hasher(
64+
capacity + WORST_TOKENS.len() + BAD_TOKENS.len(),
65+
BuildHasherDefault::<SeaHasher>::default(),
66+
);
67+
let mut store_token = |token: &str, count: usize| {
68+
usage.insert(to_short_hash(crate::utils::fast_hash(token)), count);
69+
};
70+
71+
for token in WORST_TOKENS {
72+
store_token(token, WORST_TOKEN_USAGE);
73+
}
74+
75+
for token in BAD_TOKENS {
76+
store_token(token, BAD_TOKEN_USAGE);
77+
}
78+
79+
Self { usage }
80+
}
81+
82+
/// Selects the least-used token from the provided list of tokens.
83+
/// Returns the token with the lowest current usage count.
84+
pub fn select_least_used_token(&self, tokens: &[Hash]) -> Hash {
85+
let mut best_token = 0;
86+
let mut best_token_used = usize::MAX;
87+
for &token in tokens {
88+
if token == 0 {
89+
// 0 is already used as a fallback token.
90+
continue;
91+
}
92+
93+
match self.usage.get(&to_short_hash(token)) {
94+
Some(&count) => {
95+
if count < best_token_used {
96+
best_token_used = count;
97+
best_token = token;
98+
}
99+
}
100+
None => {
101+
// Token never seen before - this is optimal
102+
return token;
103+
}
104+
}
105+
}
106+
best_token
107+
}
108+
109+
/// Records that a token has been used, incrementing its usage count.
110+
pub fn record_usage(&mut self, token: Hash) {
111+
*self.usage.entry(to_short_hash(token)).or_insert(0) += 1;
112+
}
113+
}
114+
115+
#[cfg(test)]
116+
#[path = "../../tests/unit/filters/token_selector.rs"]
117+
mod unit_tests;

src/network_filter_list.rs

Lines changed: 3 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,20 @@
11
//! Holds the implementation of [NetworkFilterList] and related functionality.
22
3-
use std::{collections::HashMap, collections::HashSet, fmt};
3+
use std::{collections::HashSet, fmt};
44

55
use flatbuffers::ForwardsUOffset;
66

77
use crate::filters::fb_network::FlatNetworkFilter;
88
use crate::filters::filter_data_context::FilterDataContext;
99
use crate::filters::flatbuffer_generated::fb;
1010
use crate::filters::network::{
11-
FilterTokens, NetworkFilter, NetworkFilterMask, NetworkFilterMaskHelper, NetworkMatchable,
11+
NetworkFilter, NetworkFilterMask, NetworkFilterMaskHelper, NetworkMatchable,
1212
};
1313
use crate::flatbuffers::containers::flat_multimap::FlatMultiMapView;
1414
use crate::flatbuffers::unsafe_tools::fb_vector_to_slice;
1515
use crate::regex_manager::RegexManager;
1616
use crate::request::Request;
17-
use crate::utils::{fast_hash, to_short_hash, ShortHash};
17+
use crate::utils::{to_short_hash, ShortHash};
1818

1919
/// Holds relevant information from a single matchin gnetwork filter rule as a result of querying a
2020
/// [NetworkFilterList] for a given request.
@@ -151,27 +151,3 @@ impl NetworkFilterList<'_> {
151151
filters
152152
}
153153
}
154-
155-
pub(crate) fn token_histogram<T>(
156-
filter_tokens: &[(T, FilterTokens)],
157-
) -> (u32, HashMap<ShortHash, u32>) {
158-
let mut tokens_histogram: HashMap<ShortHash, u32> = HashMap::new();
159-
let mut number_of_tokens = 0;
160-
for (_, tokens) in filter_tokens.iter() {
161-
match tokens {
162-
FilterTokens::Other(tokens) | FilterTokens::OptDomains(tokens) => {
163-
for t in tokens {
164-
*tokens_histogram.entry(to_short_hash(*t)).or_insert(0) += 1;
165-
number_of_tokens += 1;
166-
}
167-
}
168-
FilterTokens::Empty => {}
169-
}
170-
}
171-
172-
for bad_token in ["http", "https", "www", "com"].iter() {
173-
tokens_histogram.insert(to_short_hash(fast_hash(bad_token)), number_of_tokens);
174-
}
175-
176-
(number_of_tokens, tokens_histogram)
177-
}

tests/unit/engine.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -237,9 +237,9 @@ mod tests {
237237
);
238238
}
239239
let expected_hash: u64 = if cfg!(feature = "css-validation") {
240-
15545091389304905433
240+
13277824246832611772
241241
} else {
242-
543362704487480180
242+
12001568478200869587
243243
};
244244

245245
assert_eq!(hash(&data), expected_hash, "{HASH_MISMATCH_MSG}");
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#[cfg(test)]
2+
mod token_selector_tests {
3+
use super::super::*;
4+
use crate::utils::fast_hash;
5+
6+
#[test]
7+
fn token_priority() {
8+
let selector = TokenSelector::new(0);
9+
let regular = fast_hash("rare_token");
10+
let worst = fast_hash("https");
11+
let bad = fast_hash("assets");
12+
13+
assert_eq!(selector.select_least_used_token(&[]), 0);
14+
assert_eq!(selector.select_least_used_token(&[0, 0]), 0);
15+
16+
// a regular token is always better
17+
assert_eq!(
18+
selector.select_least_used_token(&[regular, worst, bad]),
19+
regular
20+
);
21+
assert_eq!(
22+
selector.select_least_used_token(&[worst, bad, regular]),
23+
regular
24+
);
25+
26+
// a bad token is always better than a worst token
27+
assert_eq!(selector.select_least_used_token(&[worst, bad]), bad);
28+
assert_eq!(selector.select_least_used_token(&[bad, worst]), bad);
29+
}
30+
31+
#[test]
32+
fn test_select_least_used_token_with_usage() {
33+
let mut selector = TokenSelector::new(0);
34+
let token1 = fast_hash("token1");
35+
let token2 = fast_hash("token2");
36+
37+
assert_eq!(selector.select_least_used_token(&[token1, token2]), token1);
38+
39+
selector.record_usage(token1);
40+
selector.record_usage(token1);
41+
selector.record_usage(token2);
42+
43+
// token2 should be selected as it has lower usage
44+
assert_eq!(selector.select_least_used_token(&[token1, token2]), token2);
45+
}
46+
}

tests/unit/network_filter_list.rs

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -26,49 +26,6 @@ mod tests {
2626
);
2727
}
2828

29-
#[test]
30-
fn token_histogram_works() {
31-
// handle the case of just 1 token
32-
{
33-
let tokens = vec![(0, vec![vec![111]])];
34-
let (total_tokens, histogram) = token_histogram(&tokens);
35-
assert_eq!(total_tokens, 1);
36-
assert_eq!(histogram.get(&111), Some(&1));
37-
// include bad tokens
38-
assert_eq!(histogram.get(&fast_hash("http")), Some(&1));
39-
assert_eq!(histogram.get(&fast_hash("www")), Some(&1));
40-
}
41-
42-
// handle the case of repeating tokens
43-
{
44-
let tokens = vec![(0, vec![vec![111]]), (1, vec![vec![111]])];
45-
let (total_tokens, histogram) = token_histogram(&tokens);
46-
assert_eq!(total_tokens, 2);
47-
assert_eq!(histogram.get(&111), Some(&2));
48-
// include bad tokens
49-
assert_eq!(histogram.get(&fast_hash("http")), Some(&2));
50-
assert_eq!(histogram.get(&fast_hash("www")), Some(&2));
51-
}
52-
53-
// handle the different token set sizes
54-
{
55-
let tokens = vec![
56-
(0, vec![vec![111, 123, 132]]),
57-
(1, vec![vec![111], vec![123], vec![132]]),
58-
(2, vec![vec![111, 123], vec![132]]),
59-
(3, vec![vec![111, 111], vec![111]]),
60-
];
61-
let (total_tokens, histogram) = token_histogram(&tokens);
62-
assert_eq!(total_tokens, 12);
63-
assert_eq!(histogram.get(&111), Some(&6));
64-
assert_eq!(histogram.get(&123), Some(&3));
65-
assert_eq!(histogram.get(&132), Some(&3));
66-
// include bad tokens
67-
assert_eq!(histogram.get(&fast_hash("http")), Some(&12));
68-
assert_eq!(histogram.get(&fast_hash("www")), Some(&12));
69-
}
70-
}
71-
7229
#[test]
7330
fn network_filter_list_new_works() {
7431
{

0 commit comments

Comments
 (0)