Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
7cf18aa
Add benchmark for deserializing large added vocab
ArthurZucker May 27, 2025
47c2e9f
revert dumb stuff, isolate changes
ArthurZucker May 27, 2025
a8f6a71
try to only normalize once
ArthurZucker May 27, 2025
35a9427
Merge branch 'main' into codex/optimize-addedvocabulary-deserializati…
ArthurZucker May 27, 2025
8ba1d20
small improvement?
ArthurZucker May 27, 2025
a71555c
Merge branch 'codex/optimize-addedvocabulary-deserialization-and-add-…
ArthurZucker May 27, 2025
6714ceb
some updates
ArthurZucker May 27, 2025
e07ecfc
nit
ArthurZucker May 27, 2025
8849d71
fmt
ArthurZucker May 27, 2025
5da668a
normalized string are a fucking waste of time when you just want to a…
ArthurZucker May 27, 2025
8e7ce86
more attempts
ArthurZucker May 27, 2025
948eead
works
ArthurZucker May 27, 2025
43cef92
let's fucking go, parity
ArthurZucker May 27, 2025
ae8a7b4
update
ArthurZucker May 27, 2025
44beeb7
hahahhahaha
ArthurZucker May 27, 2025
e7f8954
revert changes that are not actually even needed
ArthurZucker May 27, 2025
8d49849
add a python test!
ArthurZucker May 27, 2025
d8f07fa
use normalizer before come on
ArthurZucker May 27, 2025
f6df603
nit
ArthurZucker May 27, 2025
96a9563
Merge branch 'main' of github.com:huggingface/tokenizers into codex/o…
ArthurZucker Nov 27, 2025
bd671d1
update to a more concrete usecase
ArthurZucker Nov 27, 2025
236f8ce
fix build
ArthurZucker Nov 27, 2025
2f12a63
style
ArthurZucker Nov 27, 2025
ae4b990
reduce sample size
ArthurZucker Nov 27, 2025
e20d5c7
--allow unmaintained
ArthurZucker Nov 27, 2025
8423fc8
clippy happy
ArthurZucker Nov 27, 2025
a6b0a4d
up
ArthurZucker Nov 27, 2025
1bf0820
Merge branch 'main' of github.com:huggingface/tokenizers into codex/o…
ArthurZucker Nov 27, 2025
7c69aea
up
ArthurZucker Nov 27, 2025
756c014
derive impl
ArthurZucker Nov 27, 2025
669f78d
revert unrelated
ArthurZucker Nov 27, 2025
c998b42
fmt
ArthurZucker Nov 27, 2025
80de975
ignore
ArthurZucker Nov 27, 2025
3c24367
Merge branch 'main' into codex/optimize-addedvocabulary-deserializati…
ArthurZucker Nov 27, 2025
ed86d8b
remove stupid file
ArthurZucker Nov 27, 2025
5f9db87
Merge branch 'codex/optimize-addedvocabulary-deserialization-and-add-…
ArthurZucker Nov 27, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ jobs:
uses: actions-rs/cargo@v1
with:
command: audit
args: -D warnings -f ./bindings/python/Cargo.lock --ignore RUSTSEC-2024-0436 --ignore RUSTSEC-2025-0014
args: -D warnings -f ./bindings/python/Cargo.lock --ignore RUSTSEC-2024-0436 --ignore RUSTSEC-2025-0014 --ignore RUSTSEC-2025-0119 --ignore RUSTSEC-2024-0436

- name: Install
working-directory: ./bindings/python
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ jobs:
uses: actions-rs/cargo@v1
with:
command: audit
args: -D warnings -f ./tokenizers/Cargo.lock --ignore RUSTSEC-2024-0436 --ignore RUSTSEC-2025-0014
args: -D warnings -f ./tokenizers/Cargo.lock --ignore RUSTSEC-2024-0436 --ignore RUSTSEC-2025-0014 --ignore RUSTSEC-2025-0119
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could be the opportunity to do some cleaning within our deps, but don't know the context that much.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah just unmaintained...


# Verify that Readme.md is up to date.
- name: Make sure, Readme generated from lib.rs matches actual Readme
Expand Down
5 changes: 5 additions & 0 deletions tokenizers/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ harness = false
name = "llama3_benchmark"
harness = false

[[bench]]
name = "added_vocab_deserialize"
required-features = ["http"]
harness = false

[dependencies]
rand = "0.9"
onig = { version = "6.5.1", default-features = false, optional = true }
Expand Down
86 changes: 86 additions & 0 deletions tokenizers/benches/added_vocab_deserialize.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#[macro_use]
extern crate criterion;
use criterion::Criterion;
use std::hint::black_box;
use std::str::FromStr;
use tokenizers::{normalizers::*, AddedToken, Normalizer, Tokenizer};

fn serialized_tokenizer<N: Normalizer + Into<NormalizerWrapper>>(
size: i64,
normalizer: Option<N>,
special_tokens: bool,
) -> String {
let mut tokenizer = Tokenizer::from_pretrained("t5-small", None).unwrap();

if let Some(norm) = normalizer {
tokenizer.with_normalizer(Some(norm));
}

let tokens: Vec<_> = (0..size)
.map(|i| AddedToken::from(format!("tok{i}"), special_tokens))
.collect();
tokenizer.add_tokens(&tokens);

serde_json::to_string(&tokenizer).unwrap()
}

#[allow(clippy::type_complexity)]
fn bench_deserialize(c: &mut Criterion) {
let normalizers: Vec<(&str, Option<fn() -> NormalizerWrapper>)> = vec![
("none", None),
("byte_level", Some(|| ByteLevel.into())),
("lowercase", Some(|| Lowercase.into())),
("nfc", Some(|| NFC.into())),
("nfd", Some(|| NFD.into())),
("nfkc", Some(|| NFKC.into())),
("nfkd", Some(|| NFKD.into())),
("nmt", Some(|| Nmt.into())),
("strip", Some(|| Strip::new(true, true).into())),
("replace", Some(|| Replace::new("a", "b").unwrap().into())),
("prepend", Some(|| Prepend::new("pre_".to_string()).into())),
("bert", Some(|| BertNormalizer::default().into())),
];

for &size in &[100_000, 400_000] {
for (norm_name, maybe_factory) in &normalizers {
let label = format!(
"special tokens deserialize_added_vocab_{}_norm_{}",
size, norm_name
);

let json = match maybe_factory {
Some(factory) => serialized_tokenizer(size, Some(factory()), true),
None => serialized_tokenizer::<NormalizerWrapper>(size, None, true),
};
c.bench_function(&label, |b| {
b.iter(|| {
let tok: Tokenizer = black_box(Tokenizer::from_str(&json).unwrap());
black_box(tok);
})
});

let label = format!(
"non special deserialize_added_vocab_{}_norm_{}",
size, norm_name
);

let json = match maybe_factory {
Some(factory) => serialized_tokenizer(size, Some(factory()), false),
None => serialized_tokenizer::<NormalizerWrapper>(size, None, false),
};
c.bench_function(&label, |b| {
b.iter(|| {
let tok: Tokenizer = black_box(Tokenizer::from_str(&json).unwrap());
black_box(tok);
})
});
}
}
}

criterion_group! {
name = benches;
config = Criterion::default().significance_level(0.1).sample_size(10);
targets = bench_deserialize
}
criterion_main!(benches);
38 changes: 22 additions & 16 deletions tokenizers/src/tokenizer/added_vocabulary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -272,30 +272,35 @@ impl AddedVocabulary {
}
}

// Then we delegate to `add_tokens`, that will take care of refreshing added tokens too.
let mut ignored = 0;

let mut existing: AHashSet<AddedToken> =
self.added_tokens_map_r.values().cloned().collect();
let mut next_id = self.added_tokens_map_r.keys().copied().max().map_or(
model.get_vocab_size() as u32,
|max| {
if max >= model.get_vocab_size() as u32 || model.get_vocab_size() == 0 {
max + 1
} else {
model.get_vocab_size() as u32
}
},
);

for token in tokens {
if token.content.is_empty() || self.added_tokens_map_r.values().any(|val| val == token)
{
if token.content.is_empty() || existing.contains(token) {
ignored += 1;
continue;
}
// If a token is already part of the vocabulary, we mark it as added

let new_id = if let Some(new_id) = self.token_to_id(&token.content, model) {
new_id
} else {
self.added_tokens_map.values().cloned().max().map_or(
model.get_vocab_size() as u32,
|max| {
if (max >= model.get_vocab_size() as u32) || model.get_vocab_size() == 0 {
max + 1
} else {
model.get_vocab_size() as u32
}
},
)
let id = next_id;
next_id += 1;
id
};
// Make sure we modify the previous entry

*self
.added_tokens_map
.entry(token.content.clone())
Expand All @@ -308,6 +313,7 @@ impl AddedVocabulary {
if !self.special_tokens_set.contains(&token.content) {
self.added_tokens.push(token.clone());
}
existing.insert(token.clone());
}

self.refresh_added_tokens(model, normalizer);
Expand All @@ -317,7 +323,7 @@ impl AddedVocabulary {
}

/// Reconstruct our internal RegexSet when new tokens are added to the vocabulary.
///
/// # TODO @ArthurZucker we should probably make this async? rebuilding the regex takes a long time.
/// We keep two different RegexSet, one that will take care of matching against the
/// non-normalized string, and one matching against the normalized one.
fn refresh_added_tokens<N: Normalizer>(&mut self, model: &impl Model, normalizer: Option<&N>) {
Expand Down
9 changes: 2 additions & 7 deletions tokenizers/src/utils/truncation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,19 +49,14 @@ pub enum TruncationError {
SequenceTooShort,
}

#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq)]
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq, Default)]
pub enum TruncationStrategy {
#[default]
LongestFirst,
OnlyFirst,
OnlySecond,
}

impl Default for TruncationStrategy {
fn default() -> Self {
Self::LongestFirst
}
}

impl std::convert::AsRef<str> for TruncationStrategy {
fn as_ref(&self) -> &str {
match self {
Expand Down
Loading