diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index ef8bf4414..0a224d365 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -108,7 +108,7 @@ jobs: uses: actions-rs/cargo@v1 with: command: audit - args: -D warnings -f ./bindings/python/Cargo.lock --ignore RUSTSEC-2024-0436 --ignore RUSTSEC-2025-0014 + args: -D warnings -f ./bindings/python/Cargo.lock --ignore RUSTSEC-2024-0436 --ignore RUSTSEC-2025-0014 --ignore RUSTSEC-2025-0119 --ignore RUSTSEC-2024-0436 - name: Install working-directory: ./bindings/python diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index f281d80a8..ef133814f 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -94,7 +94,7 @@ jobs: uses: actions-rs/cargo@v1 with: command: audit - args: -D warnings -f ./tokenizers/Cargo.lock --ignore RUSTSEC-2024-0436 --ignore RUSTSEC-2025-0014 + args: -D warnings -f ./tokenizers/Cargo.lock --ignore RUSTSEC-2024-0436 --ignore RUSTSEC-2025-0014 --ignore RUSTSEC-2025-0119 # Verify that Readme.md is up to date. - name: Make sure, Readme generated from lib.rs matches actual Readme diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index 11df2a2aa..a46e71d5c 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -40,6 +40,11 @@ harness = false name = "llama3_benchmark" harness = false +[[bench]] +name = "added_vocab_deserialize" +required-features = ["http"] +harness = false + [dependencies] rand = "0.9" onig = { version = "6.5.1", default-features = false, optional = true } diff --git a/tokenizers/benches/added_vocab_deserialize.rs b/tokenizers/benches/added_vocab_deserialize.rs new file mode 100644 index 000000000..4b79b4305 --- /dev/null +++ b/tokenizers/benches/added_vocab_deserialize.rs @@ -0,0 +1,86 @@ +#[macro_use] +extern crate criterion; +use criterion::Criterion; +use std::hint::black_box; +use std::str::FromStr; +use tokenizers::{normalizers::*, AddedToken, Normalizer, Tokenizer}; + +fn serialized_tokenizer>( + size: i64, + normalizer: Option, + special_tokens: bool, +) -> String { + let mut tokenizer = Tokenizer::from_pretrained("t5-small", None).unwrap(); + + if let Some(norm) = normalizer { + tokenizer.with_normalizer(Some(norm)); + } + + let tokens: Vec<_> = (0..size) + .map(|i| AddedToken::from(format!("tok{i}"), special_tokens)) + .collect(); + tokenizer.add_tokens(&tokens); + + serde_json::to_string(&tokenizer).unwrap() +} + +#[allow(clippy::type_complexity)] +fn bench_deserialize(c: &mut Criterion) { + let normalizers: Vec<(&str, Option NormalizerWrapper>)> = vec![ + ("none", None), + ("byte_level", Some(|| ByteLevel.into())), + ("lowercase", Some(|| Lowercase.into())), + ("nfc", Some(|| NFC.into())), + ("nfd", Some(|| NFD.into())), + ("nfkc", Some(|| NFKC.into())), + ("nfkd", Some(|| NFKD.into())), + ("nmt", Some(|| Nmt.into())), + ("strip", Some(|| Strip::new(true, true).into())), + ("replace", Some(|| Replace::new("a", "b").unwrap().into())), + ("prepend", Some(|| Prepend::new("pre_".to_string()).into())), + ("bert", Some(|| BertNormalizer::default().into())), + ]; + + for &size in &[100_000, 400_000] { + for (norm_name, maybe_factory) in &normalizers { + let label = format!( + "special tokens deserialize_added_vocab_{}_norm_{}", + size, norm_name + ); + + let json = match maybe_factory { + Some(factory) => serialized_tokenizer(size, Some(factory()), true), + None => serialized_tokenizer::(size, None, true), + }; + c.bench_function(&label, |b| { + b.iter(|| { + let tok: Tokenizer = black_box(Tokenizer::from_str(&json).unwrap()); + black_box(tok); + }) + }); + + let label = format!( + "non special deserialize_added_vocab_{}_norm_{}", + size, norm_name + ); + + let json = match maybe_factory { + Some(factory) => serialized_tokenizer(size, Some(factory()), false), + None => serialized_tokenizer::(size, None, false), + }; + c.bench_function(&label, |b| { + b.iter(|| { + let tok: Tokenizer = black_box(Tokenizer::from_str(&json).unwrap()); + black_box(tok); + }) + }); + } + } +} + +criterion_group! { + name = benches; + config = Criterion::default().significance_level(0.1).sample_size(10); + targets = bench_deserialize +} +criterion_main!(benches); diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 5fa80ae6c..ff1a592a6 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -272,30 +272,35 @@ impl AddedVocabulary { } } - // Then we delegate to `add_tokens`, that will take care of refreshing added tokens too. let mut ignored = 0; + + let mut existing: AHashSet = + self.added_tokens_map_r.values().cloned().collect(); + let mut next_id = self.added_tokens_map_r.keys().copied().max().map_or( + model.get_vocab_size() as u32, + |max| { + if max >= model.get_vocab_size() as u32 || model.get_vocab_size() == 0 { + max + 1 + } else { + model.get_vocab_size() as u32 + } + }, + ); + for token in tokens { - if token.content.is_empty() || self.added_tokens_map_r.values().any(|val| val == token) - { + if token.content.is_empty() || existing.contains(token) { ignored += 1; continue; } - // If a token is already part of the vocabulary, we mark it as added + let new_id = if let Some(new_id) = self.token_to_id(&token.content, model) { new_id } else { - self.added_tokens_map.values().cloned().max().map_or( - model.get_vocab_size() as u32, - |max| { - if (max >= model.get_vocab_size() as u32) || model.get_vocab_size() == 0 { - max + 1 - } else { - model.get_vocab_size() as u32 - } - }, - ) + let id = next_id; + next_id += 1; + id }; - // Make sure we modify the previous entry + *self .added_tokens_map .entry(token.content.clone()) @@ -308,6 +313,7 @@ impl AddedVocabulary { if !self.special_tokens_set.contains(&token.content) { self.added_tokens.push(token.clone()); } + existing.insert(token.clone()); } self.refresh_added_tokens(model, normalizer); @@ -317,7 +323,7 @@ impl AddedVocabulary { } /// Reconstruct our internal RegexSet when new tokens are added to the vocabulary. - /// + /// # TODO @ArthurZucker we should probably make this async? rebuilding the regex takes a long time. /// We keep two different RegexSet, one that will take care of matching against the /// non-normalized string, and one matching against the normalized one. fn refresh_added_tokens(&mut self, model: &impl Model, normalizer: Option<&N>) { diff --git a/tokenizers/src/utils/truncation.rs b/tokenizers/src/utils/truncation.rs index e9b392d2e..62c4c3bf0 100644 --- a/tokenizers/src/utils/truncation.rs +++ b/tokenizers/src/utils/truncation.rs @@ -49,19 +49,14 @@ pub enum TruncationError { SequenceTooShort, } -#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq)] +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq, Default)] pub enum TruncationStrategy { + #[default] LongestFirst, OnlyFirst, OnlySecond, } -impl Default for TruncationStrategy { - fn default() -> Self { - Self::LongestFirst - } -} - impl std::convert::AsRef for TruncationStrategy { fn as_ref(&self) -> &str { match self {