Update serialization (#1891)

ArthurZucker · web-flow · commit d6a4acc0d2c3 · 2025-11-27T23:07:18.000+01:00
* Add benchmark for deserializing large added vocab

* revert dumb stuff, isolate changes

* try to only normalize once

* small improvement?

* some updates

* nit

* fmt

* normalized string are a fucking waste of time when you just want to add tokens to the vocab man....

* more attempts

* works

* let's fucking go, parity

* update

* hahahhahaha

* revert changes that are not actually even needed

* add a python test!

* use normalizer before come on

* nit

* update to a more concrete usecase

* fix build

* style

* reduce sample size

* --allow unmaintained

* clippy happy

* up

* up

* derive impl

* revert unrelated

* fmt

* ignore

* remove stupid file
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -108,7 +108,7 @@ jobs:
         uses: actions-rs/cargo@v1
         with:
           command: audit
-          args: -D warnings -f ./bindings/python/Cargo.lock  --ignore RUSTSEC-2024-0436 --ignore RUSTSEC-2025-0014
+          args: -D warnings -f ./bindings/python/Cargo.lock  --ignore RUSTSEC-2024-0436 --ignore RUSTSEC-2025-0014 --ignore RUSTSEC-2025-0119 --ignore RUSTSEC-2024-0436
 
       - name: Install
         working-directory: ./bindings/python
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -94,7 +94,7 @@ jobs:
         uses: actions-rs/cargo@v1
         with:
           command: audit
-          args: -D warnings -f ./tokenizers/Cargo.lock --ignore RUSTSEC-2024-0436 --ignore RUSTSEC-2025-0014
+          args: -D warnings -f ./tokenizers/Cargo.lock --ignore RUSTSEC-2024-0436 --ignore RUSTSEC-2025-0014 --ignore RUSTSEC-2025-0119
 
       # Verify that Readme.md is up to date.
       - name: Make sure, Readme generated from lib.rs matches actual Readme
diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml
@@ -40,6 +40,11 @@ harness = false
 name = "llama3_benchmark"
 harness = false
 
+[[bench]]
+name = "added_vocab_deserialize"
+required-features = ["http"]
+harness = false
+
 [dependencies]
 rand = "0.9"
 onig = { version = "6.5.1", default-features = false, optional = true }
diff --git a/tokenizers/benches/added_vocab_deserialize.rs b/tokenizers/benches/added_vocab_deserialize.rs
@@ -0,0 +1,86 @@
+#[macro_use]
+extern crate criterion;
+use criterion::Criterion;
+use std::hint::black_box;
+use std::str::FromStr;
+use tokenizers::{normalizers::*, AddedToken, Normalizer, Tokenizer};
+
+fn serialized_tokenizer<N: Normalizer + Into<NormalizerWrapper>>(
+    size: i64,
+    normalizer: Option<N>,
+    special_tokens: bool,
+) -> String {
+    let mut tokenizer = Tokenizer::from_pretrained("t5-small", None).unwrap();
+
+    if let Some(norm) = normalizer {
+        tokenizer.with_normalizer(Some(norm));
+    }
+
+    let tokens: Vec<_> = (0..size)
+        .map(|i| AddedToken::from(format!("tok{i}"), special_tokens))
+        .collect();
+    tokenizer.add_tokens(&tokens);
+
+    serde_json::to_string(&tokenizer).unwrap()
+}
+
+#[allow(clippy::type_complexity)]
+fn bench_deserialize(c: &mut Criterion) {
+    let normalizers: Vec<(&str, Option<fn() -> NormalizerWrapper>)> = vec![
+        ("none", None),
+        ("byte_level", Some(|| ByteLevel.into())),
+        ("lowercase", Some(|| Lowercase.into())),
+        ("nfc", Some(|| NFC.into())),
+        ("nfd", Some(|| NFD.into())),
+        ("nfkc", Some(|| NFKC.into())),
+        ("nfkd", Some(|| NFKD.into())),
+        ("nmt", Some(|| Nmt.into())),
+        ("strip", Some(|| Strip::new(true, true).into())),
+        ("replace", Some(|| Replace::new("a", "b").unwrap().into())),
+        ("prepend", Some(|| Prepend::new("pre_".to_string()).into())),
+        ("bert", Some(|| BertNormalizer::default().into())),
+    ];
+
+    for &size in &[100_000, 400_000] {
+        for (norm_name, maybe_factory) in &normalizers {
+            let label = format!(
+                "special tokens deserialize_added_vocab_{}_norm_{}",
+                size, norm_name
+            );
+
+            let json = match maybe_factory {
+                Some(factory) => serialized_tokenizer(size, Some(factory()), true),
+                None => serialized_tokenizer::<NormalizerWrapper>(size, None, true),
+            };
+            c.bench_function(&label, |b| {
+                b.iter(|| {
+                    let tok: Tokenizer = black_box(Tokenizer::from_str(&json).unwrap());
+                    black_box(tok);
+                })
+            });
+
+            let label = format!(
+                "non special deserialize_added_vocab_{}_norm_{}",
+                size, norm_name
+            );
+
+            let json = match maybe_factory {
+                Some(factory) => serialized_tokenizer(size, Some(factory()), false),
+                None => serialized_tokenizer::<NormalizerWrapper>(size, None, false),
+            };
+            c.bench_function(&label, |b| {
+                b.iter(|| {
+                    let tok: Tokenizer = black_box(Tokenizer::from_str(&json).unwrap());
+                    black_box(tok);
+                })
+            });
+        }
+    }
+}
+
+criterion_group! {
+    name = benches;
+    config = Criterion::default().significance_level(0.1).sample_size(10);
+    targets = bench_deserialize
+}
+criterion_main!(benches);
diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs
@@ -272,30 +272,35 @@ impl AddedVocabulary {
             }
         }
 
-        // Then we delegate to `add_tokens`, that will take care of refreshing added tokens too.
         let mut ignored = 0;
+
+        let mut existing: AHashSet<AddedToken> =
+            self.added_tokens_map_r.values().cloned().collect();
+        let mut next_id = self.added_tokens_map_r.keys().copied().max().map_or(
+            model.get_vocab_size() as u32,
+            |max| {
+                if max >= model.get_vocab_size() as u32 || model.get_vocab_size() == 0 {
+                    max + 1
+                } else {
+                    model.get_vocab_size() as u32
+                }
+            },
+        );
+
         for token in tokens {
-            if token.content.is_empty() || self.added_tokens_map_r.values().any(|val| val == token)
-            {
+            if token.content.is_empty() || existing.contains(token) {
                 ignored += 1;
                 continue;
             }
-            // If a token is already part of the vocabulary, we mark it as added
+
             let new_id = if let Some(new_id) = self.token_to_id(&token.content, model) {
                 new_id
             } else {
-                self.added_tokens_map.values().cloned().max().map_or(
-                    model.get_vocab_size() as u32,
-                    |max| {
-                        if (max >= model.get_vocab_size() as u32) || model.get_vocab_size() == 0 {
-                            max + 1
-                        } else {
-                            model.get_vocab_size() as u32
-                        }
-                    },
-                )
+                let id = next_id;
+                next_id += 1;
+                id
             };
-            // Make sure we modify the previous entry
+
             *self
                 .added_tokens_map
                 .entry(token.content.clone())
@@ -308,6 +313,7 @@ impl AddedVocabulary {
             if !self.special_tokens_set.contains(&token.content) {
                 self.added_tokens.push(token.clone());
             }
+            existing.insert(token.clone());
         }
 
         self.refresh_added_tokens(model, normalizer);
@@ -317,7 +323,7 @@ impl AddedVocabulary {
     }
 
     /// Reconstruct our internal RegexSet when new tokens are added to the vocabulary.
-    ///
+    /// # TODO @ArthurZucker we should probably make this async? rebuilding the regex takes a long time.
     /// We keep two different RegexSet, one that will take care of matching against the
     /// non-normalized string, and one matching against the normalized one.
     fn refresh_added_tokens<N: Normalizer>(&mut self, model: &impl Model, normalizer: Option<&N>) {
diff --git a/tokenizers/src/utils/truncation.rs b/tokenizers/src/utils/truncation.rs
@@ -49,19 +49,14 @@ pub enum TruncationError {
     SequenceTooShort,
 }
 
-#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq)]
+#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq, Default)]
 pub enum TruncationStrategy {
+    #[default]
     LongestFirst,
     OnlyFirst,
     OnlySecond,
 }
 
-impl Default for TruncationStrategy {
-    fn default() -> Self {
-        Self::LongestFirst
-    }
-}
-
 impl std::convert::AsRef<str> for TruncationStrategy {
     fn as_ref(&self) -> &str {
         match self {