Workaround for tokens that start with Unicode BOM (#236)

pcuenca · web-flow · commit f6ca31803f3a · 2025-09-12T19:14:02.000+02:00
* Workaround for tokens that start with Unicode BOM The BOM is swallowed by JSONSerialization.jsonObject() because it thinks it signals the encoding in use. This workaround duplicates BOM sequences that follow a quote character. Fixes #116 Fixes #88 Fixes ml-explore/mlx-swift-examples#50 * Format 🙄
diff --git a/Sources/Hub/BOMDoubling.swift b/Sources/Hub/BOMDoubling.swift
@@ -0,0 +1,48 @@
+//
+//  BOMDoubling.swift
+//  swift-transformers
+//
+//  Created by Pedro Cuenca on 20250912
+//
+
+import Foundation
+
+extension Data {
+    /// Workaround for https://github.com/huggingface/swift-transformers/issues/116
+    /// Duplicate a BOM sequence that follows a quote. The first BOM is swallowed by JSONSerialization.jsonObject
+    /// because it thinks it marks the encoding.
+    var duplicatingBOMsAfterQuotes: Data {
+        withUnsafeBytes { (raw: UnsafeRawBufferPointer) in
+            let src = raw.bindMemory(to: UInt8.self)
+            var out = [UInt8]()
+            // We expect very few matches (only 6 for Gemma)
+            out.reserveCapacity(src.count + 1000)
+
+            var i = 0
+            while i < src.count {
+                let b = src[i]
+                out.append(b)
+
+                // Check for \u{feff} BOM (observed in Gemma tokenizers), which is encoded as 0xef 0xbb 0xbf.
+                // We may need more combinations.
+                if b == 0x22, i + 3 < src.count,
+                   src[i + 1] == 0xEF, src[i + 2] == 0xBB, src[i + 3] == 0xBF
+                {
+                    // Duplicate BOM
+                    out.append(0xEF); out.append(0xBB); out.append(0xBF)
+                    out.append(0xEF); out.append(0xBB); out.append(0xBF)
+                    i += 4
+                } else {
+                    i += 1
+                }
+            }
+            return Data(out)
+        }
+    }
+}
+
+extension JSONSerialization {
+    class func bomPreservingJsonObject(with data: Data, options: JSONSerialization.ReadingOptions = []) throws -> Any {
+        try JSONSerialization.jsonObject(with: data.duplicatingBOMsAfterQuotes, options: options)
+    }
+}
diff --git a/Sources/Hub/HubApi.swift b/Sources/Hub/HubApi.swift
@@ -268,7 +268,7 @@ public extension HubApi {
     /// `fileURL` is a complete local file path for the given model
     func configuration(fileURL: URL) throws -> Config {
         let data = try Data(contentsOf: fileURL)
-        let parsed = try JSONSerialization.jsonObject(with: data, options: [])
+        let parsed = try JSONSerialization.bomPreservingJsonObject(with: data)
         guard let dictionary = parsed as? [NSString: Any] else { throw Hub.HubClientError.parse }
         return Config(dictionary)
     }
diff --git a/Tests/TokenizersTests/TokenizerTests.swift b/Tests/TokenizersTests/TokenizerTests.swift
@@ -76,34 +76,31 @@ class GemmaTokenizerTests: TokenizerTests {
     override class var hubModelName: String? { "pcuenq/gemma-tokenizer" }
     override class var encodedSamplesFilename: String? { "gemma_encoded" }
     override class var unknownTokenId: Int? { 3 }
+}
 
-    func testUnicodeEdgeCase() async {
-        guard let tester = Self._tester else {
+class GemmaUnicodeTests: XCTestCase {
+    func testGemmaUnicode() async throws {
+        guard let tokenizer = try await AutoTokenizer.from(pretrained: "pcuenq/gemma-tokenizer") as? PreTrainedTokenizer else {
             XCTFail()
             return
         }
 
         // These are two different characters
         let cases = ["à" /* 0x61 0x300 */, "à" /* 0xe0 */ ]
         let expected = [217138, 1305]
-
-        // These are different characters
         for (s, expected) in zip(cases, expected) {
-            let encoded = await tester.tokenizer?.encode(text: " " + s)
+            let encoded = tokenizer.encode(text: " " + s)
             XCTAssertEqual(encoded, [2, expected])
         }
-    }
-}
 
-class GemmaUnicodeTests: XCTestCase {
-    func testGemmaVocab() async throws {
-        guard let tokenizer = try await AutoTokenizer.from(pretrained: "pcuenq/gemma-tokenizer") as? PreTrainedTokenizer else {
-            XCTFail()
-            return
-        }
+        // Keys that start with BOM sequence
+        // https://github.com/huggingface/swift-transformers/issues/88
+        // https://github.com/ml-explore/mlx-swift-examples/issues/50#issuecomment-2046592213
+        XCTAssertEqual(tokenizer.convertIdToToken(122661), "\u{feff}#")
+        XCTAssertEqual(tokenizer.convertIdToToken(235345), "#")
 
-        // FIXME: This should be 256_000, I believe
-        XCTAssertEqual((tokenizer.model as? BPETokenizer)?.vocabCount, 255994)
+        // Verifies all expected entries are parsed
+        XCTAssertEqual((tokenizer.model as? BPETokenizer)?.vocabCount, 256_000)
     }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -268,7 +268,7 @@ public extension HubApi {`
`268`	`268`	/// `fileURL` is a complete local file path for the given model
`269`	`269`	`func configuration(fileURL: URL) throws -> Config {`
`270`	`270`	`let data = try Data(contentsOf: fileURL)`
`271`		`- let parsed = try JSONSerialization.jsonObject(with: data, options: [])`
	`271`	`+ let parsed = try JSONSerialization.bomPreservingJsonObject(with: data)`
`272`	`272`	`guard let dictionary = parsed as? [NSString: Any] else { throw Hub.HubClientError.parse }`
`273`	`273`	`return Config(dictionary)`
`274`	`274`	`}`