Skip to content

Commit f6ca318

Browse files
authored
Workaround for tokens that start with Unicode BOM (#236)
* Workaround for tokens that start with Unicode BOM The BOM is swallowed by JSONSerialization.jsonObject() because it thinks it signals the encoding in use. This workaround duplicates BOM sequences that follow a quote character. Fixes #116 Fixes #88 Fixes ml-explore/mlx-swift-examples#50 * Format 🙄
1 parent ec0ae14 commit f6ca318

File tree

3 files changed

+61
-16
lines changed

3 files changed

+61
-16
lines changed

Sources/Hub/BOMDoubling.swift

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
//
2+
// BOMDoubling.swift
3+
// swift-transformers
4+
//
5+
// Created by Pedro Cuenca on 20250912
6+
//
7+
8+
import Foundation
9+
10+
extension Data {
11+
/// Workaround for https://github.com/huggingface/swift-transformers/issues/116
12+
/// Duplicate a BOM sequence that follows a quote. The first BOM is swallowed by JSONSerialization.jsonObject
13+
/// because it thinks it marks the encoding.
14+
var duplicatingBOMsAfterQuotes: Data {
15+
withUnsafeBytes { (raw: UnsafeRawBufferPointer) in
16+
let src = raw.bindMemory(to: UInt8.self)
17+
var out = [UInt8]()
18+
// We expect very few matches (only 6 for Gemma)
19+
out.reserveCapacity(src.count + 1000)
20+
21+
var i = 0
22+
while i < src.count {
23+
let b = src[i]
24+
out.append(b)
25+
26+
// Check for \u{feff} BOM (observed in Gemma tokenizers), which is encoded as 0xef 0xbb 0xbf.
27+
// We may need more combinations.
28+
if b == 0x22, i + 3 < src.count,
29+
src[i + 1] == 0xEF, src[i + 2] == 0xBB, src[i + 3] == 0xBF
30+
{
31+
// Duplicate BOM
32+
out.append(0xEF); out.append(0xBB); out.append(0xBF)
33+
out.append(0xEF); out.append(0xBB); out.append(0xBF)
34+
i += 4
35+
} else {
36+
i += 1
37+
}
38+
}
39+
return Data(out)
40+
}
41+
}
42+
}
43+
44+
extension JSONSerialization {
45+
class func bomPreservingJsonObject(with data: Data, options: JSONSerialization.ReadingOptions = []) throws -> Any {
46+
try JSONSerialization.jsonObject(with: data.duplicatingBOMsAfterQuotes, options: options)
47+
}
48+
}

Sources/Hub/HubApi.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ public extension HubApi {
268268
/// `fileURL` is a complete local file path for the given model
269269
func configuration(fileURL: URL) throws -> Config {
270270
let data = try Data(contentsOf: fileURL)
271-
let parsed = try JSONSerialization.jsonObject(with: data, options: [])
271+
let parsed = try JSONSerialization.bomPreservingJsonObject(with: data)
272272
guard let dictionary = parsed as? [NSString: Any] else { throw Hub.HubClientError.parse }
273273
return Config(dictionary)
274274
}

Tests/TokenizersTests/TokenizerTests.swift

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -76,34 +76,31 @@ class GemmaTokenizerTests: TokenizerTests {
7676
override class var hubModelName: String? { "pcuenq/gemma-tokenizer" }
7777
override class var encodedSamplesFilename: String? { "gemma_encoded" }
7878
override class var unknownTokenId: Int? { 3 }
79+
}
7980

80-
func testUnicodeEdgeCase() async {
81-
guard let tester = Self._tester else {
81+
class GemmaUnicodeTests: XCTestCase {
82+
func testGemmaUnicode() async throws {
83+
guard let tokenizer = try await AutoTokenizer.from(pretrained: "pcuenq/gemma-tokenizer") as? PreTrainedTokenizer else {
8284
XCTFail()
8385
return
8486
}
8587

8688
// These are two different characters
8789
let cases = ["" /* 0x61 0x300 */, "à" /* 0xe0 */ ]
8890
let expected = [217138, 1305]
89-
90-
// These are different characters
9191
for (s, expected) in zip(cases, expected) {
92-
let encoded = await tester.tokenizer?.encode(text: " " + s)
92+
let encoded = tokenizer.encode(text: " " + s)
9393
XCTAssertEqual(encoded, [2, expected])
9494
}
95-
}
96-
}
9795

98-
class GemmaUnicodeTests: XCTestCase {
99-
func testGemmaVocab() async throws {
100-
guard let tokenizer = try await AutoTokenizer.from(pretrained: "pcuenq/gemma-tokenizer") as? PreTrainedTokenizer else {
101-
XCTFail()
102-
return
103-
}
96+
// Keys that start with BOM sequence
97+
// https://github.com/huggingface/swift-transformers/issues/88
98+
// https://github.com/ml-explore/mlx-swift-examples/issues/50#issuecomment-2046592213
99+
XCTAssertEqual(tokenizer.convertIdToToken(122661), "\u{feff}#")
100+
XCTAssertEqual(tokenizer.convertIdToToken(235345), "#")
104101

105-
// FIXME: This should be 256_000, I believe
106-
XCTAssertEqual((tokenizer.model as? BPETokenizer)?.vocabCount, 255994)
102+
// Verifies all expected entries are parsed
103+
XCTAssertEqual((tokenizer.model as? BPETokenizer)?.vocabCount, 256_000)
107104
}
108105
}
109106

0 commit comments

Comments
 (0)