Skip to content

Commit 94ee0c1

Browse files
authored
Add XLM Roberta tokenizer (#233)
1 parent 5059cd4 commit 94ee0c1

File tree

2 files changed

+15
-0
lines changed

2 files changed

+15
-0
lines changed

Sources/Tokenizers/Tokenizer.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ struct TokenizerModel {
116116
"PreTrainedTokenizer": BPETokenizer.self,
117117
"Qwen2Tokenizer": BPETokenizer.self,
118118
"WhisperTokenizer": BPETokenizer.self,
119+
"XLMRobertaTokenizer": UnigramTokenizer.self,
119120
]
120121

121122
static func unknownToken(from tokenizerConfig: Config) -> String? {

Tests/TokenizersTests/TokenizerTests.swift

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,20 @@ class PhiSimpleTests: XCTestCase {
120120
}
121121
}
122122

123+
class RobertaTokenizerTests: XCTestCase {
124+
/// https://github.com/huggingface/swift-transformers/issues/99
125+
func testRobertaXLMTokenizer() async throws {
126+
guard let tokenizer = try await AutoTokenizer.from(pretrained: "intfloat/multilingual-e5-small") as? PreTrainedTokenizer else {
127+
XCTFail()
128+
return
129+
}
130+
131+
let ids = tokenizer.encode(text: "query: how much protein should a female eat")
132+
let expected = [0, 41, 1294, 12, 3642, 5045, 21308, 5608, 10, 117776, 73203, 2]
133+
XCTAssertEqual(ids, expected)
134+
}
135+
}
136+
123137
class UnregisteredTokenizerTests: XCTestCase {
124138
func testNllbTokenizer() async throws {
125139
do {

0 commit comments

Comments
 (0)