Skip to content

Commit ec0ae14

Browse files
authored
Fix legacy behaviour (#234)
* Fix legacy behaviour. There was a case I had not considered 😢 Fixes #96 * Update Sources/Tokenizers/Tokenizer.swift * No force unwrap
1 parent 94ee0c1 commit ec0ae14

File tree

2 files changed

+28
-3
lines changed

2 files changed

+28
-3
lines changed

Sources/Tokenizers/Tokenizer.swift

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -279,9 +279,9 @@ public class PreTrainedTokenizer: Tokenizer {
279279
public var unknownTokenId: Int? { model.unknownTokenId }
280280
public var fuseUnknownTokens: Bool { model.fuseUnknownTokens }
281281

282-
private let addedTokens: Set<String>
283-
private let specialTokens: [String: Int]
284-
private let addedTokensRegex: NSRegularExpression?
282+
let addedTokens: Set<String>
283+
let specialTokens: [String: Int]
284+
let addedTokensRegex: NSRegularExpression?
285285

286286
private let preTokenizer: PreTokenizer?
287287
private let normalizer: Normalizer?
@@ -722,4 +722,18 @@ class LlamaPreTrainedTokenizer: PreTrainedTokenizer {
722722
let updatedData = Config(configDictionary)
723723
try super.init(tokenizerConfig: tokenizerConfig, tokenizerData: updatedData, strict: strict)
724724
}
725+
726+
/// If `isLegacy` is `False`, a prefix token is added unless the first token is special.
727+
/// https://github.com/huggingface/transformers/blob/e6dcf8abd6f65bb4b6dfc1831b20d9ba49ce00e2/src/transformers/models/t5/tokenization_t5.py#L374-L387
728+
override func tokenize(text: String) -> [String] {
729+
if isLegacy || text.isEmpty {
730+
return super.tokenize(text: text)
731+
}
732+
733+
let tokens = super.tokenize(text: sentencePieceUnderline + text.replacingOccurrences(of: sentencePieceUnderline, with: " "))
734+
if tokens.first == sentencePieceUnderline, let second = tokens.dropFirst().first, specialTokens[second] != nil {
735+
return Array(tokens[1...])
736+
}
737+
return tokens
738+
}
725739
}

Tests/TokenizersTests/TokenizerTests.swift

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,17 @@ class PhiSimpleTests: XCTestCase {
118118
XCTAssertEqual(tokenizer.encode(text: "hello world"), [15339, 1917])
119119
XCTAssertEqual(tokenizer.encode(text: "<|im_start|>user<|im_sep|>Who are you?<|im_end|><|im_start|>assistant<|im_sep|>"), [100264, 882, 100266, 15546, 527, 499, 30, 100265, 100264, 78191, 100266])
120120
}
121+
122+
/// https://github.com/huggingface/swift-transformers/issues/96
123+
func testLegacyLlamaBehaviour() async throws {
124+
guard let tokenizer = try await AutoTokenizer.from(pretrained: "mlx-community/Phi-3-mini-4k-instruct-4bit-no-q-embed") as? PreTrainedTokenizer else {
125+
XCTFail()
126+
return
127+
}
128+
129+
let inputIds = tokenizer(" Hi")
130+
XCTAssertEqual(inputIds, [1, 29871, 6324])
131+
}
121132
}
122133

123134
class RobertaTokenizerTests: XCTestCase {

0 commit comments

Comments
 (0)