Skip to content

Commit 9dad29e

Browse files
kashifpcuenca
andauthored
lets callers inject a prebuilt Tokenizer in the LanguageModel (#278)
* lets callers inject a prebuilt Tokenizer * add a tokenizerFolder argument * Update Examples/transformers-cli/Sources/transformers-cli/Transformers.swift Co-authored-by: Pedro Cuenca <pedro@huggingface.co> * Update README.md Co-authored-by: Pedro Cuenca <pedro@huggingface.co> * Update Sources/Models/LanguageModel.swift Co-authored-by: Pedro Cuenca <pedro@huggingface.co> * Update Sources/Models/LanguageModel.swift Co-authored-by: Pedro Cuenca <pedro@huggingface.co> * Update README.md Co-authored-by: Pedro Cuenca <pedro@huggingface.co> * removed configuration * add tokenizerFromLocalFolder test * add fixtures * simplified test * fix names * Update Sources/Models/LanguageModel.swift Co-authored-by: Pedro Cuenca <pedro@huggingface.co> --------- Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
1 parent 479eeef commit 9dad29e

File tree

6 files changed

+187
-14
lines changed

6 files changed

+187
-14
lines changed

Examples/transformers-cli/Sources/transformers-cli/Transformers.swift

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ struct TransformersCLI: AsyncParsableCommand {
4949
@Option(help: "Repetition penalty to discourage repeating tokens (typical: 1.0-2.0, 1.0 = no penalty)")
5050
var repetitionPenalty: Float?
5151

52+
@Option(help: "Path to a local folder containing tokenizer_config.json and tokenizer.json")
53+
var tokenizerPath: String?
54+
5255
func generate(
5356
model: LanguageModel,
5457
config: GenerationConfig,
@@ -104,7 +107,17 @@ struct TransformersCLI: AsyncParsableCommand {
104107
let url = URL(filePath: modelPath)
105108
let compiledURL = try compile(at: url)
106109
print("Loading model \(compiledURL)")
107-
let model = try LanguageModel.loadCompiled(url: compiledURL, computeUnits: computeUnits.asMLComputeUnits)
110+
let model: LanguageModel
111+
if let tokenizerFolder {
112+
let tokenizerURL = URL(filePath: tokenizerFolder, directoryHint: .isDirectory)
113+
model = try LanguageModel.loadCompiled(
114+
url: compiledURL,
115+
tokenizerFolder: tokenizerURL,
116+
computeUnits: computeUnits.asMLComputeUnits
117+
)
118+
} else {
119+
model = try LanguageModel.loadCompiled(url: compiledURL, computeUnits: computeUnits.asMLComputeUnits)
120+
}
108121

109122
var config = model.defaultGenerationConfig
110123
config.doSample = doSample

README.md

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,36 @@ example converting and running Mistral 7B using CoreML [here](https://github.com
8888

8989
The [modernization of Core ML](https://github.com/huggingface/swift-transformers/pull/257) and corresponding examples were primarily contributed by @joshnewnham, @1duo, @alejandro-isaza, @aseemw. Thank you 🙏
9090

91+
### Offline CoreML tokenizers
92+
93+
When you bundle a compiled CoreML model and tokenizer files with your app, you can skip any network requests by injecting
94+
the tokenizer when constructing `LanguageModel`:
95+
96+
```swift
97+
let compiledURL: URL = ... // path to .mlmodelc
98+
let tokenizerFolder: URL = ... // folder containing tokenizer_config.json and tokenizer.json
99+
100+
// Construct the tokenizer from local files (inside an async context)
101+
let tokenizer = try await AutoTokenizer.from(modelFolder: tokenizerFolder)
102+
let model = try LanguageModel.loadCompiled(
103+
url: compiledURL,
104+
tokenizer: tokenizer
105+
)
106+
```
107+
108+
Make sure the tokenizer assets come from the same Hugging Face repo as the original checkpoint or are compatible with the model you use. For the
109+
Mistral example in `Examples/Mistral7B/`, you can fetch the tokenizer like this:
110+
111+
```bash
112+
huggingface-cli download \
113+
mistralai/Mistral-7B-Instruct-v0.3 \
114+
tokenizer.json tokenizer_config.json \
115+
--local-dir Examples/Mistral7B/local-tokenizer
116+
```
117+
118+
If the repo is gated, authenticate with `huggingface-cli login` first. Both initializers reuse the tokenizer
119+
you pass in and never reach out to the Hugging Face Hub.
120+
91121
## Usage via SwiftPM
92122

93123
To use `swift-transformers` with SwiftPM, you can add this to your `Package.swift`:
@@ -139,5 +169,3 @@ To format your code, run `swift format -i --recursive .`.
139169
## License
140170

141171
[Apache 2](LICENSE).
142-
143-

Sources/Models/LanguageModel.swift

Lines changed: 50 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,26 @@ public class LanguageModel {
3333

3434
/// Creates a new language model instance from a CoreML model.
3535
///
36-
/// - Parameter model: The CoreML model to wrap
36+
/// - Parameters:
37+
/// - model: The CoreML model to wrap
38+
/// - tokenizer: Optional preconstructed tokenizer to reuse
3739
/// - Important: Triggers a fatal error if the model doesn't have the expected input shape information
38-
public required init(model: MLModel) {
40+
public required init(
41+
model: MLModel,
42+
tokenizer: Tokenizer? = nil
43+
) {
3944
self.model = model
45+
_tokenizer = tokenizer
4046
(minContextLength, maxContextLength) = Self.contextRange(from: model)
41-
configuration = LanguageModelConfigurationFromHub(modelName: modelName)
47+
if tokenizer == nil {
48+
self.configuration = LanguageModelConfigurationFromHub(modelName: modelName)
49+
} else {
50+
self.configuration = nil
51+
}
52+
}
53+
54+
public convenience required init(model: MLModel) {
55+
self.init(model: model, tokenizer: nil)
4256
}
4357

4458
public func resetState() async {}
@@ -142,15 +156,28 @@ public extension LanguageModel {
142156
/// - Parameters:
143157
/// - url: The URL of the compiled CoreML model file (.mlmodelc)
144158
/// - computeUnits: The compute units to use for model inference
159+
/// - tokenizer: Optional tokenizer instance to reuse
145160
/// - Returns: A configured `LanguageModel` instance
146161
/// - Throws: An error if the model cannot be loaded from the specified URL
147-
static func loadCompiled(url: URL, computeUnits: MLComputeUnits = .cpuAndGPU) throws -> LanguageModel {
162+
static func loadCompiled(
163+
url: URL,
164+
computeUnits: MLComputeUnits = .cpuAndGPU,
165+
tokenizer: Tokenizer? = nil
166+
) throws -> LanguageModel {
148167
let config = MLModelConfiguration()
149168
config.computeUnits = computeUnits
150169
let model = try MLModel(contentsOf: url, configuration: config)
151170
return switch kvCacheAvailability(for: model) {
152-
case .statefulKVCache: LanguageModelWithStatefulKVCache(model: model)
153-
default: LanguageModel(model: model)
171+
case .statefulKVCache:
172+
LanguageModelWithStatefulKVCache(
173+
model: model,
174+
tokenizer: tokenizer
175+
)
176+
default:
177+
LanguageModel(
178+
model: model,
179+
tokenizer: tokenizer
180+
)
154181
}
155182
}
156183
}
@@ -304,7 +331,8 @@ public extension LanguageModel {
304331
/// - Throws: An error if the configuration cannot be loaded
305332
var modelConfig: Config? {
306333
get async throws {
307-
try await configuration!.modelConfig
334+
guard let configuration else { return nil }
335+
return try await configuration.modelConfig
308336
}
309337
}
310338

@@ -314,7 +342,8 @@ public extension LanguageModel {
314342
/// - Throws: An error if the configuration cannot be loaded
315343
var tokenizerConfig: Config? {
316344
get async throws {
317-
try await configuration!.tokenizerConfig
345+
guard let configuration else { return nil }
346+
return try await configuration.tokenizerConfig
318347
}
319348
}
320349

@@ -324,7 +353,10 @@ public extension LanguageModel {
324353
/// - Throws: An error if the tokenizer data cannot be loaded
325354
var tokenizerData: Config {
326355
get async throws {
327-
try await configuration!.tokenizerData
356+
guard let configuration else {
357+
throw TokenizerError.missingConfig
358+
}
359+
return try await configuration.tokenizerData
328360
}
329361
}
330362

@@ -459,8 +491,11 @@ public class LanguageModelWithStatefulKVCache: LanguageModel {
459491

460492
var state: MLState?
461493

462-
public required init(model: MLModel) {
463-
super.init(model: model)
494+
public required init(
495+
model: MLModel,
496+
tokenizer: Tokenizer? = nil
497+
) {
498+
super.init(model: model, tokenizer: tokenizer)
464499
// To support pre-filling and extend, the input must support
465500
// flexible shapes.
466501
guard maxContextLength - minContextLength > 1 else {
@@ -531,11 +566,15 @@ public class LanguageModelWithStatefulKVCache: LanguageModel {
531566
public enum TokenizerError: LocalizedError {
532567
/// The tokenizer configuration file could not be found.
533568
case tokenizerConfigNotFound
569+
/// The language model configuration required to load tokenizer data is missing.
570+
case missingConfig
534571

535572
public var errorDescription: String? {
536573
switch self {
537574
case .tokenizerConfigNotFound:
538575
String(localized: "Tokenizer configuration could not be found. The model may be missing required tokenizer files.", comment: "Error when tokenizer configuration is missing")
576+
case .missingConfig:
577+
String(localized: "Language model configuration was not set, tokenizer assets could not be loaded.", comment: "Error when configuration needed for tokenizer data is missing")
539578
}
540579
}
541580
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
{
2+
"version": "1.0",
3+
"truncation": null,
4+
"padding": null,
5+
"added_tokens": [
6+
{
7+
"id": 0,
8+
"content": "<bos>"
9+
},
10+
{
11+
"id": 1,
12+
"content": "<pad>"
13+
},
14+
{
15+
"id": 2,
16+
"content": "<eos>"
17+
},
18+
{
19+
"id": 3,
20+
"content": "<unk>"
21+
}
22+
],
23+
"model": {
24+
"type": "BPE",
25+
"vocab": {
26+
"<bos>": 0,
27+
"<pad>": 1,
28+
"<eos>": 2,
29+
"<unk>": 3,
30+
"offline": 4,
31+
"path": 5,
32+
"_": 6
33+
},
34+
"merges": [
35+
"off line",
36+
"li ne",
37+
"pa th",
38+
"_ of",
39+
"_ pa"
40+
],
41+
"continuing_subword_prefix": "",
42+
"end_of_word_suffix": "",
43+
"unk_token": "<unk>"
44+
},
45+
"normalizer": {
46+
"type": "Lowercase"
47+
},
48+
"pre_tokenizer": {
49+
"type": "Whitespace"
50+
}
51+
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"tokenizer_class": "GPT2Tokenizer",
3+
"bos_token": "<bos>",
4+
"eos_token": "<eos>",
5+
"unk_token": "<unk>",
6+
"pad_token": "<pad>",
7+
"model_max_length": 128,
8+
"do_lower_case": false
9+
}

Tests/TokenizersTests/TokenizerTests.swift

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,39 @@ struct TokenizerTests {
177177
#expect(tokenizer.encode(text: "<|im_start|>user<|im_sep|>Who are you?<|im_end|><|im_start|>assistant<|im_sep|>") == [100264, 882, 100266, 15546, 527, 499, 30, 100265, 100264, 78191, 100266])
178178
}
179179

180+
@Test
181+
func tokenizerFromLocalFolder() async throws {
182+
let bundle = Bundle.module
183+
guard
184+
let tokenizerConfigURL = bundle.url(
185+
forResource: "tokenizer_config",
186+
withExtension: "json"
187+
),
188+
bundle.url(
189+
forResource: "tokenizer",
190+
withExtension: "json"
191+
) != nil
192+
else {
193+
Issue.record("Missing offline tokenizer fixtures")
194+
return
195+
}
196+
197+
let configuration = LanguageModelConfigurationFromHub(modelFolder: tokenizerConfigURL.deletingLastPathComponent())
198+
199+
let tokenizerConfigOpt = try await configuration.tokenizerConfig
200+
#expect(tokenizerConfigOpt != nil)
201+
let tokenizerConfig = tokenizerConfigOpt!
202+
let tokenizerData = try await configuration.tokenizerData
203+
204+
let tokenizer = try AutoTokenizer.from(
205+
tokenizerConfig: tokenizerConfig,
206+
tokenizerData: tokenizerData
207+
)
208+
209+
let encoded = tokenizer.encode(text: "offline path")
210+
#expect(!encoded.isEmpty)
211+
}
212+
180213
/// https://github.com/huggingface/swift-transformers/issues/96
181214
@Test
182215
func legacyLlamaBehaviour() async throws {

0 commit comments

Comments
 (0)