You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: README.md
+30-2Lines changed: 30 additions & 2 deletions
Display the source diff
Display the rich diff
Original file line number
Diff line number
Diff line change
@@ -88,6 +88,36 @@ example converting and running Mistral 7B using CoreML [here](https://github.com
88
88
89
89
The [modernization of Core ML](https://github.com/huggingface/swift-transformers/pull/257) and corresponding examples were primarily contributed by @joshnewnham, @1duo, @alejandro-isaza, @aseemw. Thank you 🙏
90
90
91
+
### Offline CoreML tokenizers
92
+
93
+
When you bundle a compiled CoreML model and tokenizer files with your app, you can skip any network requests by injecting
94
+
the tokenizer when constructing `LanguageModel`:
95
+
96
+
```swift
97
+
let compiledURL: URL =...// path to .mlmodelc
98
+
let tokenizerFolder: URL =...// folder containing tokenizer_config.json and tokenizer.json
99
+
100
+
// Construct the tokenizer from local files (inside an async context)
101
+
let tokenizer =tryawait AutoTokenizer.from(modelFolder: tokenizerFolder)
102
+
let model =try LanguageModel.loadCompiled(
103
+
url: compiledURL,
104
+
tokenizer: tokenizer
105
+
)
106
+
```
107
+
108
+
Make sure the tokenizer assets come from the same Hugging Face repo as the original checkpoint or are compatible with the model you use. For the
109
+
Mistral example in `Examples/Mistral7B/`, you can fetch the tokenizer like this:
110
+
111
+
```bash
112
+
huggingface-cli download \
113
+
mistralai/Mistral-7B-Instruct-v0.3 \
114
+
tokenizer.json tokenizer_config.json \
115
+
--local-dir Examples/Mistral7B/local-tokenizer
116
+
```
117
+
118
+
If the repo is gated, authenticate with `huggingface-cli login` first. Both initializers reuse the tokenizer
119
+
you pass in and never reach out to the Hugging Face Hub.
120
+
91
121
## Usage via SwiftPM
92
122
93
123
To use `swift-transformers` with SwiftPM, you can add this to your `Package.swift`:
@@ -139,5 +169,3 @@ To format your code, run `swift format -i --recursive .`.
@@ -304,7 +331,8 @@ public extension LanguageModel {
304
331
/// - Throws: An error if the configuration cannot be loaded
305
332
varmodelConfig:Config?{
306
333
get asyncthrows{
307
-
tryawait configuration!.modelConfig
334
+
guardlet configuration else{returnnil}
335
+
returntryawait configuration.modelConfig
308
336
}
309
337
}
310
338
@@ -314,7 +342,8 @@ public extension LanguageModel {
314
342
/// - Throws: An error if the configuration cannot be loaded
315
343
vartokenizerConfig:Config?{
316
344
get asyncthrows{
317
-
tryawait configuration!.tokenizerConfig
345
+
guardlet configuration else{returnnil}
346
+
returntryawait configuration.tokenizerConfig
318
347
}
319
348
}
320
349
@@ -324,7 +353,10 @@ public extension LanguageModel {
324
353
/// - Throws: An error if the tokenizer data cannot be loaded
325
354
vartokenizerData:Config{
326
355
get asyncthrows{
327
-
tryawait configuration!.tokenizerData
356
+
guardlet configuration else{
357
+
throwTokenizerError.missingConfig
358
+
}
359
+
returntryawait configuration.tokenizerData
328
360
}
329
361
}
330
362
@@ -459,8 +491,11 @@ public class LanguageModelWithStatefulKVCache: LanguageModel {
459
491
460
492
varstate:MLState?
461
493
462
-
publicrequiredinit(model:MLModel){
463
-
super.init(model: model)
494
+
publicrequiredinit(
495
+
model:MLModel,
496
+
tokenizer:Tokenizer?=nil
497
+
){
498
+
super.init(model: model, tokenizer: tokenizer)
464
499
// To support pre-filling and extend, the input must support
465
500
// flexible shapes.
466
501
guard maxContextLength - minContextLength >1else{
@@ -531,11 +566,15 @@ public class LanguageModelWithStatefulKVCache: LanguageModel {
531
566
publicenumTokenizerError:LocalizedError{
532
567
/// The tokenizer configuration file could not be found.
533
568
case tokenizerConfigNotFound
569
+
/// The language model configuration required to load tokenizer data is missing.
570
+
case missingConfig
534
571
535
572
publicvarerrorDescription:String?{
536
573
switchself{
537
574
case.tokenizerConfigNotFound:
538
575
String(localized:"Tokenizer configuration could not be found. The model may be missing required tokenizer files.", comment:"Error when tokenizer configuration is missing")
576
+
case.missingConfig:
577
+
String(localized:"Language model configuration was not set, tokenizer assets could not be loaded.", comment:"Error when configuration needed for tokenizer data is missing")
Copy file name to clipboardExpand all lines: Tests/TokenizersTests/TokenizerTests.swift
+33Lines changed: 33 additions & 0 deletions
Original file line number
Diff line number
Diff line change
@@ -177,6 +177,39 @@ struct TokenizerTests {
177
177
#expect(tokenizer.encode(text:"<|im_start|>user<|im_sep|>Who are you?<|im_end|><|im_start|>assistant<|im_sep|>")==[100264,882,100266,15546,527,499,30,100265,100264,78191,100266])
0 commit comments