Skip to content

Commit 3f9bb8b

Browse files
authored
Fix Swift 6 concurrency errors (#264)
* Adopt Sendable protocol for tokenizer protocols and conforming types * Annotate @mainactor for tests using shared instance of tokenizer * Conform tokenizers to Sendable without @unchecked * Fix data race in PreTrainedTokenizer
1 parent b62cda1 commit 3f9bb8b

File tree

6 files changed

+37
-14
lines changed

6 files changed

+37
-14
lines changed

Sources/Tokenizers/BPETokenizer.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import Foundation
1010
import Hub
1111

1212
/// A pair of byte/token strings used in Byte-Pair Encoding (BPE) merge operations.
13-
struct BytePair: Hashable {
13+
struct BytePair: Hashable, Sendable {
1414
let a: String
1515
let b: String
1616
init(_ a: String, _ b: String) {
@@ -38,7 +38,7 @@ struct BytePair: Hashable {
3838
/// BPE tokenizers learn to merge the most frequently occurring pairs of characters
3939
/// or character sequences. This implementation supports various BPE-based models
4040
/// including GPT-2, RoBERTa, and other transformer models.
41-
class BPETokenizer: PreTrainedTokenizerModel {
41+
class BPETokenizer: PreTrainedTokenizerModel, @unchecked Sendable {
4242
let bpeRanks: [BytePair: Int]
4343
private let tokensToIds: [NSString: Int]
4444
private let idsToTokens: [Int: NSString]

Sources/Tokenizers/BertTokenizer.swift

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ import Hub
1414
/// This tokenizer performs basic tokenization (whitespace and punctuation splitting)
1515
/// followed by WordPiece subword tokenization, which is the approach used by BERT
1616
/// and related models.
17-
public class BertTokenizer {
17+
public final class BertTokenizer: Sendable {
1818
private let basicTokenizer: BasicTokenizer
1919
private let wordpieceTokenizer: WordpieceTokenizer
2020
private let maxLen = 512
@@ -27,16 +27,16 @@ public class BertTokenizer {
2727
private let ids_to_tokens: [Int: String]
2828

2929
/// The beginning-of-sequence token string, if defined.
30-
public var bosToken: String?
30+
public let bosToken: String?
3131

3232
/// The numeric ID of the beginning-of-sequence token, if defined.
33-
public var bosTokenId: Int?
33+
public let bosTokenId: Int?
3434

3535
/// The end-of-sequence token string, if defined.
36-
public var eosToken: String?
36+
public let eosToken: String?
3737

3838
/// The numeric ID of the end-of-sequence token, if defined.
39-
public var eosTokenId: Int?
39+
public let eosTokenId: Int?
4040

4141
/// Whether consecutive unknown tokens should be fused together.
4242
public let fuseUnknownTokens: Bool
@@ -225,7 +225,7 @@ extension BertTokenizer: PreTrainedTokenizerModel {
225225
}
226226
}
227227

228-
class BasicTokenizer {
228+
final class BasicTokenizer: Sendable {
229229
let doLowerCase: Bool
230230

231231
init(doLowerCase: Bool = true) {
@@ -291,7 +291,7 @@ private extension Character {
291291
}
292292
}
293293

294-
class WordpieceTokenizer {
294+
final class WordpieceTokenizer: Sendable {
295295
let unkToken = "[UNK]"
296296
private let maxInputCharsPerWord = 100
297297
private let vocab: [String: Int]

Sources/Tokenizers/Tokenizer.swift

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ public enum ChatTemplateArgument {
215215
///
216216
/// This is the main protocol that defines all tokenizer operations, including text processing,
217217
/// chat template application, and special token handling.
218-
public protocol Tokenizer {
218+
public protocol Tokenizer: Sendable {
219219
/// Tokenizes the input text into a sequence of tokens.
220220
///
221221
/// - Parameter text: The input text to tokenize
@@ -451,7 +451,7 @@ let specialTokenAttributes: [String] = [
451451
/// This class provides a complete tokenizer implementation that can be initialized from
452452
/// Hugging Face Hub configuration files and supports all standard tokenization operations
453453
/// including chat template application, normalization, pre-tokenization, and post-processing.
454-
public class PreTrainedTokenizer: Tokenizer {
454+
public class PreTrainedTokenizer: @unchecked Sendable, Tokenizer {
455455
let model: TokenizingModel
456456

457457
public var bosToken: String? { model.bosToken }
@@ -477,6 +477,9 @@ public class PreTrainedTokenizer: Tokenizer {
477477
/// Cache for compiled Jinja templates keyed by their literal template string
478478
private var compiledChatTemplateCache: [String: Template] = [:]
479479

480+
/// Lock to protect the compiled chat template cache from concurrent access
481+
private let cacheLock = NSLock()
482+
480483
/// Initializes a tokenizer from Hugging Face configuration files.
481484
///
482485
/// - Parameters:
@@ -531,10 +534,26 @@ public class PreTrainedTokenizer: Tokenizer {
531534
}
532535

533536
private func compiledTemplate(for templateString: String) throws -> Template {
537+
// Fast path: check cache under lock
538+
cacheLock.lock()
534539
if let cached = compiledChatTemplateCache[templateString] {
540+
cacheLock.unlock()
535541
return cached
536542
}
543+
cacheLock.unlock()
544+
545+
// Compile template outside of lock to avoid holding lock during expensive operation
537546
let compiled = try Template(templateString)
547+
548+
// Insert into cache under lock (using double-checked locking pattern)
549+
cacheLock.lock()
550+
defer { cacheLock.unlock() }
551+
552+
// Check again in case another thread compiled the same template
553+
if let cached = compiledChatTemplateCache[templateString] {
554+
return cached
555+
}
556+
538557
compiledChatTemplateCache[templateString] = compiled
539558
return compiled
540559
}
@@ -907,7 +926,7 @@ public extension AutoTokenizer {
907926

908927
// MARK: - Tokenizer model classes
909928

910-
class T5Tokenizer: UnigramTokenizer {}
929+
class T5Tokenizer: UnigramTokenizer, @unchecked Sendable {}
911930

912931
// MARK: - PreTrainedTokenizer classes
913932

@@ -956,7 +975,7 @@ func maybeUpdatePostProcessor(tokenizerConfig: Config, processorConfig: Config?)
956975
}
957976

958977
/// See https://github.com/xenova/transformers.js/blob/1a9964fb09b8f54fcbeac46dc6aae8d76795809d/src/tokenizers.js#L3203 for these exceptions
959-
class LlamaPreTrainedTokenizer: PreTrainedTokenizer {
978+
class LlamaPreTrainedTokenizer: PreTrainedTokenizer, @unchecked Sendable {
960979
let isLegacy: Bool
961980

962981
required init(tokenizerConfig: Config, tokenizerData: Config, strict: Bool = true) throws {

Sources/Tokenizers/UnigramTokenizer.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ import Hub
1414
/// Unigram tokenizers use a probabilistic approach where each token has a score,
1515
/// and the tokenization process finds the most probable segmentation of the input text.
1616
/// This is commonly used in models like T5 and XLM-RoBERTa.
17-
class UnigramTokenizer: PreTrainedTokenizerModel {
17+
class UnigramTokenizer: PreTrainedTokenizerModel, @unchecked Sendable {
1818
/// A token with its associated score in the Unigram model.
1919
struct SentencePieceToken {
2020
var token: String

Tests/TokenizersTests/BertTokenizerTests.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ private enum Squad {
8383

8484
// MARK: -
8585

86+
@MainActor
8687
private let bertTokenizer: BertTokenizer = {
8788
let vocab = {
8889
let url = Bundle.module.url(forResource: "bert-vocab", withExtension: "txt")!
@@ -101,6 +102,7 @@ private let bertTokenizer: BertTokenizer = {
101102
// MARK: -
102103

103104
@Suite("BERT Tokenizer Tests")
105+
@MainActor
104106
struct BertTokenizerTests {
105107
@Test("Basic tokenizer correctly tokenizes text")
106108
func testBasicTokenizer() {

Tests/TokenizersTests/ChatTemplateTests.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ struct ChatTemplateTests {
1919
]
2020
]
2121

22+
@MainActor
2223
static let phiTokenizerTask = Task {
2324
try await AutoTokenizer.from(pretrained: "microsoft/Phi-3-mini-128k-instruct")
2425
}
@@ -27,6 +28,7 @@ struct ChatTemplateTests {
2728
try await phiTokenizerTask.value
2829
}
2930

31+
@MainActor
3032
static let tokenizerWithTemplateArrayTask = Task {
3133
try await AutoTokenizer.from(pretrained: "mlx-community/Mistral-7B-Instruct-v0.3-4bit")
3234
}

0 commit comments

Comments
 (0)