Skip to content

Commit 5ba776a

Browse files
authored
Tests: reuse a couple of tokenizers (#221)
* Tests: reuse a couple of tokenizers * I hate formatters
1 parent 5e4e6a9 commit 5ba776a

File tree

1 file changed

+21
-5
lines changed

1 file changed

+21
-5
lines changed

Tests/TokenizersTests/ChatTemplateTests.swift

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,24 @@ class ChatTemplateTests: XCTestCase {
1414
"content": "Describe the Swift programming language.",
1515
]]
1616

17+
static let phiTokenizerTask = Task {
18+
try await AutoTokenizer.from(pretrained: "microsoft/Phi-3-mini-128k-instruct")
19+
}
20+
21+
static func sharedPhiTokenizer() async throws -> Tokenizer {
22+
try await phiTokenizerTask.value
23+
}
24+
25+
static let tokenizerWithTemplateArrayTask = Task {
26+
try await AutoTokenizer.from(pretrained: "mlx-community/Mistral-7B-Instruct-v0.3-4bit")
27+
}
28+
29+
static func sharedTokenizerWithTemplateArray() async throws -> Tokenizer {
30+
try await tokenizerWithTemplateArrayTask.value
31+
}
32+
1733
func testTemplateFromConfig() async throws {
18-
let tokenizer = try await AutoTokenizer.from(pretrained: "microsoft/Phi-3-mini-128k-instruct")
34+
let tokenizer = try await Self.sharedPhiTokenizer()
1935
let encoded = try tokenizer.applyChatTemplate(messages: messages)
2036
let encodedTarget = [32010, 4002, 29581, 278, 14156, 8720, 4086, 29889, 32007, 32001]
2137
let decoded = tokenizer.decode(tokens: encoded)
@@ -36,7 +52,7 @@ class ChatTemplateTests: XCTestCase {
3652
}
3753

3854
func testDefaultTemplateFromArrayInConfig() async throws {
39-
let tokenizer = try await AutoTokenizer.from(pretrained: "mlx-community/Mistral-7B-Instruct-v0.3-4bit")
55+
let tokenizer = try await Self.sharedTokenizerWithTemplateArray()
4056
let encoded = try tokenizer.applyChatTemplate(messages: messages)
4157
let encodedTarget = [1, 29473, 3, 28752, 1040, 4672, 2563, 17060, 4610, 29491, 29473, 4]
4258
let decoded = tokenizer.decode(tokens: encoded)
@@ -46,7 +62,7 @@ class ChatTemplateTests: XCTestCase {
4662
}
4763

4864
func testTemplateFromArgumentWithEnum() async throws {
49-
let tokenizer = try await AutoTokenizer.from(pretrained: "microsoft/Phi-3-mini-128k-instruct")
65+
let tokenizer = try await Self.sharedPhiTokenizer()
5066
// Purposely not using the correct template for this model to verify that the template from the config is not being used
5167
let mistral7BDefaultTemplate = "{{bos_token}}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
5268
let encoded = try tokenizer.applyChatTemplate(messages: messages, chatTemplate: .literal(mistral7BDefaultTemplate))
@@ -58,7 +74,7 @@ class ChatTemplateTests: XCTestCase {
5874
}
5975

6076
func testTemplateFromArgumentWithString() async throws {
61-
let tokenizer = try await AutoTokenizer.from(pretrained: "microsoft/Phi-3-mini-128k-instruct")
77+
let tokenizer = try await Self.sharedPhiTokenizer()
6278
// Purposely not using the correct template for this model to verify that the template from the config is not being used
6379
let mistral7BDefaultTemplate = "{{bos_token}}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
6480
let encoded = try tokenizer.applyChatTemplate(messages: messages, chatTemplate: mistral7BDefaultTemplate)
@@ -70,7 +86,7 @@ class ChatTemplateTests: XCTestCase {
7086
}
7187

7288
func testNamedTemplateFromArgument() async throws {
73-
let tokenizer = try await AutoTokenizer.from(pretrained: "mlx-community/Mistral-7B-Instruct-v0.3-4bit")
89+
let tokenizer = try await Self.sharedTokenizerWithTemplateArray()
7490
// Normally it is not necessary to specify the name `default`, but I'm not aware of models with lists of templates in the config that are not `default` or `tool_use`
7591
let encoded = try tokenizer.applyChatTemplate(messages: messages, chatTemplate: .name("default"))
7692
let encodedTarget = [1, 29473, 3, 28752, 1040, 4672, 2563, 17060, 4610, 29491, 29473, 4]

0 commit comments

Comments
 (0)