Skip to content

Commit 3e0d206

Browse files
committed
update test to new word piece tokenizer api
1 parent fdb64ee commit 3e0d206

File tree

1 file changed

+7
-3
lines changed

1 file changed

+7
-3
lines changed

test/bert/tokenizer/internal/tWordPieceTokenizer.m

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ function canSetUnknownToken(test)
3939
tok = bert.tokenizer.internal.WordPieceTokenizer(enc,'UnknownToken',unk);
4040
test.verifyEqual(tok.Unk,unk)
4141
str = "blah";
42-
act_out = tok.tokenize(str);
42+
ustr = textanalytics.unicode.UTF32(str);
43+
act_out = tok.tokenize(ustr);
4344
exp_out = unk;
4445
test.verifyEqual(act_out,exp_out);
4546
end
@@ -50,7 +51,8 @@ function canSetMaxTokenLength(test)
5051
tok = bert.tokenizer.internal.WordPieceTokenizer(enc,'MaxTokenLength',maxLen);
5152
test.verifyEqual(tok.MaxChar,maxLen);
5253
str = "foo";
53-
act_out = tok.tokenize(str);
54+
ustr = textanalytics.unicode.UTF32(str);
55+
act_out = tok.tokenize(ustr);
5456
exp_out = tok.Unk;
5557
test.verifyEqual(act_out,exp_out);
5658
end
@@ -59,7 +61,9 @@ function canTokenize(test)
5961
enc = wordEncoding(["foo","bar","##foo"]);
6062
tok = bert.tokenizer.internal.WordPieceTokenizer(enc);
6163
str = "foo bar foobar barba bafoobar barfoo";
62-
act_out = tok.tokenize(str);
64+
wsTok = bert.tokenizer.internal.WhitespaceTokenizer;
65+
ustr = textanalytics.unicode.UTF32(wsTok.tokenize(str));
66+
act_out = tok.tokenize(ustr);
6367
exp_out = ["foo","bar",tok.Unk,tok.Unk,tok.Unk,"bar","##foo"];
6468
test.verifyEqual(act_out,exp_out);
6569
end

0 commit comments

Comments
 (0)