Skip to content

Commit 7fb91a7

Browse files
committed
more possible optimizations
1 parent cfe1f20 commit 7fb91a7

File tree

2 files changed

+8
-9
lines changed

2 files changed

+8
-9
lines changed

+bert/+tokenizer/+internal/FullTokenizer.m

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,10 @@
8585
% tokens = tokenize(tokenizer,text) tokenizes the input
8686
% string text using the FullTokenizer specified by tokenizer.
8787
basicToks = this.Basic.tokenize(txt);
88+
basicToksUnicode = textanalytics.unicode.UTF32(basicToks);
8889
subToks = cell(numel(basicToks),1);
8990
for i = 1:numel(basicToks)
90-
subToks{i} = this.WordPiece.tokenize(basicToks{i});
91+
subToks{i} = this.WordPiece.tokenize(basicToksUnicode(i));
9192
end
9293
toks = cat(2,subToks{:});
9394
end

+bert/+tokenizer/+internal/WordPieceTokenizer.m

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,16 +37,15 @@
3737
this.Vocab = this.parseVocab(vocab);
3838
end
3939

40-
function tokens = tokenize(this,text)
40+
function tokens = tokenize(this,utext)
4141
arguments
4242
this
43-
text (1,1) string
43+
utext
4444
end
4545
tokens = string.empty();
46-
wsTokens = this.WhitespaceTokenizer.tokenize(text);
47-
wsTokensU = textanalytics.unicode.UTF32(wsTokens);
48-
for i = 1:numel(wsTokensU)
49-
token = wsTokensU(i);
46+
sub = textanalytics.unicode.UTF32();
47+
for i = 1:numel(utext)
48+
token = utext(i);
5049
if numel(token.Data)>this.MaxChar
5150
tokens = [tokens,this.Unk]; %#ok
5251
continue
@@ -57,8 +56,7 @@
5756
while start<(numel(token.Data)+1)
5857
finish = numel(token.Data);
5958
currentSub = [];
60-
while start<finish+1
61-
sub = textanalytics.unicode.UTF32();
59+
while start<finish+1
6260
sub.Data = token.Data(start:finish);
6361
if start>1
6462
sub.Data = [uint32('##'),sub.Data];

0 commit comments

Comments
 (0)