File tree Expand file tree Collapse file tree 2 files changed +8
-9
lines changed
+bert/+tokenizer/+internal Expand file tree Collapse file tree 2 files changed +8
-9
lines changed Original file line number Diff line number Diff line change 8585 % tokens = tokenize(tokenizer,text) tokenizes the input
8686 % string text using the FullTokenizer specified by tokenizer.
8787 basicToks = this .Basic .tokenize(txt );
88+ basicToksUnicode = textanalytics .unicode .UTF32(basicToks );
8889 subToks = cell(numel(basicToks ),1 );
8990 for i = 1 : numel(basicToks )
90- subToks{i } = this .WordPiece .tokenize(basicToks{ i } );
91+ subToks{i } = this .WordPiece .tokenize(basicToksUnicode( i ) );
9192 end
9293 toks = cat(2 ,subToks{: });
9394 end
Original file line number Diff line number Diff line change 3737 this.Vocab = this .parseVocab(vocab );
3838 end
3939
40- function tokens = tokenize(this ,text )
40+ function tokens = tokenize(this ,utext )
4141 arguments
4242 this
43- text ( 1 , 1 ) string
43+ utext
4444 end
4545 tokens = string .empty();
46- wsTokens = this .WhitespaceTokenizer .tokenize(text );
47- wsTokensU = textanalytics .unicode .UTF32(wsTokens );
48- for i = 1 : numel(wsTokensU )
49- token = wsTokensU(i );
46+ sub = textanalytics .unicode .UTF32();
47+ for i = 1 : numel(utext )
48+ token = utext(i );
5049 if numel(token .Data )>this .MaxChar
5150 tokens = [tokens ,this .Unk ]; % #ok
5251 continue
5756 while start <(numel(token .Data )+1 )
5857 finish = numel(token .Data );
5958 currentSub = [];
60- while start < finish + 1
61- sub = textanalytics .unicode .UTF32();
59+ while start < finish + 1
6260 sub.Data = token .Data(start : finish );
6361 if start > 1
6462 sub.Data = [uint32(' ##' ),sub .Data ];
You can’t perform that action at this time.
0 commit comments