@@ -196,7 +196,7 @@ def __init__(self, model_dir_or_name: str, vocab: Vocabulary, layers: str = '-1'
196196 include_cls_sep : bool = False , pooled_cls : bool = False , auto_truncate : bool = False , min_freq = 2 ):
197197 super ().__init__ ()
198198
199- self .tokenzier = RobertaTokenizer .from_pretrained (model_dir_or_name )
199+ self .tokenizer = RobertaTokenizer .from_pretrained (model_dir_or_name )
200200 self .encoder = RobertaModel .from_pretrained (model_dir_or_name )
201201 # 由于RobertaEmbedding中设置了padding_idx为1, 且使用了非常神奇的position计算方式,所以-2
202202 self ._max_position_embeddings = self .encoder .config .max_position_embeddings - 2
@@ -233,14 +233,14 @@ def __init__(self, model_dir_or_name: str, vocab: Vocabulary, layers: str = '-1'
233233 word = '<unk>'
234234 elif vocab .word_count [word ]< min_freq :
235235 word = '<unk>'
236- word_pieces = self .tokenzier .tokenize (word )
237- word_pieces = self .tokenzier .convert_tokens_to_ids (word_pieces )
236+ word_pieces = self .tokenizer .tokenize (word )
237+ word_pieces = self .tokenizer .convert_tokens_to_ids (word_pieces )
238238 word_to_wordpieces .append (word_pieces )
239239 word_pieces_lengths .append (len (word_pieces ))
240- self ._cls_index = self .tokenzier .encoder ['<s>' ]
241- self ._sep_index = self .tokenzier .encoder ['</s>' ]
240+ self ._cls_index = self .tokenizer .encoder ['<s>' ]
241+ self ._sep_index = self .tokenizer .encoder ['</s>' ]
242242 self ._word_pad_index = vocab .padding_idx
243- self ._wordpiece_pad_index = self .tokenzier .encoder ['<pad>' ] # 需要用于生成word_piece
243+ self ._wordpiece_pad_index = self .tokenizer .encoder ['<pad>' ] # 需要用于生成word_piece
244244 self .word_to_wordpieces = np .array (word_to_wordpieces )
245245 self .register_buffer ('word_pieces_lengths' , torch .LongTensor (word_pieces_lengths ))
246246 logger .debug ("Successfully generate word pieces." )
@@ -352,20 +352,19 @@ def forward(self, words):
352352 return outputs
353353
354354 def save (self , folder ):
355- self .tokenzier .save_pretrained (folder )
355+ self .tokenizer .save_pretrained (folder )
356356 self .encoder .save_pretrained (folder )
357357
358358
359359class RobertaWordPieceEncoder (nn .Module ):
360360 r"""
361- 读取bert模型 ,读取之后调用index_dataset方法在dataset中生成word_pieces这一列。
361+ 读取roberta模型 ,读取之后调用index_dataset方法在dataset中生成word_pieces这一列。
362362
363363 RobertaWordPieceEncoder可以支持自动下载权重,当前支持的模型:
364364 en: roberta-base
365365 en-large: roberta-large
366366
367367 """
368-
369368 def __init__ (self , model_dir_or_name : str = 'en' , layers : str = '-1' , pooled_cls : bool = False ,
370369 word_dropout = 0 , dropout = 0 , requires_grad : bool = True , ** kwargs ):
371370 r"""
@@ -417,11 +416,10 @@ def index_datasets(self, *datasets, field_name, add_cls_sep=True, add_prefix_spa
417416
418417 def forward (self , word_pieces , token_type_ids = None ):
419418 r"""
420- 计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP] 的tag。
419+ 计算words的bert embedding表示。传入的words中应该自行包含<s>与</s>> 的tag。
421420
422421 :param words: batch_size x max_len
423- :param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话. 如果不传入,则自动生成(大部分情况,都不需要输入),
424- 第一个[SEP]及之前为0, 第二个[SEP]及到第一个[SEP]之间为1; 第三个[SEP]及到第二个[SEP]之间为0,依次往后推。
422+ :param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话. 如果不传入,则自动生成(大部分情况,都不需要输入)。
425423 :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers))
426424 """
427425 word_pieces = self .drop_word (word_pieces )
@@ -484,7 +482,7 @@ class _WordPieceRobertaModel(nn.Module):
484482 def __init__ (self , model_dir_or_name : str , layers : str = '-1' , pooled_cls : bool = False ):
485483 super ().__init__ ()
486484
487- self .tokenzier = RobertaTokenizer .from_pretrained (model_dir_or_name )
485+ self .tokenizer = RobertaTokenizer .from_pretrained (model_dir_or_name )
488486 self .encoder = RobertaModel .from_pretrained (model_dir_or_name )
489487 # 检查encoder_layer_number是否合理
490488 encoder_layer_number = len (self .encoder .encoder .layer )
@@ -504,25 +502,25 @@ def __init__(self, model_dir_or_name: str, layers: str = '-1', pooled_cls: bool=
504502 assert layer <= encoder_layer_number , f"The layer index:{ layer } is out of scope for " \
505503 f"a RoBERTa model with { encoder_layer_number } layers."
506504
507- self ._cls_index = self .tokenzier .encoder ['<s>' ]
508- self ._sep_index = self .tokenzier .encoder ['</s>' ]
509- self ._wordpiece_pad_index = self .tokenzier .encoder ['<pad>' ] # 需要用于生成word_piece
510- self ._wordpiece_unknown_index = self .tokenzier .encoder ['<unk>' ]
505+ self ._cls_index = self .tokenizer .encoder ['<s>' ]
506+ self ._sep_index = self .tokenizer .encoder ['</s>' ]
507+ self ._wordpiece_pad_index = self .tokenizer .encoder ['<pad>' ] # 需要用于生成word_piece
508+ self ._wordpiece_unknown_index = self .tokenizer .encoder ['<unk>' ]
511509 self .pooled_cls = pooled_cls
512510
513511 def index_datasets (self , * datasets , field_name , add_cls_sep = True , add_prefix_space = True ):
514512 r"""
515- 使用bert的tokenizer新生成word_pieces列加入到datasets中 ,并将他们设置为input。如果首尾不是
516- [CLS]与[SEP] 会在首尾额外加入[CLS]与[SEP] , 且将word_pieces这一列的pad value设置为了bert的pad value。
513+ 使用roberta的tokenizer新生成word_pieces列加入到datasets中 ,并将他们设置为input。如果首尾不是
514+ <s>与</s> 会在首尾额外加入<s>与</s> , 且将word_pieces这一列的pad value设置为了bert的pad value。
517515
518516 :param datasets: DataSet对象
519- :param field_name: 基于哪一列index
517+ :param field_name: 基于哪一列index, 这一列一般是raw_string
520518 :param bool add_cls_sep: 是否在句首句尾添加cls和sep的index
521519 :param bool add_prefix_space: 是否在句子开头添加空格,预训练时RoBERTa该值为True
522520 :return:
523521 """
524522
525- encode_func = partial (self .tokenzier .encode , add_special_tokens = add_cls_sep , add_prefix_space = add_prefix_space )
523+ encode_func = partial (self .tokenizer .encode , add_special_tokens = add_cls_sep , add_prefix_space = add_prefix_space )
526524
527525 for index , dataset in enumerate (datasets ):
528526 try :
@@ -555,5 +553,5 @@ def forward(self, word_pieces):
555553 return outputs
556554
557555 def save (self , folder ):
558- self .tokenzier .save_pretrained (folder )
556+ self .tokenizer .save_pretrained (folder )
559557 self .encoder .save_pretrained (folder )
0 commit comments