Skip to content

Commit 0ca22b6

Browse files
committed
1.修改部分文档中的typo; 2.支持对transformers embedding
1 parent 81bcb51 commit 0ca22b6

File tree

5 files changed

+559
-32
lines changed

5 files changed

+559
-32
lines changed

fastNLP/embeddings/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
"RobertaEmbedding",
1717
"RobertaWordPieceEncoder",
1818

19+
"TransformersEmbedding",
20+
"TransformersWordPieceEncoder",
21+
1922
"GPT2Embedding",
2023
"GPT2WordPieceEncoder",
2124

@@ -32,6 +35,7 @@
3235
from .elmo_embedding import ElmoEmbedding
3336
from .bert_embedding import BertEmbedding, BertWordPieceEncoder
3437
from .roberta_embedding import RobertaEmbedding, RobertaWordPieceEncoder
38+
from .transformers_embedding import TransformersEmbedding, TransformersWordPieceEncoder
3539
from .gpt2_embedding import GPT2WordPieceEncoder, GPT2Embedding
3640
from .char_embedding import CNNCharEmbedding, LSTMCharEmbedding
3741
from .stack_embedding import StackEmbedding

fastNLP/embeddings/roberta_embedding.py

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ def __init__(self, model_dir_or_name: str, vocab: Vocabulary, layers: str = '-1'
196196
include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = False, min_freq=2):
197197
super().__init__()
198198

199-
self.tokenzier = RobertaTokenizer.from_pretrained(model_dir_or_name)
199+
self.tokenizer = RobertaTokenizer.from_pretrained(model_dir_or_name)
200200
self.encoder = RobertaModel.from_pretrained(model_dir_or_name)
201201
# 由于RobertaEmbedding中设置了padding_idx为1, 且使用了非常神奇的position计算方式,所以-2
202202
self._max_position_embeddings = self.encoder.config.max_position_embeddings - 2
@@ -233,14 +233,14 @@ def __init__(self, model_dir_or_name: str, vocab: Vocabulary, layers: str = '-1'
233233
word = '<unk>'
234234
elif vocab.word_count[word]<min_freq:
235235
word = '<unk>'
236-
word_pieces = self.tokenzier.tokenize(word)
237-
word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces)
236+
word_pieces = self.tokenizer.tokenize(word)
237+
word_pieces = self.tokenizer.convert_tokens_to_ids(word_pieces)
238238
word_to_wordpieces.append(word_pieces)
239239
word_pieces_lengths.append(len(word_pieces))
240-
self._cls_index = self.tokenzier.encoder['<s>']
241-
self._sep_index = self.tokenzier.encoder['</s>']
240+
self._cls_index = self.tokenizer.encoder['<s>']
241+
self._sep_index = self.tokenizer.encoder['</s>']
242242
self._word_pad_index = vocab.padding_idx
243-
self._wordpiece_pad_index = self.tokenzier.encoder['<pad>'] # 需要用于生成word_piece
243+
self._wordpiece_pad_index = self.tokenizer.encoder['<pad>'] # 需要用于生成word_piece
244244
self.word_to_wordpieces = np.array(word_to_wordpieces)
245245
self.register_buffer('word_pieces_lengths', torch.LongTensor(word_pieces_lengths))
246246
logger.debug("Successfully generate word pieces.")
@@ -352,20 +352,19 @@ def forward(self, words):
352352
return outputs
353353

354354
def save(self, folder):
355-
self.tokenzier.save_pretrained(folder)
355+
self.tokenizer.save_pretrained(folder)
356356
self.encoder.save_pretrained(folder)
357357

358358

359359
class RobertaWordPieceEncoder(nn.Module):
360360
r"""
361-
读取bert模型,读取之后调用index_dataset方法在dataset中生成word_pieces这一列。
361+
读取roberta模型,读取之后调用index_dataset方法在dataset中生成word_pieces这一列。
362362
363363
RobertaWordPieceEncoder可以支持自动下载权重,当前支持的模型:
364364
en: roberta-base
365365
en-large: roberta-large
366366
367367
"""
368-
369368
def __init__(self, model_dir_or_name: str = 'en', layers: str = '-1', pooled_cls: bool = False,
370369
word_dropout=0, dropout=0, requires_grad: bool = True, **kwargs):
371370
r"""
@@ -417,11 +416,10 @@ def index_datasets(self, *datasets, field_name, add_cls_sep=True, add_prefix_spa
417416

418417
def forward(self, word_pieces, token_type_ids=None):
419418
r"""
420-
计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。
419+
计算words的bert embedding表示。传入的words中应该自行包含<s>与</s>>的tag。
421420
422421
:param words: batch_size x max_len
423-
:param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话. 如果不传入,则自动生成(大部分情况,都不需要输入),
424-
第一个[SEP]及之前为0, 第二个[SEP]及到第一个[SEP]之间为1; 第三个[SEP]及到第二个[SEP]之间为0,依次往后推。
422+
:param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话. 如果不传入,则自动生成(大部分情况,都不需要输入)。
425423
:return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers))
426424
"""
427425
word_pieces = self.drop_word(word_pieces)
@@ -484,7 +482,7 @@ class _WordPieceRobertaModel(nn.Module):
484482
def __init__(self, model_dir_or_name: str, layers: str = '-1', pooled_cls: bool=False):
485483
super().__init__()
486484

487-
self.tokenzier = RobertaTokenizer.from_pretrained(model_dir_or_name)
485+
self.tokenizer = RobertaTokenizer.from_pretrained(model_dir_or_name)
488486
self.encoder = RobertaModel.from_pretrained(model_dir_or_name)
489487
# 检查encoder_layer_number是否合理
490488
encoder_layer_number = len(self.encoder.encoder.layer)
@@ -504,25 +502,25 @@ def __init__(self, model_dir_or_name: str, layers: str = '-1', pooled_cls: bool=
504502
assert layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \
505503
f"a RoBERTa model with {encoder_layer_number} layers."
506504

507-
self._cls_index = self.tokenzier.encoder['<s>']
508-
self._sep_index = self.tokenzier.encoder['</s>']
509-
self._wordpiece_pad_index = self.tokenzier.encoder['<pad>'] # 需要用于生成word_piece
510-
self._wordpiece_unknown_index = self.tokenzier.encoder['<unk>']
505+
self._cls_index = self.tokenizer.encoder['<s>']
506+
self._sep_index = self.tokenizer.encoder['</s>']
507+
self._wordpiece_pad_index = self.tokenizer.encoder['<pad>'] # 需要用于生成word_piece
508+
self._wordpiece_unknown_index = self.tokenizer.encoder['<unk>']
511509
self.pooled_cls = pooled_cls
512510

513511
def index_datasets(self, *datasets, field_name, add_cls_sep=True, add_prefix_space=True):
514512
r"""
515-
使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是
516-
[CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。
513+
使用roberta的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是
514+
<s>与</s>会在首尾额外加入<s>与</s>, 且将word_pieces这一列的pad value设置为了bert的pad value。
517515
518516
:param datasets: DataSet对象
519-
:param field_name: 基于哪一列index
517+
:param field_name: 基于哪一列index, 这一列一般是raw_string
520518
:param bool add_cls_sep: 是否在句首句尾添加cls和sep的index
521519
:param bool add_prefix_space: 是否在句子开头添加空格,预训练时RoBERTa该值为True
522520
:return:
523521
"""
524522

525-
encode_func = partial(self.tokenzier.encode, add_special_tokens=add_cls_sep, add_prefix_space=add_prefix_space)
523+
encode_func = partial(self.tokenizer.encode, add_special_tokens=add_cls_sep, add_prefix_space=add_prefix_space)
526524

527525
for index, dataset in enumerate(datasets):
528526
try:
@@ -555,5 +553,5 @@ def forward(self, word_pieces):
555553
return outputs
556554

557555
def save(self, folder):
558-
self.tokenzier.save_pretrained(folder)
556+
self.tokenizer.save_pretrained(folder)
559557
self.encoder.save_pretrained(folder)

0 commit comments

Comments
 (0)