Skip to content

Commit 9a9732a

Browse files
Gosicflyyhcc
authored andcommitted
Update vocabulary.py (#325)
strip只需要将\n去掉,否则会将一些特殊字符去掉,造成split的时候长度出错 token_type_id_rev (#329) 当activation=lambda x: x出现错误 (#330) Co-authored-by: 路人咦 <1417954729@qq.com> 1.修改部分文档中的typo; 2.支持对transformers embedding update import
1 parent ba23045 commit 9a9732a

File tree

8 files changed

+565
-40
lines changed

8 files changed

+565
-40
lines changed

fastNLP/core/vocabulary.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -540,7 +540,7 @@ def load(filepath):
540540

541541
vocab = Vocabulary()
542542
for line in f:
543-
line = line.strip()
543+
line = line.strip('\n')
544544
if line:
545545
name, value = line.split()
546546
if name in ('max_size', 'min_freq'):
@@ -557,7 +557,7 @@ def load(filepath):
557557
no_create_entry_counter = {}
558558
word2idx = {}
559559
for line in f:
560-
line = line.strip()
560+
line = line.strip('\n')
561561
if line:
562562
parts = line.split('\t')
563563
word,count,idx,no_create_entry = parts[0], int(parts[1]), int(parts[2]), int(parts[3])

fastNLP/embeddings/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
"RobertaEmbedding",
1717
"RobertaWordPieceEncoder",
1818

19+
"TransformersEmbedding",
20+
"TransformersWordPieceEncoder",
21+
1922
"GPT2Embedding",
2023
"GPT2WordPieceEncoder",
2124

@@ -32,6 +35,7 @@
3235
from .elmo_embedding import ElmoEmbedding
3336
from .bert_embedding import BertEmbedding, BertWordPieceEncoder
3437
from .roberta_embedding import RobertaEmbedding, RobertaWordPieceEncoder
38+
from .transformers_embedding import TransformersEmbedding, TransformersWordPieceEncoder
3539
from .gpt2_embedding import GPT2WordPieceEncoder, GPT2Embedding
3640
from .char_embedding import CNNCharEmbedding, LSTMCharEmbedding
3741
from .stack_embedding import StackEmbedding

fastNLP/embeddings/bert_embedding.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -294,8 +294,7 @@ def forward(self, word_pieces, token_type_ids=None):
294294
sep_mask = word_pieces.eq(self._sep_index) # batch_size x max_len
295295
sep_mask_cumsum = sep_mask.long().flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1])
296296
token_type_ids = sep_mask_cumsum.fmod(2)
297-
if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0
298-
token_type_ids = token_type_ids.eq(0).long()
297+
token_type_ids = token_type_ids[:, :1].__xor__(token_type_ids) # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0
299298

300299
word_pieces = self.drop_word(word_pieces)
301300
outputs = self.model(word_pieces, token_type_ids)
@@ -465,8 +464,7 @@ def forward(self, words):
465464
sep_mask = word_pieces.eq(self._sep_index).long() # batch_size x max_len
466465
sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1])
467466
token_type_ids = sep_mask_cumsum.fmod(2)
468-
if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0
469-
token_type_ids = token_type_ids.eq(0).long()
467+
token_type_ids = token_type_ids[:, :1].__xor__(token_type_ids) # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0
470468
else:
471469
token_type_ids = torch.zeros_like(word_pieces)
472470
# 2. 获取hidden的结果,根据word_pieces进行对应的pool计算

fastNLP/embeddings/roberta_embedding.py

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ def __init__(self, model_dir_or_name: str, vocab: Vocabulary, layers: str = '-1'
196196
include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = False, min_freq=2):
197197
super().__init__()
198198

199-
self.tokenzier = RobertaTokenizer.from_pretrained(model_dir_or_name)
199+
self.tokenizer = RobertaTokenizer.from_pretrained(model_dir_or_name)
200200
self.encoder = RobertaModel.from_pretrained(model_dir_or_name)
201201
# 由于RobertaEmbedding中设置了padding_idx为1, 且使用了非常神奇的position计算方式,所以-2
202202
self._max_position_embeddings = self.encoder.config.max_position_embeddings - 2
@@ -233,14 +233,14 @@ def __init__(self, model_dir_or_name: str, vocab: Vocabulary, layers: str = '-1'
233233
word = '<unk>'
234234
elif vocab.word_count[word]<min_freq:
235235
word = '<unk>'
236-
word_pieces = self.tokenzier.tokenize(word)
237-
word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces)
236+
word_pieces = self.tokenizer.tokenize(word)
237+
word_pieces = self.tokenizer.convert_tokens_to_ids(word_pieces)
238238
word_to_wordpieces.append(word_pieces)
239239
word_pieces_lengths.append(len(word_pieces))
240-
self._cls_index = self.tokenzier.encoder['<s>']
241-
self._sep_index = self.tokenzier.encoder['</s>']
240+
self._cls_index = self.tokenizer.encoder['<s>']
241+
self._sep_index = self.tokenizer.encoder['</s>']
242242
self._word_pad_index = vocab.padding_idx
243-
self._wordpiece_pad_index = self.tokenzier.encoder['<pad>'] # 需要用于生成word_piece
243+
self._wordpiece_pad_index = self.tokenizer.encoder['<pad>'] # 需要用于生成word_piece
244244
self.word_to_wordpieces = np.array(word_to_wordpieces)
245245
self.register_buffer('word_pieces_lengths', torch.LongTensor(word_pieces_lengths))
246246
logger.debug("Successfully generate word pieces.")
@@ -352,20 +352,19 @@ def forward(self, words):
352352
return outputs
353353

354354
def save(self, folder):
355-
self.tokenzier.save_pretrained(folder)
355+
self.tokenizer.save_pretrained(folder)
356356
self.encoder.save_pretrained(folder)
357357

358358

359359
class RobertaWordPieceEncoder(nn.Module):
360360
r"""
361-
读取bert模型,读取之后调用index_dataset方法在dataset中生成word_pieces这一列。
361+
读取roberta模型,读取之后调用index_dataset方法在dataset中生成word_pieces这一列。
362362
363363
RobertaWordPieceEncoder可以支持自动下载权重,当前支持的模型:
364364
en: roberta-base
365365
en-large: roberta-large
366366
367367
"""
368-
369368
def __init__(self, model_dir_or_name: str = 'en', layers: str = '-1', pooled_cls: bool = False,
370369
word_dropout=0, dropout=0, requires_grad: bool = True, **kwargs):
371370
r"""
@@ -417,11 +416,10 @@ def index_datasets(self, *datasets, field_name, add_cls_sep=True, add_prefix_spa
417416

418417
def forward(self, word_pieces, token_type_ids=None):
419418
r"""
420-
计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。
419+
计算words的bert embedding表示。传入的words中应该自行包含<s>与</s>>的tag。
421420
422421
:param words: batch_size x max_len
423-
:param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话. 如果不传入,则自动生成(大部分情况,都不需要输入),
424-
第一个[SEP]及之前为0, 第二个[SEP]及到第一个[SEP]之间为1; 第三个[SEP]及到第二个[SEP]之间为0,依次往后推。
422+
:param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话. 如果不传入,则自动生成(大部分情况,都不需要输入)。
425423
:return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers))
426424
"""
427425
word_pieces = self.drop_word(word_pieces)
@@ -484,7 +482,7 @@ class _WordPieceRobertaModel(nn.Module):
484482
def __init__(self, model_dir_or_name: str, layers: str = '-1', pooled_cls: bool=False):
485483
super().__init__()
486484

487-
self.tokenzier = RobertaTokenizer.from_pretrained(model_dir_or_name)
485+
self.tokenizer = RobertaTokenizer.from_pretrained(model_dir_or_name)
488486
self.encoder = RobertaModel.from_pretrained(model_dir_or_name)
489487
# 检查encoder_layer_number是否合理
490488
encoder_layer_number = len(self.encoder.encoder.layer)
@@ -504,25 +502,25 @@ def __init__(self, model_dir_or_name: str, layers: str = '-1', pooled_cls: bool=
504502
assert layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \
505503
f"a RoBERTa model with {encoder_layer_number} layers."
506504

507-
self._cls_index = self.tokenzier.encoder['<s>']
508-
self._sep_index = self.tokenzier.encoder['</s>']
509-
self._wordpiece_pad_index = self.tokenzier.encoder['<pad>'] # 需要用于生成word_piece
510-
self._wordpiece_unknown_index = self.tokenzier.encoder['<unk>']
505+
self._cls_index = self.tokenizer.encoder['<s>']
506+
self._sep_index = self.tokenizer.encoder['</s>']
507+
self._wordpiece_pad_index = self.tokenizer.encoder['<pad>'] # 需要用于生成word_piece
508+
self._wordpiece_unknown_index = self.tokenizer.encoder['<unk>']
511509
self.pooled_cls = pooled_cls
512510

513511
def index_datasets(self, *datasets, field_name, add_cls_sep=True, add_prefix_space=True):
514512
r"""
515-
使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是
516-
[CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。
513+
使用roberta的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是
514+
<s>与</s>会在首尾额外加入<s>与</s>, 且将word_pieces这一列的pad value设置为了bert的pad value。
517515
518516
:param datasets: DataSet对象
519-
:param field_name: 基于哪一列index
517+
:param field_name: 基于哪一列index, 这一列一般是raw_string
520518
:param bool add_cls_sep: 是否在句首句尾添加cls和sep的index
521519
:param bool add_prefix_space: 是否在句子开头添加空格,预训练时RoBERTa该值为True
522520
:return:
523521
"""
524522

525-
encode_func = partial(self.tokenzier.encode, add_special_tokens=add_cls_sep, add_prefix_space=add_prefix_space)
523+
encode_func = partial(self.tokenizer.encode, add_special_tokens=add_cls_sep, add_prefix_space=add_prefix_space)
526524

527525
for index, dataset in enumerate(datasets):
528526
try:
@@ -555,5 +553,5 @@ def forward(self, word_pieces):
555553
return outputs
556554

557555
def save(self, folder):
558-
self.tokenzier.save_pretrained(folder)
556+
self.tokenizer.save_pretrained(folder)
559557
self.encoder.save_pretrained(folder)

0 commit comments

Comments
 (0)