Skip to content
This repository was archived by the owner on Jul 7, 2023. It is now read-only.

Commit 35db1a5

Browse files
committed
en-vi: add IWSLT'15 English-Vietnamese as new problem
1 parent b929e30 commit 35db1a5

File tree

1 file changed

+65
-0
lines changed

1 file changed

+65
-0
lines changed
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# coding=utf-8
2+
# Copyright 2018 The Tensor2Tensor Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Data generators for translation data-sets."""
17+
18+
from __future__ import absolute_import
19+
from __future__ import division
20+
from __future__ import print_function
21+
22+
# Dependency imports
23+
24+
from tensor2tensor.data_generators import problem
25+
from tensor2tensor.data_generators import text_encoder
26+
from tensor2tensor.data_generators import translate
27+
from tensor2tensor.utils import registry
28+
29+
import tensorflow as tf
30+
31+
FLAGS = tf.flags.FLAGS
32+
33+
# End-of-sentence marker.
34+
EOS = text_encoder.EOS_ID
35+
36+
# For English-Vietnamese the IWSLT'15 corpus
37+
# from https://nlp.stanford.edu/projects/nmt/ is used.
38+
# The original dataset has 133K parallel sentences.
39+
_ENVI_TRAIN_DATASETS = [[
40+
"https://github.com/stefan-it/nmt-en-vi/raw/master/data/train-en-vi.tgz", # pylint: disable=line-too-long
41+
("train.en", "train.vi")
42+
]]
43+
44+
# For development 1,553 parallel sentences are used.
45+
_ENVI_TEST_DATASETS = [[
46+
"https://github.com/stefan-it/nmt-en-vi/raw/master/data/dev-2012-en-vi.tgz", # pylint: disable=line-too-long
47+
("tst2012.en", "tst2012.vi")
48+
]]
49+
50+
51+
@registry.register_problem
52+
class TranslateEnviIwslt32k(translate.TranslateProblem):
53+
"""Problem spec for IWSLT'15 En-Vi translation."""
54+
55+
@property
56+
def approx_vocab_size(self):
57+
return 2**15 # 32768
58+
59+
@property
60+
def vocab_filename(self):
61+
return "vocab.envi.%d" % self.approx_vocab_size
62+
63+
def source_data_files(self, dataset_split):
64+
train = dataset_split == problem.DatasetSplit.TRAIN
65+
return _ENVI_TRAIN_DATASETS if train else _ENVI_TEST_DATASETS

0 commit comments

Comments
 (0)