en-vi: add IWSLT'15 English-Vietnamese as new problem

stefan-it · stefan-it · commit 35db1a5e5f62 · 2018-02-23T02:01:22.000+01:00
diff --git a/tensor2tensor/data_generators/translate_envi.py b/tensor2tensor/data_generators/translate_envi.py
@@ -0,0 +1,65 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Data generators for translation data-sets."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.data_generators import problem
+from tensor2tensor.data_generators import text_encoder
+from tensor2tensor.data_generators import translate
+from tensor2tensor.utils import registry
+
+import tensorflow as tf
+
+FLAGS = tf.flags.FLAGS
+
+# End-of-sentence marker.
+EOS = text_encoder.EOS_ID
+
+# For English-Vietnamese the IWSLT'15 corpus
+# from https://nlp.stanford.edu/projects/nmt/ is used.
+# The original dataset has 133K parallel sentences.
+_ENVI_TRAIN_DATASETS = [[
+    "https://github.com/stefan-it/nmt-en-vi/raw/master/data/train-en-vi.tgz",  # pylint: disable=line-too-long
+    ("train.en", "train.vi")
+]]
+
+# For development 1,553 parallel sentences are used.
+_ENVI_TEST_DATASETS = [[
+    "https://github.com/stefan-it/nmt-en-vi/raw/master/data/dev-2012-en-vi.tgz",  # pylint: disable=line-too-long
+    ("tst2012.en", "tst2012.vi")
+]]
+
+
+@registry.register_problem
+class TranslateEnviIwslt32k(translate.TranslateProblem):
+  """Problem spec for IWSLT'15 En-Vi translation."""
+
+  @property
+  def approx_vocab_size(self):
+    return 2**15  # 32768
+
+  @property
+  def vocab_filename(self):
+    return "vocab.envi.%d" % self.approx_vocab_size
+
+  def source_data_files(self, dataset_split):
+    train = dataset_split == problem.DatasetSplit.TRAIN
+    return _ENVI_TRAIN_DATASETS if train else _ENVI_TEST_DATASETS