tensorflow
diff --git a/‎tensor2tensor/bin/t2t-datagen‎
Lines changed: 52 additions & 42 deletions b/‎tensor2tensor/bin/t2t-datagen‎
Lines changed: 52 additions & 42 deletions
diff --git a/‎tensor2tensor/data_generators/algorithmic.py‎
Lines changed: 59 additions & 12 deletions b/‎tensor2tensor/data_generators/algorithmic.py‎
Lines changed: 59 additions & 12 deletions
diff --git a/‎tensor2tensor/data_generators/all_problems.py‎
Lines changed: 31 additions & 0 deletions b/‎tensor2tensor/data_generators/all_problems.py‎
Lines changed: 31 additions & 0 deletions
@@ -24,6 +24,9 @@ takes 2 arguments - input_directory and mode (one of "train" or "dev") - and
 yields for each training example a dictionary mapping string feature names to
 lists of {string, int, float}. The generator will be run once for each mode.
 """
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 
 import random
 import tempfile
@@ -34,6 +37,7 @@ import numpy as np
 
 from tensor2tensor.data_generators import algorithmic
 from tensor2tensor.data_generators import algorithmic_math
+from tensor2tensor.data_generators import all_problems  # pylint: disable=unused-import
 from tensor2tensor.data_generators import audio
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import image
@@ -43,6 +47,7 @@ from tensor2tensor.data_generators import snli
 from tensor2tensor.data_generators import wiki
 from tensor2tensor.data_generators import wmt
 from tensor2tensor.data_generators import wsj_parsing
+from tensor2tensor.utils import registry
 
 import tensorflow as tf
 
@@ -62,12 +67,6 @@ flags.DEFINE_integer("random_seed", 429459, "Random seed to use.")
 # Mapping from problems that we can generate data for to their generators.
 # pylint: disable=g-long-lambda
 _SUPPORTED_PROBLEM_GENERATORS = {
-    "algorithmic_identity_binary40": (
-        lambda: algorithmic.identity_generator(2, 40, 100000),
-        lambda: algorithmic.identity_generator(2, 400, 10000)),
-    "algorithmic_identity_decimal40": (
-        lambda: algorithmic.identity_generator(10, 40, 100000),
-        lambda: algorithmic.identity_generator(10, 400, 10000)),
     "algorithmic_shift_decimal40": (
         lambda: algorithmic.shift_generator(20, 10, 40, 100000),
         lambda: algorithmic.shift_generator(20, 10, 80, 10000)),
@@ -294,8 +293,6 @@ _SUPPORTED_PROBLEM_GENERATORS = {
 
 # pylint: enable=g-long-lambda
 
-UNSHUFFLED_SUFFIX = "-unshuffled"
-
 
 def set_random_seed():
   """Set the random seed from flag everywhere."""
@@ -308,13 +305,15 @@ def main(_):
   tf.logging.set_verbosity(tf.logging.INFO)
 
   # Calculate the list of problems to generate.
-  problems = list(sorted(_SUPPORTED_PROBLEM_GENERATORS))
+  problems = sorted(
+      list(_SUPPORTED_PROBLEM_GENERATORS) + registry.list_problems())
   if FLAGS.problem and FLAGS.problem[-1] == "*":
     problems = [p for p in problems if p.startswith(FLAGS.problem[:-1])]
   elif FLAGS.problem:
     problems = [p for p in problems if p == FLAGS.problem]
   else:
     problems = []
+
   # Remove TIMIT if paths are not given.
   if not FLAGS.timit_paths:
     problems = [p for p in problems if "timit" not in p]
@@ -326,7 +325,8 @@ def main(_):
     problems = [p for p in problems if "ende_bpe" not in p]
 
   if not problems:
-    problems_str = "\n  * ".join(sorted(_SUPPORTED_PROBLEM_GENERATORS))
+    problems_str = "\n  * ".join(
+        sorted(list(_SUPPORTED_PROBLEM_GENERATORS) + registry.list_problems()))
     error_msg = ("You must specify one of the supported problems to "
                  "generate data for:\n  * " + problems_str + "\n")
     error_msg += ("TIMIT, ende_bpe and parsing need data_sets specified with "
@@ -343,40 +343,50 @@ def main(_):
   for problem in problems:
     set_random_seed()
 
-    training_gen, dev_gen = _SUPPORTED_PROBLEM_GENERATORS[problem]
-
-    if isinstance(dev_gen, int):
-      # The dev set and test sets are generated as extra shards using the
-      # training generator.  The integer specifies the number of training
-      # shards.  FLAGS.num_shards is ignored.
-      num_training_shards = dev_gen
-      tf.logging.info("Generating data for %s.", problem)
-      all_output_files = generator_utils.combined_data_filenames(
-          problem + UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_training_shards)
-      generator_utils.generate_files(
-          training_gen(), all_output_files, FLAGS.max_cases)
+    if problem in _SUPPORTED_PROBLEM_GENERATORS:
+      generate_data_for_problem(problem)
     else:
-      # usual case - train data and dev data are generated using separate
-      # generators.
-      tf.logging.info("Generating training data for %s.", problem)
-      train_output_files = generator_utils.train_data_filenames(
-          problem + UNSHUFFLED_SUFFIX, FLAGS.data_dir, FLAGS.num_shards)
-      generator_utils.generate_files(
-          training_gen(), train_output_files, FLAGS.max_cases)
-      tf.logging.info("Generating development data for %s.", problem)
-      dev_shards = 10 if "coco" in problem else 1
-      dev_output_files = generator_utils.dev_data_filenames(
-          problem + UNSHUFFLED_SUFFIX, FLAGS.data_dir, dev_shards)
-      generator_utils.generate_files(dev_gen(), dev_output_files)
-      all_output_files = train_output_files + dev_output_files
+      generate_data_for_registered_problem(problem)
+
+
+def generate_data_for_problem(problem):
+  """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS."""
+  training_gen, dev_gen = _SUPPORTED_PROBLEM_GENERATORS[problem]
+
+  if isinstance(dev_gen, int):
+    # The dev set and test sets are generated as extra shards using the
+    # training generator.  The integer specifies the number of training
+    # shards.  FLAGS.num_shards is ignored.
+    num_training_shards = dev_gen
+    tf.logging.info("Generating data for %s.", problem)
+    all_output_files = generator_utils.combined_data_filenames(
+        problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
+        num_training_shards)
+    generator_utils.generate_files(training_gen(), all_output_files,
+                                   FLAGS.max_cases)
+  else:
+    # usual case - train data and dev data are generated using separate
+    # generators.
+    tf.logging.info("Generating training data for %s.", problem)
+    train_output_files = generator_utils.train_data_filenames(
+        problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
+        FLAGS.num_shards)
+    generator_utils.generate_files(training_gen(), train_output_files,
+                                   FLAGS.max_cases)
+    tf.logging.info("Generating development data for %s.", problem)
+    dev_shards = 10 if "coco" in problem else 1
+    dev_output_files = generator_utils.dev_data_filenames(
+        problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, dev_shards)
+    generator_utils.generate_files(dev_gen(), dev_output_files)
+    all_output_files = train_output_files + dev_output_files
+
+  tf.logging.info("Shuffling data...")
+  generator_utils.shuffle_dataset(all_output_files)
+
 
-    tf.logging.info("Shuffling data...")
-    for fname in all_output_files:
-      records = generator_utils.read_records(fname)
-      random.shuffle(records)
-      out_fname = fname.replace(UNSHUFFLED_SUFFIX, "")
-      generator_utils.write_records(records, out_fname)
-      tf.gfile.Remove(fname)
+def generate_data_for_registered_problem(problem_name):
+  problem = registry.problem(problem_name)
+  problem.generate_data(FLAGS.data_dir)
 
 
 if __name__ == "__main__":
 
@@ -23,12 +23,50 @@
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensor2tensor.data_generators import generator_utils as utils
+from tensor2tensor.data_generators import problem
+from tensor2tensor.utils import registry
+
+
+@registry.register_problem
+class AlgorithmicIdentityBinary40(problem.Problem):
+  """Problem spec for algorithmic binary identity task."""
+
+  @property
+  def num_symbols(self):
+    return 2
+
+  def generate_data(self, data_dir):
+    utils.generate_files(
+        identity_generator(self.num_symbols, 40, 100000),
+        self.training_filepaths(data_dir, 100))
+    utils.generate_files(
+        identity_generator(self.num_symbols, 400, 10000),
+        self.dev_filepaths(data_dir, 1))
+
+  def hparams(self, defaults, unused_model_hparams):
+    p = defaults
+    vocab_size = self.num_symbols + self._encoders["inputs"].num_reserved_ids
+    p.input_modality = {"inputs": (registry.Modalities.SYMBOL, vocab_size)}
+    p.target_modality = (registry.Modalities.SYMBOL, vocab_size)
+    p.input_space_id = problem.SpaceID.DIGIT_0
+    p.target_space_id = problem.SpaceID.DIGIT_1
+
+
+@registry.register_problem
+class AlgorithmicIdentityDecimal40(AlgorithmicIdentityBinary40):
+  """Problem spec for algorithmic decimal identity task."""
+
+  @property
+  def num_symbols(self):
+    return 10
+
 
 def identity_generator(nbr_symbols, max_length, nbr_cases):
   """Generator for the identity (copy) task on sequences of symbols.
 
   The length of the sequence is drawn uniformly at random from [1, max_length]
-  and then symbols are drawn uniformly at random from [2, nbr_symbols] until
+  and then symbols are drawn uniformly at random from [2, nbr_symbols + 2) until
   nbr_cases sequences have been produced.
 
   Args:
@@ -66,8 +104,10 @@ def shift_generator(nbr_symbols, shift, max_length, nbr_cases):
   for _ in xrange(nbr_cases):
     l = np.random.randint(max_length) + 1
     inputs = [np.random.randint(nbr_symbols - shift) + 2 for _ in xrange(l)]
-    yield {"inputs": inputs,
-           "targets": [i + shift for i in inputs] + [1]}  # [1] for EOS
+    yield {
+        "inputs": inputs,
+        "targets": [i + shift for i in inputs] + [1]
+    }  # [1] for EOS
 
 
 def reverse_generator(nbr_symbols, max_length, nbr_cases):
@@ -89,8 +129,10 @@ def reverse_generator(nbr_symbols, max_length, nbr_cases):
   for _ in xrange(nbr_cases):
     l = np.random.randint(max_length) + 1
     inputs = [np.random.randint(nbr_symbols) + 2 for _ in xrange(l)]
-    yield {"inputs": inputs,
-           "targets": list(reversed(inputs)) + [1]}  # [1] for EOS
+    yield {
+        "inputs": inputs,
+        "targets": list(reversed(inputs)) + [1]
+    }  # [1] for EOS
 
 
 def zipf_distribution(nbr_symbols, alpha):
@@ -106,7 +148,7 @@ def zipf_distribution(nbr_symbols, alpha):
     distr_map: list of float, Zipf's distribution over nbr_symbols.
 
   """
-  tmp = np.power(np.arange(1, nbr_symbols+1), -alpha)
+  tmp = np.power(np.arange(1, nbr_symbols + 1), -alpha)
   zeta = np.r_[0.0, np.cumsum(tmp)]
   return [x / zeta[-1] for x in zeta]
 
@@ -128,11 +170,14 @@ def zipf_random_sample(distr_map, sample_len):
   # we have made a sanity check to overcome this issue. On the other hand,
   # t+1 is enough from saving us to generate PAD(0) and EOS(1) which are
   # reservated symbols.
-  return [t+1 if t > 0 else t+2 for t in np.searchsorted(distr_map, u)]
+  return [t + 1 if t > 0 else t + 2 for t in np.searchsorted(distr_map, u)]
 
 
-def reverse_generator_nlplike(nbr_symbols, max_length, nbr_cases,
-                              scale_std_dev=100, alpha=1.5):
+def reverse_generator_nlplike(nbr_symbols,
+                              max_length,
+                              nbr_cases,
+                              scale_std_dev=100,
+                              alpha=1.5):
   """Generator for the reversing nlp-like task on sequences of symbols.
 
   The length of the sequence is drawn from a Gaussian(Normal) distribution
@@ -157,10 +202,12 @@ def reverse_generator_nlplike(nbr_symbols, max_length, nbr_cases,
   std_dev = max_length / scale_std_dev
   distr_map = zipf_distribution(nbr_symbols, alpha)
   for _ in xrange(nbr_cases):
-    l = int(abs(np.random.normal(loc=max_length/2, scale=std_dev)) + 1)
+    l = int(abs(np.random.normal(loc=max_length / 2, scale=std_dev)) + 1)
     inputs = zipf_random_sample(distr_map, l)
-    yield {"inputs": inputs,
-           "targets": list(reversed(inputs)) + [1]}  # [1] for EOS
+    yield {
+        "inputs": inputs,
+        "targets": list(reversed(inputs)) + [1]
+    }  # [1] for EOS
 
 
 def lower_endian_to_number(l, base):
 
@@ -0,0 +1,31 @@
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Imports for problem modules."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensor2tensor.data_generators import algorithmic
+from tensor2tensor.data_generators import algorithmic_math
+from tensor2tensor.data_generators import audio
+from tensor2tensor.data_generators import image
+from tensor2tensor.data_generators import lm1b
+from tensor2tensor.data_generators import ptb
+from tensor2tensor.data_generators import snli
+from tensor2tensor.data_generators import wiki
+from tensor2tensor.data_generators import wmt
+from tensor2tensor.data_generators import wsj_parsing
+# pylint: enable=unused-import