Merge pull request #57 from ReDeiPirati/algorithmic_reverse_nlplike_2

lukaszkaiser · web-flow · commit a2a617880121 · 2017-06-29T11:32:22.000-07:00
algorithmic_reverse_nlplike generator
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,7 @@
 # Compiled python modules.
 *.pyc
+# Byte-compiled
+__pycache__/
 
 # Python egg metadata, regenerated from source files by setuptools.
 /*.egg-info
diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
@@ -87,6 +87,16 @@ _SUPPORTED_PROBLEM_GENERATORS = {
     "algorithmic_multiplication_decimal40": (
         lambda: algorithmic.multiplication_generator(10, 40, 100000),
         lambda: algorithmic.multiplication_generator(10, 400, 10000)),
+    "algorithmic_reverse_nlplike_decimal8K": (
+        lambda: algorithmic.reverse_generator_nlplike(8000, 70, 100000,
+                                                      10, 1.300),
+        lambda: algorithmic.reverse_generator_nlplike(8000, 700, 10000,
+                                                      10, 1.300)),
+    "algorithmic_reverse_nlplike_decimal32K": (
+        lambda: algorithmic.reverse_generator_nlplike(32000, 70, 100000,
+                                                      10, 1.050),
+        lambda: algorithmic.reverse_generator_nlplike(32000, 700, 10000,
+                                                      10, 1.050)),
     "algorithmic_algebra_inverse": (
         lambda: algorithmic_math.algebra_inverse(26, 0, 2, 100000),
         lambda: algorithmic_math.algebra_inverse(26, 3, 3, 10000)),
diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/bin/t2t-trainer
diff --git a/tensor2tensor/data_generators/algorithmic.py b/tensor2tensor/data_generators/algorithmic.py
@@ -93,6 +93,75 @@ def reverse_generator(nbr_symbols, max_length, nbr_cases):
            "targets": list(reversed(inputs)) + [1]}  # [1] for EOS
 
 
+def zipf_distribution(nbr_symbols, alpha):
+  """Helper function: Create a Zipf distribution.
+
+  Args:
+    nbr_symbols: number of symbols to use in the distribution.
+    alpha: float, Zipf's Law Distribution parameter. Default = 1.5.
+      Usually for modelling natural text distribution is in
+      the range [1.1-1.6].
+
+  Return:
+    distr_map: list of float, Zipf's distribution over nbr_symbols.
+
+  """
+  tmp = np.power(np.arange(1, nbr_symbols+1), -alpha)
+  zeta = np.r_[0.0, np.cumsum(tmp)]
+  return [x / zeta[-1] for x in zeta]
+
+
+def zipf_random_sample(distr_map, sample_len):
+  """Helper function: Generate a random Zipf sample of given lenght.
+
+  Args:
+    distr_map: list of float, Zipf's distribution over nbr_symbols.
+    sample_len: integer, length of sequence to generate.
+
+  Return:
+    sample: list of integer, Zipf's random sample over nbr_symbols.
+
+  """
+  u = np.random.random(sample_len)
+  # Random produces values in range [0.0,1.0); even if it is almost
+  # improbable(but possible) that it can generate a clear 0.000..0,
+  # we have made a sanity check to overcome this issue. On the other hand,
+  # t+1 is enough from saving us to generate PAD(0) and EOS(1) which are
+  # reservated symbols.
+  return [t+1 if t > 0 else t+2 for t in np.searchsorted(distr_map, u)]
+
+
+def reverse_generator_nlplike(nbr_symbols, max_length, nbr_cases, \
+  scale_std_dev=100, alpha=1.5):
+  """Generator for the reversing nlp-like task on sequences of symbols.
+
+  The length of the sequence is drawn from a Gaussian(Normal) distribution
+  at random from [1, max_length] and with std deviation of 1%,
+  then symbols are drawn from Zipf's law at random from [2, nbr_symbols] until
+  nbr_cases sequences have been produced.
+
+  Args:
+    max_length: integer, maximum length of sequences to generate.
+    nbr_cases: the number of cases to generate.
+    scale_std_dev: float, Normal distribution's standard deviation scale factor
+      used to draw the lenght of sequence. Default = 1% of the max_length.
+    alpha: float, Zipf's Law Distribution parameter. Default = 1.5.
+      Usually for modelling natural text distribution is in
+      the range [1.1-1.6].
+
+  Yields:
+    A dictionary {"inputs": input-list, "targets": target-list} where
+    target-list is input-list reversed.
+  """
+  std_dev = max_length / scale_std_dev
+  distr_map = zipf_distribution(nbr_symbols, alpha)
+  for _ in xrange(nbr_cases):
+    l = int(abs(np.random.normal(loc=max_length/2, scale=std_dev)) + 1)
+    inputs = zipf_random_sample(distr_map, l)
+    yield {"inputs": inputs,
+           "targets": list(reversed(inputs)) + [1]}  # [1] for EOS
+
+
 def lower_endian_to_number(l, base):
   """Helper function: convert a list of digits in the given base to a number."""
   return sum([d * (base**i) for i, d in enumerate(l)])
diff --git a/tensor2tensor/data_generators/algorithmic_test.py b/tensor2tensor/data_generators/algorithmic_test.py
@@ -41,6 +41,22 @@ def testReverseGenerator(self):
       self.assertEqual(list(reversed(d["inputs"])) + [1], d["targets"])
     self.assertEqual(counter, 10)
 
+  def  testZipfDistribution(self):
+  # Following Zipf's Law with alpha equals 1: the first in rank is two times
+  # more probable/frequent that the second in rank, three times more prob/freq
+  # that the third in rank and so on.
+    d = algorithmic.zipf_distribution(10, 1.0001)
+    for i in xrange(len(d[1:])-1):
+      self.assertEqual("%.4f" % (abs(d[i+1]-d[i+2])*(i+2)), \
+       "%.4f" % d[1])
+
+  def testReverseGeneratorNlpLike(self):
+    counter = 0
+    for d in algorithmic.reverse_generator_nlplike(3, 8, 10):
+      counter += 1
+      self.assertEqual(list(reversed(d["inputs"])) + [1], d["targets"])
+    self.assertEqual(counter, 10)
+
   def testLowerEndianToNumber(self):
     self.assertEqual(algorithmic.lower_endian_to_number([0], 2), 0)
     self.assertEqual(algorithmic.lower_endian_to_number([0], 7), 0)
diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py
@@ -682,6 +682,8 @@ def image_mscoco_tokens(model_hparams, vocab_count):
     "algorithmic_multiplication_decimal40": lambda p: algorithmic(12, p),
     "algorithmic_reverse_binary40": lambda p: algorithmic(4, p),
     "algorithmic_reverse_decimal40": lambda p: algorithmic(12, p),
+    "algorithmic_reverse_nlplike_decimal8K": lambda p: algorithmic(8002, p),
+    "algorithmic_reverse_nlplike_decimal32K": lambda p: algorithmic(32002, p),
     "algorithmic_shift_decimal40": lambda p: algorithmic(22, p),
     "audio_timit_characters_tune": audio_timit_characters,
     "audio_timit_characters_test": audio_timit_characters,