Add new generator: algorithmic_reverse_nlplike that generates samples following Zipf's LAw

ReDeiPirati · ReDeiPirati · commit 79f309a38eb6 · 2017-06-27T13:25:19.000+02:00
diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
@@ -86,6 +86,16 @@ _SUPPORTED_PROBLEM_GENERATORS = {
     "algorithmic_multiplication_decimal40": (
         lambda: algorithmic.multiplication_generator(10, 40, 100000),
         lambda: algorithmic.multiplication_generator(10, 400, 10000)),
+    "algorithmic_reverse_nlplike_decimal8K": (
+        lambda: algorithmic.reverse_generator_nlplike(8000, 40, 100000,
+                                                      10, 1.250),
+        lambda: algorithmic.reverse_generator_nlplike(8000, 400, 10000,
+                                                      10, 1.250)),
+    "algorithmic_reverse_nlplike_decimal32K": (
+        lambda: algorithmic.reverse_generator_nlplike(32000, 40, 100000,
+                                                      10, 1.005),
+        lambda: algorithmic.reverse_generator_nlplike(32000, 400, 10000,
+                                                      10, 1.005)),
     "algorithmic_algebra_inverse": (
         lambda: algorithmic_math.algebra_inverse(26, 0, 2, 100000),
         lambda: algorithmic_math.algebra_inverse(26, 3, 3, 10000)),
diff --git a/tensor2tensor/data_generators/algorithmic.py b/tensor2tensor/data_generators/algorithmic.py
@@ -93,6 +93,70 @@ def reverse_generator(nbr_symbols, max_length, nbr_cases):
            "targets": list(reversed(inputs)) + [1]}  # [1] for EOS
 
 
+def zipf_distribution(nbr_symbols, alpha):
+  """Helper function: Create a Zipf distribution.
+
+  Args:
+    nbr_symbols: number of symbols to use in the distribution.
+    alpha: float, Zipf's Law Distribution parameter. Default = 1.5.
+      Usually for modelling natural text distribution is in
+      the range [1.1-1.6].
+
+  Return:
+    distr_map: list of float, Zipf's distribution over nbr_symbols.
+
+  """
+  tmp = np.power(np.arange(1, nbr_symbols+1), -alpha)
+  zeta = np.r_[0.0, np.cumsum(tmp)]
+  return [x / zeta[-1] for x in zeta]
+
+
+def zipf_random_sample(distr_map, sample_len):
+  """Helper function: Generate a random Zipf sample of given lenght.
+
+  Args:
+    distr_map: list of float, Zipf's distribution over nbr_symbols.
+    sample_len: integer, length of sequence to generate.
+
+  Return:
+    sample: list of integer, Zipf's random sample over nbr_symbols.
+
+  """
+  u = np.random.random(sample_len)
+  return [t+1 for t in np.searchsorted(distr_map, u)] # 0 pad and 1 EOS
+
+
+def reverse_generator_nlplike(nbr_symbols, max_length, nbr_cases, \
+  scale_std_dev=100, alpha=1.5):
+  """Generator for the reversing nlp-like task on sequences of symbols.
+
+  The length of the sequence is drawn from a Gaussian(Normal) distribution
+  at random from [1, max_length] and with std deviation of 1%,
+  then symbols are drawn from Zipf's law at random from [2, nbr_symbols] until
+  nbr_cases sequences have been produced.
+
+  Args:
+    max_length: integer, maximum length of sequences to generate.
+    nbr_cases: the number of cases to generate.
+    scale_std_dev: float, Normal distribution's standard deviation scale factor
+      used to draw the lenght of sequence. Default = 1% of the max_length.
+    alpha: float, Zipf's Law Distribution parameter. Default = 1.5.
+      Usually for modelling natural text distribution is in
+      the range [1.1-1.6].
+
+  Yields:
+    A dictionary {"inputs": input-list, "targets": target-list} where
+    target-list is input-list reversed.
+  """
+  std_dev = max_length / scale_std_dev
+  distr_map = zipf_distribution(nbr_symbols, alpha)
+  for _ in xrange(nbr_cases):
+    l = int(abs(np.random.normal(loc=max_length/2, scale=std_dev)) + 1)
+    inputs = zipf_random_sample(distr_map, l)
+    yield {"inputs": inputs,
+           "targets": list(reversed(inputs)) + [1]}  # [1] for EOS
+
+
 def lower_endian_to_number(l, base):
   """Helper function: convert a list of digits in the given base to a number."""
   return sum([d * (base**i) for i, d in enumerate(l)])
diff --git a/tensor2tensor/data_generators/algorithmic_test.py b/tensor2tensor/data_generators/algorithmic_test.py
@@ -41,6 +41,22 @@ def testReverseGenerator(self):
       self.assertEqual(list(reversed(d["inputs"])) + [1], d["targets"])
     self.assertEqual(counter, 10)
 
+  def  testZipfDistribution(self):
+  # Following Zipf's Law with alpha equals 1: the first in rank is two times
+  # more probable/frequent that the second in rank, three times more prob/freq
+  # that the third in rank and so on.
+    d = algorithmic.zipf_distribution(10, 1.0001)
+    for i in xrange(len(d[1:])-1):
+      self.assertEqual("%.4f" % (abs(d[i+1]-d[i+2])*(i+2)), \
+       "%.4f" % d[1])
+
+  def testReverseGeneratorNlpLike(self):
+    counter = 0
+    for d in algorithmic.reverse_generator_nlplike(3, 8, 10):
+      counter += 1
+      self.assertEqual(list(reversed(d["inputs"])) + [1], d["targets"])
+    self.assertEqual(counter, 10)
+
   def testLowerEndianToNumber(self):
     self.assertEqual(algorithmic.lower_endian_to_number([0], 2), 0)
     self.assertEqual(algorithmic.lower_endian_to_number([0], 7), 0)