Skip to content
This repository was archived by the owner on Jul 7, 2023. It is now read-only.

Commit a2a6178

Browse files
authored
Merge pull request #57 from ReDeiPirati/algorithmic_reverse_nlplike_2
algorithmic_reverse_nlplike generator
2 parents 06df1d4 + 31f5dfa commit a2a6178

File tree

6 files changed

+99
-0
lines changed

6 files changed

+99
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# Compiled python modules.
22
*.pyc
3+
# Byte-compiled
4+
__pycache__/
35

46
# Python egg metadata, regenerated from source files by setuptools.
57
/*.egg-info

tensor2tensor/bin/t2t-datagen

100644100755
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,16 @@ _SUPPORTED_PROBLEM_GENERATORS = {
8787
"algorithmic_multiplication_decimal40": (
8888
lambda: algorithmic.multiplication_generator(10, 40, 100000),
8989
lambda: algorithmic.multiplication_generator(10, 400, 10000)),
90+
"algorithmic_reverse_nlplike_decimal8K": (
91+
lambda: algorithmic.reverse_generator_nlplike(8000, 70, 100000,
92+
10, 1.300),
93+
lambda: algorithmic.reverse_generator_nlplike(8000, 700, 10000,
94+
10, 1.300)),
95+
"algorithmic_reverse_nlplike_decimal32K": (
96+
lambda: algorithmic.reverse_generator_nlplike(32000, 70, 100000,
97+
10, 1.050),
98+
lambda: algorithmic.reverse_generator_nlplike(32000, 700, 10000,
99+
10, 1.050)),
90100
"algorithmic_algebra_inverse": (
91101
lambda: algorithmic_math.algebra_inverse(26, 0, 2, 100000),
92102
lambda: algorithmic_math.algebra_inverse(26, 3, 3, 10000)),

tensor2tensor/bin/t2t-trainer

100644100755
File mode changed.

tensor2tensor/data_generators/algorithmic.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,75 @@ def reverse_generator(nbr_symbols, max_length, nbr_cases):
9393
"targets": list(reversed(inputs)) + [1]} # [1] for EOS
9494

9595

96+
def zipf_distribution(nbr_symbols, alpha):
97+
"""Helper function: Create a Zipf distribution.
98+
99+
Args:
100+
nbr_symbols: number of symbols to use in the distribution.
101+
alpha: float, Zipf's Law Distribution parameter. Default = 1.5.
102+
Usually for modelling natural text distribution is in
103+
the range [1.1-1.6].
104+
105+
Return:
106+
distr_map: list of float, Zipf's distribution over nbr_symbols.
107+
108+
"""
109+
tmp = np.power(np.arange(1, nbr_symbols+1), -alpha)
110+
zeta = np.r_[0.0, np.cumsum(tmp)]
111+
return [x / zeta[-1] for x in zeta]
112+
113+
114+
def zipf_random_sample(distr_map, sample_len):
115+
"""Helper function: Generate a random Zipf sample of given lenght.
116+
117+
Args:
118+
distr_map: list of float, Zipf's distribution over nbr_symbols.
119+
sample_len: integer, length of sequence to generate.
120+
121+
Return:
122+
sample: list of integer, Zipf's random sample over nbr_symbols.
123+
124+
"""
125+
u = np.random.random(sample_len)
126+
# Random produces values in range [0.0,1.0); even if it is almost
127+
# improbable(but possible) that it can generate a clear 0.000..0,
128+
# we have made a sanity check to overcome this issue. On the other hand,
129+
# t+1 is enough from saving us to generate PAD(0) and EOS(1) which are
130+
# reservated symbols.
131+
return [t+1 if t > 0 else t+2 for t in np.searchsorted(distr_map, u)]
132+
133+
134+
def reverse_generator_nlplike(nbr_symbols, max_length, nbr_cases, \
135+
scale_std_dev=100, alpha=1.5):
136+
"""Generator for the reversing nlp-like task on sequences of symbols.
137+
138+
The length of the sequence is drawn from a Gaussian(Normal) distribution
139+
at random from [1, max_length] and with std deviation of 1%,
140+
then symbols are drawn from Zipf's law at random from [2, nbr_symbols] until
141+
nbr_cases sequences have been produced.
142+
143+
Args:
144+
max_length: integer, maximum length of sequences to generate.
145+
nbr_cases: the number of cases to generate.
146+
scale_std_dev: float, Normal distribution's standard deviation scale factor
147+
used to draw the lenght of sequence. Default = 1% of the max_length.
148+
alpha: float, Zipf's Law Distribution parameter. Default = 1.5.
149+
Usually for modelling natural text distribution is in
150+
the range [1.1-1.6].
151+
152+
Yields:
153+
A dictionary {"inputs": input-list, "targets": target-list} where
154+
target-list is input-list reversed.
155+
"""
156+
std_dev = max_length / scale_std_dev
157+
distr_map = zipf_distribution(nbr_symbols, alpha)
158+
for _ in xrange(nbr_cases):
159+
l = int(abs(np.random.normal(loc=max_length/2, scale=std_dev)) + 1)
160+
inputs = zipf_random_sample(distr_map, l)
161+
yield {"inputs": inputs,
162+
"targets": list(reversed(inputs)) + [1]} # [1] for EOS
163+
164+
96165
def lower_endian_to_number(l, base):
97166
"""Helper function: convert a list of digits in the given base to a number."""
98167
return sum([d * (base**i) for i, d in enumerate(l)])

tensor2tensor/data_generators/algorithmic_test.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,22 @@ def testReverseGenerator(self):
4141
self.assertEqual(list(reversed(d["inputs"])) + [1], d["targets"])
4242
self.assertEqual(counter, 10)
4343

44+
def testZipfDistribution(self):
45+
# Following Zipf's Law with alpha equals 1: the first in rank is two times
46+
# more probable/frequent that the second in rank, three times more prob/freq
47+
# that the third in rank and so on.
48+
d = algorithmic.zipf_distribution(10, 1.0001)
49+
for i in xrange(len(d[1:])-1):
50+
self.assertEqual("%.4f" % (abs(d[i+1]-d[i+2])*(i+2)), \
51+
"%.4f" % d[1])
52+
53+
def testReverseGeneratorNlpLike(self):
54+
counter = 0
55+
for d in algorithmic.reverse_generator_nlplike(3, 8, 10):
56+
counter += 1
57+
self.assertEqual(list(reversed(d["inputs"])) + [1], d["targets"])
58+
self.assertEqual(counter, 10)
59+
4460
def testLowerEndianToNumber(self):
4561
self.assertEqual(algorithmic.lower_endian_to_number([0], 2), 0)
4662
self.assertEqual(algorithmic.lower_endian_to_number([0], 7), 0)

tensor2tensor/data_generators/problem_hparams.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -682,6 +682,8 @@ def image_mscoco_tokens(model_hparams, vocab_count):
682682
"algorithmic_multiplication_decimal40": lambda p: algorithmic(12, p),
683683
"algorithmic_reverse_binary40": lambda p: algorithmic(4, p),
684684
"algorithmic_reverse_decimal40": lambda p: algorithmic(12, p),
685+
"algorithmic_reverse_nlplike_decimal8K": lambda p: algorithmic(8002, p),
686+
"algorithmic_reverse_nlplike_decimal32K": lambda p: algorithmic(32002, p),
685687
"algorithmic_shift_decimal40": lambda p: algorithmic(22, p),
686688
"audio_timit_characters_tune": audio_timit_characters,
687689
"audio_timit_characters_test": audio_timit_characters,

0 commit comments

Comments
 (0)