Skip to content
This repository was archived by the owner on Jul 7, 2023. It is now read-only.

Commit 79f309a

Browse files
committed
Add new generator: algorithmic_reverse_nlplike that generates samples following Zipf's LAw
1 parent 38b9c11 commit 79f309a

File tree

3 files changed

+90
-0
lines changed

3 files changed

+90
-0
lines changed

tensor2tensor/bin/t2t-datagen

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,16 @@ _SUPPORTED_PROBLEM_GENERATORS = {
8686
"algorithmic_multiplication_decimal40": (
8787
lambda: algorithmic.multiplication_generator(10, 40, 100000),
8888
lambda: algorithmic.multiplication_generator(10, 400, 10000)),
89+
"algorithmic_reverse_nlplike_decimal8K": (
90+
lambda: algorithmic.reverse_generator_nlplike(8000, 40, 100000,
91+
10, 1.250),
92+
lambda: algorithmic.reverse_generator_nlplike(8000, 400, 10000,
93+
10, 1.250)),
94+
"algorithmic_reverse_nlplike_decimal32K": (
95+
lambda: algorithmic.reverse_generator_nlplike(32000, 40, 100000,
96+
10, 1.005),
97+
lambda: algorithmic.reverse_generator_nlplike(32000, 400, 10000,
98+
10, 1.005)),
8999
"algorithmic_algebra_inverse": (
90100
lambda: algorithmic_math.algebra_inverse(26, 0, 2, 100000),
91101
lambda: algorithmic_math.algebra_inverse(26, 3, 3, 10000)),

tensor2tensor/data_generators/algorithmic.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,70 @@ def reverse_generator(nbr_symbols, max_length, nbr_cases):
9393
"targets": list(reversed(inputs)) + [1]} # [1] for EOS
9494

9595

96+
def zipf_distribution(nbr_symbols, alpha):
97+
"""Helper function: Create a Zipf distribution.
98+
99+
Args:
100+
nbr_symbols: number of symbols to use in the distribution.
101+
alpha: float, Zipf's Law Distribution parameter. Default = 1.5.
102+
Usually for modelling natural text distribution is in
103+
the range [1.1-1.6].
104+
105+
Return:
106+
distr_map: list of float, Zipf's distribution over nbr_symbols.
107+
108+
"""
109+
tmp = np.power(np.arange(1, nbr_symbols+1), -alpha)
110+
zeta = np.r_[0.0, np.cumsum(tmp)]
111+
return [x / zeta[-1] for x in zeta]
112+
113+
114+
def zipf_random_sample(distr_map, sample_len):
115+
"""Helper function: Generate a random Zipf sample of given lenght.
116+
117+
Args:
118+
distr_map: list of float, Zipf's distribution over nbr_symbols.
119+
sample_len: integer, length of sequence to generate.
120+
121+
Return:
122+
sample: list of integer, Zipf's random sample over nbr_symbols.
123+
124+
"""
125+
u = np.random.random(sample_len)
126+
return [t+1 for t in np.searchsorted(distr_map, u)] # 0 pad and 1 EOS
127+
128+
129+
def reverse_generator_nlplike(nbr_symbols, max_length, nbr_cases, \
130+
scale_std_dev=100, alpha=1.5):
131+
"""Generator for the reversing nlp-like task on sequences of symbols.
132+
133+
The length of the sequence is drawn from a Gaussian(Normal) distribution
134+
at random from [1, max_length] and with std deviation of 1%,
135+
then symbols are drawn from Zipf's law at random from [2, nbr_symbols] until
136+
nbr_cases sequences have been produced.
137+
138+
Args:
139+
max_length: integer, maximum length of sequences to generate.
140+
nbr_cases: the number of cases to generate.
141+
scale_std_dev: float, Normal distribution's standard deviation scale factor
142+
used to draw the lenght of sequence. Default = 1% of the max_length.
143+
alpha: float, Zipf's Law Distribution parameter. Default = 1.5.
144+
Usually for modelling natural text distribution is in
145+
the range [1.1-1.6].
146+
147+
Yields:
148+
A dictionary {"inputs": input-list, "targets": target-list} where
149+
target-list is input-list reversed.
150+
"""
151+
std_dev = max_length / scale_std_dev
152+
distr_map = zipf_distribution(nbr_symbols, alpha)
153+
for _ in xrange(nbr_cases):
154+
l = int(abs(np.random.normal(loc=max_length/2, scale=std_dev)) + 1)
155+
inputs = zipf_random_sample(distr_map, l)
156+
yield {"inputs": inputs,
157+
"targets": list(reversed(inputs)) + [1]} # [1] for EOS
158+
159+
96160
def lower_endian_to_number(l, base):
97161
"""Helper function: convert a list of digits in the given base to a number."""
98162
return sum([d * (base**i) for i, d in enumerate(l)])

tensor2tensor/data_generators/algorithmic_test.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,22 @@ def testReverseGenerator(self):
4141
self.assertEqual(list(reversed(d["inputs"])) + [1], d["targets"])
4242
self.assertEqual(counter, 10)
4343

44+
def testZipfDistribution(self):
45+
# Following Zipf's Law with alpha equals 1: the first in rank is two times
46+
# more probable/frequent that the second in rank, three times more prob/freq
47+
# that the third in rank and so on.
48+
d = algorithmic.zipf_distribution(10, 1.0001)
49+
for i in xrange(len(d[1:])-1):
50+
self.assertEqual("%.4f" % (abs(d[i+1]-d[i+2])*(i+2)), \
51+
"%.4f" % d[1])
52+
53+
def testReverseGeneratorNlpLike(self):
54+
counter = 0
55+
for d in algorithmic.reverse_generator_nlplike(3, 8, 10):
56+
counter += 1
57+
self.assertEqual(list(reversed(d["inputs"])) + [1], d["targets"])
58+
self.assertEqual(counter, 10)
59+
4460
def testLowerEndianToNumber(self):
4561
self.assertEqual(algorithmic.lower_endian_to_number([0], 2), 0)
4662
self.assertEqual(algorithmic.lower_endian_to_number([0], 7), 0)

0 commit comments

Comments
 (0)