@@ -93,6 +93,75 @@ def reverse_generator(nbr_symbols, max_length, nbr_cases):
9393 "targets" : list (reversed (inputs )) + [1 ]} # [1] for EOS
9494
9595
96+ def zipf_distribution (nbr_symbols , alpha ):
97+ """Helper function: Create a Zipf distribution.
98+
99+ Args:
100+ nbr_symbols: number of symbols to use in the distribution.
101+ alpha: float, Zipf's Law Distribution parameter. Default = 1.5.
102+ Usually for modelling natural text distribution is in
103+ the range [1.1-1.6].
104+
105+ Return:
106+ distr_map: list of float, Zipf's distribution over nbr_symbols.
107+
108+ """
109+ tmp = np .power (np .arange (1 , nbr_symbols + 1 ), - alpha )
110+ zeta = np .r_ [0.0 , np .cumsum (tmp )]
111+ return [x / zeta [- 1 ] for x in zeta ]
112+
113+
114+ def zipf_random_sample (distr_map , sample_len ):
115+ """Helper function: Generate a random Zipf sample of given lenght.
116+
117+ Args:
118+ distr_map: list of float, Zipf's distribution over nbr_symbols.
119+ sample_len: integer, length of sequence to generate.
120+
121+ Return:
122+ sample: list of integer, Zipf's random sample over nbr_symbols.
123+
124+ """
125+ u = np .random .random (sample_len )
126+ # Random produces values in range [0.0,1.0); even if it is almost
127+ # improbable(but possible) that it can generate a clear 0.000..0,
128+ # we have made a sanity check to overcome this issue. On the other hand,
129+ # t+1 is enough from saving us to generate PAD(0) and EOS(1) which are
130+ # reservated symbols.
131+ return [t + 1 if t > 0 else t + 2 for t in np .searchsorted (distr_map , u )]
132+
133+
134+ def reverse_generator_nlplike (nbr_symbols , max_length , nbr_cases , \
135+ scale_std_dev = 100 , alpha = 1.5 ):
136+ """Generator for the reversing nlp-like task on sequences of symbols.
137+
138+ The length of the sequence is drawn from a Gaussian(Normal) distribution
139+ at random from [1, max_length] and with std deviation of 1%,
140+ then symbols are drawn from Zipf's law at random from [2, nbr_symbols] until
141+ nbr_cases sequences have been produced.
142+
143+ Args:
144+ max_length: integer, maximum length of sequences to generate.
145+ nbr_cases: the number of cases to generate.
146+ scale_std_dev: float, Normal distribution's standard deviation scale factor
147+ used to draw the lenght of sequence. Default = 1% of the max_length.
148+ alpha: float, Zipf's Law Distribution parameter. Default = 1.5.
149+ Usually for modelling natural text distribution is in
150+ the range [1.1-1.6].
151+
152+ Yields:
153+ A dictionary {"inputs": input-list, "targets": target-list} where
154+ target-list is input-list reversed.
155+ """
156+ std_dev = max_length / scale_std_dev
157+ distr_map = zipf_distribution (nbr_symbols , alpha )
158+ for _ in xrange (nbr_cases ):
159+ l = int (abs (np .random .normal (loc = max_length / 2 , scale = std_dev )) + 1 )
160+ inputs = zipf_random_sample (distr_map , l )
161+ yield {"inputs" : inputs ,
162+ "targets" : list (reversed (inputs )) + [1 ]} # [1] for EOS
163+
164+
96165def lower_endian_to_number (l , base ):
97166 """Helper function: convert a list of digits in the given base to a number."""
98167 return sum ([d * (base ** i ) for i , d in enumerate (l )])
0 commit comments