@@ -93,6 +93,70 @@ def reverse_generator(nbr_symbols, max_length, nbr_cases):
9393 "targets" : list (reversed (inputs )) + [1 ]} # [1] for EOS
9494
9595
96+ def zipf_distribution (nbr_symbols , alpha ):
97+ """Helper function: Create a Zipf distribution.
98+
99+ Args:
100+ nbr_symbols: number of symbols to use in the distribution.
101+ alpha: float, Zipf's Law Distribution parameter. Default = 1.5.
102+ Usually for modelling natural text distribution is in
103+ the range [1.1-1.6].
104+
105+ Return:
106+ distr_map: list of float, Zipf's distribution over nbr_symbols.
107+
108+ """
109+ tmp = np .power (np .arange (1 , nbr_symbols + 1 ), - alpha )
110+ zeta = np .r_ [0.0 , np .cumsum (tmp )]
111+ return [x / zeta [- 1 ] for x in zeta ]
112+
113+
114+ def zipf_random_sample (distr_map , sample_len ):
115+ """Helper function: Generate a random Zipf sample of given lenght.
116+
117+ Args:
118+ distr_map: list of float, Zipf's distribution over nbr_symbols.
119+ sample_len: integer, length of sequence to generate.
120+
121+ Return:
122+ sample: list of integer, Zipf's random sample over nbr_symbols.
123+
124+ """
125+ u = np .random .random (sample_len )
126+ return [t + 1 for t in np .searchsorted (distr_map , u )] # 0 pad and 1 EOS
127+
128+
129+ def reverse_generator_nlplike (nbr_symbols , max_length , nbr_cases , \
130+ scale_std_dev = 100 , alpha = 1.5 ):
131+ """Generator for the reversing nlp-like task on sequences of symbols.
132+
133+ The length of the sequence is drawn from a Gaussian(Normal) distribution
134+ at random from [1, max_length] and with std deviation of 1%,
135+ then symbols are drawn from Zipf's law at random from [2, nbr_symbols] until
136+ nbr_cases sequences have been produced.
137+
138+ Args:
139+ max_length: integer, maximum length of sequences to generate.
140+ nbr_cases: the number of cases to generate.
141+ scale_std_dev: float, Normal distribution's standard deviation scale factor
142+ used to draw the lenght of sequence. Default = 1% of the max_length.
143+ alpha: float, Zipf's Law Distribution parameter. Default = 1.5.
144+ Usually for modelling natural text distribution is in
145+ the range [1.1-1.6].
146+
147+ Yields:
148+ A dictionary {"inputs": input-list, "targets": target-list} where
149+ target-list is input-list reversed.
150+ """
151+ std_dev = max_length / scale_std_dev
152+ distr_map = zipf_distribution (nbr_symbols , alpha )
153+ for _ in xrange (nbr_cases ):
154+ l = int (abs (np .random .normal (loc = max_length / 2 , scale = std_dev )) + 1 )
155+ inputs = zipf_random_sample (distr_map , l )
156+ yield {"inputs" : inputs ,
157+ "targets" : list (reversed (inputs )) + [1 ]} # [1] for EOS
158+
159+
96160def lower_endian_to_number (l , base ):
97161 """Helper function: convert a list of digits in the given base to a number."""
98162 return sum ([d * (base ** i ) for i , d in enumerate (l )])
0 commit comments