@@ -117,9 +117,9 @@ def attention_lm_moe_base():
117117 """Set of hyperparameters.
118118
119119 suitable for 1 gpu.
120- on lm1b_16k :
121- ~337M params
122- 1.1 steps/sec on [GeForce GTX TITAN X]
120+ on lm1b_32k :
121+ ~229M params
122+ 0.9 steps/sec on [GeForce GTX TITAN X]
123123
124124 Returns:
125125 a hparams object
@@ -133,7 +133,7 @@ def attention_lm_moe_base():
133133 hparams .optimizer_adam_epsilon = 1e-9
134134 hparams .learning_rate_decay_scheme = "noam"
135135 hparams .learning_rate = 0.1
136- hparams .learning_rate_warmup_steps = 1000
136+ hparams .learning_rate_warmup_steps = 2000
137137 hparams .initializer_gain = 1.0
138138 hparams .num_hidden_layers = 4
139139 hparams .initializer = "uniform_unit_scaling"
@@ -143,14 +143,14 @@ def attention_lm_moe_base():
143143 hparams .num_sampled_classes = 0
144144 hparams .label_smoothing = 0.0
145145 hparams .shared_embedding_and_softmax_weights = int (False )
146- hparams .add_hparam ("filter_size" , 2948 ) # Add new ones like this.
146+ hparams .add_hparam ("filter_size" , 2048 ) # Add new ones like this.
147147 # comma-separated list of layer numbers.
148148 # At each of these layers, we replace the ffn with a mixture of experts.
149149 hparams .add_hparam ("moe_layers" , "2" )
150150 # If moe_n2 is None, then use a flat MoE with moe_n1 experts.
151151 # If moe_n2 is an integer, then use a hierarchical MoE
152152 # consisting of moe_n1 groups of moe_n2 experts each.
153- hparams .add_hparam ("moe_n1" , 64 )
153+ hparams .add_hparam ("moe_n1" , 32 )
154154 hparams .add_hparam ("moe_n2" , 0 )
155155 hparams .add_hparam ("moe_hidden_size" , 2048 )
156156 hparams .add_hparam ("moe_loss_coef" , 1e-2 )
@@ -171,9 +171,11 @@ def attention_lm_moe_base():
171171def attention_lm_moe_small ():
172172 """Cheap model for single-gpu training.
173173
174- on lm1b_16k:
175- ~295M params
176- 2 steps/sec on [GeForce GTX TITAN X]
174+ on lm1b_32k:
175+ ~312M params
176+ 1.6 steps/sec on [GeForce GTX TITAN X]
177+ After 50K steps on 8 GPUs (synchronous):
178+ eval_log_ppl_per_token = 3.31
177179
178180 Returns:
179181 an hparams object.
@@ -188,13 +190,36 @@ def attention_lm_moe_small():
188190 return hparams
189191
190192
193+ @registry .register_hparams
194+ def attention_lm_no_moe_small ():
195+ """Without the mixture of experts (for comparison).
196+
197+ on lm1b_32k:
198+ ~45M params
199+ 2 steps/sec on [GeForce GTX TITAN X]
200+ After 50K steps on 8 GPUs (synchronous):
201+ eval_log_ppl_per_token = 3.51
202+
203+ Returns:
204+ an hparams object.
205+ """
206+ hparams = attention_lm_moe_small ()
207+ hparams .moe_layers = ""
208+ return hparams
209+
210+
191211@registry .register_hparams
192212def attention_lm_moe_large ():
193213 """Large model for distributed training.
194214
195215 Over 1B parameters, so requires multi-gpu training due to memory
196216 requirements.
197217
218+ on lm1b_32k:
219+ After 45K steps on 8 GPUs (synchronous):
220+ eval_log_ppl_per_token = 3.18
221+ eval_ppl_per_word = exp(1.107893 * eval_log_ppl_per_token) = 33.9
222+
198223 Returns:
199224 an hparams object.
200225 """
0 commit comments