Skip to content
This repository was archived by the owner on Jul 7, 2023. It is now read-only.

Commit c4407b8

Browse files
nshazeerRyan Sepassi
authored andcommitted
Some fixes/configs/comments for language modeling plans.
PiperOrigin-RevId: 162000426
1 parent ca7b045 commit c4407b8

File tree

2 files changed

+54
-10
lines changed

2 files changed

+54
-10
lines changed

tensor2tensor/models/attention_lm.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ def attention_lm_base():
140140
hparams.optimizer_adam_epsilon = 1e-9
141141
hparams.learning_rate_decay_scheme = "noam"
142142
hparams.learning_rate = 0.1
143-
hparams.learning_rate_warmup_steps = 1000
143+
hparams.learning_rate_warmup_steps = 2000
144144
hparams.initializer_gain = 1.0
145145
hparams.num_hidden_layers = 6
146146
hparams.initializer = "uniform_unit_scaling"
@@ -163,3 +163,22 @@ def attention_lm_base():
163163
hparams.add_hparam("residual_dropout", 0.1)
164164
hparams.add_hparam("pos", "timing") # timing, none
165165
return hparams
166+
167+
168+
@registry.register_hparams
169+
def attention_lm_small():
170+
"""Cheap model.
171+
172+
on lm1b_32k:
173+
45M params
174+
2 steps/sec on [GeForce GTX TITAN X]
175+
176+
Returns:
177+
an hparams object.
178+
"""
179+
hparams = attention_lm_base()
180+
hparams.num_hidden_layers = 4
181+
hparams.hidden_size = 512
182+
hparams.filter_size = 2048
183+
hparams.residual_dropout = 0.5
184+
return hparams

tensor2tensor/models/attention_lm_moe.py

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,9 @@ def attention_lm_moe_base():
117117
"""Set of hyperparameters.
118118
119119
suitable for 1 gpu.
120-
on lm1b_16k:
121-
~337M params
122-
1.1 steps/sec on [GeForce GTX TITAN X]
120+
on lm1b_32k:
121+
~229M params
122+
0.9 steps/sec on [GeForce GTX TITAN X]
123123
124124
Returns:
125125
a hparams object
@@ -133,7 +133,7 @@ def attention_lm_moe_base():
133133
hparams.optimizer_adam_epsilon = 1e-9
134134
hparams.learning_rate_decay_scheme = "noam"
135135
hparams.learning_rate = 0.1
136-
hparams.learning_rate_warmup_steps = 1000
136+
hparams.learning_rate_warmup_steps = 2000
137137
hparams.initializer_gain = 1.0
138138
hparams.num_hidden_layers = 4
139139
hparams.initializer = "uniform_unit_scaling"
@@ -143,14 +143,14 @@ def attention_lm_moe_base():
143143
hparams.num_sampled_classes = 0
144144
hparams.label_smoothing = 0.0
145145
hparams.shared_embedding_and_softmax_weights = int(False)
146-
hparams.add_hparam("filter_size", 2948) # Add new ones like this.
146+
hparams.add_hparam("filter_size", 2048) # Add new ones like this.
147147
# comma-separated list of layer numbers.
148148
# At each of these layers, we replace the ffn with a mixture of experts.
149149
hparams.add_hparam("moe_layers", "2")
150150
# If moe_n2 is None, then use a flat MoE with moe_n1 experts.
151151
# If moe_n2 is an integer, then use a hierarchical MoE
152152
# consisting of moe_n1 groups of moe_n2 experts each.
153-
hparams.add_hparam("moe_n1", 64)
153+
hparams.add_hparam("moe_n1", 32)
154154
hparams.add_hparam("moe_n2", 0)
155155
hparams.add_hparam("moe_hidden_size", 2048)
156156
hparams.add_hparam("moe_loss_coef", 1e-2)
@@ -171,9 +171,11 @@ def attention_lm_moe_base():
171171
def attention_lm_moe_small():
172172
"""Cheap model for single-gpu training.
173173
174-
on lm1b_16k:
175-
~295M params
176-
2 steps/sec on [GeForce GTX TITAN X]
174+
on lm1b_32k:
175+
~312M params
176+
1.6 steps/sec on [GeForce GTX TITAN X]
177+
After 50K steps on 8 GPUs (synchronous):
178+
eval_log_ppl_per_token = 3.31
177179
178180
Returns:
179181
an hparams object.
@@ -188,13 +190,36 @@ def attention_lm_moe_small():
188190
return hparams
189191

190192

193+
@registry.register_hparams
194+
def attention_lm_no_moe_small():
195+
"""Without the mixture of experts (for comparison).
196+
197+
on lm1b_32k:
198+
~45M params
199+
2 steps/sec on [GeForce GTX TITAN X]
200+
After 50K steps on 8 GPUs (synchronous):
201+
eval_log_ppl_per_token = 3.51
202+
203+
Returns:
204+
an hparams object.
205+
"""
206+
hparams = attention_lm_moe_small()
207+
hparams.moe_layers = ""
208+
return hparams
209+
210+
191211
@registry.register_hparams
192212
def attention_lm_moe_large():
193213
"""Large model for distributed training.
194214
195215
Over 1B parameters, so requires multi-gpu training due to memory
196216
requirements.
197217
218+
on lm1b_32k:
219+
After 45K steps on 8 GPUs (synchronous):
220+
eval_log_ppl_per_token = 3.18
221+
eval_ppl_per_word = exp(1.107893 * eval_log_ppl_per_token) = 33.9
222+
198223
Returns:
199224
an hparams object.
200225
"""

0 commit comments

Comments
 (0)