Skip to content

Commit 32fc7bd

Browse files
committed
Further updates with Poplar SDK 2.4 release
1 parent 97e062a commit 32fc7bd

File tree

816 files changed

+108860
-3677
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

816 files changed

+108860
-3677
lines changed

LICENSE

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,22 @@
1-
Copyright 2019 Graphcore Ltd.
1+
MIT License
22

3-
Permission is hereby granted, free of charge, to any person obtaining
4-
a copy of this software and associated documentation files (the
5-
"Software"), to deal in the Software without restriction, including
6-
without limitation the rights to use, copy, modify, merge, publish,
7-
distribute, sublicense, and/or sell copies of the Software, and to
8-
permit persons to whom the Software is furnished to do so, subject to
9-
the following conditions:
3+
Copyright (c) 2019 Graphcore Ltd.
104

5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
1111

12-
The above copyright notice and this permission notice shall be
13-
included in all copies or substantial portions of the Software.
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
1414

15-
16-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17-
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18-
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19-
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20-
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21-
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22-
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.
2322

applications/popart/bert/README_Benchmarks.md

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,10 @@ python bert.py --config configs/mk2/pretrain_base_384.json --input-files=$DATASE
150150

151151
Command:
152152
```console
153-
python run_squad.py --squad-do-validation False --config squad_large_384_POD16 --num-epochs 1
153+
python3 run_squad.py \
154+
--squad-do-validation False \
155+
--config squad_large_384_POD16 \
156+
--num-epochs 1
154157
```
155158

156159
## Inference
@@ -168,9 +171,21 @@ Run the following command lines from inside the applications/popart/bert directo
168171

169172
This benchmark spawns multiple replicas using mpirun. To obtain the total throughput, sum the reported throughputs for each iteration.
170173

174+
export POPLAR_ENGINE_OPTIONS: '{"exchange.enablePrefetch": false}'
175+
171176
Command:
172177
```console
173-
mpirun --tag-output --np 4 --allow-run-as-root python bert.py --config configs/mk2/squad_large_128_inf.json --micro-batch-size {batchsize} --generated-data=true --epochs-inference 20 --input-files=$DATASETS_DIR/squad/dev-v1.1.json
178+
mpirun \
179+
--tag-output \
180+
--np 4 \
181+
--allow-run-as-root \
182+
python bert.py \
183+
--config configs/mk2/squad_large_128_inf.json \
184+
--micro-batch-size {batchsize} \
185+
--generated-data=true \
186+
--epochs-inference 20 \
187+
--input-files=$DATASETS_DIR/squad/dev-v1.1.json \
188+
--minimum-latency-inference
174189
```
175190

176191
Set --micro-batch-size to 1, 2 or 3.
@@ -181,9 +196,21 @@ Set --micro-batch-size to 1, 2 or 3.
181196

182197
This benchmark spawns multiple replicas using mpirun. To obtain the total throughput, sum the reported throughputs for each iteration.
183198

199+
export POPLAR_ENGINE_OPTIONS: '{"exchange.enablePrefetch": false}'
200+
184201
Command:
185202
```console
186-
mpirun --tag-output --np 4 --allow-run-as-root python bert.py --config configs/mk2/squad_base_128_inf.json --micro-batch-size {batchsize} --generated-data=true --epochs-inference 10 --input-files=$DATASETS_DIR/squad/dev-v1.1.json
203+
mpirun \
204+
--tag-output \
205+
--np 4 \
206+
--allow-run-as-root \
207+
python bert.py \
208+
--config configs/mk2/squad_base_128_inf.json \
209+
--micro-batch-size {batchsize} \
210+
--generated-data=true \
211+
--epochs-inference 10 \
212+
--minimum-latency-inference \
213+
--input-files=$DATASETS_DIR/squad/dev-v1.1.json
187214
```
188215

189216
Set --micro-batch-size to 1, 2, 4, 8, 16, 32, 64, or 80

applications/popart/bert/bert.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
import popart
3333
import popdist
3434
import popdist.popart
35+
from distutils import version
36+
LooseVersion = version.LooseVersion
3537
from torch.utils.tensorboard import SummaryWriter
3638

3739
import utils
@@ -331,6 +333,13 @@ def bert_session_options(args, model):
331333
options.enableFloatingPointChecks = args.floating_point_exceptions
332334
options.enableStochasticRounding = args.stochastic_rounding
333335
options.enablePrefetchDatastreams = not args.minimum_latency_inference
336+
337+
# These options are necessary to allow poplar to overlap processing of
338+
# multiple iterations in the host side
339+
options.defaultPrefetchBufferingDepth = 3
340+
options.rearrangeAnchorsOnHost = False
341+
engine_options["exchange.streamBufferOverlap"] = "hostRearrangeOnly"
342+
334343
options.enableOutlining = not args.no_outlining
335344
options.subgraphCopyingStrategy = popart.SubgraphCopyingStrategy.JustInTime
336345
partials_type = "half" if args.enable_half_partials else "float"
@@ -971,7 +980,7 @@ def setup_logger(log_level, handler=None):
971980

972981
if args.wandb and popdist_root(args):
973982
import wandb
974-
wandb.init(project="popart-bert", config=args, sync_tensorboard=True)
983+
wandb.init(project="popart-bert", config=args, sync_tensorboard=True, settings=wandb.Settings(console="wrap"))
975984
if args.wandb_checkpoint:
976985
artifact = wandb.use_artifact(args.wandb_checkpoint, type='model')
977986
artifact_dir = artifact.download()

applications/popart/bert/bert_data/pretraining_dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,11 @@
2222
from tqdm import tqdm
2323
from logging import getLogger
2424
from functools import reduce
25-
25+
import popdist.popart
2626
from .dataset import DataSet
2727
from .data_sampler import DistributedDataSampler, SampleGenerator
2828
from utils.distributed import distributed_barrier
29-
import popdist.popart
29+
3030
logger = getLogger(__name__)
3131

3232

applications/popart/bert/bert_data/tfrecord_dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def __init__(self,
6363
batch_size=1,
6464
dtype=np.int32,
6565
shuffle=True,
66-
pad_position_value=512,
66+
pad_position_value=511,
6767
prefetch=1,
6868
drop_remainder=True):
6969
self.files = input_files

applications/popart/bert/configs/mk2/packed_pretrain_base_128.json renamed to applications/popart/bert/configs/mk2/packed/packed_pretrain_base_128.json

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,31 +16,31 @@
1616
"stochastic_rounding": true,
1717
"enable_half_partials": true,
1818
"batches_per_step": 1,
19-
"training_steps": 6015,
19+
"training_steps": 7038,
2020
"steps_per_log": 100,
2121
"aggregate_metrics_over_steps": 1,
2222
"loss_scaling": 512.0,
2323
"micro_batch_size": 32,
24-
"global_batch_size": 65536,
24+
"global_batch_size": 54784,
2525
"replication_factor": 4,
2626
"split_qkv": true,
2727
"optimizer_state_offchip": true,
2828
"replicated_tensor_sharding": true,
2929
"gradient_reduction_type": "Mean",
3030
"optimizer": "LAMB_NO_BIAS",
31-
"beta1": 0.77,
32-
"beta2": 0.730,
31+
"beta1": 0.9,
32+
"beta2": 0.999,
3333
"weight_decay": 1e-2,
3434
"learning_rate_function": "Linear",
3535
"learning_rate": 0.006,
36-
"lr_warmup_steps": 1709,
36+
"lr_warmup_steps": 2000,
3737
"lr_steps_per_decay_update": 32,
3838
"shuffle": true,
3939
"duplication_factor": 1,
4040
"epochs_to_cache": 0,
4141
"embedding_serialization_vocab_steps": 5,
4242
"available_memory_proportion": [0.28],
43-
"execution_mode": "PIPELINE",
44-
"checkpoint_dir": "checkpoints/mk2/packed_pretrain_base_128",
43+
"pipeline": true,
44+
"checkpoint_dir": "checkpoints/mk2/packed/packed_pretrain_base_128",
4545
"no_validation": true
4646
}

applications/popart/bert/configs/mk2/packed_pretrain_base_384.json renamed to applications/popart/bert/configs/mk2/packed/packed_pretrain_base_384.json

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,31 +16,31 @@
1616
"stochastic_rounding": true,
1717
"enable_half_partials": true,
1818
"batches_per_step": 1,
19-
"training_steps": 1526,
19+
"training_steps": 2137,
2020
"steps_per_log": 100,
2121
"aggregate_metrics_over_steps": 1,
2222
"loss_scaling": 128.0,
2323
"micro_batch_size": 8,
24-
"global_batch_size": 16384,
24+
"global_batch_size": 9600,
2525
"replication_factor": 4,
2626
"split_qkv": true,
2727
"optimizer_state_offchip": true,
2828
"replicated_tensor_sharding": true,
2929
"gradient_reduction_type": "Mean",
3030
"optimizer": "LAMB_NO_BIAS",
31-
"beta1": 0.64,
32-
"beta2": 0.510,
31+
"beta1": 0.9,
32+
"beta2": 0.999,
3333
"weight_decay": 1e-2,
3434
"learning_rate_function": "Linear",
3535
"learning_rate": 0.002828427125,
36-
"lr_warmup_steps": 196,
36+
"lr_warmup_steps": 274,
3737
"lr_steps_per_decay_update": 8,
3838
"shuffle": true,
3939
"duplication_factor": 1,
4040
"epochs_to_cache": 0,
4141
"embedding_serialization_vocab_steps": 5,
4242
"available_memory_proportion": [0.28],
43-
"execution_mode": "PIPELINE",
44-
"checkpoint_dir": "checkpoints/mk2/packed_pretrain_base_384",
43+
"pipeline": true,
44+
"checkpoint_dir": "checkpoints/mk2/packed/packed_pretrain_base_384",
4545
"no_validation": true
4646
}

applications/popart/bert/configs/mk2/packed_pretrain_large_128.json renamed to applications/popart/bert/configs/mk2/packed/packed_pretrain_large_128.json

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,31 +16,31 @@
1616
"stochastic_rounding": true,
1717
"enable_half_partials": true,
1818
"batches_per_step": 1,
19-
"training_steps": 6015,
19+
"training_steps": 7038,
2020
"steps_per_log": 100,
2121
"aggregate_metrics_over_steps": 1,
2222
"loss_scaling": 64.0,
2323
"micro_batch_size": 8,
24-
"global_batch_size": 65536,
24+
"global_batch_size": 54784,
2525
"replication_factor": 4,
2626
"split_qkv": true,
2727
"optimizer_state_offchip": true,
2828
"replicated_tensor_sharding": true,
2929
"gradient_reduction_type": "Mean",
3030
"optimizer": "LAMB_NO_BIAS",
31-
"beta1": 0.77,
32-
"beta2": 0.730,
31+
"beta1": 0.9,
32+
"beta2": 0.999,
3333
"weight_decay": 1e-2,
3434
"learning_rate_function": "Linear",
3535
"learning_rate": 0.006,
36-
"lr_warmup_steps": 1709,
36+
"lr_warmup_steps": 2000,
3737
"lr_steps_per_decay_update": 32,
3838
"shuffle": true,
3939
"duplication_factor": 1,
4040
"epochs_to_cache": 0,
4141
"embedding_serialization_vocab_steps": 5,
42-
"available_memory_proportion": [0.15, 0.25, 0.25, 0.25],
43-
"execution_mode": "PIPELINE",
44-
"checkpoint_dir": "checkpoints/mk2/packed_pretrain_large_128",
42+
"available_memory_proportion": [0.15, 0.4, 0.4, 0.4],
43+
"pipeline": true,
44+
"checkpoint_dir": "checkpoints/mk2/packed/packed_pretrain_large_128",
4545
"no_validation": true
4646
}

applications/popart/bert/configs/mk2/packed_pretrain_base_384_phase_1.json renamed to applications/popart/bert/configs/mk2/packed/packed_pretrain_large_384.json

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22
"task": "PRETRAINING",
33
"use_packed_sequence_format": true,
44
"max_sequences_per_pack": 3,
5-
"num_layers": 12,
6-
"encoder_start_ipu": 1,
7-
"layers_per_ipu": [4, 4, 4],
8-
"hidden_size": 768,
9-
"attention_heads": 12,
5+
"num_layers": 24,
6+
"encoder_start_ipu": 0,
7+
"layers_per_ipu": [3, 7, 7, 7],
8+
"hidden_size": 1024,
9+
"attention_heads": 16,
1010
"sequence_length": 384,
1111
"mask_tokens": 56,
1212
"vocab_length": 30400,
@@ -16,31 +16,31 @@
1616
"stochastic_rounding": true,
1717
"enable_half_partials": true,
1818
"batches_per_step": 1,
19-
"training_steps": 7038,
19+
"training_steps": 2137,
2020
"steps_per_log": 100,
2121
"aggregate_metrics_over_steps": 1,
22-
"loss_scaling": 512.0,
23-
"micro_batch_size": 8,
24-
"global_batch_size": 16384,
22+
"loss_scaling": 64.0,
23+
"micro_batch_size": 2,
24+
"global_batch_size": 9600,
2525
"replication_factor": 4,
2626
"split_qkv": true,
2727
"optimizer_state_offchip": true,
2828
"replicated_tensor_sharding": true,
2929
"gradient_reduction_type": "Mean",
3030
"optimizer": "LAMB_NO_BIAS",
31-
"beta1": 0.64,
32-
"beta2": 0.510,
31+
"beta1": 0.9,
32+
"beta2": 0.999,
3333
"weight_decay": 1e-2,
3434
"learning_rate_function": "Linear",
35-
"learning_rate": 0.006,
36-
"lr_warmup_steps": 2000,
37-
"lr_steps_per_decay_update": 32,
35+
"learning_rate": 0.002828427125,
36+
"lr_warmup_steps": 274,
37+
"lr_steps_per_decay_update": 8,
3838
"shuffle": true,
3939
"duplication_factor": 1,
4040
"epochs_to_cache": 0,
4141
"embedding_serialization_vocab_steps": 5,
42-
"available_memory_proportion": [0.28],
43-
"execution_mode": "PIPELINE",
44-
"checkpoint_dir": "checkpoints/mk2/packed_pretrain_base_384_phase_1",
42+
"available_memory_proportion": [0.15, 0.4, 0.4, 0.4],
43+
"pipeline": true,
44+
"checkpoint_dir": "checkpoints/mk2/packed/packed_pretrain_large_384",
4545
"no_validation": true
4646
}

applications/popart/bert/configs/mk2/packed_pretrain_tiny_128.json renamed to applications/popart/bert/configs/mk2/packed/packed_pretrain_tiny_128.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
"epochs_to_cache": 1,
3636
"embedding_serialization_vocab_steps": 4,
3737
"available_memory_proportion": [0.20],
38+
"pipeline": true,
3839
"checkpoint_dir": "checkpoints/mk2/packed_pretrain_tiny",
3940
"no_validation": true,
4041
"optimizer_state_offchip": false,

0 commit comments

Comments
 (0)