graphcore
diff --git a/‎LICENSE‎
Lines changed: 17 additions & 18 deletions b/‎LICENSE‎
Lines changed: 17 additions & 18 deletions
diff --git a/‎applications/popart/bert/README_Benchmarks.md‎
Lines changed: 30 additions & 3 deletions b/‎applications/popart/bert/README_Benchmarks.md‎
Lines changed: 30 additions & 3 deletions
diff --git a/‎applications/popart/bert/bert.py‎
Lines changed: 10 additions & 1 deletion b/‎applications/popart/bert/bert.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎applications/popart/bert/bert_data/pretraining_dataset.py‎
Lines changed: 2 additions & 2 deletions b/‎applications/popart/bert/bert_data/pretraining_dataset.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎applications/popart/bert/bert_data/tfrecord_dataset.py‎
Lines changed: 1 addition & 1 deletion b/‎applications/popart/bert/bert_data/tfrecord_dataset.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎applications/popart/bert/configs/mk2/packed_pretrain_base_128.json‎ renamed to ‎applications/popart/bert/configs/mk2/packed/packed_pretrain_base_128.json‎
Lines changed: 7 additions & 7 deletions b/‎applications/popart/bert/configs/mk2/packed_pretrain_base_128.json‎ renamed to ‎applications/popart/bert/configs/mk2/packed/packed_pretrain_base_128.json‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎applications/popart/bert/configs/mk2/packed_pretrain_base_384.json‎ renamed to ‎applications/popart/bert/configs/mk2/packed/packed_pretrain_base_384.json‎
Lines changed: 7 additions & 7 deletions b/‎applications/popart/bert/configs/mk2/packed_pretrain_base_384.json‎ renamed to ‎applications/popart/bert/configs/mk2/packed/packed_pretrain_base_384.json‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎applications/popart/bert/configs/mk2/packed_pretrain_large_128.json‎ renamed to ‎applications/popart/bert/configs/mk2/packed/packed_pretrain_large_128.json‎
Lines changed: 8 additions & 8 deletions b/‎applications/popart/bert/configs/mk2/packed_pretrain_large_128.json‎ renamed to ‎applications/popart/bert/configs/mk2/packed/packed_pretrain_large_128.json‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎applications/popart/bert/configs/mk2/packed_pretrain_base_384_phase_1.json‎ renamed to ‎applications/popart/bert/configs/mk2/packed/packed_pretrain_large_384.json‎
Lines changed: 17 additions & 17 deletions b/‎applications/popart/bert/configs/mk2/packed_pretrain_base_384_phase_1.json‎ renamed to ‎applications/popart/bert/configs/mk2/packed/packed_pretrain_large_384.json‎
Lines changed: 17 additions & 17 deletions
diff --git a/‎applications/popart/bert/configs/mk2/packed_pretrain_tiny_128.json‎ renamed to ‎applications/popart/bert/configs/mk2/packed/packed_pretrain_tiny_128.json‎
Lines changed: 1 addition & 0 deletions b/‎applications/popart/bert/configs/mk2/packed_pretrain_tiny_128.json‎ renamed to ‎applications/popart/bert/configs/mk2/packed/packed_pretrain_tiny_128.json‎
Lines changed: 1 addition & 0 deletions
@@ -1,23 +1,22 @@
-Copyright 2019 Graphcore Ltd.
+MIT License
 
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
+Copyright (c) 2019 Graphcore Ltd.
 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
 
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
 
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
 
@@ -150,7 +150,10 @@ python bert.py --config configs/mk2/pretrain_base_384.json --input-files=$DATASE
 
 Command:
 ```console
-python run_squad.py --squad-do-validation False --config squad_large_384_POD16 --num-epochs 1
+python3 run_squad.py \
+   --squad-do-validation False \
+   --config squad_large_384_POD16 \
+   --num-epochs 1
 ```
 
 ## Inference
@@ -168,9 +171,21 @@ Run the following command lines from inside the applications/popart/bert directo
 
 This benchmark spawns multiple replicas using mpirun. To obtain the total throughput, sum the reported throughputs for each iteration.
 
+export POPLAR_ENGINE_OPTIONS: '{"exchange.enablePrefetch": false}'
+
 Command:
 ```console
-mpirun --tag-output --np 4 --allow-run-as-root python bert.py --config configs/mk2/squad_large_128_inf.json           --micro-batch-size {batchsize} --generated-data=true --epochs-inference 20 --input-files=$DATASETS_DIR/squad/dev-v1.1.json
+   mpirun \
+      --tag-output \
+      --np 4 \
+      --allow-run-as-root \
+      python bert.py \
+      --config configs/mk2/squad_large_128_inf.json \
+      --micro-batch-size {batchsize} \
+      --generated-data=true \
+      --epochs-inference 20 \
+      --input-files=$DATASETS_DIR/squad/dev-v1.1.json \
+      --minimum-latency-inference
 ```
 
 Set --micro-batch-size to 1, 2 or 3.
@@ -181,9 +196,21 @@ Set --micro-batch-size to 1, 2 or 3.
 
 This benchmark spawns multiple replicas using mpirun. To obtain the total throughput, sum the reported throughputs for each iteration.
 
+export POPLAR_ENGINE_OPTIONS: '{"exchange.enablePrefetch": false}'
+
 Command:
 ```console
-mpirun --tag-output --np 4 --allow-run-as-root python bert.py --config configs/mk2/squad_base_128_inf.json --micro-batch-size {batchsize} --generated-data=true --epochs-inference 10 --input-files=$DATASETS_DIR/squad/dev-v1.1.json
+    mpirun \
+       --tag-output \
+       --np 4 \
+       --allow-run-as-root \
+    python bert.py \
+       --config configs/mk2/squad_base_128_inf.json \
+       --micro-batch-size {batchsize} \
+       --generated-data=true \
+       --epochs-inference 10 \
+       --minimum-latency-inference \
+       --input-files=$DATASETS_DIR/squad/dev-v1.1.json
 ```
 
 Set --micro-batch-size to 1, 2, 4, 8, 16, 32, 64, or 80 
 
@@ -32,6 +32,8 @@
 import popart
 import popdist
 import popdist.popart
+from distutils import version
+LooseVersion = version.LooseVersion
 from torch.utils.tensorboard import SummaryWriter
 
 import utils
@@ -331,6 +333,13 @@ def bert_session_options(args, model):
     options.enableFloatingPointChecks = args.floating_point_exceptions
     options.enableStochasticRounding = args.stochastic_rounding
     options.enablePrefetchDatastreams = not args.minimum_latency_inference
+
+    # These options are necessary to allow poplar to overlap processing of
+    # multiple iterations in the host side
+    options.defaultPrefetchBufferingDepth = 3
+    options.rearrangeAnchorsOnHost = False
+    engine_options["exchange.streamBufferOverlap"] = "hostRearrangeOnly"
+
     options.enableOutlining = not args.no_outlining
     options.subgraphCopyingStrategy = popart.SubgraphCopyingStrategy.JustInTime
     partials_type = "half" if args.enable_half_partials else "float"
@@ -971,7 +980,7 @@ def setup_logger(log_level, handler=None):
 
     if args.wandb and popdist_root(args):
         import wandb
-        wandb.init(project="popart-bert", config=args, sync_tensorboard=True)
+        wandb.init(project="popart-bert", config=args, sync_tensorboard=True, settings=wandb.Settings(console="wrap"))
         if args.wandb_checkpoint:
             artifact = wandb.use_artifact(args.wandb_checkpoint, type='model')
             artifact_dir = artifact.download()
 
@@ -22,11 +22,11 @@
 from tqdm import tqdm
 from logging import getLogger
 from functools import reduce
-
+import popdist.popart
 from .dataset import DataSet
 from .data_sampler import DistributedDataSampler, SampleGenerator
 from utils.distributed import distributed_barrier
-import popdist.popart
+
 logger = getLogger(__name__)
 
 
 
@@ -63,7 +63,7 @@ def __init__(self,
                  batch_size=1,
                  dtype=np.int32,
                  shuffle=True,
-                 pad_position_value=512,
+                 pad_position_value=511,
                  prefetch=1,
                  drop_remainder=True):
         self.files = input_files
 
@@ -16,31 +16,31 @@
     "stochastic_rounding": true,
     "enable_half_partials": true,
     "batches_per_step":  1,
-    "training_steps": 6015,
+    "training_steps": 7038,
     "steps_per_log": 100,
     "aggregate_metrics_over_steps": 1,
     "loss_scaling": 512.0,
     "micro_batch_size": 32,
-    "global_batch_size": 65536,
+    "global_batch_size": 54784,
     "replication_factor": 4,
     "split_qkv": true,
     "optimizer_state_offchip": true,
     "replicated_tensor_sharding": true,
     "gradient_reduction_type": "Mean",
     "optimizer": "LAMB_NO_BIAS",
-    "beta1": 0.77,
-    "beta2": 0.730,
+    "beta1": 0.9,
+    "beta2": 0.999,
     "weight_decay": 1e-2,
     "learning_rate_function": "Linear",
     "learning_rate": 0.006,
-    "lr_warmup_steps": 1709,
+    "lr_warmup_steps": 2000,
     "lr_steps_per_decay_update": 32,
     "shuffle": true,
     "duplication_factor": 1,
     "epochs_to_cache": 0,
     "embedding_serialization_vocab_steps": 5,
     "available_memory_proportion": [0.28],
-    "execution_mode": "PIPELINE",
-    "checkpoint_dir": "checkpoints/mk2/packed_pretrain_base_128",
+    "pipeline": true,
+    "checkpoint_dir": "checkpoints/mk2/packed/packed_pretrain_base_128",
     "no_validation": true
 }
@@ -16,31 +16,31 @@
     "stochastic_rounding": true,
     "enable_half_partials": true,
     "batches_per_step":  1,
-    "training_steps": 1526,
+    "training_steps": 2137,
     "steps_per_log": 100,
     "aggregate_metrics_over_steps": 1,
     "loss_scaling": 128.0,
     "micro_batch_size": 8,
-    "global_batch_size": 16384,
+    "global_batch_size": 9600,
     "replication_factor": 4,
     "split_qkv": true,
     "optimizer_state_offchip": true,
     "replicated_tensor_sharding": true,
     "gradient_reduction_type": "Mean",
     "optimizer": "LAMB_NO_BIAS",
-    "beta1": 0.64,
-    "beta2": 0.510,
+    "beta1": 0.9,
+    "beta2": 0.999,
     "weight_decay": 1e-2,
     "learning_rate_function": "Linear",
     "learning_rate": 0.002828427125,
-    "lr_warmup_steps": 196,
+    "lr_warmup_steps": 274,
     "lr_steps_per_decay_update": 8,
     "shuffle": true,
     "duplication_factor": 1,
     "epochs_to_cache": 0,
     "embedding_serialization_vocab_steps": 5,
     "available_memory_proportion": [0.28],
-    "execution_mode": "PIPELINE",
-    "checkpoint_dir": "checkpoints/mk2/packed_pretrain_base_384",
+    "pipeline": true,
+    "checkpoint_dir": "checkpoints/mk2/packed/packed_pretrain_base_384",
     "no_validation": true
 }
@@ -16,31 +16,31 @@
     "stochastic_rounding": true,
     "enable_half_partials": true,
     "batches_per_step":  1,
-    "training_steps": 6015,
+    "training_steps": 7038,
     "steps_per_log": 100,
     "aggregate_metrics_over_steps": 1,
     "loss_scaling": 64.0,
     "micro_batch_size": 8,
-    "global_batch_size": 65536,
+    "global_batch_size": 54784,
     "replication_factor": 4,
     "split_qkv": true,
     "optimizer_state_offchip": true,
     "replicated_tensor_sharding": true,
     "gradient_reduction_type": "Mean",
     "optimizer": "LAMB_NO_BIAS",
-    "beta1": 0.77,
-    "beta2": 0.730,
+    "beta1": 0.9,
+    "beta2": 0.999,
     "weight_decay": 1e-2,
     "learning_rate_function": "Linear",
     "learning_rate": 0.006,
-    "lr_warmup_steps": 1709,
+    "lr_warmup_steps": 2000,
     "lr_steps_per_decay_update": 32,
     "shuffle": true,
     "duplication_factor": 1,
     "epochs_to_cache": 0,
     "embedding_serialization_vocab_steps": 5,
-    "available_memory_proportion": [0.15, 0.25, 0.25, 0.25],
-    "execution_mode": "PIPELINE",
-    "checkpoint_dir": "checkpoints/mk2/packed_pretrain_large_128",
+    "available_memory_proportion": [0.15, 0.4, 0.4, 0.4],
+    "pipeline": true,
+    "checkpoint_dir": "checkpoints/mk2/packed/packed_pretrain_large_128",
     "no_validation": true
 }
@@ -2,11 +2,11 @@
     "task": "PRETRAINING",
     "use_packed_sequence_format": true,
     "max_sequences_per_pack": 3,
-    "num_layers": 12,
-    "encoder_start_ipu": 1,
-    "layers_per_ipu": [4, 4, 4],
-    "hidden_size": 768,
-    "attention_heads": 12,
+    "num_layers": 24,
+    "encoder_start_ipu": 0,
+    "layers_per_ipu": [3, 7, 7, 7],
+    "hidden_size": 1024,
+    "attention_heads": 16,
     "sequence_length": 384,
     "mask_tokens": 56,
     "vocab_length": 30400,
@@ -16,31 +16,31 @@
     "stochastic_rounding": true,
     "enable_half_partials": true,
     "batches_per_step":  1,
-    "training_steps": 7038,
+    "training_steps": 2137,
     "steps_per_log": 100,
     "aggregate_metrics_over_steps": 1,
-    "loss_scaling": 512.0,
-    "micro_batch_size": 8,
-    "global_batch_size": 16384,
+    "loss_scaling": 64.0,
+    "micro_batch_size": 2,
+    "global_batch_size": 9600,
     "replication_factor": 4,
     "split_qkv": true,
     "optimizer_state_offchip": true,
     "replicated_tensor_sharding": true,
     "gradient_reduction_type": "Mean",
     "optimizer": "LAMB_NO_BIAS",
-    "beta1": 0.64,
-    "beta2": 0.510,
+    "beta1": 0.9,
+    "beta2": 0.999,
     "weight_decay": 1e-2,
     "learning_rate_function": "Linear",
-    "learning_rate": 0.006,
-    "lr_warmup_steps": 2000,
-    "lr_steps_per_decay_update": 32,
+    "learning_rate": 0.002828427125,
+    "lr_warmup_steps": 274,
+    "lr_steps_per_decay_update": 8,
     "shuffle": true,
     "duplication_factor": 1,
     "epochs_to_cache": 0,
     "embedding_serialization_vocab_steps": 5,
-    "available_memory_proportion": [0.28],
-    "execution_mode": "PIPELINE",
-    "checkpoint_dir": "checkpoints/mk2/packed_pretrain_base_384_phase_1",
+    "available_memory_proportion": [0.15, 0.4, 0.4, 0.4],
+    "pipeline": true,
+    "checkpoint_dir": "checkpoints/mk2/packed/packed_pretrain_large_384",
     "no_validation": true
 }
@@ -35,6 +35,7 @@
     "epochs_to_cache": 1,
     "embedding_serialization_vocab_steps": 4,
     "available_memory_proportion": [0.20],
+    "pipeline": true,
     "checkpoint_dir": "checkpoints/mk2/packed_pretrain_tiny",
     "no_validation": true,
     "optimizer_state_offchip": false,