Skip to content

Commit 4bfbe79

Browse files
authored
Adding pod64 single host benchmark and tweaks (#94)
* Adding pod64 single host benchmark and tweaks * fixing cache path and removing checkpoint freq args * Removing checkpoint freq arg update
1 parent a621494 commit 4bfbe79

File tree

2 files changed

+37
-4
lines changed

2 files changed

+37
-4
lines changed

vision/cnns/pytorch/train/benchmarks.yml

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ common_options: &common_options
1616
- [loss, "loss"]
1717
env:
1818
POPLAR_ENGINE_OPTIONS: '{"opt.enableMultiAccessCopies":"false"}'
19-
PYTORCH_CACHE_DIR: "./pt_cache/"
19+
PYTORCH_CACHE_DIR: "/tmp/pt_cache/"
2020

2121
config_options: &config_options
2222
requirements_path: requirements.txt
@@ -40,11 +40,39 @@ pytorch_resnet50_train_real_pod16:
4040
--dataloader-worker 14
4141
--dataloader-rebatch-size 256
4242
43-
pytorch_resnet50_train_real_pod64_conv:
43+
pytorch_resnet50_train_real_1host_pod64_conv:
4444
<<: [*common_options, *config_options]
4545
description: |
46-
ResNet training on 64 Mk2 IPUs with real data
46+
ResNet training on 64 Mk2 IPUs with real data on a single host
4747
for convergence testing.
48+
cmd: >-
49+
poprun
50+
-vv
51+
--num-instances=8
52+
--num-replicas=64
53+
--vipu-server-host=$IPUOF_VIPU_API_HOST
54+
--vipu-server-port 8090
55+
--vipu-partition=$IPUOF_VIPU_API_PARTITION_ID
56+
--vipu-allocation=$VIPU_ALLOCATION_ID
57+
--update-partition=yes
58+
--remove-partition=yes
59+
--reset-partition=no
60+
--sync-type=ST_POD_NATIVE_DEFAULT
61+
--executable-cache-path=$PYTORCH_CACHE_DIR
62+
python3 train.py
63+
--config resnet50-pod64
64+
--dataloader-worker 28
65+
--dataloader-rebatch-size 256
66+
--imagenet-data-path $DATASETS_DIR/imagenet-raw-dataset
67+
--checkpoint-output-dir ./checkpoints
68+
--wandb
69+
--validation-mode none
70+
71+
pytorch_resnet50_train_real_4host_pod64_conv:
72+
<<: [*common_options, *config_options]
73+
description: |
74+
ResNet training on 64 Mk2 IPUs with real data
75+
for convergence testing on 4 hosts.
4876
cmd: >-
4977
poprun
5078
-vv

vision/cnns/pytorch/train/train.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@
2626

2727

2828
def train(training_model, training_data, args, lr_scheduler, epochs, optimizer, validation_function=None):
29-
logging.info("Training the model")
29+
training_start_time = datetime.now()
30+
logging.info(f"Training the model. Start: {str(training_start_time)}")
3031

3132
# A generic container used by the train function to set and update the host-side training state.
3233
class TrainingState(): pass
@@ -114,6 +115,10 @@ class TrainingState(): pass
114115
args,
115116
)
116117

118+
training_end_time = datetime.now()
119+
total_training_time = training_end_time - training_start_time
120+
logging.info(f"Finished training. Time: {str(training_end_time)}. It took: {str(total_training_time)}")
121+
117122

118123
def get_augmented_samples(args, input_data, random_generator):
119124
# Mixup coefficients are sampled on the host, cutmix coefficients are

0 commit comments

Comments
 (0)