@@ -16,7 +16,7 @@ common_options: &common_options
1616 - [loss, "loss"]
1717 env :
1818 POPLAR_ENGINE_OPTIONS : ' {"opt.enableMultiAccessCopies":"false"}'
19- PYTORCH_CACHE_DIR : " . /pt_cache/"
19+ PYTORCH_CACHE_DIR : " /tmp /pt_cache/"
2020
2121config_options : &config_options
2222 requirements_path : requirements.txt
@@ -40,11 +40,39 @@ pytorch_resnet50_train_real_pod16:
4040 --dataloader-worker 14
4141 --dataloader-rebatch-size 256
4242
43- pytorch_resnet50_train_real_pod64_conv :
43+ pytorch_resnet50_train_real_1host_pod64_conv :
4444 << : [*common_options, *config_options]
4545 description : |
46- ResNet training on 64 Mk2 IPUs with real data
46+ ResNet training on 64 Mk2 IPUs with real data on a single host
4747 for convergence testing.
48+ cmd : >-
49+ poprun
50+ -vv
51+ --num-instances=8
52+ --num-replicas=64
53+ --vipu-server-host=$IPUOF_VIPU_API_HOST
54+ --vipu-server-port 8090
55+ --vipu-partition=$IPUOF_VIPU_API_PARTITION_ID
56+ --vipu-allocation=$VIPU_ALLOCATION_ID
57+ --update-partition=yes
58+ --remove-partition=yes
59+ --reset-partition=no
60+ --sync-type=ST_POD_NATIVE_DEFAULT
61+ --executable-cache-path=$PYTORCH_CACHE_DIR
62+ python3 train.py
63+ --config resnet50-pod64
64+ --dataloader-worker 28
65+ --dataloader-rebatch-size 256
66+ --imagenet-data-path $DATASETS_DIR/imagenet-raw-dataset
67+ --checkpoint-output-dir ./checkpoints
68+ --wandb
69+ --validation-mode none
70+
71+ pytorch_resnet50_train_real_4host_pod64_conv :
72+ << : [*common_options, *config_options]
73+ description : |
74+ ResNet training on 64 Mk2 IPUs with real data
75+ for convergence testing on 4 hosts.
4876 cmd : >-
4977 poprun
5078 -vv
0 commit comments