Adding pod64 single host benchmark and tweaks (#94)

hiteshk-gc · web-flow · commit 4bfbe794c266 · 2023-03-10T10:29:06.000Z
* Adding pod64 single host benchmark and tweaks

* fixing cache path and removing checkpoint freq args

* Removing checkpoint freq arg update
diff --git a/vision/cnns/pytorch/train/benchmarks.yml b/vision/cnns/pytorch/train/benchmarks.yml
@@ -16,7 +16,7 @@ common_options: &common_options
     - [loss, "loss"]
   env:
     POPLAR_ENGINE_OPTIONS: '{"opt.enableMultiAccessCopies":"false"}'
-    PYTORCH_CACHE_DIR: "./pt_cache/"
+    PYTORCH_CACHE_DIR: "/tmp/pt_cache/"
 
 config_options: &config_options
   requirements_path: requirements.txt
@@ -40,11 +40,39 @@ pytorch_resnet50_train_real_pod16:
       --dataloader-worker 14
       --dataloader-rebatch-size 256
 
-pytorch_resnet50_train_real_pod64_conv:
+pytorch_resnet50_train_real_1host_pod64_conv:
   <<: [*common_options, *config_options]
   description: |
-    ResNet training on 64 Mk2 IPUs with real data
+    ResNet training on 64 Mk2 IPUs with real data on a single host
     for convergence testing.
+  cmd: >-
+    poprun
+      -vv
+      --num-instances=8
+      --num-replicas=64
+      --vipu-server-host=$IPUOF_VIPU_API_HOST
+      --vipu-server-port 8090
+      --vipu-partition=$IPUOF_VIPU_API_PARTITION_ID
+      --vipu-allocation=$VIPU_ALLOCATION_ID
+      --update-partition=yes
+      --remove-partition=yes
+      --reset-partition=no
+      --sync-type=ST_POD_NATIVE_DEFAULT
+      --executable-cache-path=$PYTORCH_CACHE_DIR
+    python3 train.py
+      --config resnet50-pod64
+      --dataloader-worker 28
+      --dataloader-rebatch-size 256
+      --imagenet-data-path $DATASETS_DIR/imagenet-raw-dataset
+      --checkpoint-output-dir ./checkpoints
+      --wandb
+      --validation-mode none
+
+pytorch_resnet50_train_real_4host_pod64_conv:
+  <<: [*common_options, *config_options]
+  description: |
+    ResNet training on 64 Mk2 IPUs with real data
+    for convergence testing on 4 hosts.
   cmd: >-
     poprun
       -vv
diff --git a/vision/cnns/pytorch/train/train.py b/vision/cnns/pytorch/train/train.py
@@ -26,7 +26,8 @@
 
 
 def train(training_model, training_data, args, lr_scheduler, epochs, optimizer, validation_function=None):
-    logging.info("Training the model")
+    training_start_time = datetime.now()
+    logging.info(f"Training the model. Start: {str(training_start_time)}")
 
     # A generic container used by the train function to set and update the host-side training state.
     class TrainingState(): pass
@@ -114,6 +115,10 @@ class TrainingState(): pass
                     args,
                 )
 
+    training_end_time = datetime.now()
+    total_training_time = training_end_time - training_start_time
+    logging.info(f"Finished training. Time: {str(training_end_time)}. It took: {str(total_training_time)}")
+
 
 def get_augmented_samples(args, input_data, random_generator):
     # Mixup coefficients are sampled on the host, cutmix coefficients are