Log effective batch size during training when gradient accumulation and/or replication is used

hmellor · georgepaw · commit 7393d5aa8730 · 2022-03-18T14:17:22.000Z
Summary: Unless the user has prior knowledge, it isn't obvious that the batch size provided to `fit()` (or `dataset.batch(batch_size)`) is per replica, rather than the effective batch size from an ML perspective. This source of confusion is added to when gradient accumulation is also used. This diff adds a log which states the effective batch size from the training optimizer's perspective. It dynamically explains the effective batch size by considering the following 3 possibilities: - gradient accumulation > 1 - log only mentions gradient accumulation - number of replicas > 1 - log only mentions replication - number of replicas > 1 and gradient accumulation > 1 - log mentions both TF2.5 Only Reviewers: #tensorflow, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, georgep, markf, vladimirm, christiana Reviewed By: #tensorflow, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, christiana Subscribers: vladimirm Maniphest Tasks: T56300 Differential Revision: https://phabricator.sourcevertex.net/D61155
diff --git a/tensorflow/python/ipu/keras/extensions/data_adapter.py b/tensorflow/python/ipu/keras/extensions/data_adapter.py
@@ -230,6 +230,15 @@ def steps_per_execution_value(self):
   def element_spec(self):
     return self._dataset.element_spec
 
+  @property
+  def batch_size(self):
+    batch_size = self._adapter.batch_size()
+    if batch_size is None and self.element_spec:
+      element_spec = nest.flatten(self.element_spec)[0]
+      if element_spec.shape:
+        batch_size = element_spec.shape[0]
+    return batch_size
+
   def set_replication_factor(self, value):
     self._replication_factor = value
     self._inferred_steps = self._infer_steps(self._steps_per_epoch,
diff --git a/tensorflow/python/ipu/keras/extensions/keras_extension_base.py b/tensorflow/python/ipu/keras/extensions/keras_extension_base.py
@@ -139,6 +139,38 @@ def _log_steps_per_execution_warning(self, steps_per_execution):
                        self.name, steps_per_execution))
       logged_steps_per_execution_warning = True
 
+  def _log_optimizer_batch_size(self, data_handler):
+    """A function that logs the batch size as seen from the perspective of the
+    optimizer during training.
+
+    Args:
+        data_handler (IPUDataHandler): The data handler created in `fit()`
+    """
+    # Optimizer batch size depends on the specified batch size, the gradient
+    # accumulation and the replication factor.
+    steps_per_execution = data_handler.steps_per_execution_value
+    gradient_accumulation_steps_per_replica = \
+      self._verify_and_get_gradient_accumulation_steps_per_replica(
+          steps_per_execution)
+    total_replicas = self._get_replication_factor() * popdist.getNumInstances()
+    # Construct tailored message depending on if replication, gradient
+    # accunulation, or both are enabled.
+    is_distributed = total_replicas > 1
+    is_accumulated = gradient_accumulation_steps_per_replica > 1
+    if is_accumulated or is_distributed:
+      accumulating_n_batches = \
+        " and accumulating {} batches per optimizer step".format(
+            gradient_accumulation_steps_per_replica)
+      across_n_replicas = " across {} replicas".format(total_replicas)
+      effective_batch_size = data_handler.batch_size * \
+        gradient_accumulation_steps_per_replica * total_replicas
+      logging.info(
+          "Training is{}{}{}, your effective batch size is {}.".format(
+              " distributed" if is_distributed else "",
+              accumulating_n_batches if is_accumulated else "",
+              across_n_replicas if is_distributed else "",
+              effective_batch_size))
+
   def _get_shard_count(self):
     """Returns how many shards the model is parallelized over.
 
@@ -1196,6 +1228,8 @@ def _fit_delegate(self,
       replication_factor = self._get_replication_factor()
       data_handler.set_replication_factor(replication_factor)
 
+      self._log_optimizer_batch_size(data_handler)
+
       # Container that configures and calls `tf.keras.Callback`s.
       if not isinstance(callbacks, callbacks_module.CallbackList):
         callbacks = callbacks_module.CallbackList(