Update documentation regarding Keras API update

caandewiel · georgepaw · commit 8cbb17b374f0 · 2022-03-15T14:45:00.000Z
Summary: TF2.5 Only Reviewers: #tensorflow, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, jamiep, alexc, jackh Reviewed By: #tensorflow, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, jamiep, alexc, jackh Subscribers: georgep, jackh, alfiee, grahamh, alexc, jamiep Maniphest Tasks: T56523 Differential Revision: https://phabricator.sourcevertex.net/D62197
diff --git a/tensorflow/compiler/plugin/poplar/docs/api-changes.rst b/tensorflow/compiler/plugin/poplar/docs/api-changes.rst
@@ -14,6 +14,14 @@ ________________
 
   These will require changes to any code that uses them.
 
+IPU Keras changes
+'''''''''''''''''
+
+  - The argument ``steps_per_execution`` in ``model.compile()`` now reflects
+    the number of steps to process per execution *per replica* instead, whereas
+    previously this reflected the number of steps to process per execution for
+    all replicas combined.
+
 Removal of deprecated APIs
 ''''''''''''''''''''''''''
 
diff --git a/tensorflow/compiler/plugin/poplar/docs/keras_tf2.rst b/tensorflow/compiler/plugin/poplar/docs/keras_tf2.rst
@@ -24,10 +24,18 @@ inside the scope of an ``IPUStrategy``:
 Using steps_per_execution
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-To reduce Python overhead and maximize the performance of your model, pass in
-the ``steps_per_execution`` argument to the compile method. This argument sets
-the number of batches to process sequentially in a single execution. You should
-increase this number to improve accelerator utilization.
+To reduce Python overhead and maximize the performance of your model, pass the
+``steps_per_execution`` argument to the compile method. This argument sets the
+number of batches processed sequentially by one replica in a single execution
+which can greatly improve performance because any overhead between steps is removed,
+thus increasing IPU utilization.
+
+Ideally, ``steps_per_execution`` is equal to the number of steps your model needs
+to run per replica in order to complete one epoch. Note that it is not possible
+to fetch intermediate results when ``steps_per_execution`` is specified. Model
+weights are read on the Python host after all steps are executed on the IPU. If
+you need to access model weights during an epoch (for example for saving a
+checkpoint), you must set ``steps_per_execution`` accordingly.
 
 .. note::
 
@@ -69,8 +77,7 @@ for more details.
 
   When using data-parallelism, the ``steps_per_execution`` value the model was
   compiled with must be an integer multiple of
-  ``gradient_accumulation_steps_per_replica`` multiplied by the number of
-  replicas in the model. Data parallelism is discussed in
+  ``gradient_accumulation_steps_per_replica``. Data parallelism is discussed in
   :numref:`automatic-data-parallelism`.
 
 
diff --git a/tensorflow/compiler/plugin/poplar/docs/keras_tf2_example2.py b/tensorflow/compiler/plugin/poplar/docs/keras_tf2_example2.py
@@ -29,7 +29,7 @@ def create_dataset():
   train_ds = train_ds.map(lambda d, l:
                           (tf.cast(d, tf.float32), tf.cast(l, tf.int32)))
 
-  return train_ds.repeat().prefetch(16)
+  return train_ds.prefetch(16)
 
 
 dataset = create_dataset()
@@ -45,8 +45,10 @@ def create_dataset():
       loss=tf.keras.losses.SparseCategoricalCrossentropy(),
       optimizer=tf.keras.optimizers.RMSprop(),
       metrics=["accuracy"],
-      # Anything between 2 and `steps_per_epoch` could help here.
-      steps_per_execution=50,
+      # Anything between 2 and the length of the dataset would work,
+      # but the greater `steps_per_execution` the greater the
+      # performance gains.
+      steps_per_execution=dataset.cardinality(),
   )
 
-  model.fit(dataset, epochs=2, steps_per_epoch=100)
+  model.fit(dataset, epochs=2)
diff --git a/tensorflow/compiler/plugin/poplar/docs/keras_tf2_example3.py b/tensorflow/compiler/plugin/poplar/docs/keras_tf2_example3.py
@@ -29,7 +29,7 @@ def create_dataset():
   train_ds = train_ds.map(lambda d, l:
                           (tf.cast(d, tf.float32), tf.cast(l, tf.int32)))
 
-  return train_ds.repeat().prefetch(16)
+  return train_ds.prefetch(16)
 
 
 dataset = create_dataset()
@@ -40,15 +40,31 @@ def create_dataset():
   # Create a Keras model inside the strategy.
   model = create_model()
 
+  # `steps_per_execution` must be divisible by `gradient_accumulation_steps_per_replica`.
+  # Say we want to accumulate 10 steps before doing a weight update, then we would end up
+  # with the following values.
+  gradient_accumulation_steps_per_replica = 10
+  number_of_accumulated_steps = dataset.cardinality(
+  ) // gradient_accumulation_steps_per_replica
+
+  # In order to get the proper `steps_per_execution` value, we have to multiply
+  # `number_of_accumulated_steps` with `gradient_accumulation_steps_per_replica`.
+  steps_per_execution = number_of_accumulated_steps * \
+                        gradient_accumulation_steps_per_replica
+
+  # Now we need to truncate the dataset so Keras will not try to take more data
+  # from the dataset than is available.
+  dataset = dataset.take(steps_per_execution)
+
   # Compile the model for training.
   model.compile(
       loss=tf.keras.losses.SparseCategoricalCrossentropy(),
       optimizer=tf.keras.optimizers.RMSprop(),
       metrics=["accuracy"],
-      steps_per_execution=50,
+      steps_per_execution=steps_per_execution,
   )
 
   model.set_gradient_accumulation_options(
       gradient_accumulation_steps_per_replica=10)
 
-  model.fit(dataset, epochs=2, steps_per_epoch=100)
+  model.fit(dataset, epochs=2)