IPU Multi Replica Strategy support for Keras API in TF2.4

caandewiel · caandewiel · commit ff3c30a3fc4a · 2021-09-30T13:22:43.000+01:00
Summary: TF2.4 Only Reviewers: #tensorflow, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, babakk, jackh Reviewed By: #tensorflow, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, babakk, jackh Subscribers: hakons, georgep, babakk Maniphest Tasks: T46807 Differential Revision: https://phabricator.sourcevertex.net/D52073
diff --git a/tensorflow/compiler/plugin/poplar/BUILD b/tensorflow/compiler/plugin/poplar/BUILD
@@ -4253,6 +4253,18 @@ tf_xla_py_test(
     ],
 )
 
+poprun_py_test(
+    name = "distributed_tf2_test",
+    size = "large",
+    srcs = ["tests/distributed_tf2_test.py"],
+    main = "tests/distributed_tf2_test.py",
+    num_instances = 2,
+    num_replicas = 4,
+    deps = [
+        "//tensorflow/python/ipu:ipu_lib",
+    ],
+)
+
 xla_test(
     name = "replicated_resource_update_elementwise_clustering_hw_test",
     srcs = ["tests/replicated_resource_update_elementwise_clustering_hw_test.cc"],
@@ -5771,6 +5783,7 @@ test_suite(
         "device_connection_test",
         "distributed_batch_norm_decomposer_test",
         "distributed_batch_norm_test",
+        "distributed_tf2_test",
         "dump_poplar_info",
         "dynamic_slice_layout_test",
         "dynamic_slice_test",
diff --git a/tensorflow/compiler/plugin/poplar/tests/distributed_tf2_test.py b/tensorflow/compiler/plugin/poplar/tests/distributed_tf2_test.py
@@ -0,0 +1,92 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import numpy as np
+
+import popdist
+import popdist.tensorflow
+
+import tensorflow as tf
+from tensorflow.python import ipu
+from tensorflow.python.framework import constant_op, test_util
+from tensorflow.python.ipu.horovod import ipu_multi_replica_strategy
+from tensorflow.python.platform import test
+from tensorflow.python.ipu import horovod as hvd
+
+
+class DistributedTF2Test(test_util.TensorFlowTestCase):
+  def assert_all_instances_equal(self, local_value, name=None):
+    """Assert that the current instance has the same value as the root instance."""
+    local_tensor = constant_op.constant(local_value)
+    root_tensor = hvd.broadcast(local_tensor, root_rank=0)
+    np.testing.assert_equal(local_value, root_tensor.numpy(), name)
+
+  def test_tf2_distributed(self):
+    config = ipu.config.IPUConfig()
+    popdist.tensorflow.set_ipu_config(config, ipus_per_replica=1)
+    config.configure_ipu_system()
+
+    hvd.init()
+
+    strategy = ipu_multi_replica_strategy.IPUMultiReplicaStrategy()
+
+    def generator():
+      for _ in range(100):
+        yield np.random.rand(32, 32, 1), np.random.randint(1, 10, size=1)
+
+    dataset = tf.data.Dataset.from_generator(
+        generator,
+        output_types=(tf.float32, tf.float32),
+        output_shapes=((32, 32, 1), (1,)),
+    )
+
+    options = tf.data.Options()
+    options.experimental_distribute.auto_shard_policy =\
+      tf.data.experimental.AutoShardPolicy.OFF
+    dataset = dataset.with_options(options)
+
+    dataset = dataset.shard(num_shards=popdist.getNumInstances(),
+                            index=popdist.getInstanceIndex())
+    dataset = dataset.batch(10, drop_remainder=True)
+
+    with strategy.scope():
+      model = tf.keras.models.Sequential([
+          tf.keras.layers.Conv2D(32, 3, activation='relu'),
+          tf.keras.layers.MaxPooling2D(),
+          tf.keras.layers.Conv2D(32, 3, activation='relu'),
+          tf.keras.layers.MaxPooling2D(),
+          tf.keras.layers.Flatten(),
+          tf.keras.layers.Dense(32, activation='relu'),
+          tf.keras.layers.Dense(10),
+      ])
+
+      optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
+      loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+
+      model.compile(optimizer=optimizer,
+                    loss=loss_fn,
+                    steps_per_execution=popdist.getNumTotalReplicas())
+      history = model.fit(dataset,
+                          steps_per_epoch=popdist.getNumTotalReplicas(),
+                          epochs=1)
+
+      # Make sure the losses and weights are identical as we reduce over all IPUs
+      self.assert_all_instances_equal(history.history['loss'])
+
+      for v in model.trainable_variables:
+        self.assert_all_instances_equal(v)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ipu/BUILD b/tensorflow/python/ipu/BUILD
@@ -144,6 +144,7 @@ py_library(
         "keras/optimizers/ipu_wrappers.py",
         "keras/optimizers/map_gradient_optimizer.py",
         "keras/pipeline.py",
+        "keras_extensions.py",
         "loops.py",
         "ops/all_to_all_op.py",
         "ops/cross_replica_ops.py",
diff --git a/tensorflow/python/ipu/horovod/ipu_multi_replica_strategy.py b/tensorflow/python/ipu/horovod/ipu_multi_replica_strategy.py
@@ -17,11 +17,15 @@
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
-from tensorflow.python.distribute.cluster_resolver import cluster_resolver as cluster_resolver_lib
+from tensorflow.python.distribute.cluster_resolver import \
+  cluster_resolver as cluster_resolver_lib
 from tensorflow.python.framework import device as tf_device
-from tensorflow.python.ipu import utils as ipu_utils
-from tensorflow.python.ipu.horovod import Sum, Average, size, rank, allreduce as hvd_allreduce, broadcast as hvd_broadcast
-from tensorflow.python.ipu.ipu_multi_worker_strategy import IPUMultiWorkerExtendedV1
+from tensorflow.python.ipu import keras_extensions
+from tensorflow.python.ipu.horovod import Sum, Average, size, rank, \
+  allreduce as hvd_allreduce, \
+  broadcast as hvd_broadcast
+from tensorflow.python.ipu.ipu_multi_worker_strategy import \
+  IPUMultiWorkerExtendedV1
 from tensorflow.python.ipu.ops import cross_replica_ops
 from tensorflow.python.training import server_lib
 
@@ -40,7 +44,8 @@ def _is_current_device_ipu():
   return current_device.device_type == "IPU"
 
 
-class IPUMultiReplicaStrategyV1(distribute_lib.StrategyV1):
+class IPUMultiReplicaStrategyV1(distribute_lib.StrategyV1,
+                                keras_extensions.KerasExtensions):
   """This is a distribution strategy for multi-replica distribution
   that uses compiled communications with GCL for reductions over IPU
   links and gateway links, while using Horovod for broadcasting of
@@ -55,7 +60,9 @@ class IPUMultiReplicaStrategyV1(distribute_lib.StrategyV1):
 
   def __init__(self,
                ipu_device="/device:IPU:0",
-               add_ipu_cross_replica_reductions=True):
+               add_ipu_cross_replica_reductions=True,
+               enable_dataset_iterators=True,
+               enable_keras_extensions=True):
     # We create an empty cluster here since we will not be using gRPC for communication.
     # All the communication is delegated to either GCL or Horovod (MPI) below.
     cluster_resolver = cluster_resolver_lib.SimpleClusterResolver(
@@ -64,6 +71,8 @@ def __init__(self,
     super().__init__(
         IPUMultiReplicaExtendedV1(self, cluster_resolver, ipu_device,
                                   add_ipu_cross_replica_reductions))
+    keras_extensions.KerasExtensions.__init__(self, enable_dataset_iterators,
+                                              enable_keras_extensions)
 
   def update_ipu_config(self, config):
     """Update the given IPU configuration with the multi-replica
@@ -89,6 +98,10 @@ def __init__(self, container_strategy, cluster_resolver, ipu_device,
     self._num_workers = size()
     self._add_ipu_cross_replica_reductions = add_ipu_cross_replica_reductions
 
+  def non_slot_devices(self, var_list):
+    del var_list
+    return self._ipu_device
+
   def _reduce_to(self, reduce_op, value, destinations, options):
     del destinations
     del options
diff --git a/tensorflow/python/ipu/ipu_multi_worker_strategy.py b/tensorflow/python/ipu/ipu_multi_worker_strategy.py
@@ -31,6 +31,7 @@
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
 from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import tf_contextlib
 
@@ -432,10 +433,19 @@ def _batch_reduce_to(self, reduce_op, value_destination_pairs, options):
                                                options)
 
   def _call_for_each_replica(self, fn, args, kwargs):
-    with distribute_lib.ReplicaContext(
-        self._container_strategy(), replica_id_in_sync_group=0), \
-        ops.device(self._ipu_device):
-      return fn(*args, **kwargs)
+    with distribute_lib.ReplicaContext(self._container_strategy(),
+                                       replica_id_in_sync_group=0), ops.device(
+                                           self._ipu_device):
+      # Make sure it is compiled as a single engine when called in graph mode.
+      # This is similar to the mechanism used by xla.compile.
+      xla_context = control_flow_ops.XLAControlFlowContext()
+      try:
+        xla_context.Enter()
+        outputs = fn(*args, **kwargs)
+      finally:
+        xla_context.Exit()
+
+      return outputs
 
   def _validate_colocate_with_variable(self, colocate_with_variable):
     if colocate_with_variable.device != self._variable_device:
diff --git a/tensorflow/python/ipu/ipu_strategy.py b/tensorflow/python/ipu/ipu_strategy.py
@@ -40,11 +40,13 @@
 from tensorflow.python.ipu.keras.extensions import functional_extensions
 from tensorflow.python.ipu.keras.extensions import sequential_extensions
 from tensorflow.python.ipu import ipu_infeed_queue
+from tensorflow.python.ipu import keras_extensions
 
 _pvti_trace_channel = libpvti.createTraceChannel("TensorFlow")
 
 
-class IPUStrategyV1(distribute_lib.StrategyV1):
+class IPUStrategyV1(distribute_lib.StrategyV1,
+                    keras_extensions.KerasExtensions):
   """This is a distribution strategy for targeting a system with one
   or more IPUs.
 
@@ -78,9 +80,6 @@ class IPUStrategyV1(distribute_lib.StrategyV1):
 
 
   """
-
-  _enable_legacy_iterators = True
-
   def __init__(self,
                ipu_device="/device:IPU:0",
                cpu_device="/device:CPU:0",
@@ -98,15 +97,8 @@ def __init__(self,
         to improve Keras performance when using IPUs.
     """
     super().__init__(IPUExtendedV1(self, ipu_device, cpu_device))
-    self._enable_iterators = enable_dataset_iterators
-    self._enable_keras_extensions = enable_keras_extensions
-    self._keras_extensions = OrderedDict()
-    # Insert Sequential before Functional as Sequential models inherit from
-    # Functional models.
-    self._register_keras_extension(sequential.Sequential,
-                                   sequential_extensions.SequentialExtension)
-    self._register_keras_extension(functional.Functional,
-                                   functional_extensions.FunctionalExtension)
+    keras_extensions.KerasExtensions.__init__(self, enable_dataset_iterators,
+                                              enable_keras_extensions)
 
   @libpvti.instrument_fn(_pvti_trace_channel)
   def run(self, fn, args=(), kwargs=None, options=None):
@@ -123,39 +115,6 @@ def _device_ordinal(self):
     current_device = tf_device.DeviceSpec.from_string(device_string)
     return current_device.device_index
 
-  def _enable_dataset_iterators(self):
-    return context.executing_eagerly() and self._enable_iterators
-
-  def _create_dataset_iterator(self, dataset):
-    assert self._enable_dataset_iterators()
-    return ipu_infeed_queue.IPUOwnedIterator(dataset=dataset)  # pylint: disable=protected-access
-
-  def _register_keras_extension(self, class_type, extension):
-    self._keras_extensions[class_type] = extension
-
-  def _delete_keras_extension(self, class_type):
-    self._keras_extensions.pop(class_type, None)
-
-  def _patch_keras_extension(self, instance):
-    if not self._enable_keras_extensions:
-      return
-
-    for class_type, extension in self._keras_extensions.items():
-      if isinstance(instance, class_type):
-        if isinstance(instance, base_layer.KerasExtension):
-          if not isinstance(instance, extension):
-            raise RuntimeError(
-                "KerasExtension patching failed - already patched with a "
-                "different extension.")
-          break
-
-        # Patch in the extension.
-        # Note that we keep the name as Keras sometimes does __name__ checks.
-        cls = instance.__class__
-        instance.__class__ = cls.__class__(cls.__name__, (cls, extension), {})
-        extension.__init__(instance)
-        break
-
   @property
   def supports_loss_scaling(self):
     return True
diff --git a/tensorflow/python/ipu/keras_extensions.py b/tensorflow/python/ipu/keras_extensions.py
@@ -0,0 +1,74 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from collections import OrderedDict
+
+from tensorflow.python.eager import context
+from tensorflow.python.ipu import ipu_infeed_queue
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.ipu.keras.extensions import functional_extensions
+from tensorflow.python.ipu.keras.extensions import sequential_extensions
+from tensorflow.python.keras.engine import functional
+from tensorflow.python.keras.engine import sequential
+
+
+class KerasExtensions:
+  _enable_legacy_iterators = True
+
+  def __init__(self,
+               enable_dataset_iterators=True,
+               enable_keras_extensions=True):
+    self._enable_iterators = enable_dataset_iterators
+    self._enable_keras_extensions = enable_keras_extensions
+    self._keras_extensions = OrderedDict()
+
+    # Insert Sequential before Functional as Sequential models inherit from
+    # Functional models.
+    self._register_keras_extension(sequential.Sequential,
+                                   sequential_extensions.SequentialExtension)
+    self._register_keras_extension(functional.Functional,
+                                   functional_extensions.FunctionalExtension)
+
+  def _enable_dataset_iterators(self):
+    return context.executing_eagerly() and self._enable_iterators
+
+  def _create_dataset_iterator(self, dataset):
+    assert self._enable_dataset_iterators()
+    return ipu_infeed_queue.IPUOwnedIterator(dataset=dataset)  # pylint: disable=protected-access
+
+  def _register_keras_extension(self, class_type, extension):
+    self._keras_extensions[class_type] = extension
+
+  def _delete_keras_extension(self, class_type):
+    self._keras_extensions.pop(class_type, None)
+
+  def _patch_keras_extension(self, instance):
+    if not self._enable_keras_extensions:
+      return
+
+    for class_type, extension in self._keras_extensions.items():
+      if isinstance(instance, class_type):
+        if isinstance(instance, base_layer.KerasExtension):
+          if not isinstance(instance, extension):
+            raise RuntimeError(
+                "KerasExtension patching failed - already patched with a "
+                "different extension.")
+          break
+
+        # Patch in the extension.
+        # Note that we keep the name as Keras sometimes does __name__ checks.
+        cls = instance.__class__
+        instance.__class__ = cls.__class__(cls.__name__, (cls, extension), {})
+        extension.__init__(instance)
+        break