Add PopDistAllGather to host_collective_ops

caandewiel · Frederik Mellbye · commit e77e6d5e1bd1 · 2022-09-09T13:26:50.000+01:00
Reviewers: #tensorflow, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, jakeh Reviewed By: #tensorflow, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, jakeh Subscribers: jakeh Maniphest Tasks: T64481 Differential Revision: https://phabricator.sourcevertex.net/D74652
diff --git a/tensorflow/compiler/plugin/poplar/kernels/popdist/BUILD b/tensorflow/compiler/plugin/poplar/kernels/popdist/BUILD
@@ -5,6 +5,23 @@ package(default_visibility = [
     "//tensorflow/python/ipu:__subpackages__",
 ])
 
+poplar_cc_library(
+    name = "all_gather",
+    srcs = [
+        "all_gather.cc",
+    ],
+    deps = [
+        "//tensorflow/compiler/plugin/poplar/driver/tools:poplar_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:inplace_ops",
+        "//third_party/eigen3",
+        "@local_config_poplar//poplar:poplar_libs",
+    ],
+    alwayslink = True,
+)
+
 poplar_cc_library(
     name = "all_reduce",
     srcs = [
@@ -42,6 +59,7 @@ poplar_cc_library(
 poplar_cc_library(
     name = "popdist",
     deps = [
+        ":all_gather",
         ":all_reduce",
         ":broadcast",
     ],
diff --git a/tensorflow/compiler/plugin/poplar/kernels/popdist/all_gather.cc b/tensorflow/compiler/plugin/poplar/kernels/popdist/all_gather.cc
@@ -0,0 +1,81 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/plugin/poplar/driver/tools/poplar_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/inplace_ops_functor.h"
+
+#include <popdist/backend.hpp>
+#include <popdist/collectives.hpp>
+#include <popdist/context.hpp>
+
+namespace poplar {
+template <>
+struct equivalent_device_type<Eigen::half> {
+  const Type& value = HALF;
+};
+}  // namespace poplar
+
+namespace tensorflow {
+template <typename T>
+class PopDistAllGatherOp : public AsyncOpKernel {
+ public:
+  explicit PopDistAllGatherOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name_));
+  }
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    auto& input = ctx->input(0);
+    Tensor* output;
+
+    auto output_shape = input.shape();
+    output_shape.InsertDim(0, popdist::getNumInstances());
+
+    OP_REQUIRES_OK_ASYNC(ctx, ctx->allocate_output(0, output_shape, &output),
+                         done);
+
+    Env::Default()->SchedClosure([input = &input, output, ctx, done, this] {
+      OP_REQUIRES_OK_ASYNC(
+          ctx,
+          xla::poplarplugin::RunPoplarFunction<popdist::popdist_error>(
+              [&input, &output, &ctx, &done, this] {
+                popdist::collectives::parallel::allGather(
+                    input->flat<T>().data(), output->flat<T>().data(),
+                    input->NumElements(),
+                    poplar::equivalent_device_type<T>().value,
+                    this->tensor_name_);
+
+                done();
+              }),
+          done);
+    });
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(PopDistAllGatherOp);
+  std::string tensor_name_;
+};  // namespace tensorflow
+
+#define REGISTER_CPU(T)                                                   \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("PopdistAllGather").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      PopDistAllGatherOp<T>);
+
+TF_CALL_INTEGRAL_TYPES(REGISTER_CPU);
+TF_CALL_half(REGISTER_CPU);
+TF_CALL_float(REGISTER_CPU);
+#undef REGISTER_CPU
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/plugin/poplar/kernels/popdist/all_reduce.cc b/tensorflow/compiler/plugin/poplar/kernels/popdist/all_reduce.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <future>
-
 #include "tensorflow/compiler/plugin/poplar/driver/tools/poplar_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
@@ -53,24 +51,28 @@ class PopDistAllReduceOp : public AsyncOpKernel {
 
     auto* flattened_buffer = output->flat<T>().data();
 
-    auto future = std::async(std::launch::async, [&] {
+    Env::Default()->SchedClosure([flattened_buffer, ctx, done, this] {
       OP_REQUIRES_OK_ASYNC(
           ctx,
-          xla::poplarplugin::RunPoplarFunction<popdist::popdist_error>([&] {
-            popdist::collectives::parallel::allReduceSum(
-                flattened_buffer, input.NumElements(),
-                poplar::equivalent_device_type<T>().value, this->tensor_name_);
+          xla::poplarplugin::RunPoplarFunction<popdist::popdist_error>(
+              [&flattened_buffer, &ctx, &done, this] {
+                const auto num_elements = ctx->input(0).NumElements();
+
+                popdist::collectives::parallel::allReduceSum(
+                    flattened_buffer, num_elements,
+                    poplar::equivalent_device_type<T>().value,
+                    this->tensor_name_);
 
-            const auto num_instances = popdist::getNumInstances();
+                const auto num_instances = popdist::getNumInstances();
 
-            if (this->reduce_op_ == "MEAN") {
-              for (auto i = 0; i < input.NumElements(); ++i) {
-                *(flattened_buffer + i) /= static_cast<T>(num_instances);
-              }
-            }
+                if (this->reduce_op_ == "MEAN") {
+                  for (auto i = 0; i < num_elements; ++i) {
+                    *(flattened_buffer + i) /= static_cast<T>(num_instances);
+                  }
+                }
 
-            done();
-          }),
+                done();
+              }),
           done);
     });
   }
diff --git a/tensorflow/compiler/plugin/poplar/kernels/popdist/broadcast.cc b/tensorflow/compiler/plugin/poplar/kernels/popdist/broadcast.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <future>
-
 #include "tensorflow/compiler/plugin/poplar/driver/tools/poplar_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
@@ -50,16 +48,18 @@ class PopDistBroadcastOp : public AsyncOpKernel {
         tensorflow::functor::DoCopy(ctx->eigen_cpu_device(), input, output),
         done);
 
-    auto future = std::async(std::launch::async, [&] {
+    Env::Default()->SchedClosure([output, ctx, done, this] {
       OP_REQUIRES_OK_ASYNC(
           ctx,
-          xla::poplarplugin::RunPoplarFunction<popdist::popdist_error>([&] {
-            popdist::collectives::parallel::broadcast(
-                output->flat<T>().data(), output->NumElements(),
-                poplar::equivalent_device_type<T>().value, this->tensor_name_);
+          xla::poplarplugin::RunPoplarFunction<popdist::popdist_error>(
+              [&output, &ctx, &done, this] {
+                popdist::collectives::parallel::broadcast(
+                    output->flat<T>().data(), output->NumElements(),
+                    poplar::equivalent_device_type<T>().value,
+                    this->tensor_name_);
 
-            done();
-          }),
+                done();
+              }),
           done);
     });
   }
diff --git a/tensorflow/compiler/plugin/poplar/ops/BUILD b/tensorflow/compiler/plugin/poplar/ops/BUILD
@@ -312,8 +312,7 @@ poplar_cc_library(
         "//tensorflow/compiler/plugin/poplar/ops/datastream:host_embedding",
         "//tensorflow/compiler/plugin/poplar/ops/functional",
         "//tensorflow/compiler/plugin/poplar/ops/functional:pipelining",
-        "//tensorflow/compiler/plugin/poplar/ops/popdist:all_reduce",
-        "//tensorflow/compiler/plugin/poplar/ops/popdist:broadcast",
+        "//tensorflow/compiler/plugin/poplar/ops/popdist:ops",
         "//tensorflow/compiler/plugin/poplar/ops/popfloat:cast_to_gfloat",
         "//tensorflow/compiler/plugin/poplar/ops/popnn:ctc_loss",
         "//tensorflow/compiler/plugin/poplar/ops/popnn:gelu",
diff --git a/tensorflow/compiler/plugin/poplar/ops/popdist/BUILD b/tensorflow/compiler/plugin/poplar/ops/popdist/BUILD
@@ -4,6 +4,17 @@ licenses(["restricted"])
 
 package(default_visibility = ["//tensorflow/compiler/plugin/poplar:__subpackages__"])
 
+poplar_cc_library(
+    name = "all_gather",
+    srcs = [
+        "all_gather.cc",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+    ],
+    alwayslink = True,
+)
+
 poplar_cc_library(
     name = "all_reduce",
     srcs = [
@@ -30,6 +41,7 @@ poplar_cc_library(
     name = "ops",
     srcs = [],
     deps = [
+        ":all_gather",
         ":all_reduce",
         ":broadcast",
     ],
diff --git a/tensorflow/compiler/plugin/poplar/ops/popdist/all_gather.cc b/tensorflow/compiler/plugin/poplar/ops/popdist/all_gather.cc
@@ -0,0 +1,25 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+REGISTER_OP("PopdistAllGather")
+    .Attr("T: numbertype")
+    .Input("tensor: T")
+    .Attr("tensor_name: string")
+    .Output("sum: T")
+    .SetShapeFn(shape_inference::UnknownShape);
+}  // namespace tensorflow
diff --git a/tensorflow/python/ipu/distributed/host_collective_ops.py b/tensorflow/python/ipu/distributed/host_collective_ops.py
@@ -23,13 +23,13 @@ def _normalize_name(name):
   return re.sub('[^a-zA-Z0-9_]', '_', name)
 
 
-def broadcast(value, root_rank=0, tensor_name=None):
+def all_gather(value, tensor_name=None):
   if not tensor_name and not context.executing_eagerly():
-    tensor_name = "PopDistBroadcast_{}".format(_normalize_name(value.name))
+    tensor_name = "PopDistAllGather_{}".format(_normalize_name(value.name))
   else:
     tensor_name = "Default"
 
-  return gen_popdist_ops.popdist_broadcast(value, tensor_name=tensor_name)
+  return gen_popdist_ops.popdist_all_gather(value, tensor_name=tensor_name)
 
 
 def all_reduce(value, reduce_op, tensor_name=None):
@@ -41,3 +41,12 @@ def all_reduce(value, reduce_op, tensor_name=None):
   return gen_popdist_ops.popdist_all_reduce(value,
                                             reduce_op=reduce_op.value,
                                             tensor_name=tensor_name)
+
+
+def broadcast(value, root_rank=0, tensor_name=None):
+  if not tensor_name and not context.executing_eagerly():
+    tensor_name = "PopDistBroadcast_{}".format(_normalize_name(value.name))
+  else:
+    tensor_name = "Default"
+
+  return gen_popdist_ops.popdist_broadcast(value, tensor_name=tensor_name)
diff --git a/tensorflow/python/ipu/distributed/host_collective_ops_test.py b/tensorflow/python/ipu/distributed/host_collective_ops_test.py
@@ -39,6 +39,14 @@ class HostCollectiveOpsTest(test_util.TensorFlowTestCase,
   def setUpClass(cls):
     popdist.init()
 
+  @parameterized.named_parameters(*TESTCASES)
+  def test_all_gather(self, dtype):
+    x = constant_op.constant(popdist.getInstanceIndex(), dtype=dtype)
+    self.assertAllEqual(
+        host_collective_ops.all_gather(x),
+        np.array([i for i in range(popdist.getNumInstances())],
+                 dtype=dtype.as_numpy_dtype))
+
   @parameterized.named_parameters(*TESTCASES)
   def test_all_reduce_sum(self, dtype):
     x = constant_op.constant(popdist.getInstanceIndex(), dtype=dtype)
@@ -60,6 +68,44 @@ def test_broadcast(self, dtype):
                              dtype=dtype)
     self.assertAllEqual(host_collective_ops.broadcast(x), 42)
 
+  def test_all_all_gather_different_order(self):
+    # Call collective on `x` first and `y` afterwards.
+    @def_function.function()
+    def body_instance_even(x, y):
+      res_x = host_collective_ops.all_gather(x)
+      res_y = host_collective_ops.all_gather(y)
+
+      return (res_x, res_y)
+
+    # Call collective on `y` first and `x` afterwards.
+    @def_function.function()
+    def body_instance_odd(x, y):
+      res_y = host_collective_ops.all_gather(y)
+      res_x = host_collective_ops.all_gather(x)
+
+      return (res_x, res_y)
+
+    x = constant_op.constant(popdist.getInstanceIndex(), dtype=dtypes.float32)
+    y = constant_op.constant(
+        [popdist.getInstanceIndex(),
+         popdist.getInstanceIndex()],
+        dtype=dtypes.int32)
+
+    is_even = popdist.getInstanceIndex() % 2 == 0
+
+    # Test that we can call collectives in any order as long as our tensors have names.
+    (res_x,
+     res_y) = body_instance_even(x, y) if is_even else body_instance_odd(x, y)
+
+    self.assertAllEqual(
+        res_x,
+        np.array([i for i in range(popdist.getNumInstances())],
+                 dtype=np.float32))
+    self.assertAllEqual(
+        res_y,
+        np.array([[i, i] for i in range(popdist.getNumInstances())],
+                 dtype=np.float32))
+
   def test_all_reduce_different_order(self):
     # Call collective on `x` first and `y` afterwards.
     @def_function.function()
@@ -124,6 +170,39 @@ def body_instance_odd(x, y):
     self.assertAllEqual(res_x, 42)
     self.assertAllEqual(res_y, [42, 42])
 
+  def test_all_gather_different_dtype(self):
+    dtype = dtypes.float32 if popdist.getInstanceIndex(
+    ) % 2 == 0 else dtypes.int32
+    x = constant_op.constant(popdist.getInstanceIndex(), dtype=dtype)
+
+    try:
+      host_collective_ops.all_gather(x)
+    except errors.UnknownError as e:
+      self.assertAllEqual(
+          True,
+          "Tensor layouts did not match on all instances" in e.message,
+      )
+
+      return
+
+    self.fail()
+
+  def test_all_gather_different_shape(self):
+    value = 1 if popdist.getInstanceIndex() % 2 == 0 else [1, 1]
+    x = constant_op.constant(value, dtype=dtypes.int32)
+
+    try:
+      host_collective_ops.all_gather(x)
+    except errors.UnknownError as e:
+      self.assertAllEqual(
+          True,
+          "Tensor layouts did not match on all instances" in e.message,
+      )
+
+      return
+
+    self.fail()
+
   def test_all_reduce_different_dtype(self):
     dtype = dtypes.float32 if popdist.getInstanceIndex(
     ) % 2 == 0 else dtypes.int32