graphcore
diff --git a/‎tensorflow/compiler/plugin/poplar/driver/popit_backend/popit_executor.cc‎
Lines changed: 4 additions & 4 deletions b/‎tensorflow/compiler/plugin/poplar/driver/popit_backend/popit_executor.cc‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎tensorflow/compiler/plugin/poplar/driver/popit_backend/popit_memory.h‎
Lines changed: 0 additions & 28 deletions b/‎tensorflow/compiler/plugin/poplar/driver/popit_backend/popit_memory.h‎
Lines changed: 0 additions & 28 deletions
diff --git a/‎tensorflow/compiler/plugin/poplar/driver/tools/poplar_util.cc‎
Lines changed: 4 additions & 0 deletions b/‎tensorflow/compiler/plugin/poplar/driver/tools/poplar_util.cc‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎tensorflow/compiler/plugin/poplar/driver/tools/poplar_util.h‎
Lines changed: 29 additions & 2 deletions b/‎tensorflow/compiler/plugin/poplar/driver/tools/poplar_util.h‎
Lines changed: 29 additions & 2 deletions
diff --git a/‎tensorflow/compiler/plugin/poplar/kernels/popdist/BUILD‎
Lines changed: 2 additions & 0 deletions b/‎tensorflow/compiler/plugin/poplar/kernels/popdist/BUILD‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tensorflow/compiler/plugin/poplar/kernels/popdist/all_reduce.cc‎
Lines changed: 41 additions & 16 deletions b/‎tensorflow/compiler/plugin/poplar/kernels/popdist/all_reduce.cc‎
Lines changed: 41 additions & 16 deletions
diff --git a/‎tensorflow/compiler/plugin/poplar/kernels/popdist/broadcast.cc‎
Lines changed: 37 additions & 10 deletions b/‎tensorflow/compiler/plugin/poplar/kernels/popdist/broadcast.cc‎
Lines changed: 37 additions & 10 deletions
diff --git a/‎tensorflow/compiler/plugin/poplar/ops/popdist/all_reduce.cc‎
Lines changed: 2 additions & 7 deletions b/‎tensorflow/compiler/plugin/poplar/ops/popdist/all_reduce.cc‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎tensorflow/compiler/plugin/poplar/ops/popdist/broadcast.cc‎
Lines changed: 2 additions & 11 deletions b/‎tensorflow/compiler/plugin/poplar/ops/popdist/broadcast.cc‎
Lines changed: 2 additions & 11 deletions
@@ -229,7 +229,7 @@ Status PopItExecutor::Memset32(se::Stream* stream,
 }
 bool PopItExecutor::Memcpy(se::Stream* stream, void* host_dst,
                            const se::DeviceMemoryBase& src, uint64 size) {
-  return RunPoplarFunction([&] {
+  return RunPoplarFunction<poplar::poplar_error>([&] {
            return popitCopyToHost(
                static_cast<const PopItSubBuffer*>(src.opaque())->GetDevicePtr(),
                static_cast<char*>(host_dst));
@@ -238,7 +238,7 @@ bool PopItExecutor::Memcpy(se::Stream* stream, void* host_dst,
 }
 bool PopItExecutor::Memcpy(se::Stream* stream, se::DeviceMemoryBase* dst,
                            const void* host_src, uint64 size) {
-  return RunPoplarFunction([&] {
+  return RunPoplarFunction<poplar::poplar_error>([&] {
            return popitCopyFromHost(
                static_cast<const char*>(host_src),
                static_cast<PopItSubBuffer*>(dst->opaque())->GetDevicePtr());
@@ -249,7 +249,7 @@ bool PopItExecutor::MemcpyDeviceToDevice(se::Stream* stream,
                                          se::DeviceMemoryBase* dst,
                                          const se::DeviceMemoryBase& src,
                                          uint64 size) {
-  return RunPoplarFunction([&] {
+  return RunPoplarFunction<poplar::poplar_error>([&] {
            return popitCopy(
                static_cast<const PopItSubBuffer*>(src.opaque())->GetDevicePtr(),
                static_cast<PopItSubBuffer*>(dst->opaque())->GetDevicePtr())
@@ -260,7 +260,7 @@ bool PopItExecutor::HostCallback(se::Stream* stream,
                                  std::function<void()> callback) {
   // For now sync and then callback, we should aim to make this async
   // though
-  return RunPoplarFunction([&] {
+  return RunPoplarFunction<poplar::poplar_error>([&] {
            popitSync(session_.get());
            callback();
          })
 
@@ -65,34 +65,6 @@ struct PopItSubBuffer {
   PopItSubBuffer(popitMem_t* mem, int64_t size)
       : PopItSubBuffer(PopItBufferType(mem, PopItDeallocator()), 0, size) {}
 };
-
-template <class T>
-using StatusType = typename std::conditional<std::is_same<T, void>::value,
-                                             Status, StatusOr<T>>::type;
-
-template <typename F, typename... Args>
-using DeducedReturn = StatusType<typename std::result_of<F(Args...)>::type>;
-
-Status ConvertError(const std::exception& e) {
-  return PoplarExceptionToTensorflowStatus("", e);
-}
-
-// Function that runs a poplar function and converts any errors to
-// status/statusor<T>
-template <typename F, typename... Args>
-DeducedReturn<F, Args...> RunPoplarFunction(F f, Args&&... args) {
-  try {
-    if constexpr (std::is_same<DeducedReturn<F, Args...>, Status>::value) {
-      f(std::forward<Args>(args)...);
-      return Status::OK();
-    } else {
-      return f(std::forward<Args>(args)...);
-    }
-  } catch (const poplar::poplar_error& e) {
-    return ConvertError(e);
-  }
-}
-
 }  // namespace poplarplugin
 }  // namespace xla
 
 
@@ -1042,5 +1042,9 @@ void CheckPoplarPackageHash() {
     }
   }
 }
+
+Status ConvertError(const std::exception& e) {
+  return PoplarExceptionToTensorflowStatus("", e);
+}
 }  // namespace poplarplugin
 }  // namespace xla
@@ -19,6 +19,10 @@ limitations under the License.
  * These functions are related to poplar, and cannot be used within the
  * optimizers target in the BUILD file.
  */
+#include <string>
+#include <utility>
+#include <vector>
+
 #include <gcl/Collectives.hpp>
 #include <poplar/Program.hpp>
 #include <poplar/exceptions.hpp>
@@ -27,8 +31,6 @@ limitations under the License.
 #include <popnn/Pooling.hpp>
 #include <popops/Expr.hpp>
 #include <poputil/exceptions.hpp>
-#include <string>
-#include <vector>
 
 #include "absl/container/inlined_vector.h"
 #include "absl/types/optional.h"
@@ -282,6 +284,31 @@ bool HasIOTiles(CompilerResources& res);
 int64_t GetNumIPUs(CompilerResources& res);
 
 void CheckPoplarPackageHash();
+
+template <class T>
+using StatusType = typename std::conditional<std::is_same<T, void>::value,
+                                             Status, StatusOr<T>>::type;
+
+template <typename F, typename... Args>
+using DeducedReturn = StatusType<typename std::result_of<F(Args...)>::type>;
+
+Status ConvertError(const std::exception& e);
+
+// Function that runs a poplar function and converts any errors to
+// status/statusor<T>
+template <typename E, typename F, typename... Args>
+DeducedReturn<F, Args...> RunPoplarFunction(F f, Args&&... args) {
+  try {
+    if constexpr (std::is_same<DeducedReturn<F, Args...>, Status>::value) {
+      f(std::forward<Args>(args)...);
+      return Status::OK();
+    } else {
+      return f(std::forward<Args>(args)...);
+    }
+  } catch (const E& e) {
+    return ConvertError(e);
+  }
+}
 }  // namespace poplarplugin
 }  // namespace xla
 
 
@@ -11,6 +11,7 @@ poplar_cc_library(
         "all_reduce.cc",
     ],
     deps = [
+        "//tensorflow/compiler/plugin/poplar/driver/tools:poplar_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -27,6 +28,7 @@ poplar_cc_library(
         "broadcast.cc",
     ],
     deps = [
+        "//tensorflow/compiler/plugin/poplar/driver/tools:poplar_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
 
@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <future>
+
+#include "tensorflow/compiler/plugin/poplar/driver/tools/poplar_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -21,49 +24,71 @@ limitations under the License.
 #include <popdist/collectives.hpp>
 #include <popdist/context.hpp>
 
+namespace poplar {
+template <>
+struct equivalent_device_type<Eigen::half> {
+  const Type& value = HALF;
+};
+}  // namespace poplar
+
 namespace tensorflow {
 template <typename T>
-class PopDistAllReduceOp : public OpKernel {
+class PopDistAllReduceOp : public AsyncOpKernel {
  public:
-  explicit PopDistAllReduceOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  explicit PopDistAllReduceOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("reduce_op", &reduce_op_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name_));
   }
 
-  void Compute(OpKernelContext* ctx) override {
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     auto& input = ctx->input(0);
     Tensor* output;
 
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
-    OP_REQUIRES_OK(ctx, tensorflow::functor::DoCopy(ctx->eigen_cpu_device(),
-                                                    input, output));
+    OP_REQUIRES_OK_ASYNC(ctx, ctx->allocate_output(0, input.shape(), &output),
+                         done);
+    OP_REQUIRES_OK_ASYNC(
+        ctx,
+        tensorflow::functor::DoCopy(ctx->eigen_cpu_device(), input, output),
+        done);
 
     auto* flattened_buffer = output->flat<T>().data();
 
-    popdist::collectives::sequential::allReduceSum(
-        flattened_buffer, input.NumElements(),
-        poplar::equivalent_device_type<T>().value);
+    auto future = std::async(std::launch::async, [&] {
+      OP_REQUIRES_OK_ASYNC(
+          ctx,
+          xla::poplarplugin::RunPoplarFunction<popdist::popdist_error>([&] {
+            popdist::collectives::parallel::allReduceSum(
+                flattened_buffer, input.NumElements(),
+                poplar::equivalent_device_type<T>().value, this->tensor_name_);
 
-    const auto num_instances = popdist::getNumInstances();
+            const auto num_instances = popdist::getNumInstances();
 
-    if (reduce_op_ == "MEAN") {
-      for (auto i = 0; i < input.NumElements(); ++i) {
-        *(flattened_buffer + i) /= num_instances;
-      }
-    }
+            if (this->reduce_op_ == "MEAN") {
+              for (auto i = 0; i < input.NumElements(); ++i) {
+                *(flattened_buffer + i) /= static_cast<T>(num_instances);
+              }
+            }
+
+            done();
+          }),
+          done);
+    });
   }
 
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(PopDistAllReduceOp);
 
   std::string reduce_op_;
-};
+  std::string tensor_name_;
+};  // namespace tensorflow
 
 #define REGISTER_CPU(T)                                                   \
   REGISTER_KERNEL_BUILDER(                                                \
       Name("PopdistAllReduce").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       PopDistAllReduceOp<T>);
 
 TF_CALL_INTEGRAL_TYPES(REGISTER_CPU);
+TF_CALL_half(REGISTER_CPU);
 TF_CALL_float(REGISTER_CPU);
 #undef REGISTER_CPU
 }  // namespace tensorflow
@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <future>
+
+#include "tensorflow/compiler/plugin/poplar/driver/tools/poplar_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -21,34 +24,58 @@ limitations under the License.
 #include <popdist/collectives.hpp>
 #include <popdist/context.hpp>
 
+namespace poplar {
+template <>
+struct equivalent_device_type<Eigen::half> {
+  const Type& value = HALF;
+};
+}  // namespace poplar
+
 namespace tensorflow {
 template <typename T>
-class PopDistBroadcastOp : public OpKernel {
+class PopDistBroadcastOp : public AsyncOpKernel {
  public:
-  explicit PopDistBroadcastOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  explicit PopDistBroadcastOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name_));
+  }
 
-  void Compute(OpKernelContext* ctx) override {
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     auto& input = ctx->input(0);
     Tensor* output;
 
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
-    OP_REQUIRES_OK(ctx, tensorflow::functor::DoCopy(ctx->eigen_cpu_device(),
-                                                    input, output));
-    popdist::collectives::sequential::broadcast(
-        output->flat<T>().data(), output->NumElements(),
-        poplar::equivalent_device_type<T>().value);
+    OP_REQUIRES_OK_ASYNC(ctx, ctx->allocate_output(0, input.shape(), &output),
+                         done);
+    OP_REQUIRES_OK_ASYNC(
+        ctx,
+        tensorflow::functor::DoCopy(ctx->eigen_cpu_device(), input, output),
+        done);
+
+    auto future = std::async(std::launch::async, [&] {
+      OP_REQUIRES_OK_ASYNC(
+          ctx,
+          xla::poplarplugin::RunPoplarFunction<popdist::popdist_error>([&] {
+            popdist::collectives::parallel::broadcast(
+                output->flat<T>().data(), output->NumElements(),
+                poplar::equivalent_device_type<T>().value, this->tensor_name_);
+
+            done();
+          }),
+          done);
+    });
   }
 
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(PopDistBroadcastOp);
-};
+  std::string tensor_name_;
+};  // namespace tensorflow
 
 #define REGISTER_CPU(T)                                                   \
   REGISTER_KERNEL_BUILDER(                                                \
       Name("PopdistBroadcast").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       PopDistBroadcastOp<T>);
 
 TF_CALL_INTEGRAL_TYPES(REGISTER_CPU);
+TF_CALL_half(REGISTER_CPU);
 TF_CALL_float(REGISTER_CPU);
 #undef REGISTER_CPU
 }  // namespace tensorflow
@@ -20,12 +20,7 @@ REGISTER_OP("PopdistAllReduce")
     .Attr("T: numbertype")
     .Input("tensor: T")
     .Attr("reduce_op: string")
+    .Attr("tensor_name: string")
     .Output("sum: T")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle output;
-      TF_RETURN_IF_ERROR(
-          c->ReplaceDim(c->input(0), 0, c->UnknownDim(), &output));
-      c->set_output(0, output);
-      return Status::OK();
-    });
+    .SetShapeFn(shape_inference::UnchangedShape);
 }  // namespace tensorflow
@@ -19,16 +19,7 @@ namespace tensorflow {
 REGISTER_OP("PopdistBroadcast")
     .Attr("T: numbertype")
     .Input("tensor: T")
+    .Attr("tensor_name: string")
     .Output("sum: T")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle output;
-
-      if (c->Rank(c->input(0)) > 0) {
-        TF_RETURN_IF_ERROR(
-            c->ReplaceDim(c->input(0), 0, c->UnknownDim(), &output));
-      }
-
-      c->set_output(0, output);
-      return Status::OK();
-    });
+    .SetShapeFn(shape_inference::UnchangedShape);
 }  // namespace tensorflow
Original file line number	Diff line number	Diff line change
`@@ -1042,5 +1042,9 @@ void CheckPoplarPackageHash() {`
`1042`	`1042`	`}`
`1043`	`1043`	`}`
`1044`	`1044`	`}`
	`1045`	`+`
	`1046`	`+Status ConvertError(const std::exception& e) {`
	`1047`	`+ return PoplarExceptionToTensorflowStatus("", e);`
	`1048`	`+}`
`1045`	`1049`	`} // namespace poplarplugin`
`1046`	`1050`	`} // namespace xla`