Change inputs of f8_convert ops from a tuple to simple inputs.

samgiz · samgiz · commit d21d1bf08f19 · 2022-12-01T06:51:10.000Z
Summary: Using tuples was breaking pipelining as the ops were being assigned the wrong sharding. Removing tuple inputs removes this issue and makes the code more consistent with matmul and conv f8 ops. Reviewers: #tensorflow, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, alfiee Reviewed By: #tensorflow, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, alfiee Maniphest Tasks: T71684 Differential Revision: https://phabricator.sourcevertex.net/D78466
diff --git a/tensorflow/compiler/plugin/poplar/driver/ops/custom_ops/popops/f8_convert.cc b/tensorflow/compiler/plugin/poplar/driver/ops/custom_ops/popops/f8_convert.cc
@@ -35,11 +35,13 @@ class ConvertToF8Op : public PoplarOpDef {
       const poplar::DebugContext& debug_context) override {
     PoplarOpDefDebugInfo debug_info(debug_context, "Fp8Convert");
     DriverProgramSequence seq(debug_info);
-    auto inputs = FindInstructionInputs(tensor_map, res, inst, 0, seq,
-                                        debug_info, /*expand_aliasing=*/true);
-    CHECK_EQ(inputs.size(), 2);
-    DriverTensor input = inputs[0].AsTensor();
-    DriverTensor input_metadata = inputs[1].AsTensor();
+    TF_ASSIGN_OR_RETURN(
+        auto input, FindInstructionInput(tensor_map, res, inst, 0, seq,
+                                         debug_info, /*expand_aliasing=*/true));
+    TF_ASSIGN_OR_RETURN(
+        auto input_metadata,
+        FindInstructionInput(tensor_map, res, inst, 1, seq, debug_info,
+                             /*expand_aliasing=*/true));
     // We can't reinterpret to neither QUARTER_METADATA nor QUARTER type.
     // Instead, clone them and copy raw unsigned char data over.
     // This copy will be elided by poplar.
diff --git a/tensorflow/compiler/plugin/poplar/driver/tensor.cc b/tensorflow/compiler/plugin/poplar/driver/tensor.cc
@@ -1419,38 +1419,33 @@ StatusOr<DriverTensor> FindF8InstructionInput(
     const poplar::DebugNameAndId& debug_name_and_id, bool expand_aliasing) {
   const HloInstruction* operand = inst->operand(input);
 
-  TensorOrRemoteBufferVector inputs = GetTensorsMaybeExpand(
-      map, res, operand, seq, expand_aliasing, debug_name_and_id, 0, 2);
-
-  if (inputs.size() == 0) {
-    return tensorflow::errors::Unknown(
-        StrCat("[Poplar] Couldn't find input ", input, " for ", inst->name()));
-  }
+  TF_ASSIGN_OR_RETURN(
+      auto u8_data,
+      FindInstructionInput(map, res, inst, 0, seq, debug_name_and_id,
+                           /*expand_aliasing=*/true));
+  // return u8_data;
+  TF_ASSIGN_OR_RETURN(
+      auto u8_metadata,
+      FindInstructionInput(map, res, inst, 1, seq, debug_name_and_id,
+                           /*expand_aliasing=*/true));
 
-  CHECK_EQ(inputs.size(), 2);
   auto& graph =
       GetGraphWithOutputIndex(res, operand, /*flattened_output_tuple_index=*/0);
-  CHECK(&graph == &GetGraphWithOutputIndex(res, operand,
-                                           /*flattened_output_tuple_index=*/1));
-  poplar::Graph& poplar_graph = graph;
-
   // We can't reinterpret to neither QUARTER_METADATA nor QUARTER type.
   // Instead, clone them and copy raw unsigned char data over.
   // Those copies will be elided by poplar.
 
-  DriverTensor u8_data = inputs[0].AsTensor();
-  DriverTensor u8_metadata = inputs[1].AsTensor();
-  auto f8_metadata = poplar_graph.clone(
+  auto f8_metadata = graph.clone(
       poplar::QUARTER_METADATA, u8_metadata.reshape({1}), debug_name_and_id,
       poplar::TensorCloneMethod::PRESERVE_ORDER_AND_ALIASES);
-  auto f8_data = poplar_graph.clone(
-      poplar::QUARTER, f8_metadata, u8_data, debug_name_and_id,
-      poplar::TensorCloneMethod::PRESERVE_ORDER_AND_ALIASES);
+  auto f8_data =
+      graph.clone(poplar::QUARTER, f8_metadata, u8_data, debug_name_and_id,
+                  poplar::TensorCloneMethod::PRESERVE_ORDER_AND_ALIASES);
   seq.add(poplar::program::Copy(
       u8_metadata, f8_metadata.reinterpret(poplar::UNSIGNED_CHAR)));
   seq.add(poplar::program::Copy(u8_data,
                                 f8_data.reinterpret(poplar::UNSIGNED_CHAR)));
-  return DriverTensor(f8_data);
+  return f8_data;
 }
 
 TensorOrRemoteBufferVector FindInstructionInputs(
diff --git a/tensorflow/compiler/plugin/poplar/driver/tools/custom_ops/f8_convert.cc b/tensorflow/compiler/plugin/poplar/driver/tools/custom_ops/f8_convert.cc
@@ -23,83 +23,46 @@ namespace xla {
 namespace poplarplugin {
 
 HloConvertFromF8Instruction::HloConvertFromF8Instruction(
-    const Shape& shape, HloInstruction* operand)
-    : HloF8ConvertInstruction(shape, operand) {
-  CHECK_EQ(shape, GetShape(operand));
-}
-
-Shape HloConvertFromF8Instruction::GetShape(const HloInstruction* operand) {
-  // Result shape is f16[<input-dimensions>].
-  const Shape& op_shape = operand->shape();
-  CHECK(op_shape.IsTuple());
-
-  // Expect data to be in U8.
-  const Shape& input_shape = op_shape.tuple_shapes(0);
-  CHECK(input_shape.element_type() == U8);
-
-  const Shape& metadata_shape = op_shape.tuple_shapes(1);
-  CHECK(metadata_shape.element_type() == U8);
-  CHECK(ShapeUtil::IsScalar(metadata_shape));
-
-  // The only supported type now is F16.
-  return ShapeUtil::MakeShape(F16, input_shape.dimensions());
-}
+    const Shape& shape, HloInstruction* data, HloInstruction* metadata)
+    : HloF8ConvertInstruction(shape, {data, metadata}) {}
 
 std::unique_ptr<HloInstruction>
 HloConvertFromF8Instruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext*) const {
-  CHECK_EQ(new_operands.size(), 1);
+  CHECK_EQ(new_operands.size(), 2);
   return std::unique_ptr<HloInstruction>(
-      new HloConvertFromF8Instruction(shape, new_operands[0]));
+      new HloConvertFromF8Instruction(shape, new_operands[0], new_operands[1]));
 }
 
 HloConvertToF8Instruction::HloConvertToF8Instruction(const Shape& shape,
-                                                     HloInstruction* operand)
-    : HloF8ConvertInstruction(shape, operand) {
-  CHECK(ShapeUtil::Compatible(shape, GetShape(operand)));
-}
-
-Shape HloConvertToF8Instruction::GetShape(const HloInstruction* operand) {
-  // Result shape is (u8[<input-dimensions>], u8 metadata).
-  const Shape& op_shape = operand->shape();
-  CHECK(op_shape.IsTuple());
-
-  // The only supported type now is F16.
-  const Shape& input_shape = op_shape.tuple_shapes(0);
-  CHECK(input_shape.element_type() == F16);
-
-  const Shape& metadata_shape = op_shape.tuple_shapes(1);
-  CHECK(metadata_shape.element_type() == U8);
-  CHECK(ShapeUtil::IsScalar(metadata_shape));
-
-  return ShapeUtil::MakeTupleShape(
-      {ShapeUtil::MakeShape(U8, input_shape.dimensions()), metadata_shape});
-}
+                                                     HloInstruction* data,
+                                                     HloInstruction* metadata)
+    : HloF8ConvertInstruction(shape, {data, metadata}) {}
 
 std::unique_ptr<HloInstruction>
 HloConvertToF8Instruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext*) const {
-  CHECK_EQ(new_operands.size(), 1);
+  CHECK_EQ(new_operands.size(), 2);
   return std::unique_ptr<HloInstruction>(
-      new HloConvertToF8Instruction(shape, new_operands[0]));
+      new HloConvertToF8Instruction(shape, new_operands[0], new_operands[1]));
 }
 
 namespace {
 StatusOr<std::unique_ptr<HloInstruction>>
 HloConvertFromF8InstructionFactoryFunc(HloCustomCallInstruction* call) {
-  return std::unique_ptr<HloInstruction>(
-      new HloConvertFromF8Instruction(call->shape(), call->mutable_operand(0)));
+  return std::unique_ptr<HloInstruction>(new HloConvertFromF8Instruction(
+      call->shape(), call->mutable_operand(0), call->mutable_operand(1)));
 }
 
 static HloPoplarInstructionFactory fp8_convert_from_factory(
     PoplarOp::ConvertFromF8, HloConvertFromF8InstructionFactoryFunc);
 
 StatusOr<std::unique_ptr<HloInstruction>> HloConvertToF8InstructionFactoryFunc(
     HloCustomCallInstruction* call) {
-  return std::unique_ptr<HloInstruction>(
-      new HloConvertToF8Instruction(call->shape(), call->mutable_operand(0)));
+  return std::unique_ptr<HloInstruction>(new HloConvertToF8Instruction(
+      call->shape(), call->mutable_operand(0), call->mutable_operand(1)));
 }
 
 static HloPoplarInstructionFactory fp8_convert_to_factory(
diff --git a/tensorflow/compiler/plugin/poplar/driver/tools/custom_ops/f8_convert.h b/tensorflow/compiler/plugin/poplar/driver/tools/custom_ops/f8_convert.h
@@ -31,8 +31,9 @@ namespace poplarplugin {
 template <PoplarOp Op>
 class HloF8ConvertInstruction : public HloPoplarInstruction {
  public:
-  HloF8ConvertInstruction(const Shape& shape, HloInstruction* operand)
-      : HloPoplarInstruction(shape, {operand}, Op) {}
+  HloF8ConvertInstruction(const Shape& shape,
+                          absl::Span<HloInstruction* const> operands)
+      : HloPoplarInstruction(shape, operands, Op) {}
 
   absl::flat_hash_set<int64_t> AllocatingIndices() const override { return {}; }
   bool AllocatingOutput() const override { return false; }
@@ -67,13 +68,10 @@ class HloF8ConvertInstruction : public HloPoplarInstruction {
 class HloConvertFromF8Instruction
     : public HloF8ConvertInstruction<PoplarOp::ConvertFromF8> {
  public:
-  HloConvertFromF8Instruction(const Shape& shape, HloInstruction* operand);
-  explicit HloConvertFromF8Instruction(HloInstruction* operand)
-      : HloConvertFromF8Instruction(GetShape(operand), operand) {}
+  HloConvertFromF8Instruction(const Shape& shape, HloInstruction* data,
+                              HloInstruction* metadata);
 
  private:
-  Shape GetShape(const HloInstruction* operand);
-
   std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
       const Shape& shape, absl::Span<HloInstruction* const>,
       HloCloneContext*) const override;
@@ -84,13 +82,10 @@ std::unique_ptr<HloInstruction> CreateConvertToF8Instruction(
 class HloConvertToF8Instruction
     : public HloF8ConvertInstruction<PoplarOp::ConvertToF8> {
  public:
-  HloConvertToF8Instruction(const Shape& shape, HloInstruction* operand);
-  explicit HloConvertToF8Instruction(HloInstruction* operand)
-      : HloConvertToF8Instruction(GetShape(operand), operand) {}
+  HloConvertToF8Instruction(const Shape& shape, HloInstruction* data,
+                            HloInstruction* metadata);
 
  private:
-  Shape GetShape(const HloInstruction* operand);
-
   std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
       const Shape& shape, absl::Span<HloInstruction* const>,
       HloCloneContext*) const override;
diff --git a/tensorflow/compiler/plugin/poplar/kernels/popops/f8_convert.cc b/tensorflow/compiler/plugin/poplar/kernels/popops/f8_convert.cc
@@ -120,10 +120,9 @@ class IpuConvertToF8Op : public XlaOpKernel, public IpuOpKernel {
     xla::Shape output_shape = xla::ShapeUtil::MakeTupleShape(
         {output_data_shape, output_metadata_shape});
 
-    auto packed_input = xla::Tuple(b, {input_data, input_metadata});
-    auto output_tuple =
-        xla::CustomCall(b, PoplarOp_Name(PoplarOp::ConvertToF8), {packed_input},
-                        {output_shape}, attribute_map_.Serialise());
+    auto output_tuple = xla::CustomCall(
+        b, PoplarOp_Name(PoplarOp::ConvertToF8), {input_data, input_metadata},
+        output_shape, attribute_map_.Serialise());
 
     ctx->SetOutput(0, xla::GetTupleElement(output_tuple, 0));
     ctx->SetOutput(1, xla::GetTupleElement(output_tuple, 1));
@@ -155,10 +154,9 @@ class IpuConvertFromF8Op : public XlaOpKernel, public IpuOpKernel {
     xla::Shape output_shape;
     OP_REQUIRES_OK(
         ctx, TensorShapeToXLAShape(DT_HALF, ctx->InputShape(0), &output_shape));
-    xla::XlaOp input = xla::Tuple(b, {input_data, input_metadata});
-    auto output =
-        xla::CustomCall(b, PoplarOp_Name(PoplarOp::ConvertFromF8), {input},
-                        {output_shape}, attribute_map_.Serialise());
+    auto output = xla::CustomCall(b, PoplarOp_Name(PoplarOp::ConvertFromF8),
+                                  {input_data, input_metadata}, output_shape,
+                                  attribute_map_.Serialise());
 
     ctx->SetOutput(0, output);
   }
diff --git a/tensorflow/compiler/plugin/poplar/kernels/popops/fp8_ops.cc b/tensorflow/compiler/plugin/poplar/kernels/popops/fp8_ops.cc
@@ -143,7 +143,7 @@ class PoplinF8ConvOp : public XlaOpKernel, IpuOpKernel {
 
     OP_REQUIRES(
         ctx, op_type != PoplarOp::Unknown,
-        xla::InvalidArgument("Unsupported F8 Convolution Dimension ", D));
+        xla::InvalidArgument("Unsupported F8 Convolution Dimension %d", D));
 
     auto call_output =
         xla::CustomCall(ctx->builder(), PoplarOp_Name(op_type), args, out_shape,
diff --git a/tensorflow/compiler/plugin/poplar/tests/f8_test.cc b/tensorflow/compiler/plugin/poplar/tests/f8_test.cc
@@ -35,8 +35,12 @@ TEST_F(Fp8Test, TestConvert) {
 
   ENTRY main {
     input = (f16[2,2], u8[]) parameter(0)
-    input.fp8 = (u8[2,2], u8[]) custom-call(input), custom_call_target="ConvertToF8"
-    input.fp = f16[2,2] custom-call(input.fp8), custom_call_target="ConvertFromF8"
+    input.1 = f16[2,2] get-tuple-element(input), index=0
+    input.2 = u8[] get-tuple-element(input), index=1
+    input.fp8 = (u8[2,2], u8[]) custom-call(input.1, input.2), custom_call_target="ConvertToF8"
+    input.fp8.1 = u8[2,2] get-tuple-element(input.fp8), index=0
+    input.fp8.2 = u8[] get-tuple-element(input.fp8), index=1
+    input.fp = f16[2,2] custom-call(input.fp8.1, input.fp8.2), custom_call_target="ConvertFromF8"
     ROOT root = ((f16[2,2], u8[]), (u8[2,2], u8[]), f16[2,2]) tuple(input, input.fp8, input.fp)
   }
   )";