TF2 - Improve ShardingPass handling of after-all tokens

BabakkGraphcore · georgepaw · commit 4673d3afb3b6 · 2022-05-05T15:24:49.000+01:00
Summary: Updating handling of `after-all` instructions so they explicitly get given default sharding and are not considered for copying to users. This does 2 things, it stops infeeds/outfeeds using the sharding of their after-all input and it stops `ProcessComputation` using an after-all instruction as a means of kick-starting the sharding process when it can no longer make progress - an after-all doesn't help propagate sharding information since its consumers get their sharding from elsewhere, so the next ProcessComputation call still doesn't make any progress. By using an after-all to kick start things we potentially cause a tuple instruction to be prematurely given default sharding. TF2.5 Only TF1 - D64932 Test Plan: CI + new tests Reviewers: #tensorflow, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, samuelh, vladimirm Reviewed By: #tensorflow, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, samuelh, vladimirm Subscribers: vladimirm, samuelh Maniphest Tasks: T59245 Differential Revision: https://phabricator.sourcevertex.net/D64354
diff --git a/tensorflow/compiler/plugin/poplar/driver/passes/sharding_pass.cc b/tensorflow/compiler/plugin/poplar/driver/passes/sharding_pass.cc
@@ -284,7 +284,10 @@ bool CopyTupleShardingFromOperands(HloInstruction* inst) {
 bool CopyShardingFromOperands(HloInstruction* inst) {
   for (int o = 0; o < inst->operand_count(); o++) {
     auto* operand = inst->operand(o);
-    if (operand->has_sharding()) {
+    // We don't want to propagate the sharding of AfterAll tokens. The
+    // sharding of in/outfeeds should come from their users/outfed data
+    // respectively.
+    if (operand->has_sharding() && operand->opcode() != HloOpcode::kAfterAll) {
       if (CompatibleShapes(inst->shape(), operand->shape())) {
         auto s = GetShardingOfOutputTensor(operand);
         SetSharding(inst, s);
@@ -381,9 +384,15 @@ StatusOr<bool> ProcessComputation(HloComputation* comp, int attempt) {
     done = true;
     bool made_progress = false;
     for (auto* inst : comp->MakeInstructionPostOrder()) {
-      VLOG(3) << "Sharding pass visting instruction " << inst->name();
+      VLOG(3) << "Attempt " << attempt << ": sharding pass visting instruction "
+              << inst->name();
       bool added_sharding = false;
 
+      if (!inst->has_sharding() && inst->opcode() == HloOpcode::kAfterAll) {
+        SetSharding(inst, GetDefaultSharding(inst->shape()));
+        added_sharding = true;
+      }
+
       // If an instruction has no operands, and no users but the root Tuple,
       // then assign default sharding
       if (!inst->has_sharding() && inst->operand_count() == 0 &&
@@ -433,6 +442,11 @@ StatusOr<bool> ProcessComputation(HloComputation* comp, int attempt) {
             // These are dealt with by the computation level code
             break;
           }
+          case HloOpcode::kInfeed: {
+            // Infeeds should get their sharding from their users
+            // not their operands.
+            break;
+          }
           default: {
             if (IsPoplarInstruction(PoplarOp::Barrier, inst)) {
               added_sharding = CopyTupleShardingFromOperands(inst);
diff --git a/tensorflow/compiler/plugin/poplar/tests/sharding_pass_test.cc b/tensorflow/compiler/plugin/poplar/tests/sharding_pass_test.cc
@@ -845,6 +845,102 @@ main {
   EXPECT_EQ(shardings[1].GetUniqueDevice(), 1);
 }
 
+TEST_F(ShardingPassTest, TestInfeedsWithPartiallySpecifiedSharding) {
+  // This tests that we can infer a sharding for an infeed when not
+  // all of the tuple elements returned by the infeed have been sharded.
+  // In this case we're feeding in two values (f16[4,64,64,3], s32[4])
+  // but are only sharding the first one. We expect s32[4]
+  // (get-tuple-element.34) to be assigned default sharding which will let
+  // the infeed sharding be resolved.
+  const std::string hlo_string = R"(
+HloModule top
+
+ main {
+  after-all.4 = token[] after-all()
+  inf1 = ((f16[4,64,64,3], s32[4]), token[]) infeed(after-all.4), infeed_config="\022\0011\"\002\023\003(\003"
+  get-tuple-element.30 = (f16[4,64,64,3], s32[4]) get-tuple-element(inf1), index=0
+  get-tuple-element.31 = f16[4,64,64,3] get-tuple-element(get-tuple-element.30), index=0
+  arg_1 = f16[3,64] parameter(0)
+  dot.4 = f16[4,64,64,64] dot(get-tuple-element.31, arg_1), lhs_contracting_dims={3}, rhs_contracting_dims={0}, sharding={maximal device=1}
+  arg_2 = f16[64,64] parameter(1)
+  dot.5 = f16[4,64,64,64] dot(dot.4, arg_2), lhs_contracting_dims={3}, rhs_contracting_dims={0}, sharding={maximal device=1}
+  get-tuple-element.34 = s32[4] get-tuple-element(get-tuple-element.30), index=1
+  ROOT tuple.8 = (f16[4,64,64,64], s32[4]) tuple(dot.5, get-tuple-element.34)
+ }
+)";
+  HloModuleConfig config;
+  config.set_debug_options(GetDebugOptionsForTest());
+
+  auto module_or_status = ParseAndReturnVerifiedModule(hlo_string, config);
+  EXPECT_TRUE(module_or_status.ok());
+
+  auto* module = module_or_status.ValueOrDie().get();
+  auto* comp = module->entry_computation();
+
+  ShardingPass shardingPass;
+  ASSERT_TRUE(shardingPass.Run(module).ValueOrDie());
+
+  auto* inf1 = comp->GetInstructionWithName("inf1");
+  ASSERT_TRUE(inf1->has_sharding());
+  ASSERT_TRUE(inf1->sharding().IsTuple());
+  auto shardings = inf1->sharding().tuple_elements();
+  ASSERT_TRUE(shardings[0].HasUniqueDevice());
+  ASSERT_EQ(shardings[0].GetUniqueDevice(), 1);
+  ASSERT_TRUE(shardings[1].HasUniqueDevice());
+  ASSERT_EQ(shardings[1].GetUniqueDevice(), 0);
+}
+
+TEST_F(ShardingPassTest, TestOutfeedsDontTakeTokenSharding) {
+  // This tests that outfeeds take sharding from their outfed data
+  // input, not from their token.
+  const std::string hlo_string = R"(
+HloModule top
+
+main {
+  arg_1 = f16[3,64] parameter(1)
+  arg_2 = f16[4,64,64,3] parameter(2)
+  dot.4 = f16[4,64,64,64] dot(arg_2, arg_1), lhs_contracting_dims={3}, rhs_contracting_dims={0}, sharding={maximal device=0}
+  arg_3 = f16[64,64] parameter(3)
+  dot.5 = f16[4,64,64,64] dot(dot.4, arg_3), lhs_contracting_dims={3}, rhs_contracting_dims={0}, sharding={maximal device=1}
+  arg_4 = s32[4] parameter(4)
+  outfed_tuple = (f16[4,64,64,64], s32[4]) tuple(dot.5, arg_4)
+  after-all.5 = token[] after-all()
+  outfeed = token[] outfeed(outfed_tuple, after-all.5), outfeed_config="\022\0012\"\002\023\003(\003"
+  arg_0 = s32[] parameter(0)
+  ROOT tuple.10 = (s32[], f16[3,64], f16[64,64]) tuple(arg_0, arg_1, arg_3)
+}
+)";
+  HloModuleConfig config;
+  config.set_debug_options(GetDebugOptionsForTest());
+
+  auto module_or_status = ParseAndReturnVerifiedModule(hlo_string, config);
+  EXPECT_TRUE(module_or_status.ok());
+
+  auto* module = module_or_status.ValueOrDie().get();
+  auto* comp = module->entry_computation();
+
+  ShardingPass shardingPass;
+  ASSERT_TRUE(shardingPass.Run(module).ValueOrDie());
+
+  auto* outfed_tuple = comp->GetInstructionWithName("outfed_tuple");
+  ASSERT_TRUE(outfed_tuple->has_sharding());
+  const auto outfed_tuple_sharding = outfed_tuple->sharding();
+
+  auto* outfeed = comp->GetInstructionWithName("outfeed");
+  ASSERT_TRUE(outfeed->has_sharding());
+
+  ASSERT_EQ(outfeed->sharding(), outfed_tuple_sharding);
+
+  ASSERT_TRUE(outfed_tuple_sharding.IsTuple());
+  auto shardings = outfed_tuple_sharding.tuple_elements();
+
+  ASSERT_EQ(shardings.size(), 2);
+  ASSERT_TRUE(shardings[0].HasUniqueDevice());
+  ASSERT_EQ(shardings[0].GetUniqueDevice(), 1);
+  ASSERT_TRUE(shardings[1].HasUniqueDevice());
+  ASSERT_EQ(shardings[1].GetUniqueDevice(), 0);
+}
+
 TEST_F(ShardingPassTest, TestGteOpsMatchTheirOperands) {
   std::string hlo_string = R"(
 HloModule top
diff --git a/tensorflow/python/ipu/tests/infeed_outfeed_test.py b/tensorflow/python/ipu/tests/infeed_outfeed_test.py
@@ -1475,6 +1475,64 @@ def my_net():
       self.assertAllEqual(np.full([1, 10, 10], 0), out[6])
       self.assertAllEqual(np.full([1, 10, 10], 0), out[7])
 
+  @test_util.deprecated_graph_mode_only
+  def testValidSharding(self):
+    # Reproducer from T59245
+    ipu_options = ipu.config.IPUConfig()
+    ipu_options.auto_select_ipus = 2
+
+    ipu_options.configure_ipu_system()
+    ipu.utils.move_variable_initialization_to_cpu()
+    outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue()
+
+    images = dataset_ops.Dataset.from_tensors(
+        np.ones((64, 64, 3), dtype=np.float16))
+    labels = dataset_ops.Dataset.from_tensors(np.ones((), dtype=np.int32))
+    tf_dataset = dataset_ops.Dataset.zip(
+        (images, labels)).repeat(4).batch(4, drop_remainder=True).repeat()
+
+    with ops.device('cpu'):
+      infeed_queue = ipu.ipu_infeed_queue.IPUInfeedQueue(tf_dataset)
+
+    def retinanet_validating_loop():
+      def body(images, imgIds):
+        with variable_scope.variable_scope("MainGraph"):
+          with ipu.scopes.ipu_shard(0):
+            w1 = variable_scope.get_variable(
+                "w1",
+                shape=[3, 64],
+                dtype=np.float16,
+                initializer=init_ops.glorot_uniform_initializer(
+                    dtype=np.float16))
+            y = math_ops.matmul(images, w1)
+
+          with ipu.scopes.ipu_shard(1):
+            w2 = variable_scope.get_variable(
+                "w2",
+                shape=[64, 64],
+                dtype=np.float16,
+                initializer=init_ops.glorot_uniform_initializer(
+                    dtype=np.float16))
+            scores = math_ops.matmul(y, w2)
+
+        out = outfeed_queue.enqueue([scores, imgIds])
+        return out
+
+      return ipu.loops.repeat(128, body, inputs=[], infeed_queue=infeed_queue)
+
+    with ipu.scopes.ipu_scope('/device:IPU:0'):
+      retinanet_validation_step = ipu.ipu_compiler.compile(
+          retinanet_validating_loop, inputs=[])
+
+    session = session_lib.Session()
+    session.run(infeed_queue.initializer)
+    session.run(variables.global_variables_initializer())
+    try:
+      # This can throw if the sharding of body is incorrect.
+      session.run(retinanet_validation_step)
+    except Exception as e:  # pylint: disable=broad-except
+      self.fail(f"Unexpected exception thrown: {e}")
+
   @test_util.run_v2_only
   def testDeduceDevice(self):
     cfg = ipu.config.IPUConfig()