[Tensor Parallel] update examples to simplify embedding + first transformer block

tianyu-l · tianyu-l · commit cd29c12ac853 · 2024-05-16T10:40:46.000-07:00
ghstack-source-id: 4396c58 Pull Request resolved: #1259
diff --git a/distributed/tensor_parallelism/fsdp_tp_example.py b/distributed/tensor_parallelism/fsdp_tp_example.py
@@ -107,22 +107,19 @@
     {
         "tok_embeddings": RowwiseParallel(
             input_layouts=Replicate(),
+            output_layouts=Shard(1),
         ),
+        "norm": SequenceParallel(),
         "output": ColwiseParallel(
             input_layouts=Shard(1),
             output_layouts=Replicate()
         ),
-        "norm": SequenceParallel(),
-        "layers.0": PrepareModuleInput(
-            input_layouts=(Replicate(), None),
-            desired_input_layouts=(Shard(1), None),
-            use_local_output=True,
-        ),
     }
 )
 
 for layer_id, transformer_block in enumerate(model.layers):
     layer_tp_plan = {
+        "attention_norm": SequenceParallel(),
         "attention": PrepareModuleInput(
             input_layouts=(Shard(1), None),
             desired_input_layouts=(Replicate(), None),
@@ -131,15 +128,14 @@
         "attention.wk": ColwiseParallel(),
         "attention.wv": ColwiseParallel(),
         "attention.wo": RowwiseParallel(output_layouts=Shard(1)),
-        "attention_norm": SequenceParallel(),
+        "ffn_norm": SequenceParallel(),
         "feed_forward": PrepareModuleInput(
             input_layouts=(Shard(1),),
             desired_input_layouts=(Replicate(),),
         ),
         "feed_forward.w1": ColwiseParallel(),
         "feed_forward.w2": RowwiseParallel(output_layouts=Shard(1)),
         "feed_forward.w3": ColwiseParallel(),
-        "ffn_norm": SequenceParallel(),
     }
 
     # Adjust attention module to use the local number of heads