fix grad_out passing

wconstab · wconstab · commit 74d322379701 · 2025-11-12T16:29:51.000-08:00
and apply pytorch/pytorch#167002 to custom overlap_f_b for dualpipe ghstack-source-id: 4d524cc Pull Request resolved: #2036
diff --git a/torchtitan/distributed/dual_pipe_v.py b/torchtitan/distributed/dual_pipe_v.py
@@ -271,6 +271,11 @@ def run_forward():
             arg_mbs[forward_mb_index],
             kwarg_mbs[forward_mb_index],
         )
+        # TODO its error prone to have this logic scattered inside and outside the runtime file..
+        # this goes along with the patch to pytorch: https://github.com/pytorch/pytorch/pull/167002/
+        key = f"{forward_stage.stage_index}_{forward_mb_index}"
+        assert key not in schedule.ownership_tokens
+        schedule.ownership_tokens[key] = output.view_as(output).grad_fn
         schedule._maybe_compute_loss(
             forward_stage, output, ctx.target_mbs, forward_mb_index
         )
diff --git a/torchtitan/distributed/pipeline_parallel.py b/torchtitan/distributed/pipeline_parallel.py
@@ -13,7 +13,7 @@
 import torch.nn as nn
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.pipelining import PipelineStage
-
+from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.distributed.pipelining.schedules import (
     _PipelineSchedule,
     _PipelineScheduleRuntime,
@@ -47,7 +47,8 @@ class MmSeparateWeightGrad(torch.autograd.Function):
         @staticmethod
         def forward(ctx, i, w):
             ctx.save_for_backward(i)
-            return w
+            grad_shape = (i.shape[0], w.shape[1])
+            return torch.empty(grad_shape, dtype=w.dtype, device=w.device, requires_grad=True)
 
         @staticmethod
         def backward(ctx, grad_output):
@@ -59,56 +60,34 @@ class MmSeparateInputGrad(torch.autograd.Function):
         @staticmethod
         def forward(ctx, i, w):
             ctx.save_for_backward(w)
-            return i
+            grad_shape = (i.shape[0], w.shape[1])
+            return torch.empty(grad_shape, dtype=w.dtype, device=w.device, requires_grad=True)
 
         @staticmethod
         def backward(ctx, grad_output):
             (w,) = ctx.saved_tensors
-            """
-            A[m,k] @ B[k,n] -> O[m,n]
-            grad_o[m,n] @ B.t()[n,k]    -> grad_a[m,k] 
-            looks right..
-            getting 
-[rank4]:[rank4]:   File "/data/users/whc/torchtitan/torchtitan/distributed/pipeline_parallel.py", line 67, in backward
-[rank4]:[rank4]:     grad_input = grad_output.mm(w.t())
-[rank4]:[rank4]:   File "/data/users/whc/torchtitan/torchtitan/distributed/pipeline_parallel.py", line 88, in split_mm
-[rank4]:[rank4]:     return MmPassThrough.apply(i1, w1)
-[rank4]:[rank4]:   File "/data/users/whc/pytorch/torch/autograd/function.py", line 583, in apply
-[rank4]:[rank4]:     return super().apply(*args, **kwargs)  # type: ignore[misc]
-[rank4]:[rank4]:   File "/data/users/whc/torchtitan/torchtitan/distributed/pipeline_parallel.py", line 74, in forward
-[rank4]:[rank4]:     return torch.mm(x, y)
-[rank4]:[rank4]: RuntimeError: mat1 and mat2 shapes cannot be multiplied (4096x2816 and 2048x2816)
-
-[rank4]:[rank4]: RuntimeError: 
-[rank4]:[rank4]:         Failed to run stage backward:
-[rank4]:[rank4]:         Stage output: ('Tensor(torch.Size([1, 4096, 2048]), grad=True, dtype=torch.bfloat16)',)
-[rank4]:[rank4]:         Output gradient: ('Tensor(torch.Size([1, 4096, 2048]), grad=False, dtype=torch.bfloat16)',)
-[rank4]:[rank4]:         Input: ['Tensor(torch.Size([1, 4096, 2048]), grad=True, dtype=torch.bfloat16)']
-[rank4]:[rank4]:         
-            """
-            logger.error(f"MmSeparateInputGrad backward: {grad_output.shape=}, {w.t().shape=}")
             grad_input = grad_output.mm(w.t())
             return grad_input, None
 
     class MmPassThrough(torch.autograd.Function):
         @staticmethod
-        def forward(ctx, x, y):
+        def forward(ctx, x, y, fake_1, fake_2):
             with torch._C._AutoDispatchBelowAutograd():
                 return torch.mm(x, y)
 
         @staticmethod
         def backward(ctx, gO):
-            # TODO(whc) - claude first wrote it this way and later tried to return None, None, i'm not sure which is correct
-            return gO, gO
+            return None, None, gO, gO
 
     def split_mm(i, w):
         # Apply the pass-through node. y is passed to this node so that it can be
         # saved for backward, but detach because we don't want to actually build
         # this edge of the graph
-        logger.error(f"split_mm forward: {i.shape=}, {w.shape=}")
-        w1 = MmSeparateWeightGrad.apply(i.detach(), w)
-        i1 = MmSeparateInputGrad.apply(i, w.detach())
-        return MmPassThrough.apply(i1, w1)
+        # logger.error(f"split_mm forward: {i.shape=}, {w.shape=}")
+        fake_1 = MmSeparateWeightGrad.apply(i.detach(), w)
+        fake_2 = MmSeparateInputGrad.apply(i, w.detach())
+
+        return MmPassThrough.apply(i.detach(), w.detach(), fake_1, fake_2)
 
     # addmm operator: out = beta * input + alpha * (mat1 @ mat2)
     class AddmmSeparateMat2Grad(torch.autograd.Function):
@@ -237,7 +216,12 @@ class GroupedMmSeparateMat2Grad(torch.autograd.Function):
         def forward(ctx, input, mat2, offs, bias, out_dtype):
             ctx.save_for_backward(input)
             ctx.offs = offs
-            return mat2
+            with FakeTensorMode(allow_non_fake_inputs=True), torch._C._AutoDispatchBelowAutograd(), torch.no_grad():
+                fake_output = torch.ops.aten._grouped_mm.default(
+                    input, mat2, offs=offs, bias=bias, out_dtype=out_dtype
+                )
+
+            return torch.empty((1,), dtype=input.dtype, device=input.device, requires_grad=True).expand_as(fake_output)
 
         @staticmethod
         def backward(ctx, grad_output):
@@ -253,7 +237,15 @@ class GroupedMmSeparateInputGrad(torch.autograd.Function):
         def forward(ctx, input, mat2, offs, bias, out_dtype):
             ctx.save_for_backward(mat2)
             ctx.offs = offs
-            return input
+            with FakeTensorMode(allow_non_fake_inputs=True), torch._C._AutoDispatchBelowAutograd(), torch.no_grad():
+                fake_output = torch.ops.aten._grouped_mm.default(
+                    input, mat2, offs=offs, bias=bias, out_dtype=out_dtype
+                )
+
+            # grad_shape = fake_output.shape
+            # print(f"Shape: {fake_output.shape=}")
+            return torch.empty((1,), dtype=input.dtype, device=input.device, requires_grad=True).expand_as(fake_output)
+            # return torch.empty(grad_shape, dtype=input.dtype, device=input.device, requires_grad=True)
 
         @staticmethod
         def backward(ctx, grad_output):
@@ -266,24 +258,24 @@ def backward(ctx, grad_output):
 
     class GroupedMmPassThrough(torch.autograd.Function):
         @staticmethod
-        def forward(ctx, input, mat2, offs, bias, out_dtype):
+        def forward(ctx, input, mat2, offs, bias, out_dtype, fake_1, fake_2):
             with torch._C._AutoDispatchBelowAutograd():
                 return torch.ops.aten._grouped_mm.default(
                     input, mat2, offs=offs, bias=bias, out_dtype=out_dtype
                 )
 
         @staticmethod
         def backward(ctx, gO):
-            return gO, gO, None, None, None
+            return None, None, None, None, None, gO, gO
 
     def split_grouped_mm(input, mat2, offs=None, bias=None, out_dtype=None):
-        mat2_1 = GroupedMmSeparateMat2Grad.apply(
+        fake_1 = GroupedMmSeparateMat2Grad.apply(
             input.detach(), mat2, offs, bias, out_dtype
         )
-        input_1 = GroupedMmSeparateInputGrad.apply(
+        fake_2 = GroupedMmSeparateInputGrad.apply(
             input, mat2.detach(), offs, bias, out_dtype
         )
-        return GroupedMmPassThrough.apply(input_1, mat2_1, offs, bias, out_dtype)
+        return GroupedMmPassThrough.apply(input.detach(), mat2.detach(), offs, bias, out_dtype, fake_1, fake_2)
 
     lib.impl("mm", split_mm, "Autograd")
     lib.impl("addmm", split_addmm, "Autograd")

Original file line number	Diff line number	Diff line change
`@@ -271,6 +271,11 @@ def run_forward():`
`271`	`271`	`arg_mbs[forward_mb_index],`
`272`	`272`	`kwarg_mbs[forward_mb_index],`
`273`	`273`	`)`
	`274`	`+ # TODO its error prone to have this logic scattered inside and outside the runtime file..`
	`275`	`+ # this goes along with the patch to pytorch: https://github.com/pytorch/pytorch/pull/167002/`
	`276`	`+ key = f"{forward_stage.stage_index}_{forward_mb_index}"`
	`277`	`+ assert key not in schedule.ownership_tokens`
	`278`	`+ schedule.ownership_tokens[key] = output.view_as(output).grad_fn`
`274`	`279`	`schedule._maybe_compute_loss(`
`275`	`280`	`forward_stage, output, ctx.target_mbs, forward_mb_index`
`276`	`281`	`)`