testing infra

garrett361 · garrett361 · commit a26de67ae590 · 2025-11-18T15:46:36.000Z
diff --git a/torchtitan/models/moe/moe.py b/torchtitan/models/moe/moe.py
@@ -413,14 +413,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         bs, slen, dim = x.shape
         x = x.view(-1, dim)
 
-        # top_scores shape (bs*slen, top_k)
-        # selected_experts_indices shape (bs*slen, top_k)
+        # top_scores and selected_experts_indices shape (bs*slen, top_k)
         # num_tokens_per_expert shape (num_experts,)
         (
             top_scores,
             selected_experts_indices,
             num_tokens_per_expert,
         ) = self.router(x, self.expert_bias)
+        top_k = selected_experts_indices.shape[-1]
 
         # tokens_per_expert will be used to update the expert bias for load balancing.
         # and also to count the expert usage
@@ -461,22 +461,25 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         # to "implicitly" overlap the shared expert compute with token combine communication
         out = self.shared_experts(x) if self.shared_experts is not None else None
 
+        # Unsort routed outputs
+        routed_output_unsorted = torch.zeros(
+            (bs * slen * top_k, dim),
+            dtype=routed_output.dtype,
+            device=routed_output.device,
+        )
+        routed_output_unsorted[token_indices_experts_sorted] = routed_output
+        routed_output_unsorted = routed_output_unsorted.reshape(-1, top_k, dim)
         if not self.score_before_experts:
-            # Unsort scores and routed outputs. Also save some allocations: store unsorted scores
-            # and outputs in top_scores and routed_input, respectively.
-            routed_input[token_indices_experts_sorted] = routed_output
-            routed_input = routed_input.reshape(-1, self.router.top_k, dim)
             out_experts = (
                 torch.bmm(
-                    top_scores.reshape(-1, 1, self.router.top_k), routed_input.float()
+                    top_scores.reshape(-1, 1, self.router.top_k),
+                    routed_output_unsorted.float(),
                 )
                 .to(x.dtype)
                 .squeeze(1)
             )
         else:
-            # Unsort routed outputs and save an allocation: store unsorted outputs in routed_input
-            routed_input[token_indices_experts_sorted] = routed_output
-            out_experts = routed_input.reshape(-1, self.router.top_k, dim).sum(dim=1)
+            out_experts = routed_output_unsorted.sum(dim=1)
 
         if out is None:
             return out_experts.reshape(bs, slen, dim)
@@ -500,88 +503,3 @@ def init_weights(
                 self.expert_bias = torch.zeros(
                     self.experts.num_experts, dtype=torch.float32
                 )
-
-
-# For testing
-class MoEOld(MoE):
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            x (torch.Tensor): Input tensor with shape ``(bs, slen, dim)``.
-
-        Returns:
-            out (torch.Tensor): Output tensor with shape ``(bs, slen, dim)``.
-        """
-        bs, slen, dim = x.shape
-        x = x.view(-1, dim)
-
-        # top_scores and selected_experts_indices shape (bs*slen*top_k,)
-        # num_tokens_per_expert shape (num_experts,)
-        (
-            top_scores,
-            selected_experts_indices,
-            num_tokens_per_expert,
-        ) = self.router(x, self.expert_bias)
-
-        # tokens_per_expert will be used to update the expert bias for load balancing.
-        # and also to count the expert usage
-        # TODO: Activation Checkpointing has the side effect of double counting tokens_per_expert --
-        #       first in the forward pass, and then in the backward pass. However, this has no
-        #       effect on the expert bias update thanks to the torch.sign() operator.
-        with torch.no_grad():
-            self.tokens_per_expert.add_(num_tokens_per_expert)
-
-        # top_scores and token_indices_experts_sorted shape (bs*slen*top_k,)
-        # num_tokens_per_expert shape (num_experts,)
-        # NOTE: the reason we need to compute num_tokens_per_expert again is:
-        #       1st computation in router is to update self.tokens_per_expert
-        #       which would be the same across all TP ranks.
-        #       2nd computation in reorderer is for the actual routing and experts computation
-        #       which would be sharded over TP ranks if expert_tensor_parallel_degree==1.
-        #       If tensor_paralllel_degree == expert_tensor_parallel_degree, they agree.
-        (
-            top_scores_experts_sorted,
-            token_indices_experts_sorted,
-            num_tokens_per_expert,
-        ) = self.reorderer(top_scores, selected_experts_indices)
-        # NOTE: @goon - adjust for redefined reorderer output and divide by top_k
-        token_indices_experts_sorted = (
-            token_indices_experts_sorted // self.reorderer.top_k
-        )
-
-        # shape (bs*slen*top_k, dim)
-        token_indices_experts_sorted = token_indices_experts_sorted.reshape(
-            -1, 1
-        ).expand(-1, dim)
-
-        # shape (bs*slen*top_k, dim)
-        routed_input = torch.gather(x, dim=0, index=token_indices_experts_sorted)
-
-        if self.score_before_experts:
-            routed_input = (
-                routed_input.to(torch.float32)
-                * top_scores_experts_sorted.reshape(-1, 1)
-            ).to(x.dtype)
-
-        # shape (bs*slen*top_k, dim)
-        routed_output = self.experts(routed_input, num_tokens_per_expert)
-
-        # shared expert
-        # Note: we execute the shared expert before scoring the output of the routed expert
-        # to "implicitly" overlap the shared expert compute with token combine communication
-        if self.shared_experts is not None:
-            out = self.shared_experts(x)
-        else:
-            out = torch.zeros_like(x)
-
-        if not self.score_before_experts:
-            routed_output = (
-                routed_output.to(torch.float32)
-                * top_scores_experts_sorted.reshape(-1, 1)
-            ).to(x.dtype)
-
-        out = out.scatter_add(
-            dim=0, index=token_indices_experts_sorted, src=routed_output
-        )
-        out = out.reshape(bs, slen, dim)
-        return out
diff --git a/torchtitan/moe_bench_and_test/__init__.py b/torchtitan/moe_bench_and_test/__init__.py
@@ -0,0 +1,170 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import types
+
+import torch
+import torch.nn as nn
+
+from torchtitan.models.moe.moe import MoE, TokenReorderer
+
+
+# NOTE: @goon -  torch.testing.assert_close is very strict and hard to pass. Use the more-lenient
+# assert_close from FLA, slightly modified to remove their CI related code.
+# https://github.com/fla-org/flash-linear-attention/blob/3ddba2a043100837a1f6499b5eb6692de71a477b/fla/utils.py?plain=1#L82
+def get_abs_err(x, y):
+    return (x.detach() - y.detach()).flatten().abs().max().item()
+
+
+def get_err_ratio(x, y):
+    err = (x.detach() - y.detach()).flatten().square().mean().sqrt().item()
+    base = (x.detach()).flatten().square().mean().sqrt().item()
+    return err / (base + 1e-8)
+
+
+def assert_close(prefix, ref, tri, ratio, err_atol=1e-6):
+    abs_atol = get_abs_err(ref, tri)
+    msg = f"{prefix:>16} diff: {abs_atol:.6f} ratio: {get_err_ratio(ref, tri):.6f}"
+    error_rate = get_err_ratio(ref, tri)
+    if abs_atol <= err_atol:
+        return
+    assert error_rate < ratio, msg
+
+
+# For testing: copy over old router and MoE impls and use these to monkey patch models in tests.
+# Code copied from
+# https://github.com/pytorch/torchtitan/blob/3819737fab042fdfd5443b1d99753b951b59696d/torchtitan/models/moe/moe.py?plain=1#L298
+class MoEOld(MoE):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (torch.Tensor): Input tensor with shape ``(bs, slen, dim)``.
+
+        Returns:
+            out (torch.Tensor): Output tensor with shape ``(bs, slen, dim)``.
+        """
+        bs, slen, dim = x.shape
+        x = x.view(-1, dim)
+
+        # top_scores and selected_experts_indices shape (bs*slen*top_k,)
+        # num_tokens_per_expert shape (num_experts,)
+        (
+            top_scores,
+            selected_experts_indices,
+            num_tokens_per_expert,
+        ) = self.router(x, self.expert_bias)
+
+        # tokens_per_expert will be used to update the expert bias for load balancing.
+        # and also to count the expert usage
+        # TODO: Activation Checkpointing has the side effect of double counting tokens_per_expert --
+        #       first in the forward pass, and then in the backward pass. However, this has no
+        #       effect on the expert bias update thanks to the torch.sign() operator.
+        with torch.no_grad():
+            self.tokens_per_expert.add_(num_tokens_per_expert)
+
+        # top_scores and token_indices_experts_sorted shape (bs*slen*top_k,)
+        # num_tokens_per_expert shape (num_experts,)
+        # NOTE: the reason we need to compute num_tokens_per_expert again is:
+        #       1st computation in router is to update self.tokens_per_expert
+        #       which would be the same across all TP ranks.
+        #       2nd computation in reorderer is for the actual routing and experts computation
+        #       which would be sharded over TP ranks if expert_tensor_parallel_degree==1.
+        #       If tensor_paralllel_degree == expert_tensor_parallel_degree, they agree.
+        (
+            top_scores_experts_sorted,
+            token_indices_experts_sorted,
+            num_tokens_per_expert,
+        ) = self.reorderer(top_scores, selected_experts_indices)
+
+        # shape (bs*slen*top_k, dim)
+        token_indices_experts_sorted = token_indices_experts_sorted.reshape(
+            -1, 1
+        ).expand(-1, dim)
+
+        # shape (bs*slen*top_k, dim)
+        routed_input = torch.gather(x, dim=0, index=token_indices_experts_sorted)
+
+        if self.score_before_experts:
+            routed_input = (
+                routed_input.to(torch.float32)
+                * top_scores_experts_sorted.reshape(-1, 1)
+            ).to(x.dtype)
+
+        # shape (bs*slen*top_k, dim)
+        routed_output = self.experts(routed_input, num_tokens_per_expert)
+
+        # shared expert
+        # Note: we execute the shared expert before scoring the output of the routed expert
+        # to "implicitly" overlap the shared expert compute with token combine communication
+        if self.shared_experts is not None:
+            out = self.shared_experts(x)
+        else:
+            out = torch.zeros_like(x)
+
+        if not self.score_before_experts:
+            routed_output = (
+                routed_output.to(torch.float32)
+                * top_scores_experts_sorted.reshape(-1, 1)
+            ).to(x.dtype)
+
+        out = out.scatter_add(
+            dim=0, index=token_indices_experts_sorted, src=routed_output
+        )
+        out = out.reshape(bs, slen, dim)
+        return out
+
+
+class TokenReordererOld(TokenReorderer):
+    def forward(
+        self,
+        top_scores: torch.Tensor,
+        selected_experts_indices: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Reorders token indices to match the order of experts for MoE routing.
+
+        Args:
+            top_scores (torch.Tensor): Routing scores for selected experts,
+                shape (batch_size * seq_len, top_k)
+            selected_experts_indices (torch.Tensor): Expert indices selected for each token,
+                shape (batch_size*seq_len, top_k)
+
+        Returns:
+            tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+                - top_scores_experts_sorted: Scores reordered to match expert ordering
+                - token_indices_experts_sorted: Token indices reordered to match expert ordering
+                - num_tokens_per_expert: Number of tokens assigned to each expert
+        """
+        # group tokens together by expert indices from 0 to num_experts and pass that to experts forward
+        num_tokens_per_expert = torch.histc(
+            selected_experts_indices.view(-1),
+            bins=self.num_experts,
+            min=0,
+            max=self.num_experts,
+        )
+
+        # Reorder the token indices to match the order of the experts
+        # token_indices_experts_sorted shape (bs*slen*top_k,)
+        token_indices_experts_sorted = torch.argsort(
+            selected_experts_indices.view(-1), stable=True
+        )
+
+        top_scores_experts_sorted = top_scores.view(-1)[token_indices_experts_sorted]
+        token_indices_experts_sorted = token_indices_experts_sorted // self.top_k
+
+        return (
+            top_scores_experts_sorted,
+            token_indices_experts_sorted,
+            num_tokens_per_expert,
+        )
+
+
+def apply_old_moe_monkey_patches(module: nn.Module) -> None:
+    for mod in module.modules():
+        if isinstance(mod, MoE):
+            mod.forward = types.MethodType(MoEOld.forward, mod)
+        if isinstance(mod, TokenReorderer):
+            mod.forward = types.MethodType(TokenReordererOld.forward, mod)
diff --git a/torchtitan/moe_bench_and_test/moe_memory.py b/torchtitan/moe_bench_and_test/moe_memory.py
@@ -9,7 +9,9 @@
 import torch
 
 from torchtitan.components.metrics import DeviceMemoryMonitor
-from torchtitan.models.moe.moe import MoE, MoEArgs, MoEOld
+from torchtitan.models.moe.moe import MoE, MoEArgs
+
+from torchtitan.moe_bench_and_test import apply_old_moe_monkey_patches
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -43,17 +45,13 @@
         use_grouped_mm=use_grouped_mm,
     )
 
-    if args.cls == "moe":
-        cls = MoE
-    elif args.cls == "moe_old":
-        cls = MoEOld
-    else:
-        raise ValueError
-
     torch.manual_seed(42)
-    moe = cls(moe_args, dim=dim, hidden_dim=moe_inter_dim).to(
+    moe = MoE(moe_args, dim=dim, hidden_dim=moe_inter_dim).to(
         device=device, dtype=torch.bfloat16
     )
+    if args.cls == "moe_old":
+        apply_old_moe_monkey_patches(moe)
+
     moe.init_weights(1 / dim**0.5, device)
     inputs = torch.randn(
         args.bsz,
@@ -66,6 +64,8 @@
     for _ in range(args.iters):
         moe(inputs).sum().backward()
         moe.zero_grad()
+
+    torch.cuda.synchronize()
     print(f"\n{args=}")
     peak_stats = mem_monitor.get_peak_stats()
     print(f"{peak_stats.max_active_gib=}")
diff --git a/torchtitan/moe_bench_and_test/moe_timing.py b/torchtitan/moe_bench_and_test/moe_timing.py
@@ -9,7 +9,8 @@
 import torch
 from triton.testing import do_bench
 
-from torchtitan.models.moe.moe import MoE, MoEArgs, MoEOld
+from torchtitan.models.moe.moe import MoE, MoEArgs
+from torchtitan.moe_bench_and_test import apply_old_moe_monkey_patches
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -43,17 +44,12 @@
         use_grouped_mm=use_grouped_mm,
     )
 
-    if args.cls == "moe":
-        cls = MoE
-    elif args.cls == "moe_old":
-        cls = MoEOld
-    else:
-        raise ValueError
-
     torch.manual_seed(42)
-    moe = cls(moe_args, dim=dim, hidden_dim=moe_inter_dim).to(
+    moe = MoE(moe_args, dim=dim, hidden_dim=moe_inter_dim).to(
         device=device, dtype=torch.bfloat16
     )
+    if args.cls == "moe_old":
+        apply_old_moe_monkey_patches(moe)
     moe.init_weights(1 / dim**0.5, device)
     inputs = torch.randn(
         args.bsz,
diff --git a/torchtitan/moe_bench_and_test/test_moe.py b/torchtitan/moe_bench_and_test/test_moe.py
diff --git a/torchtitan/moe_bench_and_test/test_tp.py b/torchtitan/moe_bench_and_test/test_tp.py