Adding ScatterMoE.

shawntan · shawntan · commit 2208a353dde3 · 2025-10-15T21:28:16.000Z
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
@@ -46,6 +46,7 @@
     "opencv-python": "opencv-python",
     "optimum-benchmark": "optimum-benchmark>=0.3.0",
     "optuna": "optuna",
+    "optax": "optax>=0.08,<=0.1.4",
     "pandas": "pandas<2.3.0",
     "packaging": "packaging>=20.0",
     "parameterized": "parameterized>=0.9",
diff --git a/src/transformers/integrations/hub_kernels.py b/src/transformers/integrations/hub_kernels.py
@@ -84,6 +84,16 @@
                 )
             },
         },
+        "ScatterMoEGatedMLP": {
+            "cuda": {
+                Mode.TRAINING: LayerRepository(
+                    repo_id="kernels-community/scattermoe", layer_name="ScatterMoEGatedMLP"
+                ),
+                Mode.INFERENCE: LayerRepository(
+                    repo_id="kernels-community/scattermoe", layer_name="ScatterMoEGatedMLP"
+                ),
+            },
+        },
         "FastGELU": {
             "cuda": {
                 Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository(
diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py
@@ -22,6 +22,7 @@
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache
 from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPast, MoeCausalLMOutputWithPast, MoeModelOutputWithPast
@@ -129,6 +130,7 @@ def load_balancing_loss_func(
 
 
 # Copied from transformers.models.granite.modeling_granite.GraniteRMSNorm with Granite->GraniteMoe
+@use_kernel_forward_from_hub("RMSNorm")
 class GraniteMoeRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
@@ -317,6 +319,7 @@ def forward(self, hidden_states):
         return index_sorted_experts, batch_index, batch_gates, expert_size, logits
 
 
+@use_kernel_forward_from_hub("ScatterMoEGatedMLP")
 class GraniteMoeMoE(nn.Module):
     """
     A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.
diff --git a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py
@@ -29,6 +29,7 @@
 
 from ...cache_utils import Cache
 from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPast, MoeCausalLMOutputWithPast, MoeModelOutputWithPast
@@ -934,6 +935,7 @@ class GraniteFlashAttentionKwargs(TypedDict, total=False):
     seq_idx: torch.IntTensor
 
 
+@use_kernel_forward_from_hub("RMSNorm")
 class GraniteMoeHybridRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
@@ -1047,6 +1049,7 @@ def forward(self, hidden_states):
         return index_sorted_experts, batch_index, batch_gates, expert_size, logits
 
 
+@use_kernel_forward_from_hub("ScatterMoEGatedMLP")
 class GraniteMoeHybridMoE(nn.Module):
     """
     A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.
diff --git a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py
@@ -28,6 +28,7 @@
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache
 from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPast, MoeCausalLMOutputWithPast, MoeModelOutputWithPast
@@ -99,6 +100,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
+@use_kernel_forward_from_hub("RMSNorm")
 class GraniteMoeSharedRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
@@ -212,6 +214,7 @@ def forward(self, hidden_states):
         return index_sorted_experts, batch_index, batch_gates, expert_size, logits
 
 
+@use_kernel_forward_from_hub("ScatterMoEGatedMLP")
 class GraniteMoeSharedMoE(nn.Module):
     """
     A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.