From cdaa40d814b71e0e03133f7e4a3b8274834c1ba6 Mon Sep 17 00:00:00 2001 From: Shawn Tan Date: Fri, 10 Oct 2025 18:35:52 +0000 Subject: [PATCH 1/2] ScatterMoE --- src/transformers/integrations/hub_kernels.py | 6 ++++++ src/transformers/models/granitemoe/modeling_granitemoe.py | 1 + src/transformers/models/granitemoe/modular_granitemoe.py | 2 ++ .../models/granitemoehybrid/modeling_granitemoehybrid.py | 1 + .../models/granitemoeshared/modeling_granitemoeshared.py | 1 + 5 files changed, 11 insertions(+) diff --git a/src/transformers/integrations/hub_kernels.py b/src/transformers/integrations/hub_kernels.py index 60c8797176e8..1525e538976e 100644 --- a/src/transformers/integrations/hub_kernels.py +++ b/src/transformers/integrations/hub_kernels.py @@ -115,6 +115,12 @@ def use_kernel_forward_from_hub(layer_name: str): ) }, }, + "ScatterMoEGatedMLP": { + "cuda": { + Mode.TRAINING: LayerRepository(repo_id="shawntan/scattermoe", layer_name="ScatterMoEGatedMLP"), + Mode.INFERENCE: LayerRepository(repo_id="shawntan/scattermoe", layer_name="ScatterMoEGatedMLP"), + }, + }, "FastGELU": { "cuda": { Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository( diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py index 0eefadc9a1b9..043afb42afd8 100644 --- a/src/transformers/models/granitemoe/modeling_granitemoe.py +++ b/src/transformers/models/granitemoe/modeling_granitemoe.py @@ -221,6 +221,7 @@ def forward(self, hidden_states): return index_sorted_experts, batch_index, batch_gates, expert_size, logits +@use_kernel_forward_from_hub("ScatterMoEGatedMLP") class GraniteMoeMoE(nn.Module): """ A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts. diff --git a/src/transformers/models/granitemoe/modular_granitemoe.py b/src/transformers/models/granitemoe/modular_granitemoe.py index 3c5b73ebf899..71e85edadbe0 100644 --- a/src/transformers/models/granitemoe/modular_granitemoe.py +++ b/src/transformers/models/granitemoe/modular_granitemoe.py @@ -20,6 +20,7 @@ from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache +from ...integrations import use_kernel_forward_from_hub from ...masking_utils import create_causal_mask from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast from ...modeling_utils import PreTrainedModel @@ -49,6 +50,7 @@ class GraniteMoeTopKGating(JetMoeTopKGating): pass +@use_kernel_forward_from_hub("ScatterMoEGatedMLP") class GraniteMoeMoE(nn.Module): """ A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts. diff --git a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py index 947d250cd134..770a917789af 100644 --- a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py @@ -1066,6 +1066,7 @@ def forward(self, hidden_states): return index_sorted_experts, batch_index, batch_gates, expert_size, logits +@use_kernel_forward_from_hub("ScatterMoEGatedMLP") class GraniteMoeHybridMoE(nn.Module): """ A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts. diff --git a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py index 8b1569722006..ad8558f24d8c 100644 --- a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py @@ -207,6 +207,7 @@ def forward(self, hidden_states): return index_sorted_experts, batch_index, batch_gates, expert_size, logits +@use_kernel_forward_from_hub("ScatterMoEGatedMLP") class GraniteMoeSharedMoE(nn.Module): """ A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts. From d4f8a6ca82d5054ca984a323addeaa3b640378d6 Mon Sep 17 00:00:00 2001 From: Shawn Tan Date: Thu, 16 Oct 2025 18:12:18 +0000 Subject: [PATCH 2/2] Change kernel repo_id --- src/transformers/integrations/hub_kernels.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/transformers/integrations/hub_kernels.py b/src/transformers/integrations/hub_kernels.py index 1525e538976e..02b967305663 100644 --- a/src/transformers/integrations/hub_kernels.py +++ b/src/transformers/integrations/hub_kernels.py @@ -117,8 +117,12 @@ def use_kernel_forward_from_hub(layer_name: str): }, "ScatterMoEGatedMLP": { "cuda": { - Mode.TRAINING: LayerRepository(repo_id="shawntan/scattermoe", layer_name="ScatterMoEGatedMLP"), - Mode.INFERENCE: LayerRepository(repo_id="shawntan/scattermoe", layer_name="ScatterMoEGatedMLP"), + Mode.TRAINING: LayerRepository( + repo_id="kernels-community/scattermoe", layer_name="ScatterMoEGatedMLP" + ), + Mode.INFERENCE: LayerRepository( + repo_id="kernels-community/scattermoe", layer_name="ScatterMoEGatedMLP" + ), }, }, "FastGELU": {