huggingface
diff --git a/‎docs/source/en/model_doc/switch_transformers.md‎
Lines changed: 0 additions & 1 deletion b/‎docs/source/en/model_doc/switch_transformers.md‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/transformers/modeling_outputs.py‎
Lines changed: 2 additions & 0 deletions b/‎src/transformers/modeling_outputs.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/transformers/models/aria/modeling_aria.py‎
Lines changed: 31 additions & 83 deletions b/‎src/transformers/models/aria/modeling_aria.py‎
Lines changed: 31 additions & 83 deletions
diff --git a/‎src/transformers/models/aria/modular_aria.py‎
Lines changed: 31 additions & 83 deletions b/‎src/transformers/models/aria/modular_aria.py‎
Lines changed: 31 additions & 83 deletions
diff --git a/‎src/transformers/models/bamba/modeling_bamba.py‎
Lines changed: 18 additions & 10 deletions b/‎src/transformers/models/bamba/modeling_bamba.py‎
Lines changed: 18 additions & 10 deletions
@@ -105,7 +105,6 @@ print(tokenizer.decode(outputs[0]))
 ## SwitchTransformersTop1Router
 
 [[autodoc]] SwitchTransformersTop1Router
-    - _compute_router_probabilities
     - forward
 
 ## SwitchTransformersSparseMLP
 
@@ -357,6 +357,7 @@ class MoEModelOutput(ModelOutput):
     hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[tuple[torch.FloatTensor, ...]] = None
     router_probs: Optional[tuple[torch.FloatTensor]] = None
+    router_logits: Optional[tuple[torch.FloatTensor]] = None
 
 
 @dataclass
@@ -494,6 +495,7 @@ class MoEModelOutputWithPastAndCrossAttentions(ModelOutput):
     attentions: Optional[tuple[torch.FloatTensor, ...]] = None
     cross_attentions: Optional[tuple[torch.FloatTensor, ...]] = None
     router_probs: Optional[tuple[torch.FloatTensor]] = None
+    router_logits: Optional[tuple[torch.FloatTensor]] = None
 
 
 @dataclass
 
@@ -309,117 +309,65 @@ def forward(self, input, tokens_per_expert):
         )
 
 
-class AriaGroupedExpertsMLP(nn.Module):
-    """
-    Grouped MLP module for Mixture of Experts.
-
-    Args:
-        config (`AriaTextConfig`):
-            Configuration object for the model.
-    """
-
+class AriaExperts(nn.Module):
     def __init__(self, config: AriaTextConfig) -> None:
         super().__init__()
         self.config = config
         self.fc1 = AriaGroupedExpertsGemm(config.hidden_size, config.intermediate_size * 2, config.moe_num_experts)
         self.fc2 = AriaGroupedExpertsGemm(config.intermediate_size, config.hidden_size, config.moe_num_experts)
 
-    def forward(self, permuted_tokens, tokens_per_expert):
-        """
-        Forward pass of the Grouped MLP.
-
-        Args:
-            permuted_tokens (torch.Tensor): Permuted input tokens.
-            tokens_per_expert (torch.Tensor): Number of tokens assigned to each expert.
-
-        Returns:
-            torch.Tensor: Output tensor after passing through the MLP.
-        """
-        fc1_output = self.fc1(permuted_tokens, tokens_per_expert)
-        projection, gate = torch.chunk(fc1_output, 2, dim=-1)
-        fc1_output = nn.functional.silu(projection) * gate
-        fc2_output = self.fc2(fc1_output, tokens_per_expert)
-        return fc2_output
-
-
-# Token permutation adapted from https://github.com/NVIDIA/Megatron-LM/blob/54f1f78529cbc2b9cddad313e7f9d96ac0420a27/megatron/core/transformer/moe/token_dispatcher.py#L291-L587
-class AriaTextMoELayer(nn.Module):
-    """
-    Aria Text Mixture of Experts (MoE) Layer.
-
-    This layer applies a gating mechanism to route input tokens to different experts.
-
-    Args:
-        config (`AriaTextConfig`):
-            Configuration object for the text component of the model.
-    """
-
-    def __init__(self, config: AriaTextConfig):
-        super().__init__()
-
-        self.router = nn.Linear(config.hidden_size, config.moe_num_experts, bias=False)
-        self.experts = AriaGroupedExpertsMLP(config)
-        self.shared_experts = AriaSharedExpertsMLP(config)
-        self.config = config
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass of the MoE Layer.
-
-        Args:
-            hidden_states (`torch.Tensor`):
-                Input tensor of shape (batch_size, sequence_length, hidden_size).
-
-        Returns:
-            torch.Tensor: Output tensor after passing through the MoE layer.
-
-        Process:
-        1. Route tokens to experts using the router.
-        2. Permute tokens based on routing decisions.
-        3. Process tokens through experts.
-        4. Unpermute and combine expert outputs.
-        5. Add shared expert output to the final result.
-        """
-        original_shape = hidden_states.shape
-        hidden_states = hidden_states.view(-1, hidden_states.size(-1))
-
-        # Top K Routing
-        logits = self.router(hidden_states)
-        top_logits, top_indices = torch.topk(logits, k=self.config.moe_topk, dim=1)
+    def route_tokens_to_experts(self, router_logits):
+        top_logits, top_indices = torch.topk(router_logits, k=self.config.moe_topk, dim=1)
         scores = nn.functional.softmax(top_logits, dim=-1)
+        return top_indices, scores
 
-        original_dtype = top_indices.dtype
-
+    def forward(self, hidden_states, router_logits) -> torch.Tensor:
+        top_k_index, top_k_weights = self.route_tokens_to_experts(router_logits)
+        original_dtype = top_k_index.dtype
         tokens_per_expert = torch.histc(
-            top_indices.flatten().to(torch.float32),
+            top_k_index.flatten().to(torch.float32),
             bins=self.config.moe_num_experts,
             min=0,
             max=self.config.moe_num_experts - 1,
         ).to(original_dtype)
-        indices = top_indices
+        indices = top_k_index
 
-        # Token permutation
         flatten_indices = indices.view(-1)
         sorted_indices = torch.argsort(flatten_indices)
         permuted_tokens = hidden_states.index_select(0, sorted_indices // self.config.moe_topk)
 
-        # Process through experts
-        expert_output = self.experts(permuted_tokens, tokens_per_expert)
+        fc1_output = self.fc1(permuted_tokens, tokens_per_expert)
+        projection, gate = torch.chunk(fc1_output, 2, dim=-1)
+        fc1_output = nn.functional.silu(projection) * gate
+        expert_output = self.fc2(fc1_output, tokens_per_expert)
 
-        # Token unpermutation
         unpermuted_tokens = torch.zeros(
-            (scores.shape[0] * self.config.moe_topk, expert_output.size(1)),
+            (top_k_weights.shape[0] * self.config.moe_topk, expert_output.size(1)),
             dtype=expert_output.dtype,
             device=expert_output.device,
         )
         unpermuted_tokens.index_copy_(0, sorted_indices, expert_output)
         unpermuted_tokens = unpermuted_tokens.view(-1, self.config.moe_topk, expert_output.size(1))
 
-        output = (unpermuted_tokens * scores.unsqueeze(-1)).sum(dim=1).view(original_shape)
+        output = (unpermuted_tokens * top_k_weights.unsqueeze(-1)).sum(dim=1)
+        return output
 
-        # Add shared expert output
+
+class AriaTextMoELayer(nn.Module):
+    def __init__(self, config: AriaTextConfig):
+        super().__init__()
+        self.router = nn.Linear(config.hidden_size, config.moe_num_experts, bias=False)
+        self.experts = AriaExperts(config)
+        self.shared_experts = AriaSharedExpertsMLP(config)
+        self.config = config
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        original_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_states.size(-1))
+        router_logits = self.router(hidden_states)
+        expert_output = self.experts(hidden_states, router_logits).view(original_shape)
         shared_expert_output = self.shared_experts(hidden_states.view(original_shape))
-        return output + shared_expert_output
+        return expert_output + shared_expert_output
 
 
 def rotate_half(x):
 
@@ -1120,117 +1120,65 @@ def forward(self, input, tokens_per_expert):
         )
 
 
-class AriaGroupedExpertsMLP(nn.Module):
-    """
-    Grouped MLP module for Mixture of Experts.
-
-    Args:
-        config (`AriaTextConfig`):
-            Configuration object for the model.
-    """
-
+class AriaExperts(nn.Module):
     def __init__(self, config: AriaTextConfig) -> None:
         super().__init__()
         self.config = config
         self.fc1 = AriaGroupedExpertsGemm(config.hidden_size, config.intermediate_size * 2, config.moe_num_experts)
         self.fc2 = AriaGroupedExpertsGemm(config.intermediate_size, config.hidden_size, config.moe_num_experts)
 
-    def forward(self, permuted_tokens, tokens_per_expert):
-        """
-        Forward pass of the Grouped MLP.
-
-        Args:
-            permuted_tokens (torch.Tensor): Permuted input tokens.
-            tokens_per_expert (torch.Tensor): Number of tokens assigned to each expert.
-
-        Returns:
-            torch.Tensor: Output tensor after passing through the MLP.
-        """
-        fc1_output = self.fc1(permuted_tokens, tokens_per_expert)
-        projection, gate = torch.chunk(fc1_output, 2, dim=-1)
-        fc1_output = nn.functional.silu(projection) * gate
-        fc2_output = self.fc2(fc1_output, tokens_per_expert)
-        return fc2_output
-
-
-# Token permutation adapted from https://github.com/NVIDIA/Megatron-LM/blob/54f1f78529cbc2b9cddad313e7f9d96ac0420a27/megatron/core/transformer/moe/token_dispatcher.py#L291-L587
-class AriaTextMoELayer(nn.Module):
-    """
-    Aria Text Mixture of Experts (MoE) Layer.
-
-    This layer applies a gating mechanism to route input tokens to different experts.
-
-    Args:
-        config (`AriaTextConfig`):
-            Configuration object for the text component of the model.
-    """
-
-    def __init__(self, config: AriaTextConfig):
-        super().__init__()
-
-        self.router = nn.Linear(config.hidden_size, config.moe_num_experts, bias=False)
-        self.experts = AriaGroupedExpertsMLP(config)
-        self.shared_experts = AriaSharedExpertsMLP(config)
-        self.config = config
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass of the MoE Layer.
-
-        Args:
-            hidden_states (`torch.Tensor`):
-                Input tensor of shape (batch_size, sequence_length, hidden_size).
-
-        Returns:
-            torch.Tensor: Output tensor after passing through the MoE layer.
-
-        Process:
-        1. Route tokens to experts using the router.
-        2. Permute tokens based on routing decisions.
-        3. Process tokens through experts.
-        4. Unpermute and combine expert outputs.
-        5. Add shared expert output to the final result.
-        """
-        original_shape = hidden_states.shape
-        hidden_states = hidden_states.view(-1, hidden_states.size(-1))
-
-        # Top K Routing
-        logits = self.router(hidden_states)
-        top_logits, top_indices = torch.topk(logits, k=self.config.moe_topk, dim=1)
+    def route_tokens_to_experts(self, router_logits):
+        top_logits, top_indices = torch.topk(router_logits, k=self.config.moe_topk, dim=1)
         scores = nn.functional.softmax(top_logits, dim=-1)
+        return top_indices, scores
 
-        original_dtype = top_indices.dtype
-
+    def forward(self, hidden_states, router_logits) -> torch.Tensor:
+        top_k_index, top_k_weights = self.route_tokens_to_experts(router_logits)
+        original_dtype = top_k_index.dtype
         tokens_per_expert = torch.histc(
-            top_indices.flatten().to(torch.float32),
+            top_k_index.flatten().to(torch.float32),
             bins=self.config.moe_num_experts,
             min=0,
             max=self.config.moe_num_experts - 1,
         ).to(original_dtype)
-        indices = top_indices
+        indices = top_k_index
 
-        # Token permutation
         flatten_indices = indices.view(-1)
         sorted_indices = torch.argsort(flatten_indices)
         permuted_tokens = hidden_states.index_select(0, sorted_indices // self.config.moe_topk)
 
-        # Process through experts
-        expert_output = self.experts(permuted_tokens, tokens_per_expert)
+        fc1_output = self.fc1(permuted_tokens, tokens_per_expert)
+        projection, gate = torch.chunk(fc1_output, 2, dim=-1)
+        fc1_output = nn.functional.silu(projection) * gate
+        expert_output = self.fc2(fc1_output, tokens_per_expert)
 
-        # Token unpermutation
         unpermuted_tokens = torch.zeros(
-            (scores.shape[0] * self.config.moe_topk, expert_output.size(1)),
+            (top_k_weights.shape[0] * self.config.moe_topk, expert_output.size(1)),
             dtype=expert_output.dtype,
             device=expert_output.device,
         )
         unpermuted_tokens.index_copy_(0, sorted_indices, expert_output)
         unpermuted_tokens = unpermuted_tokens.view(-1, self.config.moe_topk, expert_output.size(1))
 
-        output = (unpermuted_tokens * scores.unsqueeze(-1)).sum(dim=1).view(original_shape)
+        output = (unpermuted_tokens * top_k_weights.unsqueeze(-1)).sum(dim=1)
+        return output
 
-        # Add shared expert output
+
+class AriaTextMoELayer(nn.Module):
+    def __init__(self, config: AriaTextConfig):
+        super().__init__()
+        self.router = nn.Linear(config.hidden_size, config.moe_num_experts, bias=False)
+        self.experts = AriaExperts(config)
+        self.shared_experts = AriaSharedExpertsMLP(config)
+        self.config = config
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        original_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_states.size(-1))
+        router_logits = self.router(hidden_states)
+        expert_output = self.experts(hidden_states, router_logits).view(original_shape)
         shared_expert_output = self.shared_experts(hidden_states.view(original_shape))
-        return output + shared_expert_output
+        return expert_output + shared_expert_output
 
 
 class AriaTextAttention(LlamaAttention):
 
@@ -159,16 +159,24 @@ def update(
 
     def reorder_cache(self, beam_idx: torch.LongTensor):
         """Reorders the cache for beam search, given the selected beam indices."""
-        for layer_idx in range(len(self.key_cache)):
-            device = self.key_cache[layer_idx].device
-            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
-            device = self.value_cache[layer_idx].device
-            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
-
-            device = self.conv_states[layer_idx].device
-            self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device))
-            device = self.ssm_states[layer_idx].device
-            self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device))
+        if self.get_seq_length() > 0:
+            for layer_idx in range(len(self.key_cache)):
+                device = self.key_cache[layer_idx].device
+                self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
+                device = self.value_cache[layer_idx].device
+                self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
+
+                device = self.conv_states[layer_idx].device
+                self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device))
+                device = self.ssm_states[layer_idx].device
+                self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device))
+
+    def get_mask_sizes(self, cache_position: torch.Tensor, layer_idx: int) -> tuple[int, int]:
+        """Return the length and offset of the cache, used to generate the mask"""
+        kv_offset = 0
+        query_length = cache_position.shape[0]
+        kv_length = self.get_seq_length(layer_idx) + query_length
+        return kv_length, kv_offset
 
     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         """Returns the sequence length of the cached states. A layer index can be optionally passed."""