update mixtral

ArthurZucker · ArthurZucker · commit b605e1a346b9 · 2025-11-13T14:51:43.000Z
diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -404,7 +404,7 @@ class MixtralPreTrainedModel(PreTrainedModel):
     _can_compile_fullgraph = False  # MoE models don't work with torch.compile (`torch.where(condition)` not supported)
     _supports_attention_backend = True
     _can_record_outputs = {
-        "router_logits": OutputRecorder(nn.Linear, layer_name="mlp.gate", index=0),
+        "router_logits": OutputRecorder(MixtralTopKRouter, index=0),
         "hidden_states": MixtralDecoderLayer,
         "attentions": MixtralAttention,
     }
diff --git a/src/transformers/models/mixtral/modular_mixtral.py b/src/transformers/models/mixtral/modular_mixtral.py
@@ -265,7 +265,7 @@ def forward(
 class MixtralPreTrainedModel(MistralPreTrainedModel):
     _can_compile_fullgraph = False  # MoE models don't work with torch.compile (`torch.where(condition)` not supported)
     _can_record_outputs = {
-        "router_logits": OutputRecorder(nn.Linear, layer_name="mlp.gate", index=0),
+        "router_logits": OutputRecorder(MixtralTopKRouter, index=0),
         "hidden_states": MixtralDecoderLayer,
         "attentions": MixtralAttention,
     }

Original file line number	Diff line number	Diff line change
`@@ -404,7 +404,7 @@ class MixtralPreTrainedModel(PreTrainedModel):`
`404`	`404`	_can_compile_fullgraph = False # MoE models don't work with torch.compile (`torch.where(condition)` not supported)
`405`	`405`	`_supports_attention_backend = True`
`406`	`406`	`_can_record_outputs = {`
`407`		`- "router_logits": OutputRecorder(nn.Linear, layer_name="mlp.gate", index=0),`
	`407`	`+ "router_logits": OutputRecorder(MixtralTopKRouter, index=0),`
`408`	`408`	`"hidden_states": MixtralDecoderLayer,`
`409`	`409`	`"attentions": MixtralAttention,`
`410`	`410`	`}`
Original file line number	Diff line number	Diff line change
`@@ -265,7 +265,7 @@ def forward(`
`265`	`265`	`class MixtralPreTrainedModel(MistralPreTrainedModel):`
`266`	`266`	_can_compile_fullgraph = False # MoE models don't work with torch.compile (`torch.where(condition)` not supported)
`267`	`267`	`_can_record_outputs = {`
`268`		`- "router_logits": OutputRecorder(nn.Linear, layer_name="mlp.gate", index=0),`
	`268`	`+ "router_logits": OutputRecorder(MixtralTopKRouter, index=0),`
`269`	`269`	`"hidden_states": MixtralDecoderLayer,`
`270`	`270`	`"attentions": MixtralAttention,`
`271`	`271`	`}`