[Bugfix] Disable shared expert overlap if Marlin MoE is used (vllm-project#28410)

mgoin · devpatelio · commit eb00ffd3ccc4 · 2025-11-28T16:30:31.000-08:00
Signed-off-by: mgoin &lt;mgoin64@gmail.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -678,6 +678,10 @@ def use_flashinfer_cutlass_kernels(self):
             and self.moe_config.use_flashinfer_cutlass_kernels
         )
 
+    @property
+    def use_marlin_kernels(self):
+        return getattr(self.quant_method, "use_marlin", False)
+
     @property
     def use_dp_chunking(self) -> bool:
         return (
diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
@@ -28,17 +28,17 @@ def __init__(
         super().__init__(**kwargs)
         self._shared_experts = shared_experts
 
-        # Disable shared expert overlap if we are using eplb, because of
-        # correctness issues, or if using flashinfer with DP, since there
-        # is nothing to be gained in this case. Disabling the overlap
-        # optimization also prevents the shared experts from being hidden
-        # from torch.compile.
+        # Disable shared expert overlap if:
+        #   - we are using eplb, because of correctness issues
+        #   - we are using flashinfer with DP, since there nothint to gain
+        #   - we are using marlin kjernels
         self.use_overlapped = (
             use_overlapped
             and not (
                 # TODO(wentao): find the root cause and remove this condition
                 self.enable_eplb
                 or (self.moe_config.use_flashinfer_cutlass_kernels and self.dp_size > 1)
+                or self.use_marlin_kernels
             )
             and self._shared_experts is not None
         )
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -424,6 +424,7 @@ def __init__(
         if self.quant_config.weight_bits != 4:
             raise ValueError("AWQMoEMethod only supports 4bit now.")
         self.quant_type = scalar_types.uint4
+        self.use_marlin = True
 
     def create_weights(
         self,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1342,6 +1342,7 @@ def __init__(
                 f"{WNA16_SUPPORTED_BITS}",
             )
         self.quant_type = WNA16_SUPPORTED_TYPES_MAP[self.num_bits]
+        self.use_marlin = True
 
     def create_weights(
         self,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -482,6 +482,7 @@ def __init__(
             self.quant_type = scalar_types.uint8b128
         else:
             raise ValueError("GPTQMarlinMoEMethod only supports int4 and int8 now.")
+        self.use_marlin = True
 
     def create_weights(
         self,
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -216,6 +216,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
     def __init__(self, moe: FusedMoEConfig):
         super().__init__(moe)
         self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled)
+        self.use_marlin = self.mxfp4_backend == Mxfp4Backend.MARLIN
         self.max_capture_size = (
             get_current_vllm_config().compilation_config.max_cudagraph_capture_size
         )

Original file line number	Diff line number	Diff line change
`@@ -1342,6 +1342,7 @@ def __init__(`
`1342`	`1342`	`f"{WNA16_SUPPORTED_BITS}",`
`1343`	`1343`	`)`
`1344`	`1344`	`self.quant_type = WNA16_SUPPORTED_TYPES_MAP[self.num_bits]`
	`1345`	`+ self.use_marlin = True`
`1345`	`1346`
`1346`	`1347`	`def create_weights(`
`1347`	`1348`	`self,`
Original file line number	Diff line number	Diff line change
`@@ -216,6 +216,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):`
`216`	`216`	`def __init__(self, moe: FusedMoEConfig):`
`217`	`217`	`super().__init__(moe)`
`218`	`218`	`self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled)`
	`219`	`+ self.use_marlin = self.mxfp4_backend == Mxfp4Backend.MARLIN`
`219`	`220`	`self.max_capture_size = (`
`220`	`221`	`get_current_vllm_config().compilation_config.max_cudagraph_capture_size`
`221`	`222`	`)`