|
20 | 20 | _get_config_dtype_str, |
21 | 21 | ) |
22 | 22 | from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( |
23 | | - MarlinExperts, |
| 23 | + modular_marlin_fused_moe, |
24 | 24 | ) |
25 | 25 | from vllm.model_executor.layers.fused_moe.fused_moe import ( |
26 | | - TritonExperts, |
| 26 | + modular_triton_fused_moe, |
27 | 27 | try_get_optimal_moe_config, |
28 | 28 | ) |
29 | 29 | from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import ( |
30 | 30 | FusedMoEModularMethod, |
31 | 31 | ) |
32 | | -from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( |
33 | | - UnfusedOAITritonExperts, |
34 | | -) |
35 | | -from vllm.model_executor.layers.fused_moe.modular_kernel import ( |
36 | | - FusedMoEModularKernel, |
37 | | -) |
38 | | -from vllm.model_executor.layers.fused_moe.prepare_finalize import ( |
39 | | - MoEPrepareAndFinalizeNoEP, |
40 | | -) |
41 | 32 |
|
42 | 33 | from .utils import _get_lora_device |
43 | 34 |
|
@@ -123,23 +114,15 @@ def _inject_lora_into_fused_moe(self): |
123 | 114 | self.base_layer.ensure_moe_quant_config_init() |
124 | 115 | quant_config = self.base_layer.quant_method.moe_quant_config |
125 | 116 |
|
126 | | - prepare_finalize = MoEPrepareAndFinalizeNoEP() |
127 | | - m_fused_moe_fn = FusedMoEModularKernel( |
128 | | - prepare_finalize, |
129 | | - self.base_layer.quant_method.select_gemm_impl( |
130 | | - prepare_finalize, self.base_layer |
131 | | - ), |
132 | | - self.base_layer.shared_experts, |
133 | | - getattr(self.base_layer, "shared_experts_stream", None), |
134 | | - ) |
135 | | - if quant_config.use_mxfp4_w4a16: |
136 | | - assert isinstance( |
137 | | - m_fused_moe_fn.fused_experts, (MarlinExperts, UnfusedOAITritonExperts) |
| 117 | + m_fused_moe_fn = ( |
| 118 | + modular_triton_fused_moe( |
| 119 | + quant_config, shared_experts=self.base_layer.shared_experts |
138 | 120 | ) |
139 | | - else: |
140 | | - assert isinstance( |
141 | | - m_fused_moe_fn.fused_experts, (MarlinExperts, TritonExperts) |
| 121 | + if not quant_config.use_mxfp4_w4a16 |
| 122 | + else modular_marlin_fused_moe( |
| 123 | + quant_config, shared_experts=self.base_layer.shared_experts |
142 | 124 | ) |
| 125 | + ) |
143 | 126 |
|
144 | 127 | def fwd_decorator(layer, func): |
145 | 128 | def wrapper(*args, **kwargs): |
|
0 commit comments