Merge pull request #75 from vkuzo/20251008_nvfp4_moe

vkuzo · web-flow · commit e44cafd64bb6 · 2025-10-08T09:49:09.000-04:00
extend nvfp4 logic to MoEs
diff --git a/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py b/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py
@@ -84,7 +84,20 @@ def tokenize(sample):
         #   * quantize the weights to fp4 with per group 16 via ptq
         #   * calibrate a global_scale for activations, which will be used to
         #       quantize activations to fp4 on the fly
-        recipe = QuantizationModifier(targets="Linear", scheme="NVFP4", ignore=["lm_head"])
+        recipe = QuantizationModifier(
+            targets="Linear", 
+            scheme="NVFP4", 
+            ignore=[
+                "lm_head",
+                # for Qwen MoE, but ok to just hardcode here for now
+                # https://github.com/vllm-project/llm-compressor/blob/33ef5f497a9801893764c6a2c880cb1f560067fa/examples/quantizing_moe/qwen_example.py#L10
+                "re:.*mlp.gate$", 
+                "re:.*mlp.shared_expert_gate$",
+                # also skip attention and shared expert, to focus on MoE for now
+                "re:.*self_attn.*",
+                "re:.*shared_expert.*",
+            ],
+        )
 
         # Apply quantization.
         oneshot(
diff --git a/hf_torchao_vllm/quantize_hf_model_with_torchao.py b/hf_torchao_vllm/quantize_hf_model_with_torchao.py
@@ -121,14 +121,21 @@ def get_quantization_config(args):
                 expert_fqn_to_config = {}
                 # TODO(future PR): this is annoying, I should be able to use a regex here
                 for layer_idx in range(24):
-                    for expert_idx in range(60):
-                        expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.experts.{expert_idx}.gate_proj"] = single_config
-                        expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.experts.{expert_idx}.up_proj"] = single_config
-                        expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.experts.{expert_idx}.down_proj"] = single_config
+                    expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.q_proj"] = None
+                    expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.k_proj"] = None
+                    expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.v_proj"] = None
+                    expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.o_proj"] = None
+                    expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.gate"] = None
+                    expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert.gate_proj"] = None
+                    expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert.up_proj"] = None
+                    expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert.down_proj"] = None
+                    expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert_gate"] = None
+                    expert_fqn_to_config[f"lm_head"] = None
                 module_fqn_to_config = ModuleFqnToConfig({
-                    "_default": None,
+                    "_default": single_config,
                     **expert_fqn_to_config,
                 })
+
                 return TorchAoConfig(
                     quant_type=module_fqn_to_config,
                 )
@@ -162,12 +169,18 @@ def get_quantization_config(args):
                 expert_fqn_to_config = {}
                 # TODO(future PR): this is annoying, I should be able to use a regex here
                 for layer_idx in range(24):
-                    for expert_idx in range(60):
-                        expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.experts.{expert_idx}.gate_proj"] = single_config
-                        expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.experts.{expert_idx}.up_proj"] = single_config
-                        expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.experts.{expert_idx}.down_proj"] = single_config
+                    expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.q_proj"] = None
+                    expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.k_proj"] = None
+                    expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.v_proj"] = None
+                    expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.o_proj"] = None
+                    expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.gate"] = None
+                    expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert.gate_proj"] = None
+                    expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert.up_proj"] = None
+                    expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert.down_proj"] = None
+                    expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert_gate"] = None
+                    expert_fqn_to_config[f"lm_head"] = None
                 module_fqn_to_config = ModuleFqnToConfig({
-                    "_default": None,
+                    "_default": single_config,
                     **expert_fqn_to_config,
                 })
                 return TorchAoConfig(