Skip to content

Commit e44cafd

Browse files
authored
Merge pull request #75 from vkuzo/20251008_nvfp4_moe
extend nvfp4 logic to MoEs
2 parents b130677 + 56b29a1 commit e44cafd

File tree

2 files changed

+37
-11
lines changed

2 files changed

+37
-11
lines changed

hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,20 @@ def tokenize(sample):
8484
# * quantize the weights to fp4 with per group 16 via ptq
8585
# * calibrate a global_scale for activations, which will be used to
8686
# quantize activations to fp4 on the fly
87-
recipe = QuantizationModifier(targets="Linear", scheme="NVFP4", ignore=["lm_head"])
87+
recipe = QuantizationModifier(
88+
targets="Linear",
89+
scheme="NVFP4",
90+
ignore=[
91+
"lm_head",
92+
# for Qwen MoE, but ok to just hardcode here for now
93+
# https://github.com/vllm-project/llm-compressor/blob/33ef5f497a9801893764c6a2c880cb1f560067fa/examples/quantizing_moe/qwen_example.py#L10
94+
"re:.*mlp.gate$",
95+
"re:.*mlp.shared_expert_gate$",
96+
# also skip attention and shared expert, to focus on MoE for now
97+
"re:.*self_attn.*",
98+
"re:.*shared_expert.*",
99+
],
100+
)
88101

89102
# Apply quantization.
90103
oneshot(

hf_torchao_vllm/quantize_hf_model_with_torchao.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -121,14 +121,21 @@ def get_quantization_config(args):
121121
expert_fqn_to_config = {}
122122
# TODO(future PR): this is annoying, I should be able to use a regex here
123123
for layer_idx in range(24):
124-
for expert_idx in range(60):
125-
expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.experts.{expert_idx}.gate_proj"] = single_config
126-
expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.experts.{expert_idx}.up_proj"] = single_config
127-
expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.experts.{expert_idx}.down_proj"] = single_config
124+
expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.q_proj"] = None
125+
expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.k_proj"] = None
126+
expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.v_proj"] = None
127+
expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.o_proj"] = None
128+
expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.gate"] = None
129+
expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert.gate_proj"] = None
130+
expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert.up_proj"] = None
131+
expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert.down_proj"] = None
132+
expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert_gate"] = None
133+
expert_fqn_to_config[f"lm_head"] = None
128134
module_fqn_to_config = ModuleFqnToConfig({
129-
"_default": None,
135+
"_default": single_config,
130136
**expert_fqn_to_config,
131137
})
138+
132139
return TorchAoConfig(
133140
quant_type=module_fqn_to_config,
134141
)
@@ -162,12 +169,18 @@ def get_quantization_config(args):
162169
expert_fqn_to_config = {}
163170
# TODO(future PR): this is annoying, I should be able to use a regex here
164171
for layer_idx in range(24):
165-
for expert_idx in range(60):
166-
expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.experts.{expert_idx}.gate_proj"] = single_config
167-
expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.experts.{expert_idx}.up_proj"] = single_config
168-
expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.experts.{expert_idx}.down_proj"] = single_config
172+
expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.q_proj"] = None
173+
expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.k_proj"] = None
174+
expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.v_proj"] = None
175+
expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.o_proj"] = None
176+
expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.gate"] = None
177+
expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert.gate_proj"] = None
178+
expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert.up_proj"] = None
179+
expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert.down_proj"] = None
180+
expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert_gate"] = None
181+
expert_fqn_to_config[f"lm_head"] = None
169182
module_fqn_to_config = ModuleFqnToConfig({
170-
"_default": None,
183+
"_default": single_config,
171184
**expert_fqn_to_config,
172185
})
173186
return TorchAoConfig(

0 commit comments

Comments
 (0)