@@ -23,6 +23,18 @@ def run(
2323 print (model )
2424 tokenizer = AutoTokenizer .from_pretrained (model_name )
2525
26+ ignore_list = ["lm_head" ]
27+ if model_name == "Qwen1.5-MoE-A2.7B" :
28+ ignore_list .extend (
29+ [
30+ "re:.*mlp.gate$" ,
31+ "re:.*mlp.shared_expert_gate$" ,
32+ # also skip attention and shared expert, to focus on MoE for now
33+ "re:.*self_attn.*" ,
34+ "re:.*shared_expert.*" ,
35+ ]
36+ )
37+
2638 if quant_type == "fp8" :
2739 # Configure the quantization algorithm and scheme.
2840 # In this case, we:
@@ -31,16 +43,7 @@ def run(
3143 recipe = QuantizationModifier (
3244 targets = "Linear" ,
3345 scheme = "FP8_DYNAMIC" ,
34- ignore = [
35- "lm_head" ,
36- # for Qwen MoE, but ok to just hardcode here for now
37- # https://github.com/vllm-project/llm-compressor/blob/33ef5f497a9801893764c6a2c880cb1f560067fa/examples/quantizing_moe/qwen_example.py#L10
38- "re:.*mlp.gate$" ,
39- "re:.*mlp.shared_expert_gate$" ,
40- # also skip attention and shared expert, to focus on MoE for now
41- "re:.*self_attn.*" ,
42- "re:.*shared_expert.*" ,
43- ],
46+ ignore = ignore_list ,
4447 )
4548
4649 # Apply quantization.
@@ -89,16 +92,7 @@ def tokenize(sample):
8992 recipe = QuantizationModifier (
9093 targets = "Linear" ,
9194 scheme = "NVFP4" ,
92- ignore = [
93- "lm_head" ,
94- # for Qwen MoE, but ok to just hardcode here for now
95- # https://github.com/vllm-project/llm-compressor/blob/33ef5f497a9801893764c6a2c880cb1f560067fa/examples/quantizing_moe/qwen_example.py#L10
96- "re:.*mlp.gate$" ,
97- "re:.*mlp.shared_expert_gate$" ,
98- # also skip attention and shared expert, to focus on MoE for now
99- "re:.*self_attn.*" ,
100- "re:.*shared_expert.*" ,
101- ],
95+ ignore = ignore_list ,
10296 )
10397
10498 # Apply quantization.
0 commit comments