Skip to content

Commit db8f2ff

Browse files
committed
small fix for llmcompressor dense example
Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags:
1 parent e145bda commit db8f2ff

File tree

1 file changed

+14
-20
lines changed

1 file changed

+14
-20
lines changed

hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py

Lines changed: 14 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,18 @@ def run(
2323
print(model)
2424
tokenizer = AutoTokenizer.from_pretrained(model_name)
2525

26+
ignore_list = ["lm_head"]
27+
if model_name == "Qwen1.5-MoE-A2.7B":
28+
ignore_list.extend(
29+
[
30+
"re:.*mlp.gate$",
31+
"re:.*mlp.shared_expert_gate$",
32+
# also skip attention and shared expert, to focus on MoE for now
33+
"re:.*self_attn.*",
34+
"re:.*shared_expert.*",
35+
]
36+
)
37+
2638
if quant_type == "fp8":
2739
# Configure the quantization algorithm and scheme.
2840
# In this case, we:
@@ -31,16 +43,7 @@ def run(
3143
recipe = QuantizationModifier(
3244
targets="Linear",
3345
scheme="FP8_DYNAMIC",
34-
ignore=[
35-
"lm_head",
36-
# for Qwen MoE, but ok to just hardcode here for now
37-
# https://github.com/vllm-project/llm-compressor/blob/33ef5f497a9801893764c6a2c880cb1f560067fa/examples/quantizing_moe/qwen_example.py#L10
38-
"re:.*mlp.gate$",
39-
"re:.*mlp.shared_expert_gate$",
40-
# also skip attention and shared expert, to focus on MoE for now
41-
"re:.*self_attn.*",
42-
"re:.*shared_expert.*",
43-
],
46+
ignore=ignore_list,
4447
)
4548

4649
# Apply quantization.
@@ -89,16 +92,7 @@ def tokenize(sample):
8992
recipe = QuantizationModifier(
9093
targets="Linear",
9194
scheme="NVFP4",
92-
ignore=[
93-
"lm_head",
94-
# for Qwen MoE, but ok to just hardcode here for now
95-
# https://github.com/vllm-project/llm-compressor/blob/33ef5f497a9801893764c6a2c880cb1f560067fa/examples/quantizing_moe/qwen_example.py#L10
96-
"re:.*mlp.gate$",
97-
"re:.*mlp.shared_expert_gate$",
98-
# also skip attention and shared expert, to focus on MoE for now
99-
"re:.*self_attn.*",
100-
"re:.*shared_expert.*",
101-
],
95+
ignore=ignore_list,
10296
)
10397

10498
# Apply quantization.

0 commit comments

Comments
 (0)