Skip to content

Commit 9de0416

Browse files
authored
Merge pull request #80 from vkuzo/llmc_llama4_moe
llm_compressor example of llama 4 scout
2 parents 0e7b53e + 8fc7288 commit 9de0416

File tree

1 file changed

+11
-4
lines changed

1 file changed

+11
-4
lines changed

hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from llmcompressor.modifiers.quantization import QuantizationModifier
77
from llmcompressor.utils import dispatch_for_generation
88

9-
import torch
109
from transformers import AutoModelForCausalLM, AutoTokenizer
1110

1211

@@ -17,9 +16,7 @@ def run(
1716
assert quant_type in ("fp8", "nvfp4"), "unsupported"
1817

1918
# Load model.
20-
model = AutoModelForCausalLM.from_pretrained(
21-
model_name, torch_dtype=torch.bfloat16
22-
)
19+
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
2320
print(model)
2421
tokenizer = AutoTokenizer.from_pretrained(model_name)
2522

@@ -34,6 +31,16 @@ def run(
3431
"re:.*shared_expert.*",
3532
]
3633
)
34+
elif model_name == "meta-llama/Llama-4-Scout-17B-16E-Instruct":
35+
ignore_list.extend(
36+
[
37+
"re:.*self_attn",
38+
"re:.*router",
39+
"re:.*vision_model.*",
40+
"re:.*multi_modal_projector.*",
41+
"Llama4TextAttention",
42+
]
43+
)
3744

3845
if quant_type == "fp8":
3946
# Configure the quantization algorithm and scheme.

0 commit comments

Comments
 (0)