llm_compressor example of llama 4 scout

vkuzo · vkuzo · commit 8fc7288da1b8 · 2025-11-05T04:09:43.000-08:00
Summary:

Test Plan:

```
with-proxy python quantize_hf_model_with_llm_compressor.py --model_name "meta-llama/Llama-4-Scout-17B-16E-Instruct"
python inspect_llm_compressor_output.py --dir_name data/llmcompressor/fp8-Llama-4-Scout-17B-16E-Instruct/
```

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py b/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py
@@ -6,7 +6,6 @@
 from llmcompressor.modifiers.quantization import QuantizationModifier
 from llmcompressor.utils import dispatch_for_generation
 
-import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
@@ -17,9 +16,7 @@ def run(
     assert quant_type in ("fp8", "nvfp4"), "unsupported"
 
     # Load model.
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name, torch_dtype=torch.bfloat16
-    )
+    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
     print(model)
     tokenizer = AutoTokenizer.from_pretrained(model_name)
 
@@ -34,6 +31,16 @@ def run(
                 "re:.*shared_expert.*",
             ]
         )
+    elif model_name == "meta-llama/Llama-4-Scout-17B-16E-Instruct":
+        ignore_list.extend(
+            [
+                "re:.*self_attn",
+                "re:.*router",
+                "re:.*vision_model.*",
+                "re:.*multi_modal_projector.*",
+                "Llama4TextAttention",
+            ]
+        )
 
     if quant_type == "fp8":
         # Configure the quantization algorithm and scheme.