Merge pull request #80 from vkuzo/llmc_llama4_moe

vkuzo · web-flow · commit 9de04161cc61 · 2025-11-05T07:10:07.000-05:00
llm_compressor example of llama 4 scout
diff --git a/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py b/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py
@@ -6,7 +6,6 @@
 from llmcompressor.modifiers.quantization import QuantizationModifier
 from llmcompressor.utils import dispatch_for_generation
 
-import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
@@ -17,9 +16,7 @@ def run(
     assert quant_type in ("fp8", "nvfp4"), "unsupported"
 
     # Load model.
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name, torch_dtype=torch.bfloat16
-    )
+    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
     print(model)
     tokenizer = AutoTokenizer.from_pretrained(model_name)
 
@@ -34,6 +31,16 @@ def run(
                 "re:.*shared_expert.*",
             ]
         )
+    elif model_name == "meta-llama/Llama-4-Scout-17B-16E-Instruct":
+        ignore_list.extend(
+            [
+                "re:.*self_attn",
+                "re:.*router",
+                "re:.*vision_model.*",
+                "re:.*multi_modal_projector.*",
+                "Llama4TextAttention",
+            ]
+        )
 
     if quant_type == "fp8":
         # Configure the quantization algorithm and scheme.