[wip] llama 4 scout expert quant

vkuzo · vkuzo · commit 4d43646a102c · 2025-11-06T17:37:17.000-08:00
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/hf_torchao_vllm/quantize_hf_model_with_torchao.py b/hf_torchao_vllm/quantize_hf_model_with_torchao.py
@@ -83,6 +83,7 @@ def get_quantization_config(args):
                 # TODO tool to find this (I used bisect on this tiny model).
                 activation_value_lb=1.0e-12,
             )
+
             if args.experts_only_qwen_1_5_moe_a_2_7b:
                 expert_fqn_to_config = {}
                 # TODO(future PR): this is annoying, I should be able to use a regex here
@@ -125,6 +126,32 @@ def get_quantization_config(args):
                 return TorchAoConfig(
                     quant_type=module_fqn_to_config,
                 )
+            elif args.ffn_only_llama_4_scout:
+                # TODO gate this properly
+                expert_3d_weight_single_config = Float8DynamicActivationFloat8WeightConfig(
+                    # the weights of this model are stored in (B, K, N) layout, and we
+                    # need to quantize rowwise across the K axis, which is `PerRow(1)`.
+                    granularity=[PerRow(), PerRow(1)],
+                    # the 125m model has a lot of activation zeroes for some
+                    # prompts, need to set a lower bound to prevent scales from
+                    # being 0.
+                    # TODO seems like torchao should do this for me.
+                    # TODO tool to find this (I used bisect on this tiny model).
+                    activation_value_lb=1.0e-12,
+                )
+                module_fqn_to_config = ModuleFqnToConfig(
+                    {
+                        r"re:.*\.feed_forward\.experts\.gate_up_proj": expert_3d_weight_single_config,
+                        r"re:.*\.feed_forward\.experts\.down_proj": expert_3d_weight_single_config,
+                        r"re:.*\.shared_expert\.down_proj": single_config,
+                        r"re:.*\.shared_expert\.up_proj": single_config,
+                        r"re:.*\.shared_expert\.gate_proj": single_config,
+                    }
+                )
+                return TorchAoConfig(
+                    quant_type=module_fqn_to_config,
+                )
+
             else:
                 return TorchAoConfig(single_config)
         case "int4_weight_only":
@@ -345,6 +372,7 @@ def main(
     device_map: str = "cuda",
     experts_only_qwen_1_5_moe_a_2_7b: bool = False,
     skip_gate_qwen_1_5_moe_a_2_7b: bool = False,
+    ffn_only_llama_4_scout: bool = False,
     save_model_to_disk: bool = True,
 ):
     """
@@ -363,6 +391,7 @@ def main(
         device_map: Device mapping strategy
         experts_only_qwen_1_5_moe_a_2_7b: if True, quantizes experts only for Qwen1.5-MoE-A2.7B model
         skip_gate_qwen_1_5_moe_a_2_7b: if True, skips gate quantization for Qwen1.5-MoE-A2.7B model
+        ffn_only_llama_4_scout: if True, FFN only for meta-llama/Llama-4-Scout-17B-16E-Instruct
         save_model_to_disk: if True, saves quantized model to local disk
     """
     # Test prompts
@@ -397,6 +426,7 @@ def main(
         experts_only_qwen_1_5_moe_a_2_7b=experts_only_qwen_1_5_moe_a_2_7b,
         save_model_to_disk=save_model_to_disk,
         skip_gate_qwen_1_5_moe_a_2_7b=skip_gate_qwen_1_5_moe_a_2_7b,
+        ffn_only_llama_4_scout=ffn_only_llama_4_scout,
     )
     print(f"{args=}")
 
@@ -415,6 +445,7 @@ def main(
     # Get quantization config
     quantization_config = get_quantization_config(args)
 
+    # TODO(before land): clean up the chat processor code
     if args.model_name == "meta-llama/Llama-4-Scout-17B-16E-Instruct":
         # TODO(future): maybe unify with the else branch, need to figure
         # out the right syntax for preparing inputs and running generation
@@ -430,6 +461,27 @@ def main(
             torch_dtype=torch.bfloat16,
             quantization_config=quantization_config,
         )
+        print(quantized_model)
+
+        print(
+            "quantized_model.language_model.model.layers[47].feed_forward.experts.down_proj",
+            type(
+                quantized_model.language_model.model.layers[
+                    47
+                ].feed_forward.experts.down_proj
+            ),
+        )
+        print(
+            "quantized_model.language_model.model.layers[47].feed_forward.experts.gate_up_proj",
+            type(
+                quantized_model.language_model.model.layers[
+                    47
+                ].feed_forward.experts.gate_up_proj
+            ),
+        )
+
+        # breakpoint()
+        # return
 
         messages = []
         for prompt in prompts[:1]:
@@ -462,6 +514,8 @@ def main(
         for response in responses:
             print(response)
 
+        return
+
     else:
         # Load tokenizer
         tokenizer = AutoTokenizer.from_pretrained(args.model_name)
@@ -495,8 +549,16 @@ def main(
     if args.save_model_to_disk:
         # Save quantized model
         print(f"\nSaving quantized model to: {output_dir}")
-        quantized_model.save_pretrained(output_dir, safe_serialization=False)
-        tokenizer.save_pretrained(output_dir)
+        if args.model_name == "meta-llama/Llama-4-Scout-17B-16E-Instruct":
+            quantized_model.save_pretrained(
+                output_dir, safe_serialization=False
+            )
+            processor.save_pretrained(output_dir)
+        else:
+            quantized_model.save_pretrained(
+                output_dir, safe_serialization=False
+            )
+            tokenizer.save_pretrained(output_dir)
 
     # Push to HuggingFace hub if requested
     if args.push_to_hub: