Quant fallback to 8w per token + other quant improvements for multimodal (#154)

jackzhxng · web-flow · commit 6f51b4c5b12d · 2025-11-10T20:04:26.000-05:00
diff --git a/optimum/commands/export/executorch.py b/optimum/commands/export/executorch.py
@@ -76,12 +76,14 @@ def parse_args_executorch(parser):
     required_group.add_argument(
         "--qlinear",
         type=str,
-        choices=["8da4w", "4w", "8w"],
+        choices=["8da4w", "4w", "8w", "8da8w", "8da4w,8da8w"],
         required=False,
         help=(
             "Quantization config for decoder linear layers.\n\n"
             "Options:\n"
             "  8da4w - 8-bit dynamic activation, 4-bit weight\n"
+            "  8da8w - 8-bit dynamic activation, 8-bit weight\n"
+            "  8da4w,8da8w - 8-bit dynamic activation, 4-bit weight and 8-bit weight\n"
             "  4w    - 4-bit weight only\n"
             "  8w    - 8-bit weight only"
         ),
@@ -104,12 +106,14 @@ def parse_args_executorch(parser):
     required_group.add_argument(
         "--qlinear_encoder",
         type=str,
-        choices=["8da4w", "4w", "8w"],
+        choices=["8da4w", "4w", "8w", "8da8w", "8da4w,8da8w"],
         required=False,
         help=(
             "Quantization config for encoder linear layers.\n\n"
             "Options:\n"
             "  8da4w - 8-bit dynamic activation, 4-bit weight\n"
+            "  8da8w - 8-bit dynamic activation, 8-bit weight\n"
+            "  8da4w,8da8w - 8-bit dynamic activation, 4-bit weight; fallback on 8-bit dynamic activation, 8-bit weight per-channel where group size doesn't divide block size cleanly \n"
             "  4w    - 4-bit weight only\n"
             "  8w    - 8-bit weight only"
         ),
@@ -144,6 +148,24 @@ def parse_args_executorch(parser):
     required_group.add_argument(
         "--qembedding_group_size", type=int, required=False, help="Group size for embedding quantization."
     )
+    required_group.add_argument(
+        "--qembedding_encoder",
+        type=str,
+        choices=["4w", "8w"],
+        required=False,
+        help=(
+            "Quantization config for encoder embedding layer, for model arcitectures with an encoder.\n\n"
+            "Options:\n"
+            "  4w    - 4-bit weight only\n"
+            "  8w    - 8-bit weight only"
+        ),
+    )
+    required_group.add_argument(
+        "--qembedding_encoder_group_size",
+        type=int,
+        required=False,
+        help="Group size for encoder embedding quantization, for model architectures with an encoder.",
+    )
     required_group.add_argument(
         "--max_seq_len",
         type=int,
@@ -220,6 +242,10 @@ def run(self):
             kwargs["qembedding"] = self.args.qembedding
         if self.args.qembedding_group_size:
             kwargs["qembedding_group_size"] = self.args.qembedding_group_size
+        if self.args.qembedding_encoder:
+            kwargs["qembedding_encoder"] = self.args.qembedding_encoder
+        if self.args.qembedding_encoder_group_size:
+            kwargs["qembedding_encoder_group_size"] = self.args.qembedding_encoder_group_size
         if self.args.max_seq_len:
             kwargs["max_seq_len"] = self.args.max_seq_len
         if hasattr(self.args, "dtype") and self.args.dtype:
diff --git a/optimum/exporters/executorch/quantization.py b/optimum/exporters/executorch/quantization.py
@@ -44,7 +44,7 @@ def quantize_model_(
         if qlinear_config == "8w":
             assert (
                 qembedding_group_size == 0
-            ), "8-bit embedding quantization only supports per-channel at the moment, please use qembedding_group_size = 0."
+            ), "8-bit embedding quantization only supports per-token at the moment, please use qembedding_group_size = 0."
         if qembedding_group_size == 0:
             embedding_weight_granularity = PerAxis(0)
         else:
@@ -71,42 +71,99 @@ def quantize_model_(
         )
 
     if qlinear_config:
+
+        def build_linear_config(quant_config_key: str, granularity: str, packing_format: Optional[str] = None):
+            if quant_config_key == "8da4w":
+                return Int8DynamicActivationIntxWeightConfig(
+                    weight_dtype=torch.int4,
+                    weight_granularity=granularity,
+                )
+            if quant_config_key == "4w":
+                # Determine if we need to use Int4WeightOnlyConfig with int4_packing_format
+                if packing_format:
+                    return Int4WeightOnlyConfig(
+                        group_size=qlinear_group_size,
+                        int4_packing_format=packing_format,
+                        int4_choose_qparams_algorithm="hqq",
+                    )
+                else:
+                    return IntxWeightOnlyConfig(
+                        weight_dtype=torch.int4,
+                        granularity=granularity,
+                    )
+            if quant_config_key == "8w":
+                return IntxWeightOnlyConfig(
+                    weight_dtype=torch.int8,
+                    granularity=granularity,
+                )
+            if quant_config_key == "8da8w":
+                return Int8DynamicActivationIntxWeightConfig(
+                    weight_dtype=torch.int8,
+                    weight_granularity=PerAxis(0),
+                )
+            raise ValueError(f"Unsupported linear quantization config '{quant_config_key}'.")
+
+        qlinear_configs = [cfg.strip() for cfg in qlinear_config.split(",")]
+        if any(cfg == "" for cfg in qlinear_configs):
+            raise ValueError("Linear quantization config entries must be non-empty.")
+        if len(qlinear_configs) > 2:
+            raise ValueError("Expected at most one fallback linear quantization config, got more than one comma.")
+
+        primary_linear_config_key = qlinear_configs[0]
+        fallback_linear_config_key = qlinear_configs[1] if len(qlinear_configs) == 2 else None
+
         if qlinear_group_size == 0:
             linear_weight_granularity = PerAxis(0)
+            if fallback_linear_config_key is not None:
+                logging.warning(
+                    "qlinear_group_size is 0, fallback linear config will not be used as all layers will be quantized with per-axis granularity."
+                )
+                fallback_linear_config_key = None
         else:
-            assert qlinear_group_size % 2 == 0, "Linear quantization group size must be a multiple of 2."
+            assert (
+                qlinear_group_size % 2 == 0
+            ), f"Linear quantization group size must be a multiple of 2, got {qlinear_group_size}."
             linear_weight_granularity = PerGroup(qlinear_group_size)
 
         logging.info("Quantizing linear layers.")
+        primary_linear_config = build_linear_config(
+            primary_linear_config_key, linear_weight_granularity, qlinear_packing_format
+        )
 
-        # Determine if we need to use Int4WeightOnlyConfig with int4_packing_format
-        if qlinear_config == "4w" and qlinear_packing_format:
-            linear_config = Int4WeightOnlyConfig(
-                group_size=qlinear_group_size,
-                int4_packing_format=qlinear_packing_format,
-                int4_choose_qparams_algorithm="hqq",
-            )
-        else:
-            linear_config = {
-                "8da4w": Int8DynamicActivationIntxWeightConfig(
-                    weight_dtype=torch.int4,
-                    weight_granularity=linear_weight_granularity,
-                ),
-                "4w": IntxWeightOnlyConfig(
-                    weight_dtype=torch.int4,
-                    granularity=linear_weight_granularity,
-                ),
-                "8w": IntxWeightOnlyConfig(
-                    weight_dtype=torch.int8,
-                    granularity=linear_weight_granularity,
-                ),
-            }[qlinear_config]
+        # First, quantize layers that are compatible with group quantization
+        def per_group_filter(module, fqn):
+            if isinstance(module, torch.nn.Linear):
+                # Check if hidden dimension is divisible by group size
+                # For Linear layers, weight shape is [out_features, in_features]
+                # Group quantization typically applies to the in_features dimension (dim=1)
+                return qlinear_group_size == 0 or (module.weight.shape[1] % qlinear_group_size == 0)
+            return False
 
         quantize_(
             eager_model,
-            linear_config,
+            primary_linear_config,
+            filter_fn=per_group_filter,
         )
 
+        # Then, quantize incompatible layers using the fallback per-axis config
+        if fallback_linear_config_key is not None:
+            fallback_linear_config = build_linear_config(fallback_linear_config_key, PerAxis(0))
+
+            def per_token_filter(module, fqn):
+                if isinstance(module, torch.nn.Linear):
+                    return module.weight.shape[1] % qlinear_group_size != 0
+                return False
+
+            logging.info(
+                f"Applying fallback linear config '{fallback_linear_config_key}' (per-axis)"
+                f" to layers incompatible with group size {qlinear_group_size}."
+            )
+            quantize_(
+                eager_model,
+                fallback_linear_config,
+                filter_fn=per_token_filter,
+            )
+
     # TODO: remove after ExecuTorch dep on Torch >= 2.10.0.
     if parse(torch_version) < parse("2.10.0.dev20251104"):
         unwrap_tensor_subclass(eager_model)
diff --git a/optimum/exporters/executorch/tasks/multimodal_text_to_text.py b/optimum/exporters/executorch/tasks/multimodal_text_to_text.py
@@ -14,6 +14,7 @@
 
 
 import json
+import logging
 import os.path
 
 import torchao
@@ -202,8 +203,12 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs):
     qlinear_encoder_packing_format = kwargs.get("qlinear_encoder_packing_format", None)
     qembedding_config = kwargs.get("qembedding", None)
     qembedding_group_size = kwargs.get("qembedding_group_size", None)
+    qembedding_encoder_config = kwargs.get("qembedding_encoder", None)
+    qembedding_encoder_group_size = kwargs.get("qembedding_encoder_group_size", None)
 
     # Quantize decoder linear weights.
+    if qlinear_config:
+        logging.info("Quantizing decoder linears...")
     quantize_decoder_kwargs = {
         "eager_model": getattr(eager_model, decoder_name),
         "qlinear_config": qlinear_config,
@@ -214,7 +219,26 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs):
         quantize_decoder_kwargs["qlinear_packing_format"] = qlinear_packing_format
     quantize_model_(**quantize_decoder_kwargs)
 
+    # Quantize lm head, if it is separate from the decoder model.
+    # e.g. Sometimes  the top-level model will have:
+    # def __init__(self, ...):
+    #     self.decoder = ...
+    #     self.lm_head = ...  # lm_head is not part of the decoder instance
+    #     ...
+    if not hasattr(getattr(eager_model, decoder_name), "lm_head"):
+        if not hasattr(eager_model, "lm_head"):
+            raise AttributeError(
+                f"Could not find `lm_head` for {model_name_or_path} has no `lm_head`, please double check if this is expected."
+            )
+        quantize_lm_head_kwargs = {
+            "eager_model": eager_model.lm_head,
+            "qlinear_config": qlinear_config,
+        }
+        quantize_model_(**quantize_lm_head_kwargs)
+
     # Quantize encoder linear weights.
+    if qlinear_encoder_config:
+        logging.info("Quantizing encoder linears...")
     quantize_encoder_kwargs = {
         "eager_model": getattr(eager_model, encoder_name),
         "qlinear_config": qlinear_encoder_config,
@@ -225,9 +249,9 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs):
         quantize_encoder_kwargs["qlinear_packing_format"] = qlinear_encoder_packing_format
     quantize_model_(**quantize_encoder_kwargs)
 
-    # TODO: quantize other parts of the model, e.g. MultimodalProjector?
-
     # Quantize decoder embeddings.
+    if qembedding_config:
+        logging.info("Quantizing embeddings...")
     quantize_decoder_embedding_kwargs = {
         "eager_model": getattr(eager_model, decoder_name),
         "qembedding_config": qembedding_config,
@@ -236,7 +260,16 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs):
         quantize_decoder_embedding_kwargs["qembedding_group_size"] = qembedding_group_size
     quantize_model_(**quantize_decoder_embedding_kwargs)
 
-    # TODO: quantize encoder embeddings.
+    # Quantize encoder embeddings.
+    if qembedding_encoder_config:
+        logging.info("Quantizing embeddings...")
+    quantize_encoder_embedding_kwargs = {
+        "eager_model": getattr(eager_model, encoder_name),
+        "qembedding_config": qembedding_encoder_config,
+    }
+    if qembedding_encoder_group_size is not None:
+        quantize_encoder_embedding_kwargs["qembedding_group_size"] = qembedding_encoder_group_size
+    quantize_model_(**quantize_encoder_embedding_kwargs)
 
     return MultiModalTextToTextExportableModule(
         model=eager_model,
diff --git a/tests/models/test_modeling_gemma3.py b/tests/models/test_modeling_gemma3.py
@@ -309,9 +309,24 @@ def test_gemma3_image_vision_with_custom_sdpa_kv_cache_8da4w_8we(self):
             use_custom_kv_cache=True,
             qlinear="8da4w",
             qlinear_group_size=32,
-            # Can't quantize the encoder a the moment, hidden dim of 4304 doesn't fit ExecuTorch's
-            # XNNPack 32-group size quantized kernels. See https://github.com/pytorch/executorch/issues/14221.
-            qembedding_config="8w",
+            qlinear_encoder="8da4w,8da8w",
+            qlinear_encoder_group_size=32,
+            qembedding="8w",
+            qembedding_encoder="8w",
+        )
+
+        # Check file size is approximately 3GB (allow 1% tolerance)
+        file_size_bytes = os.path.getsize(os.path.join(model._temp_dir.name, "model.pte"))
+        file_size_gb = file_size_bytes / (1024**3)
+        expected_size_gb = 2.96
+        tolerance = 0.01  # 1% tolerance
+
+        logging.info(f"model.pte size: {file_size_gb:.2f} GB")
+        self.assertAlmostEqual(
+            file_size_gb,
+            expected_size_gb,
+            delta=expected_size_gb * tolerance,
+            msg=f"Expected file size ~{expected_size_gb}GB, but got {file_size_gb:.2f}GB",
         )
 
         # Generate