From a93e2b4b301acd922ddb7e113651aa5032f7b1fb Mon Sep 17 00:00:00 2001
From: Kinjal Patel <kinjalpravin@nvidia.com>
Date: Tue, 18 Nov 2025 00:53:17 +0000
Subject: [PATCH 01/10] Added support to export for BF16 weight and amax

Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
---
 examples/vllm_serve/convert_amax_hf2vllm.py   | 213 ------------------
 examples/vllm_serve/fakequant_worker.py       |  75 +++++-
 modelopt/torch/export/unified_export_hf.py    |  45 +++-
 .../torch/export/unified_export_megatron.py   | 126 ++++++++++-
 4 files changed, 231 insertions(+), 228 deletions(-)
 delete mode 100644 examples/vllm_serve/convert_amax_hf2vllm.py

diff --git a/examples/vllm_serve/convert_amax_hf2vllm.py b/examples/vllm_serve/convert_amax_hf2vllm.py
deleted file mode 100644
index 6f0321a91..000000000
--- a/examples/vllm_serve/convert_amax_hf2vllm.py
+++ /dev/null
@@ -1,213 +0,0 @@
-#!/usr/bin/env python3
-
-# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import re
-from collections import defaultdict
-
-import torch
-
-
-def convert_amax_hf2vllm(
-    hf_state_dict: dict[str, torch.Tensor],
-) -> dict[str, torch.Tensor]:
-    """
-    Convert amax values from HuggingFace format to vLLM format.
-
-    This function merges:
-    - q_proj, k_proj, v_proj amax values into qkv_proj (taking max)
-    - gate_proj, up_proj amax values into gate_up_proj (taking max)
-
-    Args:
-        hf_state_dict: HuggingFace state dict containing amax values
-
-    Returns:
-        vLLM format state dict with merged amax values
-    """
-    vllm_state_dict = {}
-
-    # Group keys by their base pattern (without the specific projection name)
-    merge_groups = defaultdict(list)
-
-    for key, value in hf_state_dict.items():
-        if "_amax" not in key:
-            # Copy non-amax keys as-is
-            vllm_state_dict[key] = value
-            continue
-
-        # Check if this is a q/k/v projection that needs merging
-        qkv_match = re.search(r"(.*\.)([qkv])_proj(\..+_amax)$", key)
-        if qkv_match:
-            base_pattern = qkv_match.group(1) + "qkv_proj" + qkv_match.group(3)
-            merge_groups[base_pattern].append((key, value))
-            continue
-
-        # Check if this is a gate/up projection that needs merging
-        gate_up_match = re.search(r"(.*\.)(gate|up)_proj(\..+_amax)$", key)
-        if gate_up_match:
-            base_pattern = gate_up_match.group(1) + "gate_up_proj" + gate_up_match.group(3)
-            merge_groups[base_pattern].append((key, value))
-            continue
-
-        # Copy other amax keys as-is (like o_proj, down_proj)
-        vllm_state_dict[key] = value
-
-    # Merge grouped amax values by taking the maximum
-    for merged_key, key_value_pairs in merge_groups.items():
-        if len(key_value_pairs) > 1:
-            # Take the maximum across all values for this merged key
-            values = [value for _, value in key_value_pairs]
-            merged_value = torch.stack(values).max(dim=0)[0]
-            vllm_state_dict[merged_key] = merged_value
-            print(f"Merged {len(key_value_pairs)} keys into {merged_key}")
-            for orig_key, _ in key_value_pairs:
-                print(f"  - {orig_key}")
-        else:
-            # Single key, just rename it
-            _, value = key_value_pairs[0]
-            vllm_state_dict[merged_key] = value
-
-    return vllm_state_dict
-
-
-def test_conversion():
-    """Test the conversion logic with sample keys"""
-    import torch
-
-    # Create sample HF state dict
-    sample_hf_keys = [
-        "model.layers.0.self_attn.q_proj.input_quantizer._amax",
-        "model.layers.0.self_attn.k_proj.input_quantizer._amax",
-        "model.layers.0.self_attn.v_proj.input_quantizer._amax",
-        "model.layers.0.self_attn.q_proj.weight_quantizer._amax",
-        "model.layers.0.self_attn.k_proj.weight_quantizer._amax",
-        "model.layers.0.self_attn.v_proj.weight_quantizer._amax",
-        "model.layers.0.self_attn.o_proj.input_quantizer._amax",
-        "model.layers.0.self_attn.o_proj.weight_quantizer._amax",
-        "model.layers.0.mlp.gate_proj.input_quantizer._amax",
-        "model.layers.0.mlp.up_proj.input_quantizer._amax",
-        "model.layers.0.mlp.gate_proj.weight_quantizer._amax",
-        "model.layers.0.mlp.up_proj.weight_quantizer._amax",
-        "model.layers.0.mlp.down_proj.input_quantizer._amax",
-        "model.layers.0.mlp.down_proj.weight_quantizer._amax",
-    ]
-
-    hf_state_dict = {}
-    for key in sample_hf_keys:
-        hf_state_dict[key] = torch.tensor([1.0, 2.0, 3.0])  # Sample values
-
-    print("Testing conversion with sample keys...")
-    print(f"Input keys: {len(sample_hf_keys)}")
-
-    vllm_state_dict = convert_amax_hf2vllm(hf_state_dict)
-    vllm_amax_keys = [k for k in vllm_state_dict if "_amax" in k]
-
-    print(f"Output keys: {len(vllm_amax_keys)}")
-    print("\nExpected vLLM keys:")
-    expected_keys = [
-        "model.layers.0.self_attn.qkv_proj.input_quantizer._amax",
-        "model.layers.0.self_attn.qkv_proj.weight_quantizer._amax",
-        "model.layers.0.self_attn.o_proj.input_quantizer._amax",
-        "model.layers.0.self_attn.o_proj.weight_quantizer._amax",
-        "model.layers.0.mlp.gate_up_proj.input_quantizer._amax",
-        "model.layers.0.mlp.gate_up_proj.weight_quantizer._amax",
-        "model.layers.0.mlp.down_proj.input_quantizer._amax",
-        "model.layers.0.mlp.down_proj.weight_quantizer._amax",
-    ]
-
-    for key in expected_keys:
-        print(f"  {key}")
-
-    print("\nActual vLLM keys:")
-    for key in sorted(vllm_amax_keys):
-        print(f"  {key}")
-
-    # Check if all expected keys are present
-    missing_keys = set(expected_keys) - set(vllm_amax_keys)
-    extra_keys = set(vllm_amax_keys) - set(expected_keys)
-
-    if missing_keys:
-        print(f"\nMissing keys: {missing_keys}")
-    if extra_keys:
-        print(f"\nExtra keys: {extra_keys}")
-
-    if not missing_keys and not extra_keys:
-        print("\n✓ Test passed! All keys converted correctly.")
-    else:
-        print("\n✗ Test failed! Key mismatch detected.")
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Convert amax values from HuggingFace to vLLM format"
-    )
-    parser.add_argument("--input", "-i", help="Input HuggingFace checkpoint path")
-    parser.add_argument("--output", "-o", help="Output vLLM checkpoint path")
-    parser.add_argument("--dry-run", action="store_true", help="Show conversion without saving")
-    parser.add_argument("--test", action="store_true", help="Run test with sample data")
-
-    args = parser.parse_args()
-
-    if args.test:
-        test_conversion()
-        return
-
-    if not args.input or not args.output:
-        parser.error("--input and --output are required unless using --test")
-
-    # Load HuggingFace checkpoint
-    print(f"Loading HuggingFace checkpoint from: {args.input}")
-    if os.path.isfile(args.input):
-        hf_state_dict = torch.load(args.input, map_location="cpu")
-    else:
-        raise Exception(f"File not found: {args.input}")
-
-    print(f"Loaded {len(hf_state_dict)} keys from HuggingFace checkpoint")
-
-    # Filter to only amax keys for analysis
-    amax_keys = [k for k in hf_state_dict if "_amax" in k]
-    print(f"Found {len(amax_keys)} amax keys")
-
-    if args.dry_run:
-        print("\nAmax keys in HuggingFace format:")
-        for key in sorted(amax_keys):
-            print(f"  {key}")
-
-    # Convert to vLLM format
-    print("\nConverting to vLLM format...")
-    vllm_state_dict = convert_amax_hf2vllm(hf_state_dict)
-
-    vllm_amax_keys = [k for k in vllm_state_dict if "_amax" in k]
-    print(f"Result: {len(vllm_amax_keys)} amax keys in vLLM format")
-
-    if args.dry_run:
-        print("\nAmax keys in vLLM format:")
-        for key in sorted(vllm_amax_keys):
-            print(f"  {key}")
-        print("\nDry run complete. No files saved.")
-        return
-
-    # Save vLLM checkpoint
-    print(f"Saving vLLM checkpoint to: {args.output}")
-    os.makedirs(os.path.dirname(args.output), exist_ok=True)
-    torch.save(vllm_state_dict, args.output)
-    print("Conversion complete!")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/vllm_serve/fakequant_worker.py b/examples/vllm_serve/fakequant_worker.py
index 8532c369f..5cfccb976 100644
--- a/examples/vllm_serve/fakequant_worker.py
+++ b/examples/vllm_serve/fakequant_worker.py
@@ -15,7 +15,9 @@
 
 import dataclasses
 import os
+import re
 import warnings
+from collections import defaultdict
 from contextlib import contextmanager
 from typing import Any
 
@@ -30,6 +32,68 @@
 from modelopt.torch.utils.dataset_utils import get_dataset_dataloader
 
 
+def convert_amax_hf2vllm(
+    hf_state_dict: dict[str, torch.Tensor],
+) -> dict[str, torch.Tensor]:
+    """
+    Convert amax values from HuggingFace format to vLLM format.
+
+    This function merges:
+    - q_proj, k_proj, v_proj amax values into qkv_proj (taking max)
+    - gate_proj, up_proj amax values into gate_up_proj (taking max)
+
+    Args:
+        hf_state_dict: HuggingFace state dict containing amax values
+
+    Returns:
+        vLLM format state dict with merged amax values
+    """
+    vllm_state_dict = {}
+
+    # Group keys by their base pattern (without the specific projection name)
+    merge_groups = defaultdict(list)
+
+    for key, value in hf_state_dict.items():
+        if "_amax" not in key:
+            # Copy non-amax keys as-is
+            vllm_state_dict[key] = value
+            continue
+
+        # Check if this is a q/k/v projection that needs merging
+        qkv_match = re.search(r"(.*\.)([qkv])_proj(\..+_amax)$", key)
+        if qkv_match:
+            base_pattern = qkv_match.group(1) + "qkv_proj" + qkv_match.group(3)
+            merge_groups[base_pattern].append((key, value))
+            continue
+
+        # Check if this is a gate/up projection that needs merging
+        gate_up_match = "mixer" not in key and re.search(r"(.*\.)(gate|up)_proj(\..+_amax)$", key)
+        if gate_up_match:
+            base_pattern = gate_up_match.group(1) + "gate_up_proj" + gate_up_match.group(3)
+            merge_groups[base_pattern].append((key, value))
+            continue
+
+        # Copy other amax keys as-is (like o_proj, down_proj)
+        vllm_state_dict[key] = value
+
+    # Merge grouped amax values by taking the maximum
+    for merged_key, key_value_pairs in merge_groups.items():
+        if len(key_value_pairs) > 1:
+            # Take the maximum across all values for this merged key
+            values = [value for _, value in key_value_pairs]
+            merged_value = torch.stack(values).max(dim=0)[0]
+            vllm_state_dict[merged_key] = merged_value
+            print(f"Merged {len(key_value_pairs)} keys into {merged_key}")
+            for orig_key, _ in key_value_pairs:
+                print(f"  - {orig_key}")
+        else:
+            # Single key, just rename it
+            _, value = key_value_pairs[0]
+            vllm_state_dict[merged_key] = value
+
+    return vllm_state_dict
+
+
 @contextmanager
 def disable_compilation(model):
     do_not_compile = True
@@ -154,8 +218,17 @@ def calibrate_loop(model: Any = None) -> None:
     if amax_file_path:
         print(f"Loading amax values from {amax_file_path}")
         saved_amax_dict = torch.load(amax_file_path)
-        current_state_dict = model.state_dict()
+        # convert amax keys to vLLM format
+        if hasattr(self.model_runner.model, "hf_to_vllm_mapper"):
+            saved_amax_dict = self.model_runner.model.hf_to_vllm_mapper.apply_dict(saved_amax_dict)
+            saved_amax_dict = {
+                key.replace("quantizer_amax", "quantizer._amax"): value
+                for key, value in saved_amax_dict.items()
+                if key.endswith("quantizer_amax")
+            }
+        saved_amax_dict = convert_amax_hf2vllm(saved_amax_dict)
 
+        current_state_dict = model.state_dict()
         # Count amax keys in checkpoint and model
         checkpoint_amax_keys = [key for key in saved_amax_dict if key.endswith("_amax")]
         model_amax_keys = [key for key in current_state_dict if key.endswith("_amax")]
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 447338690..e1251120a 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -33,7 +33,11 @@
 from modelopt.torch.quantization import set_quantizer_by_cfg_context
 from modelopt.torch.quantization.nn import SequentialQuantizer, TensorQuantizer
 from modelopt.torch.quantization.qtensor import NVFP4QTensor
-from modelopt.torch.quantization.utils import fsdp2_aware_weight_update, quantizer_attr_names
+from modelopt.torch.quantization.utils import (
+    fsdp2_aware_weight_update,
+    get_quantizer_state_dict,
+    quantizer_attr_names,
+)
 
 from .convert_hf_config import convert_hf_quant_config_format
 from .layer_utils import (
@@ -74,7 +78,7 @@
     to_quantized_weight,
 )
 
-__all__ = ["export_hf_checkpoint"]
+__all__ = ["export_hf_bf16_weights_amax", "export_hf_checkpoint"]
 
 
 def _is_enabled_quantizer(quantizer):
@@ -609,3 +613,40 @@ def export_hf_checkpoint(
             " can be saved with torch.save for further inspection."
         )
         raise e
+
+
+def export_hf_bf16_weights_amax(
+    model: nn.Module,
+    export_dir: Path | str = tempfile.gettempdir(),
+):
+    """Exports the torch model weights and amax values separately which can be used for vLLM fakequant serve.
+
+    This function:
+    1. Extracts amax values for calibration
+    2. Deletes all quantizer parameters from state dict to store only weights in original dtype
+    3. Saves model checkpoint (with weights in original dtype) and amax values separately
+
+    Args:
+        model: The quantized model to export
+        export_dir: Directory to save the model and artifacts
+    """
+    export_dir = Path(export_dir)
+    export_dir.mkdir(parents=True, exist_ok=True)
+
+    amax_dict = {
+        name + "._amax": param["_amax"].detach().clone().cpu()
+        for name, param in get_quantizer_state_dict(model).items()
+        if "_amax" in param
+    }
+
+    # remove quantizer from model
+    for name, module in model.named_modules():
+        if is_quantlinear(module):
+            delattr(module, "weight_quantizer")
+            delattr(module, "input_quantizer")
+            delattr(module, "output_quantizer")
+            module.export()
+
+    # Save with model without quantizer parameters
+    model.save_pretrained(export_dir)
+    torch.save(amax_dict, f"{export_dir}/quant_amax.pth")
diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
index e31530109..18df91f39 100644
--- a/modelopt/torch/export/unified_export_megatron.py
+++ b/modelopt/torch/export/unified_export_megatron.py
@@ -20,6 +20,7 @@
 
 import json
 import os
+import shutil
 import tempfile
 from collections import OrderedDict
 from pathlib import Path
@@ -29,11 +30,12 @@
 import torch
 import torch.distributed
 import torch.nn as nn
-from huggingface_hub import snapshot_download
+from huggingface_hub import hf_hub_download, snapshot_download
 from safetensors.torch import safe_open, save_file
 from tqdm import tqdm
 
 from modelopt import __version__
+from modelopt.torch.quantization.utils import get_quantizer_state_dict
 from modelopt.torch.utils import import_plugin
 
 from .model_config import (
@@ -41,6 +43,7 @@
     QUANTIZATION_FP8,
     QUANTIZATION_FP8_PB_REAL,
     QUANTIZATION_FP8_PB_WO,
+    QUANTIZATION_NONE,
     QUANTIZATION_NVFP4,
 )
 from .plugins.mcore_common import all_mcore_hf_export_mapping
@@ -77,7 +80,10 @@
 
     has_mcore = True
 
-__all__ = ["export_mcore_gpt_to_hf", "import_mcore_gpt_from_hf"]
+__all__ = [
+    "export_mcore_gpt_to_hf",
+    "import_mcore_gpt_from_hf",
+]
 
 
 # This path uses output_quantizer for KV cache quantization.
@@ -109,20 +115,24 @@ def get_kv_cache_scaling_factor(kv_module: nn.Module) -> torch.Tensor:
 
 def get_quantized_state(
     module: torch.nn.Module,
-    dtype: torch.dtype = torch.bfloat16,
+    dtype: torch.dtype = torch.float16,
+    export_bf16_weights_amax: bool = False,
 ) -> tuple[dict[str, torch.Tensor], str, int]:
     """Return a state_dict, quantization format, and block_size of the module.
 
     Args:
         module: The target module to perform real quantization.
         dtype: The default data type.
+        export_bf16_weights_amax: Whether to export the weights in bf16 and amax values.
 
     Returns:
         Tuple: state_dict, quantization format, and block_size of the module.
     """
     name_to_value = {}
-    qformat: str = get_quantization_format(module)
-    block_size = get_weight_block_size(module)
+    qformat: str = (
+        QUANTIZATION_NONE if export_bf16_weights_amax else get_quantization_format(module)
+    )
+    block_size = 0 if export_bf16_weights_amax else get_weight_block_size(module)
 
     if hasattr(module, "weight") and module.weight is not None:
         weight = module.weight.to(dtype).cpu()
@@ -136,6 +146,12 @@ def get_quantized_state(
     if hasattr(module, "expert_bias") and module.expert_bias is not None:
         name_to_value["expert_bias"] = module.expert_bias.to(dtype).cpu()
 
+    if export_bf16_weights_amax:
+        for name, param in get_quantizer_state_dict(module).items():
+            if "_amax" in param:
+                name_to_value[name + "._amax"] = param["_amax"].to(dtype).cpu()
+        return name_to_value, qformat, block_size
+
     # Getting the weight scales
     weight_scale = get_weight_scaling_factor(module)
     weight_scale_2 = get_weight_scaling_factor_2(module)
@@ -187,6 +203,7 @@ def __init__(
         dtype=torch.bfloat16,
         trust_remote_code: bool = True,
         moe_router_dtype: torch.dtype | None = None,
+        export_bf16_weights_amax: bool = False,
     ):
         """Create a GPTModel exporter instance."""
         if not isinstance(model, (GPTModel, MambaModel, LLaVAModel)):
@@ -222,6 +239,7 @@ def __init__(
         self.model = model.language_model if self.is_multimodal else model
         self.dtype = dtype
         self.trust_remote_code = trust_remote_code
+        self.export_bf16_weights_amax = export_bf16_weights_amax
         self.arch = self._hf_config.architectures[0]
         # TODO: May modify this later according to what quantization exported ckpt is, currently only support BF16.
         if self.arch == "GptOssForCausalLM":
@@ -331,7 +349,11 @@ def save_pretrained(
 
         # Main export process
         state_dict = self.extra_state_dict if self.export_extra_modules else self.state_dict
-        quantization_format = get_quantization_format(self.model)
+        quantization_format = (
+            get_quantization_format(self.model)
+            if not self.export_bf16_weights_amax
+            else QUANTIZATION_NONE
+        )
         quantization = None
         kv_cache_quantization = None
 
@@ -378,7 +400,7 @@ def save_pretrained(
                 except (OSError, ValueError, ImportError):
                     pass
 
-        if is_last_stage_main_rank:
+        if is_last_stage_main_rank and not self.export_bf16_weights_amax:
             hf_quant_config = {
                 "producer": {
                     "name": "modelopt",
@@ -398,6 +420,9 @@ def save_pretrained(
             and self.is_multimodal
             and pretrained_model_name_or_path is not None
         ):
+            assert not self.export_bf16_weights_amax, (
+                "Exporting weights in bf16 and amax values is not supported for multimodal models"
+            )
             hf_checkpoint_path = Path(pretrained_model_name_or_path)
             if not hf_checkpoint_path.is_dir():
                 hf_checkpoint_path = tempfile.gettempdir() + "/" + pretrained_model_name_or_path
@@ -466,6 +491,9 @@ def save_pretrained(
         torch.distributed.barrier()
 
         if self.export_extra_modules:
+            assert not self.export_bf16_weights_amax, (
+                "Exporting weights in bf16 and amax values is not supported for extra modules"
+            )
             if is_last_stage_main_rank:
                 save_file(
                     state_dict, save_directory + "/model.safetensors", metadata={"format": "pt"}
@@ -473,6 +501,71 @@ def save_pretrained(
             torch.distributed.barrier()
             return
 
+        if self.export_bf16_weights_amax:
+            amax_state_dict = {
+                k: v.detach().clone().cpu() for k, v in state_dict.items() if k.endswith("_amax")
+            }
+
+            # Gather all amax dicts to rank 0
+            world_size = torch.distributed.get_world_size()
+            rank = torch.distributed.get_rank()
+
+            if rank == 0:
+                # Rank 0 will collect all amax values
+                all_amax_dicts = [None] * world_size
+                torch.distributed.gather_object(amax_state_dict, all_amax_dicts, dst=0)
+
+                # Merge all amax dicts into one
+                merged_amax_dict = {}
+                for amax_dict in all_amax_dicts:
+                    if amax_dict is not None:
+                        merged_amax_dict.update(amax_dict)
+
+                print(f"Total amax entries from all ranks: {len(merged_amax_dict.keys())}")
+                torch.save(merged_amax_dict, save_directory + "/quant_amax.pth")
+            else:
+                # Other ranks just send their amax values
+                torch.distributed.gather_object(amax_state_dict, None, dst=0)
+
+            torch.distributed.barrier()
+
+            # remove amax values from state_dict
+            state_dict = {k: v for k, v in state_dict.items() if not k.endswith("_amax")}
+
+        if (
+            is_last_stage_main_rank
+            and self._hf_config is not None
+            and pretrained_model_name_or_path is not None
+        ):
+            # Nemotron-H model requires configuration and modeling files to run with vLLM
+            hf_checkpoint_path = Path(pretrained_model_name_or_path)
+            model_type = getattr(self._hf_config, "model_type", None)
+
+            if hf_checkpoint_path.is_dir():
+                # Local directory - files should be there
+                config_file = hf_checkpoint_path / f"configuration_{model_type}.py"
+                modeling_file = hf_checkpoint_path / f"modeling_{model_type}.py"
+            else:
+                # Remote model ID - download from HuggingFace Hub (cached automatically)
+                try:
+                    config_file = hf_hub_download(
+                        repo_id=pretrained_model_name_or_path,
+                        filename=f"configuration_{model_type}.py",
+                    )
+                except Exception:
+                    config_file = ""
+                try:
+                    modeling_file = hf_hub_download(
+                        repo_id=pretrained_model_name_or_path, filename=f"modeling_{model_type}.py"
+                    )
+                except Exception:
+                    modeling_file = ""
+
+            if config_file and os.path.exists(config_file):
+                shutil.copy(config_file, f"{save_directory}/configuration_{model_type}.py")
+            if modeling_file and os.path.exists(modeling_file):
+                shutil.copy(modeling_file, f"{save_directory}/modeling_{model_type}.py")
+
         save_safetensors(state_dict, save_directory)
 
     @property
@@ -544,7 +637,9 @@ def _name_remapping(
             self._state_dict[prefix] = module
             return
 
-        name_to_value, qformat, block_size = get_quantized_state(module, dtype)
+        name_to_value, qformat, block_size = get_quantized_state(
+            module, dtype, self.export_bf16_weights_amax
+        )
 
         weight = name_to_value.pop("weight")
         weight_scale, weight_scale_2 = self._get_weight_scales(name_to_value, qformat)
@@ -576,7 +671,9 @@ def _name_remapping(
     def _gated_mlp_slicing(
         self, module, prefix, gate_proj_name="gate_proj", up_proj_name="up_proj"
     ):
-        name_to_value, qformat, block_size = get_quantized_state(module, self.dtype)
+        name_to_value, qformat, block_size = get_quantized_state(
+            module, self.dtype, self.export_bf16_weights_amax
+        )
 
         weight = name_to_value.pop("weight")
         weight_scale, weight_scale_2 = self._get_weight_scales(name_to_value, qformat)
@@ -641,7 +738,9 @@ def _qkv_slicing(
         k_scale_name="k_scale",
         v_scale_name="v_scale",
     ):
-        name_to_value, qformat, block_size = get_quantized_state(module, self.dtype)
+        name_to_value, qformat, block_size = get_quantized_state(
+            module, self.dtype, self.export_bf16_weights_amax
+        )
 
         q_proj_prefix = prefix + q_proj_name + "."
         k_proj_prefix = prefix + k_proj_name + "."
@@ -764,7 +863,7 @@ def _pack_name_remapping(self, module, prefix, layer_type=None):
         for expert in module:
             assert layer_type is not None, "layer_type is required for pack_name_remapping"
             name_to_value, qformat, block_size = get_quantized_state(
-                getattr(expert, layer_type), self.dtype
+                getattr(expert, layer_type), self.dtype, self.export_bf16_weights_amax
             )
             weight = name_to_value.pop("weight")
             weight_scale, weight_scale_2 = self._get_weight_scales(name_to_value, qformat)
@@ -830,7 +929,7 @@ def _pack_name_remapping_gpt_oss(self, module, prefix, layer_type=None):
         for expert in module:
             assert layer_type is not None, "layer_type is required for pack_name_remapping"
             name_to_value, qformat, block_size = get_quantized_state(
-                getattr(expert, layer_type), self.dtype
+                getattr(expert, layer_type), self.dtype, self.export_bf16_weights_amax
             )
             weight = name_to_value.pop("weight")
             bias = name_to_value.pop("bias", None)
@@ -1170,6 +1269,7 @@ def export_mcore_gpt_to_hf(
     dtype: torch.dtype = torch.bfloat16,
     export_dir: Path | str = tempfile.gettempdir(),
     moe_router_dtype: torch.dtype | None = None,
+    export_bf16_weights_amax: bool = False,
 ):
     """Export Megatron Core GPTModel to unified checkpoint and save to export_dir.
 
@@ -1183,6 +1283,7 @@ def export_mcore_gpt_to_hf(
             eagle_module. Otherwise, only export the base model.
         dtype: The weights data type to export the unquantized layers.
         export_dir: The target export path.
+        export_bf16_weights_amax: If True, export the weights in bf16 and amax values.
     """
     exporter = GPTModelExporter(
         model,
@@ -1190,6 +1291,7 @@ def export_mcore_gpt_to_hf(
         export_extra_modules=export_extra_modules,
         dtype=dtype,
         moe_router_dtype=moe_router_dtype,
+        export_bf16_weights_amax=export_bf16_weights_amax,
     )
     exporter.save_pretrained(export_dir, pretrained_model_name_or_path)
 

From 05cd5047a8f33995de12a2352369087f43748d2b Mon Sep 17 00:00:00 2001
From: Kinjal Patel <kinjalpravin@nvidia.com>
Date: Wed, 19 Nov 2025 22:08:46 +0000
Subject: [PATCH 02/10] Updated docs

Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
---
 CHANGELOG.rst                              |  4 +
 examples/vllm_serve/README.md              | 16 ++--
 modelopt/torch/export/unified_export_hf.py | 91 +++++++++++-----------
 3 files changed, 61 insertions(+), 50 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index ffe0acc53..f21d29793 100755
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -16,7 +16,11 @@ Model Optimizer Changelog (Linux)
 - Add FP8/NVFP4 KV cache quantization support for Megatron Core models.
 - Add flag ``trt_plugins_precision`` in ONNX autocast to indicate custom ops precision. This is similar to the flag already existing in the quantization workflow.
 - Add support for PyTorch Geometric quantization.
+<<<<<<< HEAD
 - Add per tensor and per channel MSE calibrator support.
+=======
+- Added support for QAT fakequant evaluation in vLLM. in vLLM for fast evaluation of arbitrary quantization formats. See ``examples/vllm_serve/README.md#load-qatptq-model-and-serve-in-vllm-wip`` for more details.
+>>>>>>> 560dfc7 (Updated docs)
 
 **Documentation**
 
diff --git a/examples/vllm_serve/README.md b/examples/vllm_serve/README.md
index 64a4147c2..7cd3dcf89 100644
--- a/examples/vllm_serve/README.md
+++ b/examples/vllm_serve/README.md
@@ -55,15 +55,18 @@ lm_eval --model local-completions --tasks gsm8k --model_args model=<model_name>,
 
 ## Load QAT/PTQ model and serve in vLLM (WIP)
 
-Overwrite the calibrated amax value with prepared values from either PTQ/QAT. This is only tested for Llama3.1
+Overwrite the calibrated amax value with prepared values from either QAT/PTQ.
 
-Step 1: convert amax to merged amax, using llama3.1 as an example:
+Step 1: export the model with bf16 weights and amax values.
 
-```bash
-python convert_amax_hf2vllm.py -i <amax.pth> -o <vllm_amax.pth>
-```
+- For HF model set `export_bf16_weights_amax` to export the model with function `modelopt.torch.export.unified_export_hf.export_hf_checkpoint`.
+- For MCore model use `export_bf16_weights_amax` to export the model with function `modelopt.torch.export.unified_export_megatron.export_mcore_gpt_to_hf`.
+
+Step 2: configure <quant_amax.pth> from exported model using AMAX_FILE_PATH environment variable in step 1. For example:
 
-Step 2: add `<vllm_amax.pth>` to `quant_config` in `vllm_serve_fakequant.py`
+```
+AMAX_FILE_PATH=<vllm_amax.pth> QUANT_CFG=<quant_config> python vllm_serve_fakequant.py <model_path> -tp 8 --host 0.0.0.0 --port 8000
+```
 
 ## Important Notes
 
@@ -85,3 +88,4 @@ torch.distributed.barrier()
 ## Known Problems
 
 1. AWQ is not yet supported in vLLM.
+2. PTQ/QAT checkpoint doesn't work with KV Cache quantization enabled.
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index e1251120a..1c5925802 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -78,7 +78,7 @@
     to_quantized_weight,
 )
 
-__all__ = ["export_hf_bf16_weights_amax", "export_hf_checkpoint"]
+__all__ = ["export_hf_checkpoint"]
 
 
 def _is_enabled_quantizer(quantizer):
@@ -557,11 +557,44 @@ def _export_hf_checkpoint(
     return quantized_state_dict, quant_config
 
 
+def _export_hf_bf16_weights_amax(
+    model: nn.Module,
+) -> tuple[dict[str, torch.Tensor], dict[str, Any]]:
+    """Exports the torch model weights and amax values separately.
+
+    This function:
+    1. Extracts amax values for calibration
+    2. Deletes all quantizer parameters from state dict to store only weights in original dtype
+
+    Args:
+        model: The quantized model to export
+
+    Returns:
+        post_state_dict: Dict containing quantized weights
+        amax_dict: Dict containing amax values
+    """
+    amax_dict = {
+        name + "._amax": param["_amax"].detach().clone().cpu()
+        for name, param in get_quantizer_state_dict(model).items()
+        if "_amax" in param
+    }
+
+    # remove quantizer from model
+    for name, module in model.named_modules():
+        if is_quantlinear(module):
+            delattr(module, "weight_quantizer")
+            delattr(module, "input_quantizer")
+            delattr(module, "output_quantizer")
+            module.export()
+    return model.state_dict(), amax_dict
+
+
 def export_hf_checkpoint(
     model: nn.Module,
     dtype: torch.dtype | None = None,
     export_dir: Path | str = tempfile.gettempdir(),
     save_modelopt_state: bool = False,
+    export_bf16_weights_amax: bool = False,
 ):
     """Exports the torch model to unified checkpoint and saves to export_dir.
 
@@ -583,13 +616,19 @@ def export_hf_checkpoint(
         return
 
     try:
-        post_state_dict, hf_quant_config = _export_hf_checkpoint(model, dtype)
+        if export_bf16_weights_amax:
+            post_state_dict, amax_dict = _export_hf_bf16_weights_amax(model)
+            hf_quant_config = None
+            torch.save(amax_dict, f"{export_dir}/quant_amax.pth")
+        else:
+            post_state_dict, hf_quant_config = _export_hf_checkpoint(model, dtype)
 
-        # Save hf_quant_config.json for backward compatibility
-        with open(f"{export_dir}/hf_quant_config.json", "w") as file:
-            json.dump(hf_quant_config, file, indent=4)
+        if hf_quant_config is not None:
+            # Save hf_quant_config.json for backward compatibility
+            with open(f"{export_dir}/hf_quant_config.json", "w") as file:
+                json.dump(hf_quant_config, file, indent=4)
 
-        hf_quant_config = convert_hf_quant_config_format(hf_quant_config)
+            hf_quant_config = convert_hf_quant_config_format(hf_quant_config)
 
         # Save model
         model.save_pretrained(
@@ -602,7 +641,8 @@ def export_hf_checkpoint(
         with open(original_config) as file:
             config_data = json.load(file)
 
-        config_data["quantization_config"] = hf_quant_config
+        if hf_quant_config is not None:
+            config_data["quantization_config"] = hf_quant_config
 
         with open(original_config, "w") as file:
             json.dump(config_data, file, indent=4)
@@ -613,40 +653,3 @@ def export_hf_checkpoint(
             " can be saved with torch.save for further inspection."
         )
         raise e
-
-
-def export_hf_bf16_weights_amax(
-    model: nn.Module,
-    export_dir: Path | str = tempfile.gettempdir(),
-):
-    """Exports the torch model weights and amax values separately which can be used for vLLM fakequant serve.
-
-    This function:
-    1. Extracts amax values for calibration
-    2. Deletes all quantizer parameters from state dict to store only weights in original dtype
-    3. Saves model checkpoint (with weights in original dtype) and amax values separately
-
-    Args:
-        model: The quantized model to export
-        export_dir: Directory to save the model and artifacts
-    """
-    export_dir = Path(export_dir)
-    export_dir.mkdir(parents=True, exist_ok=True)
-
-    amax_dict = {
-        name + "._amax": param["_amax"].detach().clone().cpu()
-        for name, param in get_quantizer_state_dict(model).items()
-        if "_amax" in param
-    }
-
-    # remove quantizer from model
-    for name, module in model.named_modules():
-        if is_quantlinear(module):
-            delattr(module, "weight_quantizer")
-            delattr(module, "input_quantizer")
-            delattr(module, "output_quantizer")
-            module.export()
-
-    # Save with model without quantizer parameters
-    model.save_pretrained(export_dir)
-    torch.save(amax_dict, f"{export_dir}/quant_amax.pth")

From b6efc6ec51ce9bd72d14249a34317c70f0a649e6 Mon Sep 17 00:00:00 2001
From: Kinjal Patel <kinjalpravin@nvidia.com>
Date: Wed, 19 Nov 2025 22:17:37 +0000
Subject: [PATCH 03/10] minor

Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
---
 examples/vllm_serve/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/vllm_serve/README.md b/examples/vllm_serve/README.md
index 7cd3dcf89..239b97be0 100644
--- a/examples/vllm_serve/README.md
+++ b/examples/vllm_serve/README.md
@@ -64,7 +64,7 @@ Step 1: export the model with bf16 weights and amax values.
 
 Step 2: configure <quant_amax.pth> from exported model using AMAX_FILE_PATH environment variable in step 1. For example:
 
-```
+```bash
 AMAX_FILE_PATH=<vllm_amax.pth> QUANT_CFG=<quant_config> python vllm_serve_fakequant.py <model_path> -tp 8 --host 0.0.0.0 --port 8000
 ```
 

From 4daf5cec81b9b450c02af25cd66488d57a3b0b76 Mon Sep 17 00:00:00 2001
From: Kinjal Patel <kinjalpravin@nvidia.com>
Date: Wed, 19 Nov 2025 23:12:43 +0000
Subject: [PATCH 04/10] minor

Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
---
 modelopt/torch/export/unified_export_megatron.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
index 18df91f39..71f8a3c55 100644
--- a/modelopt/torch/export/unified_export_megatron.py
+++ b/modelopt/torch/export/unified_export_megatron.py
@@ -537,7 +537,9 @@ def save_pretrained(
             and self._hf_config is not None
             and pretrained_model_name_or_path is not None
         ):
-            # Nemotron-H model requires configuration and modeling files to run with vLLM
+            # For models that keep configuration and modeling files as part of the checkpoint,
+            # we need to copy them to the export directory for seamless integration with inference
+            # frameworks.
             hf_checkpoint_path = Path(pretrained_model_name_or_path)
             model_type = getattr(self._hf_config, "model_type", None)
 

From 2f6c0c0dd931452c545f594a0d96736cef138131 Mon Sep 17 00:00:00 2001
From: Kinjal Patel <kinjalpravin@nvidia.com>
Date: Fri, 21 Nov 2025 18:49:46 +0000
Subject: [PATCH 05/10] minor

Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
---
 examples/vllm_serve/README.md              | 1 +
 modelopt/torch/export/unified_export_hf.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/examples/vllm_serve/README.md b/examples/vllm_serve/README.md
index 239b97be0..90c053d5b 100644
--- a/examples/vllm_serve/README.md
+++ b/examples/vllm_serve/README.md
@@ -89,3 +89,4 @@ torch.distributed.barrier()
 
 1. AWQ is not yet supported in vLLM.
 2. PTQ/QAT checkpoint doesn't work with KV Cache quantization enabled.
+3. Mixed precision checkpoint doesn't work currently.
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 1c5925802..2e9925924 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -603,6 +603,8 @@ def export_hf_checkpoint(
         dtype: the weights data type to export the unquantized layers or the default model data type if None.
         export_dir: the target export path.
         save_modelopt_state: whether to save the modelopt state_dict.
+        export_bf16_weights_amax: whether to export the bf16 weights and amax values separately. This can be used for
+                                  vLLM fakequant serving.
     """
     export_dir = Path(export_dir)
     export_dir.mkdir(parents=True, exist_ok=True)

From bc85b5cce730e033d7ee5d990b6802cee9e3f1dc Mon Sep 17 00:00:00 2001
From: Kinjal Patel <kinjalpravin@nvidia.com>
Date: Fri, 21 Nov 2025 21:42:20 +0000
Subject: [PATCH 06/10] added seperate file for vLLM for export

Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
---
 .../torch/export/plugins/vllm_fakequant.py    | 125 ++++++++++++++++++
 modelopt/torch/export/unified_export_hf.py    |  52 ++------
 .../torch/export/unified_export_megatron.py   |  82 ++++--------
 3 files changed, 159 insertions(+), 100 deletions(-)
 create mode 100644 modelopt/torch/export/plugins/vllm_fakequant.py

diff --git a/modelopt/torch/export/plugins/vllm_fakequant.py b/modelopt/torch/export/plugins/vllm_fakequant.py
new file mode 100644
index 000000000..370d886b7
--- /dev/null
+++ b/modelopt/torch/export/plugins/vllm_fakequant.py
@@ -0,0 +1,125 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Export functions for vLLM fakequant."""
+
+import os
+from pathlib import Path
+
+import torch
+import torch.nn as nn
+
+from modelopt.torch.export.layer_utils import is_quantlinear
+from modelopt.torch.export.model_config import QUANTIZATION_NONE
+from modelopt.torch.quantization.utils import get_quantizer_state_dict
+
+
+def export_hf_vllm_fq_checkpoint(
+    model: nn.Module,
+    export_dir: Path | str,
+) -> dict[str, torch.Tensor]:
+    """Exports the torch model weights and amax values separately.
+
+    This function:
+    1. Extracts amax values for calibration
+    2. Deletes all quantizer parameters from state dict to store only weights in original dtype
+
+    Args:
+        model: The quantized model to export
+        export_dir: Directory to save the amax values
+
+    Returns:
+        post_state_dict: Dict containing quantized weights
+    """
+    amax_dict = {
+        name + "._amax": param["_amax"].detach().clone().cpu()
+        for name, param in get_quantizer_state_dict(model).items()
+        if "_amax" in param
+    }
+
+    # remove quantizer from model
+    for _, module in model.named_modules():
+        if is_quantlinear(module):
+            delattr(module, "weight_quantizer")
+            delattr(module, "input_quantizer")
+            delattr(module, "output_quantizer")
+            module.export()
+    torch.save(amax_dict, f"{export_dir}/quant_amax.pth")
+    return model.state_dict()
+
+
+def get_mcore_vllm_fq_quantized_state(
+    module: torch.nn.Module, name_to_value: dict, dtype: torch.dtype = torch.bfloat16
+):
+    """Return a state_dict, quantization format, and block_size of the quantized module.
+
+    Args:
+        module: The target module to perform real quantization.
+        name_to_value: The dictionary to store the quantized state.
+        dtype: The default data type.
+
+    Returns:
+        Tuple: state dict, quantization format, and block_size of the quantized module.
+
+    """
+    qformat: str = QUANTIZATION_NONE
+    block_size = 0
+
+    for name, param in get_quantizer_state_dict(module).items():
+        if "_amax" in param:
+            name_to_value[name + "._amax"] = param["_amax"].to(dtype).cpu()
+    return name_to_value, qformat, block_size
+
+
+def gather_mcore_vllm_fq_quantized_state_dict(
+    state_dict: dict[str, torch.Tensor], save_directory: str | os.PathLike
+):
+    """Gather all quantized state dict from all ranks and save them to a file.
+
+    Args:
+        state_dict: The state dictionary of the module.
+        save_directory: The directory to save the quantized state dict.
+
+    Returns:
+        The state dictionary of the module without quantized state.
+    """
+    amax_state_dict = {
+        k: v.detach().clone().cpu() for k, v in state_dict.items() if k.endswith("_amax")
+    }
+
+    # Gather all amax dicts to rank 0
+    world_size = torch.distributed.get_world_size()
+    rank = torch.distributed.get_rank()
+
+    if rank == 0:
+        # Rank 0 will collect all amax values
+        all_amax_dicts = [None] * world_size
+        torch.distributed.gather_object(amax_state_dict, all_amax_dicts, dst=0)
+
+        # Merge all amax dicts into one
+        merged_amax_dict = {}
+        for amax_dict in all_amax_dicts:
+            if amax_dict is not None:
+                merged_amax_dict.update(amax_dict)
+
+        print(f"Total amax entries from all ranks: {len(merged_amax_dict.keys())}")
+        torch.save(merged_amax_dict, save_directory + "/quant_amax.pth")
+    else:
+        # Other ranks just send their amax values
+        torch.distributed.gather_object(amax_state_dict, None, dst=0)
+
+    torch.distributed.barrier()
+
+    # remove amax values from state_dict
+    return {k: v for k, v in state_dict.items() if not k.endswith("_amax")}
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 2e9925924..6deb479f1 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -33,11 +33,7 @@
 from modelopt.torch.quantization import set_quantizer_by_cfg_context
 from modelopt.torch.quantization.nn import SequentialQuantizer, TensorQuantizer
 from modelopt.torch.quantization.qtensor import NVFP4QTensor
-from modelopt.torch.quantization.utils import (
-    fsdp2_aware_weight_update,
-    get_quantizer_state_dict,
-    quantizer_attr_names,
-)
+from modelopt.torch.quantization.utils import fsdp2_aware_weight_update, quantizer_attr_names
 
 from .convert_hf_config import convert_hf_quant_config_format
 from .layer_utils import (
@@ -63,6 +59,7 @@
 )
 from .model_utils import get_language_model_from_vl, is_multimodal_model
 from .plugins import export_spec_ckpt_config, export_spec_ckpt_state_dict, spec_opt_only
+from .plugins.vllm_fakequant import export_hf_vllm_fq_checkpoint
 from .quant_utils import (
     fuse_prequant_layernorm,
     fuse_prequant_to_linear,
@@ -557,44 +554,12 @@ def _export_hf_checkpoint(
     return quantized_state_dict, quant_config
 
 
-def _export_hf_bf16_weights_amax(
-    model: nn.Module,
-) -> tuple[dict[str, torch.Tensor], dict[str, Any]]:
-    """Exports the torch model weights and amax values separately.
-
-    This function:
-    1. Extracts amax values for calibration
-    2. Deletes all quantizer parameters from state dict to store only weights in original dtype
-
-    Args:
-        model: The quantized model to export
-
-    Returns:
-        post_state_dict: Dict containing quantized weights
-        amax_dict: Dict containing amax values
-    """
-    amax_dict = {
-        name + "._amax": param["_amax"].detach().clone().cpu()
-        for name, param in get_quantizer_state_dict(model).items()
-        if "_amax" in param
-    }
-
-    # remove quantizer from model
-    for name, module in model.named_modules():
-        if is_quantlinear(module):
-            delattr(module, "weight_quantizer")
-            delattr(module, "input_quantizer")
-            delattr(module, "output_quantizer")
-            module.export()
-    return model.state_dict(), amax_dict
-
-
 def export_hf_checkpoint(
     model: nn.Module,
     dtype: torch.dtype | None = None,
     export_dir: Path | str = tempfile.gettempdir(),
     save_modelopt_state: bool = False,
-    export_bf16_weights_amax: bool = False,
+    export_vllm_fq_weights_qstate: bool = False,
 ):
     """Exports the torch model to unified checkpoint and saves to export_dir.
 
@@ -603,8 +568,8 @@ def export_hf_checkpoint(
         dtype: the weights data type to export the unquantized layers or the default model data type if None.
         export_dir: the target export path.
         save_modelopt_state: whether to save the modelopt state_dict.
-        export_bf16_weights_amax: whether to export the bf16 weights and amax values separately. This can be used for
-                                  vLLM fakequant serving.
+        export_vllm_fq_weights_qstate: whether to export the weights and quantization state separately for vLLM
+        fakequant serving.
     """
     export_dir = Path(export_dir)
     export_dir.mkdir(parents=True, exist_ok=True)
@@ -618,15 +583,14 @@ def export_hf_checkpoint(
         return
 
     try:
-        if export_bf16_weights_amax:
-            post_state_dict, amax_dict = _export_hf_bf16_weights_amax(model)
+        if export_vllm_fq_weights_qstate:
+            post_state_dict = export_hf_vllm_fq_checkpoint(model, export_dir)
             hf_quant_config = None
-            torch.save(amax_dict, f"{export_dir}/quant_amax.pth")
         else:
             post_state_dict, hf_quant_config = _export_hf_checkpoint(model, dtype)
 
         if hf_quant_config is not None:
-            # Save hf_quant_config.json for backward compatibility
+            # Save hf_quant_config.json for\ backward compatibility
             with open(f"{export_dir}/hf_quant_config.json", "w") as file:
                 json.dump(hf_quant_config, file, indent=4)
 
diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
index 71f8a3c55..ba0d76d4f 100644
--- a/modelopt/torch/export/unified_export_megatron.py
+++ b/modelopt/torch/export/unified_export_megatron.py
@@ -35,7 +35,6 @@
 from tqdm import tqdm
 
 from modelopt import __version__
-from modelopt.torch.quantization.utils import get_quantizer_state_dict
 from modelopt.torch.utils import import_plugin
 
 from .model_config import (
@@ -49,6 +48,10 @@
 from .plugins.mcore_common import all_mcore_hf_export_mapping
 from .plugins.mcore_custom import CustomModuleMapping, save_safetensors
 from .plugins.megatron_importer import GPTModelImporter
+from .plugins.vllm_fakequant import (
+    gather_mcore_vllm_fq_quantized_state_dict,
+    get_mcore_vllm_fq_quantized_state,
+)
 from .quant_utils import (
     get_activation_scaling_factor,
     get_kv_cache_dtype,
@@ -116,23 +119,21 @@ def get_kv_cache_scaling_factor(kv_module: nn.Module) -> torch.Tensor:
 def get_quantized_state(
     module: torch.nn.Module,
     dtype: torch.dtype = torch.float16,
-    export_bf16_weights_amax: bool = False,
+    export_vllm_fq_weights_qstate: bool = False,
 ) -> tuple[dict[str, torch.Tensor], str, int]:
     """Return a state_dict, quantization format, and block_size of the module.
 
     Args:
         module: The target module to perform real quantization.
         dtype: The default data type.
-        export_bf16_weights_amax: Whether to export the weights in bf16 and amax values.
+        export_vllm_fq_weights_qstate: Whether to export the weights in bf16 and amax values.
 
     Returns:
         Tuple: state_dict, quantization format, and block_size of the module.
     """
     name_to_value = {}
-    qformat: str = (
-        QUANTIZATION_NONE if export_bf16_weights_amax else get_quantization_format(module)
-    )
-    block_size = 0 if export_bf16_weights_amax else get_weight_block_size(module)
+    qformat: str = get_quantization_format(module)
+    block_size = get_weight_block_size(module)
 
     if hasattr(module, "weight") and module.weight is not None:
         weight = module.weight.to(dtype).cpu()
@@ -146,11 +147,8 @@ def get_quantized_state(
     if hasattr(module, "expert_bias") and module.expert_bias is not None:
         name_to_value["expert_bias"] = module.expert_bias.to(dtype).cpu()
 
-    if export_bf16_weights_amax:
-        for name, param in get_quantizer_state_dict(module).items():
-            if "_amax" in param:
-                name_to_value[name + "._amax"] = param["_amax"].to(dtype).cpu()
-        return name_to_value, qformat, block_size
+    if export_vllm_fq_weights_qstate:
+        return get_mcore_vllm_fq_quantized_state(module, name_to_value, dtype)
 
     # Getting the weight scales
     weight_scale = get_weight_scaling_factor(module)
@@ -203,7 +201,7 @@ def __init__(
         dtype=torch.bfloat16,
         trust_remote_code: bool = True,
         moe_router_dtype: torch.dtype | None = None,
-        export_bf16_weights_amax: bool = False,
+        export_vllm_fq_weights_qstate: bool = False,
     ):
         """Create a GPTModel exporter instance."""
         if not isinstance(model, (GPTModel, MambaModel, LLaVAModel)):
@@ -239,7 +237,7 @@ def __init__(
         self.model = model.language_model if self.is_multimodal else model
         self.dtype = dtype
         self.trust_remote_code = trust_remote_code
-        self.export_bf16_weights_amax = export_bf16_weights_amax
+        self.export_vllm_fq_weights_qstate = export_vllm_fq_weights_qstate
         self.arch = self._hf_config.architectures[0]
         # TODO: May modify this later according to what quantization exported ckpt is, currently only support BF16.
         if self.arch == "GptOssForCausalLM":
@@ -351,7 +349,7 @@ def save_pretrained(
         state_dict = self.extra_state_dict if self.export_extra_modules else self.state_dict
         quantization_format = (
             get_quantization_format(self.model)
-            if not self.export_bf16_weights_amax
+            if not self.export_vllm_fq_weights_qstate
             else QUANTIZATION_NONE
         )
         quantization = None
@@ -400,7 +398,7 @@ def save_pretrained(
                 except (OSError, ValueError, ImportError):
                     pass
 
-        if is_last_stage_main_rank and not self.export_bf16_weights_amax:
+        if is_last_stage_main_rank and not self.export_vllm_fq_weights_qstate:
             hf_quant_config = {
                 "producer": {
                     "name": "modelopt",
@@ -420,7 +418,7 @@ def save_pretrained(
             and self.is_multimodal
             and pretrained_model_name_or_path is not None
         ):
-            assert not self.export_bf16_weights_amax, (
+            assert not self.export_vllm_fq_weights_qstate, (
                 "Exporting weights in bf16 and amax values is not supported for multimodal models"
             )
             hf_checkpoint_path = Path(pretrained_model_name_or_path)
@@ -491,7 +489,7 @@ def save_pretrained(
         torch.distributed.barrier()
 
         if self.export_extra_modules:
-            assert not self.export_bf16_weights_amax, (
+            assert not self.export_vllm_fq_weights_qstate, (
                 "Exporting weights in bf16 and amax values is not supported for extra modules"
             )
             if is_last_stage_main_rank:
@@ -501,36 +499,8 @@ def save_pretrained(
             torch.distributed.barrier()
             return
 
-        if self.export_bf16_weights_amax:
-            amax_state_dict = {
-                k: v.detach().clone().cpu() for k, v in state_dict.items() if k.endswith("_amax")
-            }
-
-            # Gather all amax dicts to rank 0
-            world_size = torch.distributed.get_world_size()
-            rank = torch.distributed.get_rank()
-
-            if rank == 0:
-                # Rank 0 will collect all amax values
-                all_amax_dicts = [None] * world_size
-                torch.distributed.gather_object(amax_state_dict, all_amax_dicts, dst=0)
-
-                # Merge all amax dicts into one
-                merged_amax_dict = {}
-                for amax_dict in all_amax_dicts:
-                    if amax_dict is not None:
-                        merged_amax_dict.update(amax_dict)
-
-                print(f"Total amax entries from all ranks: {len(merged_amax_dict.keys())}")
-                torch.save(merged_amax_dict, save_directory + "/quant_amax.pth")
-            else:
-                # Other ranks just send their amax values
-                torch.distributed.gather_object(amax_state_dict, None, dst=0)
-
-            torch.distributed.barrier()
-
-            # remove amax values from state_dict
-            state_dict = {k: v for k, v in state_dict.items() if not k.endswith("_amax")}
+        if self.export_vllm_fq_weights_qstate:
+            state_dict = gather_mcore_vllm_fq_quantized_state_dict(state_dict, save_directory)
 
         if (
             is_last_stage_main_rank
@@ -640,7 +610,7 @@ def _name_remapping(
             return
 
         name_to_value, qformat, block_size = get_quantized_state(
-            module, dtype, self.export_bf16_weights_amax
+            module, dtype, self.export_vllm_fq_weights_qstate
         )
 
         weight = name_to_value.pop("weight")
@@ -674,7 +644,7 @@ def _gated_mlp_slicing(
         self, module, prefix, gate_proj_name="gate_proj", up_proj_name="up_proj"
     ):
         name_to_value, qformat, block_size = get_quantized_state(
-            module, self.dtype, self.export_bf16_weights_amax
+            module, self.dtype, self.export_vllm_fq_weights_qstate
         )
 
         weight = name_to_value.pop("weight")
@@ -741,7 +711,7 @@ def _qkv_slicing(
         v_scale_name="v_scale",
     ):
         name_to_value, qformat, block_size = get_quantized_state(
-            module, self.dtype, self.export_bf16_weights_amax
+            module, self.dtype, self.export_vllm_fq_weights_qstate
         )
 
         q_proj_prefix = prefix + q_proj_name + "."
@@ -865,7 +835,7 @@ def _pack_name_remapping(self, module, prefix, layer_type=None):
         for expert in module:
             assert layer_type is not None, "layer_type is required for pack_name_remapping"
             name_to_value, qformat, block_size = get_quantized_state(
-                getattr(expert, layer_type), self.dtype, self.export_bf16_weights_amax
+                getattr(expert, layer_type), self.dtype, self.export_vllm_fq_weights_qstate
             )
             weight = name_to_value.pop("weight")
             weight_scale, weight_scale_2 = self._get_weight_scales(name_to_value, qformat)
@@ -931,7 +901,7 @@ def _pack_name_remapping_gpt_oss(self, module, prefix, layer_type=None):
         for expert in module:
             assert layer_type is not None, "layer_type is required for pack_name_remapping"
             name_to_value, qformat, block_size = get_quantized_state(
-                getattr(expert, layer_type), self.dtype, self.export_bf16_weights_amax
+                getattr(expert, layer_type), self.dtype, self.export_vllm_fq_weights_qstate
             )
             weight = name_to_value.pop("weight")
             bias = name_to_value.pop("bias", None)
@@ -1271,7 +1241,7 @@ def export_mcore_gpt_to_hf(
     dtype: torch.dtype = torch.bfloat16,
     export_dir: Path | str = tempfile.gettempdir(),
     moe_router_dtype: torch.dtype | None = None,
-    export_bf16_weights_amax: bool = False,
+    export_vllm_fq_weights_qstate: bool = False,
 ):
     """Export Megatron Core GPTModel to unified checkpoint and save to export_dir.
 
@@ -1285,7 +1255,7 @@ def export_mcore_gpt_to_hf(
             eagle_module. Otherwise, only export the base model.
         dtype: The weights data type to export the unquantized layers.
         export_dir: The target export path.
-        export_bf16_weights_amax: If True, export the weights in bf16 and amax values.
+        export_vllm_fq_weights_qstate: If True, export the weights in bf16 and amax values.
     """
     exporter = GPTModelExporter(
         model,
@@ -1293,7 +1263,7 @@ def export_mcore_gpt_to_hf(
         export_extra_modules=export_extra_modules,
         dtype=dtype,
         moe_router_dtype=moe_router_dtype,
-        export_bf16_weights_amax=export_bf16_weights_amax,
+        export_vllm_fq_weights_qstate=export_vllm_fq_weights_qstate,
     )
     exporter.save_pretrained(export_dir, pretrained_model_name_or_path)
 

From b0f78c8079c2f5078009b10ef6cebca2fb40406a Mon Sep 17 00:00:00 2001
From: Kinjal Patel <kinjalpravin@nvidia.com>
Date: Fri, 21 Nov 2025 21:43:05 +0000
Subject: [PATCH 07/10] added test for vllm fq export

Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
---
 .../export/test_vllm_fakequant_export.py      | 194 ++++++++++++++++++
 1 file changed, 194 insertions(+)
 create mode 100644 tests/gpu/torch/export/test_vllm_fakequant_export.py

diff --git a/tests/gpu/torch/export/test_vllm_fakequant_export.py b/tests/gpu/torch/export/test_vllm_fakequant_export.py
new file mode 100644
index 000000000..08e77ac8d
--- /dev/null
+++ b/tests/gpu/torch/export/test_vllm_fakequant_export.py
@@ -0,0 +1,194 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+from copy import deepcopy
+from functools import partial
+import modelopt.torch.quantization as mtq
+from modelopt.torch.export.unified_export_hf import export_hf_checkpoint
+from modelopt.torch.export.unified_export_megatron import export_mcore_gpt_to_hf
+from _test_utils.torch.transformers_models import create_tiny_llama_dir
+from _test_utils.torch.distributed.utils import spawn_multiprocess_job
+from _test_utils.torch.megatron.models import get_mcore_gpt_model
+from _test_utils.import_helper import skip_if_no_megatron
+from transformers import AutoModelForCausalLM
+
+import os
+import json
+
+skip_if_no_megatron(apex_or_te_required=True)
+
+@pytest.mark.parametrize("quant_cfg", [mtq.FP8_DEFAULT_CFG])
+def test_hf_vllm_export(tmp_path, quant_cfg):
+    """Test HuggingFace model export for vLLM with fake quantization.
+    
+    This test verifies:
+    1. Model weights match before and after export
+    2. quant_amax.pth file is created, huggingface config file does not exist
+    3. Amax values are correctly extracted and saved in quant_amax.pth file
+    """
+    
+    # Create a tiny LLaMA model for testing
+    tiny_model_dir = create_tiny_llama_dir(tmp_path, with_tokenizer=True, num_hidden_layers=2)
+    
+    # Load the model
+    model = AutoModelForCausalLM.from_pretrained(tiny_model_dir)
+    model = model.cuda()
+    model.eval()
+    
+    # Quantize the model
+    def forward_loop(model):
+        input_ids = torch.randint(0, model.config.vocab_size, (1, 128)).cuda()
+        with torch.no_grad():
+            model(input_ids)
+    
+    model = mtq.quantize(model, quant_cfg, forward_loop)
+    
+    model_state_dict = deepcopy(model.state_dict())
+
+    # Export directory
+    export_dir = tmp_path / "vllm_export"
+    export_dir.mkdir(exist_ok=True)
+    
+    # Export for vLLM
+    export_hf_checkpoint(model, export_dir=export_dir, export_vllm_fq_weights_qstate=True)
+
+    # check if quant_amax.pth file exists
+    quant_amax_file = export_dir / "quant_amax.pth"
+    assert quant_amax_file.exists(), f"quant_amax.pth file should be created in {export_dir}"
+    
+    # make sure hf_quant_config.json file does not exist
+    hf_quant_config_file = export_dir / "hf_quant_config.json"
+    assert not hf_quant_config_file.exists(), f"hf_quant_config.json file should not be created in {export_dir}"
+
+    # check weights match before and after export
+    model_after = AutoModelForCausalLM.from_pretrained(export_dir)
+    model_after = model_after.cuda()
+    model_after.eval()
+    model_after_state_dict = model_after.state_dict()
+    amax_state_dict = {}
+    for key in model_state_dict.keys():
+        if key.endswith("_amax"):
+            amax_state_dict[key] = model_state_dict[key]
+            continue
+        
+        assert torch.allclose(model_state_dict[key], model_after_state_dict[key], atol=1e-6), (
+            f"Weight mismatch for {key}: "
+            f"before shape={model_state_dict[key].shape}, after shape={model_after_state_dict[key].shape}, "
+            f"max diff={torch.abs(model_state_dict[key] - model_after_state_dict[key]).max()}"
+        )
+
+    # Verify amax values are correct
+    amax_dict = torch.load(quant_amax_file)
+    assert len(amax_dict) > 0, "amax_dict should not be empty"
+    assert amax_dict.keys() == amax_state_dict.keys(), f"amax keys mismatch between before and after export"
+
+
+def _test_mcore_vllm_export(tmp_path, quant_cfg, rank, size):
+    """Test megatron-core model export for vLLM with fake quantization.
+    
+    """
+    # Create a tiny mcore GPT model
+    num_layers = 2
+    hidden_size = 64
+    num_attention_heads = 8
+    num_query_groups = size
+    ffn_hidden_size = 128
+    max_sequence_length = 32
+    vocab_size = 64
+    
+    model = get_mcore_gpt_model(
+        tensor_model_parallel_size=size,
+        pipeline_model_parallel_size=1,
+        initialize_megatron=True,
+        num_layers=num_layers,
+        hidden_size=hidden_size,
+        num_attention_heads=num_attention_heads,
+        num_query_groups=num_query_groups,
+        ffn_hidden_size=ffn_hidden_size,
+        max_sequence_length=max_sequence_length,
+        vocab_size=vocab_size,
+        activation_func="swiglu",
+        normalization="RMSNorm",
+        transformer_impl="modelopt",
+    ).cuda()
+    model.eval()
+    
+    # Quantize the model
+    def forward_loop(model):
+        batch_size = 1
+        seq_len = 32
+        input_ids = torch.randint(0, vocab_size, (batch_size, seq_len)).cuda()
+        position_ids = torch.arange(seq_len).unsqueeze(0).cuda()
+        # Create causal attention mask
+        attention_mask = torch.tril(torch.ones((1, 1, seq_len, seq_len))).cuda()
+        attention_mask = attention_mask < 0.5  # Convert to boolean mask
+        with torch.no_grad():
+            model(input_ids, position_ids, attention_mask)
+    
+    model = mtq.quantize(model, quant_cfg, forward_loop)
+    
+    model_state_dict = deepcopy(model.state_dict())
+
+    # Create HF config for export
+    pretrained_config = {
+        "architectures": ["LlamaForCausalLM"],
+        "attention_bias": False,
+        "hidden_size": hidden_size,
+        "intermediate_size": ffn_hidden_size,
+        "max_position_embeddings": max_sequence_length,
+        "model_type": "llama",
+        "num_attention_heads": num_attention_heads,
+        "num_hidden_layers": num_layers,
+        "num_key_value_heads": num_query_groups,
+        "torch_dtype": "bfloat16",
+    }
+    
+    with open(tmp_path / "config.json", "w") as f:
+        json.dump(pretrained_config, f)
+
+    # Export directory
+    export_dir = tmp_path / "vllm_export"
+    export_dir.mkdir(exist_ok=True)
+    
+    # Export for vLLM
+    export_mcore_gpt_to_hf(
+        model,
+        pretrained_model_name_or_path=tmp_path,
+        dtype=torch.bfloat16,
+        export_dir=str(export_dir),
+        export_vllm_fq_weights_qstate=True,
+    )
+
+    # check if quant_amax.pth file exists
+    quant_amax_file = export_dir / "quant_amax.pth"
+    assert quant_amax_file.exists(), f"quant_amax.pth file should be created in {export_dir}"
+    
+    # make sure hf_quant_config.json file does not exist
+    hf_quant_config_file = export_dir / "hf_quant_config.json"
+    assert not hf_quant_config_file.exists(), f"hf_quant_config.json file should not be created in {export_dir}"
+
+
+@pytest.mark.parametrize("quant_cfg", [mtq.FP8_DEFAULT_CFG])
+def test_mcore_vllm_export(tmp_path, quant_cfg):
+    """Wrapper test function for mcore vLLM export."""
+    spawn_multiprocess_job(
+        size=1,
+        job=partial(_test_mcore_vllm_export, tmp_path, quant_cfg),
+        backend="nccl",
+    )
+
+

From f46e41d2d2fb8c1cb21cbf6cb5a2e79375c7f643 Mon Sep 17 00:00:00 2001
From: Kinjal Patel <kinjalpravin@nvidia.com>
Date: Sat, 22 Nov 2025 01:53:09 +0000
Subject: [PATCH 08/10] Added support for Qwen3-MoE

Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
---
 examples/vllm_serve/fakequant_worker.py       | 39 ++++++++-
 modelopt/torch/quantization/plugins/vllm.py   | 17 ++--
 .../export/test_vllm_fakequant_export.py      | 80 +++++++++----------
 3 files changed, 87 insertions(+), 49 deletions(-)

diff --git a/examples/vllm_serve/fakequant_worker.py b/examples/vllm_serve/fakequant_worker.py
index 5cfccb976..d08e62340 100644
--- a/examples/vllm_serve/fakequant_worker.py
+++ b/examples/vllm_serve/fakequant_worker.py
@@ -33,7 +33,7 @@
 
 
 def convert_amax_hf2vllm(
-    hf_state_dict: dict[str, torch.Tensor],
+    hf_state_dict: dict[str, torch.Tensor], fuse_experts: bool = False
 ) -> dict[str, torch.Tensor]:
     """
     Convert amax values from HuggingFace format to vLLM format.
@@ -66,13 +66,44 @@ def convert_amax_hf2vllm(
             merge_groups[base_pattern].append((key, value))
             continue
 
-        # Check if this is a gate/up projection that needs merging
-        gate_up_match = "mixer" not in key and re.search(r"(.*\.)(gate|up)_proj(\..+_amax)$", key)
+        # Check if this is an expert gate/up projection
+        # Pattern: model.layers.0.mlp.experts.*.gate_proj.input_quantizer._amax and
+        # model.layers.0.mlp.experts.*.up_proj.input_quantizer._amax
+        # Maps to: model.layers.0.mlp.experts.w13_input_quantizer._amax
+        expert_gate_up_match = (
+            "mixer" not in key
+            and fuse_experts
+            and re.search(r"(.*\.experts)\.\d+\.(gate|up)_proj\.([^.]+_quantizer\._amax)$", key)
+        )
+        if expert_gate_up_match:
+            base_pattern = expert_gate_up_match.group(1) + ".w13_" + expert_gate_up_match.group(3)
+            merge_groups[base_pattern].append((key, value))
+            continue
+
+        # Check if this is a non-expert gate/up projection that needs merging
+        gate_up_match = (
+            "mixer" not in key
+            and "experts" not in key
+            and re.search(r"(.*\.)(gate|up)_proj(\..+_amax)$", key)
+        )
         if gate_up_match:
             base_pattern = gate_up_match.group(1) + "gate_up_proj" + gate_up_match.group(3)
             merge_groups[base_pattern].append((key, value))
             continue
 
+        # Check if this is an expert down_proj
+        # Pattern: model.layers.0.mlp.experts.*.down_proj.input_quantizer._amax
+        # Maps to: model.layers.0.mlp.experts.w2_input_quantizer._amax
+        expert_down_match = (
+            "mixer" not in key
+            and fuse_experts
+            and re.search(r"(.*\.experts)\.\d+\.down_proj\.([^.]+_quantizer\._amax)$", key)
+        )
+        if expert_down_match:
+            base_pattern = expert_down_match.group(1) + ".w2_" + expert_down_match.group(2)
+            merge_groups[base_pattern].append((key, value))
+            continue
+
         # Copy other amax keys as-is (like o_proj, down_proj)
         vllm_state_dict[key] = value
 
@@ -226,7 +257,7 @@ def calibrate_loop(model: Any = None) -> None:
                 for key, value in saved_amax_dict.items()
                 if key.endswith("quantizer_amax")
             }
-        saved_amax_dict = convert_amax_hf2vllm(saved_amax_dict)
+        saved_amax_dict = convert_amax_hf2vllm(saved_amax_dict, fuse_experts=True)
 
         current_state_dict = model.state_dict()
         # Count amax keys in checkpoint and model
diff --git a/modelopt/torch/quantization/plugins/vllm.py b/modelopt/torch/quantization/plugins/vllm.py
index c35f7760b..9676d2c89 100644
--- a/modelopt/torch/quantization/plugins/vllm.py
+++ b/modelopt/torch/quantization/plugins/vllm.py
@@ -21,14 +21,21 @@
 import vllm.model_executor.layers.fused_moe.layer as vllm_fused_moe_layer
 import vllm.model_executor.layers.linear as vllm_linear
 
-try:
-    import vllm.model_executor.layers.fused_moe.shared_fused_moe as vllm_shared_fused_moe_layer
-except ImportError:
-    vllm_shared_fused_moe_layer = None
-
 from ...utils.distributed import ParallelState
 from ..nn import QuantLinearConvBase, QuantModule, QuantModuleRegistry, TensorQuantizer
 
+# Try multiple import paths for vLLM compatibility across versions
+vllm_shared_fused_moe_layer = None
+for module_path in [
+    "vllm.model_executor.layers.fused_moe.shared_fused_moe",  # 0.11.0+
+    "vllm.model_executor.layers.shared_fused_moe.shared_fused_moe",  # 0.10.2
+]:
+    try:
+        vllm_shared_fused_moe_layer = importlib.import_module(module_path)
+        break
+    except ImportError:
+        continue
+
 vllm_fused_moe_package = importlib.import_module("vllm.model_executor.layers.fused_moe.fused_moe")
 
 
diff --git a/tests/gpu/torch/export/test_vllm_fakequant_export.py b/tests/gpu/torch/export/test_vllm_fakequant_export.py
index 08e77ac8d..127e0f57e 100644
--- a/tests/gpu/torch/export/test_vllm_fakequant_export.py
+++ b/tests/gpu/torch/export/test_vllm_fakequant_export.py
@@ -13,66 +13,69 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytest
-import torch
+import json
 from copy import deepcopy
 from functools import partial
-import modelopt.torch.quantization as mtq
-from modelopt.torch.export.unified_export_hf import export_hf_checkpoint
-from modelopt.torch.export.unified_export_megatron import export_mcore_gpt_to_hf
-from _test_utils.torch.transformers_models import create_tiny_llama_dir
+
+import pytest
+import torch
+from _test_utils.import_helper import skip_if_no_megatron
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_gpt_model
-from _test_utils.import_helper import skip_if_no_megatron
+from _test_utils.torch.transformers_models import create_tiny_llama_dir
 from transformers import AutoModelForCausalLM
 
-import os
-import json
+import modelopt.torch.quantization as mtq
+from modelopt.torch.export.unified_export_hf import export_hf_checkpoint
+from modelopt.torch.export.unified_export_megatron import export_mcore_gpt_to_hf
 
 skip_if_no_megatron(apex_or_te_required=True)
 
+
 @pytest.mark.parametrize("quant_cfg", [mtq.FP8_DEFAULT_CFG])
 def test_hf_vllm_export(tmp_path, quant_cfg):
     """Test HuggingFace model export for vLLM with fake quantization.
-    
+
     This test verifies:
     1. Model weights match before and after export
     2. quant_amax.pth file is created, huggingface config file does not exist
     3. Amax values are correctly extracted and saved in quant_amax.pth file
     """
-    
+
     # Create a tiny LLaMA model for testing
     tiny_model_dir = create_tiny_llama_dir(tmp_path, with_tokenizer=True, num_hidden_layers=2)
-    
+
     # Load the model
     model = AutoModelForCausalLM.from_pretrained(tiny_model_dir)
     model = model.cuda()
     model.eval()
-    
+
     # Quantize the model
     def forward_loop(model):
         input_ids = torch.randint(0, model.config.vocab_size, (1, 128)).cuda()
         with torch.no_grad():
             model(input_ids)
-    
+
     model = mtq.quantize(model, quant_cfg, forward_loop)
-    
+
     model_state_dict = deepcopy(model.state_dict())
 
     # Export directory
     export_dir = tmp_path / "vllm_export"
     export_dir.mkdir(exist_ok=True)
-    
+
     # Export for vLLM
     export_hf_checkpoint(model, export_dir=export_dir, export_vllm_fq_weights_qstate=True)
 
     # check if quant_amax.pth file exists
     quant_amax_file = export_dir / "quant_amax.pth"
     assert quant_amax_file.exists(), f"quant_amax.pth file should be created in {export_dir}"
-    
+
     # make sure hf_quant_config.json file does not exist
     hf_quant_config_file = export_dir / "hf_quant_config.json"
-    assert not hf_quant_config_file.exists(), f"hf_quant_config.json file should not be created in {export_dir}"
+    assert not hf_quant_config_file.exists(), (
+        f"hf_quant_config.json file should not be created in {export_dir}"
+    )
 
     # check weights match before and after export
     model_after = AutoModelForCausalLM.from_pretrained(export_dir)
@@ -80,27 +83,27 @@ def forward_loop(model):
     model_after.eval()
     model_after_state_dict = model_after.state_dict()
     amax_state_dict = {}
-    for key in model_state_dict.keys():
+    for key, param in model_state_dict.items():
         if key.endswith("_amax"):
-            amax_state_dict[key] = model_state_dict[key]
+            amax_state_dict[key] = param
             continue
-        
-        assert torch.allclose(model_state_dict[key], model_after_state_dict[key], atol=1e-6), (
+
+        assert torch.allclose(param, model_after_state_dict[key], atol=1e-6), (
             f"Weight mismatch for {key}: "
-            f"before shape={model_state_dict[key].shape}, after shape={model_after_state_dict[key].shape}, "
-            f"max diff={torch.abs(model_state_dict[key] - model_after_state_dict[key]).max()}"
+            f"before shape={param.shape}, after shape={model_after_state_dict[key].shape}, "
+            f"max diff={torch.abs(param - model_after_state_dict[key]).max()}"
         )
 
     # Verify amax values are correct
     amax_dict = torch.load(quant_amax_file)
     assert len(amax_dict) > 0, "amax_dict should not be empty"
-    assert amax_dict.keys() == amax_state_dict.keys(), f"amax keys mismatch between before and after export"
+    assert amax_dict.keys() == amax_state_dict.keys(), (
+        "amax keys mismatch between before and after export"
+    )
 
 
 def _test_mcore_vllm_export(tmp_path, quant_cfg, rank, size):
-    """Test megatron-core model export for vLLM with fake quantization.
-    
-    """
+    """Test megatron-core model export for vLLM with fake quantization."""
     # Create a tiny mcore GPT model
     num_layers = 2
     hidden_size = 64
@@ -109,7 +112,7 @@ def _test_mcore_vllm_export(tmp_path, quant_cfg, rank, size):
     ffn_hidden_size = 128
     max_sequence_length = 32
     vocab_size = 64
-    
+
     model = get_mcore_gpt_model(
         tensor_model_parallel_size=size,
         pipeline_model_parallel_size=1,
@@ -126,7 +129,7 @@ def _test_mcore_vllm_export(tmp_path, quant_cfg, rank, size):
         transformer_impl="modelopt",
     ).cuda()
     model.eval()
-    
+
     # Quantize the model
     def forward_loop(model):
         batch_size = 1
@@ -138,11 +141,8 @@ def forward_loop(model):
         attention_mask = attention_mask < 0.5  # Convert to boolean mask
         with torch.no_grad():
             model(input_ids, position_ids, attention_mask)
-    
-    model = mtq.quantize(model, quant_cfg, forward_loop)
-    
-    model_state_dict = deepcopy(model.state_dict())
 
+    model = mtq.quantize(model, quant_cfg, forward_loop)
     # Create HF config for export
     pretrained_config = {
         "architectures": ["LlamaForCausalLM"],
@@ -156,14 +156,14 @@ def forward_loop(model):
         "num_key_value_heads": num_query_groups,
         "torch_dtype": "bfloat16",
     }
-    
+
     with open(tmp_path / "config.json", "w") as f:
         json.dump(pretrained_config, f)
 
     # Export directory
     export_dir = tmp_path / "vllm_export"
     export_dir.mkdir(exist_ok=True)
-    
+
     # Export for vLLM
     export_mcore_gpt_to_hf(
         model,
@@ -176,10 +176,12 @@ def forward_loop(model):
     # check if quant_amax.pth file exists
     quant_amax_file = export_dir / "quant_amax.pth"
     assert quant_amax_file.exists(), f"quant_amax.pth file should be created in {export_dir}"
-    
+
     # make sure hf_quant_config.json file does not exist
     hf_quant_config_file = export_dir / "hf_quant_config.json"
-    assert not hf_quant_config_file.exists(), f"hf_quant_config.json file should not be created in {export_dir}"
+    assert not hf_quant_config_file.exists(), (
+        f"hf_quant_config.json file should not be created in {export_dir}"
+    )
 
 
 @pytest.mark.parametrize("quant_cfg", [mtq.FP8_DEFAULT_CFG])
@@ -190,5 +192,3 @@ def test_mcore_vllm_export(tmp_path, quant_cfg):
         job=partial(_test_mcore_vllm_export, tmp_path, quant_cfg),
         backend="nccl",
     )
-
-

From d8652d19bd21a3b6ded0df58e4cab59b6df74026 Mon Sep 17 00:00:00 2001
From: Kinjal Patel <kinjalpravin@nvidia.com>
Date: Sat, 22 Nov 2025 02:00:42 +0000
Subject: [PATCH 09/10] minor

Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
---
 CHANGELOG.rst | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index f21d29793..dbc6ba899 100755
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -16,11 +16,8 @@ Model Optimizer Changelog (Linux)
 - Add FP8/NVFP4 KV cache quantization support for Megatron Core models.
 - Add flag ``trt_plugins_precision`` in ONNX autocast to indicate custom ops precision. This is similar to the flag already existing in the quantization workflow.
 - Add support for PyTorch Geometric quantization.
-<<<<<<< HEAD
 - Add per tensor and per channel MSE calibrator support.
-=======
 - Added support for QAT fakequant evaluation in vLLM. in vLLM for fast evaluation of arbitrary quantization formats. See ``examples/vllm_serve/README.md#load-qatptq-model-and-serve-in-vllm-wip`` for more details.
->>>>>>> 560dfc7 (Updated docs)
 
 **Documentation**
 

From 10926994280d1181ed146b24e3ae1a87c0157e02 Mon Sep 17 00:00:00 2001
From: Kinjal Patel <kinjalpravin@nvidia.com>
Date: Mon, 24 Nov 2025 20:54:29 +0000
Subject: [PATCH 10/10] minor

Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
---
 CHANGELOG.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index dbc6ba899..899b14009 100755
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -17,7 +17,7 @@ Model Optimizer Changelog (Linux)
 - Add flag ``trt_plugins_precision`` in ONNX autocast to indicate custom ops precision. This is similar to the flag already existing in the quantization workflow.
 - Add support for PyTorch Geometric quantization.
 - Add per tensor and per channel MSE calibrator support.
-- Added support for QAT fakequant evaluation in vLLM. in vLLM for fast evaluation of arbitrary quantization formats. See ``examples/vllm_serve/README.md#load-qatptq-model-and-serve-in-vllm-wip`` for more details.
+- Added support for PTQ/QAT checkpoint export and loading for running fakequant evaluation in vLLM. See ``examples/vllm_serve/README.md#load-qatptq-model-and-serve-in-vllm-wip`` for more details.
 
 **Documentation**