NVIDIA
diff --git a/‎CHANGELOG.rst‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎modelopt/torch/export/quant_utils.py‎
Lines changed: 30 additions & 1 deletion b/‎modelopt/torch/export/quant_utils.py‎
Lines changed: 30 additions & 1 deletion
diff --git a/‎modelopt/torch/export/unified_export_hf.py‎
Lines changed: 25 additions & 4 deletions b/‎modelopt/torch/export/unified_export_hf.py‎
Lines changed: 25 additions & 4 deletions
diff --git a/‎modelopt/torch/quantization/calib/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎modelopt/torch/quantization/calib/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎modelopt/torch/quantization/calib/mse.py‎
Lines changed: 177 additions & 0 deletions b/‎modelopt/torch/quantization/calib/mse.py‎
Lines changed: 177 additions & 0 deletions
diff --git a/‎modelopt/torch/quantization/config.py‎
Lines changed: 39 additions & 0 deletions b/‎modelopt/torch/quantization/config.py‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎modelopt/torch/quantization/mode.py‎
Lines changed: 14 additions & 1 deletion b/‎modelopt/torch/quantization/mode.py‎
Lines changed: 14 additions & 1 deletion
@@ -16,6 +16,7 @@ Model Optimizer Changelog (Linux)
 - Add FP8/NVFP4 KV cache quantization support for Megatron Core models.
 - Add flag ``trt_plugins_precision`` in ONNX autocast to indicate custom ops precision. This is similar to the flag already existing in the quantization workflow.
 - Add support for PyTorch Geometric quantization.
+- Add per tensor and per channel MSE calibrator support.
 
 **Documentation**
 
 
@@ -779,7 +779,36 @@ def to_quantized_weight(
         )[0]._quantized_data
 
     if quantization == QUANTIZATION_FP8_PC_PT:
-        return (weight / weights_scaling_factor.unsqueeze(-1)).to(torch.float8_e4m3fn)
+        if weight.dim() == 3:
+            # Handle different scale tensor shapes
+            if weights_scaling_factor.dim() == 1:
+                # Per-expert scaling only: (num_experts,) -> (num_experts, 1, 1)
+                return (weight / weights_scaling_factor[:, None, None]).to(torch.float8_e4m3fn)
+            elif weights_scaling_factor.dim() == 2:
+                # Per-channel scaling: check which dimension matches
+                if weights_scaling_factor.shape[0] != weight.shape[0]:
+                    raise ValueError(
+                        f"First dimension (num_experts) mismatch for FP8_PC_PT quantization. "
+                        f"weight shape: {weight.shape}, scale shape: {weights_scaling_factor.shape}"
+                    )
+                if weight.shape[-1] == weight.shape[-2]:
+                    raise ValueError(
+                        f"Ambiguous scaling dimension for FP8_PC_PT quantization with square weight matrix. "
+                        f"weight shape: {weight.shape}, scale shape: {weights_scaling_factor.shape}. "
+                        f"Cannot determine if scaling should be applied to input_dim or output_dim."
+                    )
+                if weights_scaling_factor.shape[-1] == weight.shape[-1]:
+                    # (num_experts, input_dim) -> (num_experts, 1, input_dim), BMM-style
+                    return (weight / weights_scaling_factor.unsqueeze(-2)).to(torch.float8_e4m3fn)
+                elif weights_scaling_factor.shape[-1] == weight.shape[-2]:
+                    # (num_experts, output_dim) -> (num_experts, output_dim, 1), Standard MoE case
+                    return (weight / weights_scaling_factor.unsqueeze(-1)).to(torch.float8_e4m3fn)
+                else:
+                    raise ValueError(
+                        f"Cannot determine correct unsqueeze dimension for FP8_PC_PT quantization. "
+                        f"weight shape: {weight.shape}, scale shape: {weights_scaling_factor.shape}"
+                    )
+        return (weight / weights_scaling_factor[:, None]).to(torch.float8_e4m3fn)
 
     if quantization in [QUANTIZATION_INT4_AWQ, QUANTIZATION_W4A8_AWQ]:
         return pack_int4_in_uint8(weight, weights_scaling_factor)
 
@@ -50,6 +50,7 @@
     KV_CACHE_NVFP4_AFFINE,
     QUANTIZATION_FP8,
     QUANTIZATION_FP8_PB_REAL,
+    QUANTIZATION_FP8_PC_PT,
     QUANTIZATION_NONE,
     QUANTIZATION_NVFP4,
     QUANTIZATION_NVFP4_AWQ,
@@ -327,13 +328,15 @@ def _export_quantized_weight(
     weight_scale_2: torch.Tensor | None = getattr(sub_module, quantizer_attrs.weight_scale_2, None)
 
     # Transpose weight for bmm-style expert quantization (llama4, gpt-oss)
+    # Check if this is a BMM-style expert weight that needs transposition
+    is_bmm_expert_weight = weight.dim() == 3 and any(
+        expert_type in type(sub_module).__name__
+        for expert_type in ["Llama4TextExperts", "GptOssExperts"]
+    )
+
     if quantization_format in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]:
         # Transpose weight from (num_experts, input_dim, output_dim) to (num_experts, output_dim, input_dim)
         # for NVFP4 quantization functions that expect input_dim as the last dimension for block quantization
-        is_bmm_expert_weight = weight.dim() == 3 and any(
-            expert_type in type(sub_module).__name__
-            for expert_type in ["Llama4TextExperts", "GptOssExperts"]
-        )
         weight, _ = maybe_transpose_expert_weight_dimensions(
             weight, is_bmm_expert_weight=is_bmm_expert_weight
         )
@@ -354,6 +357,24 @@ def _export_quantized_weight(
         quantized_weight, weight_scale = maybe_transpose_expert_weight_dimensions(
             quantized_weight, weight_scale, is_bmm_expert_weight=is_bmm_expert_weight
         )
+    elif quantization_format == QUANTIZATION_FP8_PC_PT and is_bmm_expert_weight:
+        # For FP8_PC_PT with BMM-style experts, transpose only the weight (not weight_scale)
+        weight, _ = maybe_transpose_expert_weight_dimensions(
+            weight, is_bmm_expert_weight=is_bmm_expert_weight
+        )
+
+        quantized_weight = to_quantized_weight(
+            weight.to(dtype),
+            weight_scale,
+            quantization_format,
+            weight_scale_2,
+            block_size,
+        )
+
+        # Transpose back to original BMM format
+        quantized_weight, _ = maybe_transpose_expert_weight_dimensions(
+            quantized_weight, is_bmm_expert_weight=is_bmm_expert_weight
+        )
     else:
         quantized_weight = to_quantized_weight(
             weight.to(dtype),
 
@@ -23,3 +23,4 @@
 from .calibrator import *
 from .histogram import *
 from .max import *
+from .mse import *
@@ -0,0 +1,177 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Calibrator that returns the MSE amax of all collected tensors."""
+
+from collections.abc import Callable
+
+import torch
+import torch.nn.functional as F
+
+from .. import utils as quant_utils
+from .calibrator import _Calibrator
+
+__all__ = ["MseCalibrator"]
+
+
+class MseCalibrator(_Calibrator):
+    """Per-tensor and per-channel MSE amax search that minimizes error between x and quantized x."""
+
+    def __init__(
+        self,
+        amax: torch.Tensor,
+        axis: int | tuple | list | None = None,
+        num_steps: int = 10,
+        start_multiplier: float = 0.25,
+        stop_multiplier: float = 4.0,
+        quant_func: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
+        error_func: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
+    ):
+        """Initialize MSE calibrator.
+
+        Args:
+            amax: Initial amax value (required).
+            axis: Quantization axis. None means per-tensor quantization.
+            num_steps: Number of amax candidates to try.
+            start_multiplier: Starting multiplier for amax search.
+            stop_multiplier: Ending multiplier for amax search.
+            quant_func: Function that quantizes input tensor given an amax value.
+                       Should have signature: quant_func(x, amax) -> quantized_x.
+            error_func: Function to compute error between x and xq.
+                       Default is F.mse_loss(x, xq, reduction='none').
+        """
+        super().__init__(num_bits=None, axis=axis, unsigned=None)
+        self._initial_amax = amax
+        self._num_steps = num_steps
+        self._start_multiplier = start_multiplier
+        self._stop_multiplier = stop_multiplier
+        self._quant_func = quant_func
+        self._error_func = error_func
+        self._losses_sum = [None] * num_steps
+        self._candidate_amaxs = [None] * num_steps
+
+        self._amax = None
+
+    @torch.no_grad()
+    def collect(self, x: torch.Tensor):
+        """Collect input tensor statistics and compute losses for MSE calibration.
+
+        Args:
+            x: Input tensor.
+        """
+        if self._quant_func is None:
+            raise RuntimeError(
+                "Quantization function not set. Msecalibrator requires a quant_func to be provided."
+            )
+
+        x = x.detach().to(dtype=torch.float32)
+
+        device = x.device
+        multipliers = torch.linspace(
+            self._start_multiplier, self._stop_multiplier, steps=self._num_steps, device=device
+        )
+
+        # Get reduce axis for per-channel quantization
+        reduce_axis = quant_utils.convert_quantization_axis_to_reduce_axis(x, self._axis)
+
+        for step, multiplier in enumerate(multipliers):
+            candidate_amax = self._initial_amax * multiplier
+            xq = self._quant_func(x, candidate_amax)
+
+            if self._error_func is not None:
+                error = self._error_func(x, xq)
+            else:
+                error = F.mse_loss(x, xq, reduction="none")
+
+            loss = quant_utils.reduce_sum(error, axis=reduce_axis, keepdims=False)
+
+            if self._candidate_amaxs[step] is None:
+                self._candidate_amaxs[step] = candidate_amax
+
+            if self._losses_sum[step] is None:
+                self._losses_sum[step] = loss.clone()
+            else:
+                self._losses_sum[step] += loss
+
+    def reset(self):
+        """Reset the stored losses and amax value."""
+        self._losses_sum = [None] * self._num_steps
+        self._candidate_amaxs = [None] * self._num_steps
+        self._amax = None
+
+    @torch.no_grad()
+    def compute_amax(self, verbose: bool = False):
+        """Return the amax value that minimizes quantization error.
+
+        Args:
+            verbose: If True, print the ratio of best_amax to initial_amax.
+        """
+        if not any(loss_sum is not None for loss_sum in self._losses_sum):
+            return None
+
+        # Check if this is per-tensor or per-channel based on the first loss
+        first_loss_sum = None
+        for loss_sum in self._losses_sum:
+            if loss_sum is not None:
+                first_loss_sum = loss_sum
+                break
+
+        if first_loss_sum is None:
+            return None
+
+        # Collect losses for all steps
+        losses_per_step = []
+        for step in range(self._num_steps):
+            if self._losses_sum[step] is not None:
+                losses_per_step.append(self._losses_sum[step])
+            # No data for this step, use inf
+            elif first_loss_sum.ndim == 0:
+                losses_per_step.append(torch.tensor(float("inf"), device=first_loss_sum.device))
+            else:
+                losses_per_step.append(torch.full_like(first_loss_sum, float("inf")))
+
+        # Stack to get [num_steps] for per-tensor or [num_steps, num_channels] for per-channel
+        losses_per_step = torch.stack(losses_per_step)
+
+        # Find best step(s): scalar for per-tensor, [num_channels] for per-channel
+        best_steps = torch.argmin(losses_per_step, dim=0)
+
+        # Stack candidate amaxs and select based on best_steps
+        candidate_amaxs = torch.stack(self._candidate_amaxs)
+
+        if first_loss_sum.ndim == 0:
+            # Per-tensor case: best_steps is a scalar
+            self._amax = self._candidate_amaxs[best_steps.item()]
+        else:
+            # Per-channel case: best_steps is a tensor
+            num_channels = best_steps.shape[0]
+            self._amax = candidate_amaxs[
+                best_steps, torch.arange(num_channels, device=best_steps.device)
+            ]
+            self._amax = self._amax.reshape(self._initial_amax.shape)
+
+        if verbose:
+            ratio = self._amax / self._initial_amax
+            if ratio.ndim == 0:
+                print(f"MSE Calibrator: best_amax/initial_amax ratio = {ratio.item():.4f}")
+            else:
+                print(
+                    f"MSE Calibrator: best_amax/initial_amax ratio - "
+                    f"mean: {ratio.mean().item():.4f}, "
+                    f"min: {ratio.min().item():.4f}, "
+                    f"max: {ratio.max().item():.4f}"
+                )
+
+        return self._amax
@@ -981,6 +981,45 @@ class MaxCalibConfig(QuantizeAlgorithmConfig):
     )
 
 
+class MseCalibConfig(QuantizeAlgorithmConfig):
+    """Configuration for per-tensor MSE calibration.
+
+    Finds a scale s (via amax a, with s = a / q_max) that minimizes the
+    reconstruction error of a tensor after uniform Q→DQ:
+
+        s* = argmin_s  E[(X - DQ(Q(X; s)))^2],   X ∈ {weights | activations}
+    """
+
+    method: Literal["mse"] = ModeloptField("mse")
+
+    num_steps: int | None = ModeloptField(
+        default=10,
+        ge=1,
+        title="Number of amax candidates to try.",
+        description="Number of amax candidates to search over for MSE minimization.",
+    )
+
+    start_multiplier: float | None = ModeloptField(
+        default=0.25,
+        gt=0.0,
+        title="Starting multiplier for amax search.",
+        description="Starting multiplier for amax search range (multiplies initial amax).",
+    )
+
+    stop_multiplier: float | None = ModeloptField(
+        default=4.0,
+        gt=0.0,
+        title="Ending multiplier for amax search.",
+        description="Ending multiplier for amax search range (multiplies initial amax).",
+    )
+
+    distributed_sync: bool | None = ModeloptField(
+        default=True,
+        title="Whether to sync the amax across the distributed processes.",
+        description="If True, the amax will be synced across the distributed processes.",
+    )
+
+
 class SmoothQuantCalibConfig(QuantizeAlgorithmConfig):
     """The config for ``smoothquant`` algorithm (SmoothQuant).
 
 
@@ -38,6 +38,7 @@
     AWQLiteCalibConfig,
     CompressConfig,
     MaxCalibConfig,
+    MseCalibConfig,
     QuantizeAlgoCfgType,
     QuantizeAlgorithmConfig,
     QuantizeConfig,
@@ -54,7 +55,7 @@
     restore_svdquant_model,
     update_quantize_metadata,
 )
-from .model_calib import awq, max_calibrate, smoothquant, svdquant
+from .model_calib import awq, max_calibrate, mse_calibrate, smoothquant, svdquant
 
 __all__ = ["BaseCalibrateModeDescriptor"]
 
@@ -363,6 +364,18 @@ def config_class(self) -> type[QuantizeAlgorithmConfig]:
     _calib_func = max_calibrate
 
 
+@CalibrateModeRegistry.register_mode
+class MseCalibrateModeDescriptor(BaseCalibrateModeDescriptor):
+    """Mode for mse calibration algorithm."""
+
+    @property
+    def config_class(self) -> type[QuantizeAlgorithmConfig]:
+        """Specifies the config class for the mode."""
+        return MseCalibConfig
+
+    _calib_func = mse_calibrate
+
+
 @CalibrateModeRegistry.register_mode
 class SmoothQuantModeDescriptor(BaseCalibrateModeDescriptor):
     """Mode for smoothquant calibration algorithm."""