From 3ba4f00f440e483f15e7ce9bb3f3322c828ad1d6 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 27 Oct 2025 12:00:40 -0400
Subject: [PATCH 01/10] apply patch

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/modeling/llama4.py          |  6 +--
 src/llmcompressor/modeling/prepare.py         | 39 ++++++-----------
 src/llmcompressor/modeling/qwen3_vl_moe.py    | 43 +++++++++++++++----
 .../modeling/test_calib_qwen3_vl_moe.py       | 21 +++++----
 4 files changed, 59 insertions(+), 50 deletions(-)

diff --git a/src/llmcompressor/modeling/llama4.py b/src/llmcompressor/modeling/llama4.py
index 2b49a652af..bfaa8e21c7 100644
--- a/src/llmcompressor/modeling/llama4.py
+++ b/src/llmcompressor/modeling/llama4.py
@@ -38,10 +38,8 @@ def __init__(
         calibrate_all_experts: bool = True,
     ):
         super().__init__()
-        # Extract text config from multimodal config if needed
-        text_config = (
-            config.get_text_config() if hasattr(config, "get_text_config") else config
-        )
+        # Extract text config from multimodal config
+        text_config: Llama4TextConfig = config.get_text_config()
         self.top_k = text_config.num_experts_per_tok
         self.hidden_dim = text_config.hidden_size
         self.num_experts = text_config.num_local_experts
diff --git a/src/llmcompressor/modeling/prepare.py b/src/llmcompressor/modeling/prepare.py
index 42173bb8b5..10b92f2603 100644
--- a/src/llmcompressor/modeling/prepare.py
+++ b/src/llmcompressor/modeling/prepare.py
@@ -10,33 +10,20 @@
 from compressed_tensors.utils import deprecated, replace_module
 from transformers import PreTrainedModel
 
-# Import MoE calibration modules to trigger registration
-from llmcompressor.modeling.deepseek_v3 import (  # noqa: F401
-    CalibrationDeepseekV3MoE,
-)
-from llmcompressor.modeling.deepseek_v3 import (
-    replace as replace_deepseekv3,
-)
-from llmcompressor.modeling.llama4 import (  # noqa: F401
-    SequentialLlama4TextMoe,
-)
-from llmcompressor.modeling.llama4 import (
-    replace as replace_llama4,
-)
-from llmcompressor.modeling.moe_context import (  # noqa: F401
-    moe_calibration_context,
-)
-from llmcompressor.modeling.qwen3_moe import (  # noqa: F401
-    CalibrationQwen3MoeSparseMoeBlock,
-)
-from llmcompressor.modeling.qwen3_next_moe import (  # noqa: F401
-    CalibrationQwen3NextSparseMoeBlock,
-)
-from llmcompressor.modeling.qwen3_vl_moe import (
-    replace as replace_Qwen3VLMoE,
-)
+# deprecated replacement functions
+from llmcompressor.modeling.deepseek_v3 import replace as replace_deepseekv3
+from llmcompressor.modeling.llama4 import replace as replace_llama4
+from llmcompressor.modeling.qwen3_vl_moe import replace as replace_Qwen3VLMoE
+
+# trigger registration
+from .deepseek_v3 import CalibrationDeepseekV3MoE  # noqa: F401
+from .llama4 import SequentialLlama4TextMoe  # noqa: F401
+from .qwen3_moe import CalibrationQwen3MoeSparseMoeBlock  # noqa: F401
+from .qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock  # noqa: F401
+
+# TODO: add granite4, Qwen3Next
 
-__all__ = ["moe_calibration_context", "replace_modules_for_calibration"]
+__all__ = ["replace_modules_for_calibration"]
 
 # ---------------------- module replacements; permanent -------------------------
 replacements = {
diff --git a/src/llmcompressor/modeling/qwen3_vl_moe.py b/src/llmcompressor/modeling/qwen3_vl_moe.py
index 5af6b7abf3..9eeba6dfc1 100644
--- a/src/llmcompressor/modeling/qwen3_vl_moe.py
+++ b/src/llmcompressor/modeling/qwen3_vl_moe.py
@@ -1,19 +1,40 @@
 import torch
-
+from transformers import Qwen3VLMoeConfig, Qwen3VLMoeTextConfig
+from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import (
+    Qwen3VLMoeTextSparseMoeBlock as OriginalQwen3VLMoeTextSparseMoeBlock,
+)
+
+from llmcompressor.modeling.moe_context import (
+    MoECalibrationModule,
+    register_moe_calibration,
+)
 from llmcompressor.utils.dev import skip_weights_initialize
 
 
-class LinearQwen3VLMoeTextSparseMoeBlock(torch.nn.Module):
-    def __init__(self, config, original, calibrate_all_experts):
+@register_moe_calibration("CalibrationQwen3VLMoeTextSparseMoeBlock")
+class CalibrateQwen3VLMoeTextSparseMoeBlock(MoECalibrationModule):
+    """
+    Calibration version of Qwen3VLMoeTextSparseMoeBlock that sends all tokens to all
+    experts.
+    """
+
+    def __init__(
+        self,
+        original: OriginalQwen3VLMoeTextSparseMoeBlock,
+        config: Qwen3VLMoeConfig,
+        calibrate_all_experts: bool,
+    ):
         super().__init__()
-        self.hidden_size = config.hidden_size
-        self.num_experts = config.num_experts
+        text_config: Qwen3VLMoeTextConfig = config.get_text_config()
+
+        self.hidden_size = text_config.hidden_size
+        self.num_experts = text_config.num_experts
         self.top_k = original.top_k
         # Note: gate was changed to be a Linear layer in transformers==4.57.0
         # https://github.com/JJJYmmm/transformers/commit/f5dea1c694af8c994c769170813a8702332119ee
         self.gate = original.gate
         self.calibrate_all_experts = calibrate_all_experts
-        self.experts = SequentialQwen3VLMoeTextExperts(config, original.experts)
+        self.experts = SequentialQwen3VLMoeTextExperts(text_config, original.experts)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
@@ -91,9 +112,13 @@ def __init__(self, config, original):
             self[i].down_proj.weight.data = down.t().clone().contiguous()
 
 
-def replace(config, module, calibrate_all_experts):
-    return LinearQwen3VLMoeTextSparseMoeBlock(
+def replace(
+    config: Qwen3VLMoeConfig,
+    original: OriginalQwen3VLMoeTextSparseMoeBlock,
+    calibrate_all_experts: bool,
+):
+    return CalibrateQwen3VLMoeTextSparseMoeBlock(
         config=config.get_text_config(),
-        original=module,
+        original=original,
         calibrate_all_experts=calibrate_all_experts,
     )
diff --git a/tests/llmcompressor/modeling/test_calib_qwen3_vl_moe.py b/tests/llmcompressor/modeling/test_calib_qwen3_vl_moe.py
index 513fb8cd73..c34951755b 100644
--- a/tests/llmcompressor/modeling/test_calib_qwen3_vl_moe.py
+++ b/tests/llmcompressor/modeling/test_calib_qwen3_vl_moe.py
@@ -1,17 +1,16 @@
 import torch
+from transformers import Qwen3VLMoeTextConfig
+from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import (
+    Qwen3VLMoeTextSparseMoeBlock,
+)
 
-from llmcompressor.modeling.qwen3_vl_moe import LinearQwen3VLMoeTextSparseMoeBlock
+from llmcompressor.modeling.qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock
 from llmcompressor.utils.helpers import calibration_forward_context
 from tests.testing_utils import requires_gpu
 
 
 @requires_gpu
-def test_calib_qwen3_vl_moe_module():
-    from transformers import Qwen3VLMoeTextConfig
-    from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import (
-        Qwen3VLMoeTextSparseMoeBlock,
-    )
-
+def test_calib_qwen3_moe_module():
     config = Qwen3VLMoeTextConfig()
     with torch.device("cuda"):
         original = Qwen3VLMoeTextSparseMoeBlock(config).eval()
@@ -29,16 +28,16 @@ def test_calib_qwen3_vl_moe_module():
     with calibration_forward_context(original):
         true_output = original(sample)
 
-    module = LinearQwen3VLMoeTextSparseMoeBlock(
-        config, original, calibrate_all_experts=True
+    module = CalibrateQwen3VLMoeTextSparseMoeBlock(
+        original, config, calibrate_all_experts=True
     )
     with calibration_forward_context(module):
         output = module(sample)
         assert torch.nn.functional.mse_loss(true_output[0], output[0]) < 1e-10
         assert torch.nn.functional.mse_loss(true_output[1], output[1]) < 1e-10
 
-    module = LinearQwen3VLMoeTextSparseMoeBlock(
-        config, original, calibrate_all_experts=False
+    module = CalibrateQwen3VLMoeTextSparseMoeBlock(
+        original, config, calibrate_all_experts=False
     )
     with calibration_forward_context(module):
         output = module(sample)

From 2cf2aa9b12c13a234144cd3d631f9fb76a39cd5d Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Wed, 29 Oct 2025 18:01:19 -0400
Subject: [PATCH 02/10] change log level

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/modeling/moe_context.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llmcompressor/modeling/moe_context.py b/src/llmcompressor/modeling/moe_context.py
index 232b271c01..a25b2fa0ec 100644
--- a/src/llmcompressor/modeling/moe_context.py
+++ b/src/llmcompressor/modeling/moe_context.py
@@ -118,7 +118,7 @@ def moe_calibration_context(
     replaced = {}
 
     # Step 1: Collect all MoE modules that need replacement
-    logger.info("Entering MoE calibration context")
+    logger.debug("Entering MoE calibration context")
     modules_to_replace = []
     for name, module in model.named_modules():
         class_name = module.__class__.__name__
@@ -161,7 +161,7 @@ def moe_calibration_context(
         yield
     finally:
         # Step 2: Restore non-permanent modules
-        for name, (original, replacement) in replaced.items():
+        for name, (_original, replacement) in replaced.items():
             if not replacement.is_permanent:
                 restored = replacement.restore(original)
                 model.set_submodule(name, restored)

From fc34db696930dfc99220d60505280b06706d2c40 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Wed, 29 Oct 2025 18:08:11 -0400
Subject: [PATCH 03/10] move registration

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/modeling/moe_context.py           | 7 +++++++
 src/llmcompressor/modeling/prepare.py               | 8 --------
 tests/llmcompressor/modeling/test_calib_granite4.py | 0
 3 files changed, 7 insertions(+), 8 deletions(-)
 create mode 100644 tests/llmcompressor/modeling/test_calib_granite4.py

diff --git a/src/llmcompressor/modeling/moe_context.py b/src/llmcompressor/modeling/moe_context.py
index a25b2fa0ec..0216b651ae 100644
--- a/src/llmcompressor/modeling/moe_context.py
+++ b/src/llmcompressor/modeling/moe_context.py
@@ -115,6 +115,13 @@ def moe_calibration_context(
                 model(**batch)
         # Model is now restored (unless permanent)
     """
+    # trigger registration
+    from .deepseek_v3 import CalibrationDeepseekV3MoE  # noqa: F401
+    from .llama4 import SequentialLlama4TextMoe  # noqa: F401
+    from .qwen3_moe import CalibrationQwen3MoeSparseMoeBlock  # noqa: F401
+    from .qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock  # noqa: F401
+    # TODO: add granite4, Qwen3Next
+
     replaced = {}
 
     # Step 1: Collect all MoE modules that need replacement
diff --git a/src/llmcompressor/modeling/prepare.py b/src/llmcompressor/modeling/prepare.py
index 10b92f2603..af9920d1b8 100644
--- a/src/llmcompressor/modeling/prepare.py
+++ b/src/llmcompressor/modeling/prepare.py
@@ -15,14 +15,6 @@
 from llmcompressor.modeling.llama4 import replace as replace_llama4
 from llmcompressor.modeling.qwen3_vl_moe import replace as replace_Qwen3VLMoE
 
-# trigger registration
-from .deepseek_v3 import CalibrationDeepseekV3MoE  # noqa: F401
-from .llama4 import SequentialLlama4TextMoe  # noqa: F401
-from .qwen3_moe import CalibrationQwen3MoeSparseMoeBlock  # noqa: F401
-from .qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock  # noqa: F401
-
-# TODO: add granite4, Qwen3Next
-
 __all__ = ["replace_modules_for_calibration"]
 
 # ---------------------- module replacements; permanent -------------------------
diff --git a/tests/llmcompressor/modeling/test_calib_granite4.py b/tests/llmcompressor/modeling/test_calib_granite4.py
new file mode 100644
index 0000000000..e69de29bb2

From 4e78ae46fce829304d787c147b41d1ff1ee4a627 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Wed, 29 Oct 2025 18:14:27 -0400
Subject: [PATCH 04/10] use registry mixin

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/modeling/deepseek_v3.py  |  7 ++--
 src/llmcompressor/modeling/llama4.py       |  7 ++--
 src/llmcompressor/modeling/moe_context.py  | 39 ++++------------------
 src/llmcompressor/modeling/qwen3_moe.py    |  7 ++--
 src/llmcompressor/modeling/qwen3_vl_moe.py |  7 ++--
 5 files changed, 14 insertions(+), 53 deletions(-)

diff --git a/src/llmcompressor/modeling/deepseek_v3.py b/src/llmcompressor/modeling/deepseek_v3.py
index c2dd8f4b69..4618d15b68 100644
--- a/src/llmcompressor/modeling/deepseek_v3.py
+++ b/src/llmcompressor/modeling/deepseek_v3.py
@@ -4,13 +4,10 @@
     DeepseekV3MoE as OriginalDeepseekV3MoE,
 )
 
-from llmcompressor.modeling.moe_context import (
-    MoECalibrationModule,
-    register_moe_calibration,
-)
+from llmcompressor.modeling.moe_context import MoECalibrationModule
 
 
-@register_moe_calibration("DeepseekV3MoE")
+@MoECalibrationModule.register("DeepseekV3MoE")
 class CalibrationDeepseekV3MoE(MoECalibrationModule):
     """
     Calibration version of DeepseekV3MoE that sends all tokens to all experts.
diff --git a/src/llmcompressor/modeling/llama4.py b/src/llmcompressor/modeling/llama4.py
index bfaa8e21c7..1f2ef9b77f 100644
--- a/src/llmcompressor/modeling/llama4.py
+++ b/src/llmcompressor/modeling/llama4.py
@@ -11,14 +11,11 @@
     Llama4TextMoe,
 )
 
-from llmcompressor.modeling.moe_context import (
-    MoECalibrationModule,
-    register_moe_calibration,
-)
+from llmcompressor.modeling.moe_context import MoECalibrationModule
 from llmcompressor.utils.dev import skip_weights_initialize
 
 
-@register_moe_calibration("Llama4TextMoe")
+@MoECalibrationModule.register("Llama4TextMoe")
 class SequentialLlama4TextMoe(MoECalibrationModule):
     """
     Calibration version of Llama4TextMoe that unpacks experts for sequential processing.
diff --git a/src/llmcompressor/modeling/moe_context.py b/src/llmcompressor/modeling/moe_context.py
index 0216b651ae..fb8bd86732 100644
--- a/src/llmcompressor/modeling/moe_context.py
+++ b/src/llmcompressor/modeling/moe_context.py
@@ -14,22 +14,20 @@
 
 import contextlib
 from abc import ABC
-from typing import Dict, Type
 
 import torch
+from compressed_tensors.registry import RegistryMixin
 from loguru import logger
 from tqdm import tqdm
 from transformers import PreTrainedModel
 
 __all__ = [
     "MoECalibrationModule",
-    "MOE_CALIBRATION_MODULES",
-    "register_moe_calibration",
     "moe_calibration_context",
 ]
 
 
-class MoECalibrationModule(ABC, torch.nn.Module):
+class MoECalibrationModule(ABC, torch.nn.Module, RegistryMixin):
     """
     Abstract base class for MoE calibration modules.
 
@@ -62,32 +60,6 @@ def restore(self, original: torch.nn.Module) -> torch.nn.Module:
         )
 
 
-# Registry: module class name -> calibration module class
-MOE_CALIBRATION_MODULES: Dict[str, Type[MoECalibrationModule]] = {}
-
-
-def register_moe_calibration(module_class_name: str):
-    """
-    Decorator to register a MoE calibration module.
-
-    Usage:
-        @register_moe_calibration("DeepseekV3MoE")
-        class CalibrationDeepseekV3MoE(MoECalibrationModule):
-            ...
-
-    Args:
-        module_class_name: The class name of the original module to replace
-    """
-
-    def decorator(cls: Type[MoECalibrationModule]) -> Type[MoECalibrationModule]:
-        if not issubclass(cls, MoECalibrationModule):
-            raise TypeError(f"{cls.__name__} must inherit from MoECalibrationModule")
-        MOE_CALIBRATION_MODULES[module_class_name] = cls
-        return cls
-
-    return decorator
-
-
 @contextlib.contextmanager
 def moe_calibration_context(
     model: PreTrainedModel,
@@ -127,9 +99,10 @@ def moe_calibration_context(
     # Step 1: Collect all MoE modules that need replacement
     logger.debug("Entering MoE calibration context")
     modules_to_replace = []
+    moe_class_names = MoECalibrationModule.registered_names()
     for name, module in model.named_modules():
         class_name = module.__class__.__name__
-        if class_name in MOE_CALIBRATION_MODULES:
+        if class_name in moe_class_names:
             modules_to_replace.append((name, module, class_name))
 
     # Step 2: Replace modules with progress bar
@@ -138,8 +111,8 @@ def moe_calibration_context(
         for name, module, class_name in tqdm(
             modules_to_replace, desc="Replacing MoE modules for calibration"
         ):
-            calibration_cls = MOE_CALIBRATION_MODULES[class_name]
-            replacement = calibration_cls(
+            replacement = MoECalibrationModule.load_from_registry(
+                class_name,
                 module,
                 model.config,
                 calibrate_all_experts=calibrate_all_experts,
diff --git a/src/llmcompressor/modeling/qwen3_moe.py b/src/llmcompressor/modeling/qwen3_moe.py
index 5432b731bb..678e32f10c 100644
--- a/src/llmcompressor/modeling/qwen3_moe.py
+++ b/src/llmcompressor/modeling/qwen3_moe.py
@@ -20,13 +20,10 @@
     Qwen3MoeSparseMoeBlock as OriginalQwen3MoeSparseMoeBlock,
 )
 
-from llmcompressor.modeling.moe_context import (
-    MoECalibrationModule,
-    register_moe_calibration,
-)
+from llmcompressor.modeling.moe_context import MoECalibrationModule
 
 
-@register_moe_calibration("Qwen3MoeSparseMoeBlock")
+@MoECalibrationModule.register("Qwen3MoeSparseMoeBlock")
 class CalibrationQwen3MoeSparseMoeBlock(MoECalibrationModule):
     """
     Calibration version of Qwen3MoeSparseMoeBlock that sends all tokens to all experts.
diff --git a/src/llmcompressor/modeling/qwen3_vl_moe.py b/src/llmcompressor/modeling/qwen3_vl_moe.py
index 9eeba6dfc1..97cc221758 100644
--- a/src/llmcompressor/modeling/qwen3_vl_moe.py
+++ b/src/llmcompressor/modeling/qwen3_vl_moe.py
@@ -4,14 +4,11 @@
     Qwen3VLMoeTextSparseMoeBlock as OriginalQwen3VLMoeTextSparseMoeBlock,
 )
 
-from llmcompressor.modeling.moe_context import (
-    MoECalibrationModule,
-    register_moe_calibration,
-)
+from llmcompressor.modeling.moe_context import MoECalibrationModule
 from llmcompressor.utils.dev import skip_weights_initialize
 
 
-@register_moe_calibration("CalibrationQwen3VLMoeTextSparseMoeBlock")
+@MoECalibrationModule.register("CalibrationQwen3VLMoeTextSparseMoeBlock")
 class CalibrateQwen3VLMoeTextSparseMoeBlock(MoECalibrationModule):
     """
     Calibration version of Qwen3VLMoeTextSparseMoeBlock that sends all tokens to all

From 78f08a308c1a8b433549161da13a11a70b8e805b Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Wed, 29 Oct 2025 18:17:23 -0400
Subject: [PATCH 05/10] Revert "use registry mixin"

This reverts commit 6dd03202f2e262a726125b955098e6769dec1f85.
---
 src/llmcompressor/modeling/deepseek_v3.py  |  7 ++--
 src/llmcompressor/modeling/llama4.py       |  7 ++--
 src/llmcompressor/modeling/moe_context.py  | 39 ++++++++++++++++++----
 src/llmcompressor/modeling/qwen3_moe.py    |  7 ++--
 src/llmcompressor/modeling/qwen3_vl_moe.py |  7 ++--
 5 files changed, 53 insertions(+), 14 deletions(-)

diff --git a/src/llmcompressor/modeling/deepseek_v3.py b/src/llmcompressor/modeling/deepseek_v3.py
index 4618d15b68..c2dd8f4b69 100644
--- a/src/llmcompressor/modeling/deepseek_v3.py
+++ b/src/llmcompressor/modeling/deepseek_v3.py
@@ -4,10 +4,13 @@
     DeepseekV3MoE as OriginalDeepseekV3MoE,
 )
 
-from llmcompressor.modeling.moe_context import MoECalibrationModule
+from llmcompressor.modeling.moe_context import (
+    MoECalibrationModule,
+    register_moe_calibration,
+)
 
 
-@MoECalibrationModule.register("DeepseekV3MoE")
+@register_moe_calibration("DeepseekV3MoE")
 class CalibrationDeepseekV3MoE(MoECalibrationModule):
     """
     Calibration version of DeepseekV3MoE that sends all tokens to all experts.
diff --git a/src/llmcompressor/modeling/llama4.py b/src/llmcompressor/modeling/llama4.py
index 1f2ef9b77f..bfaa8e21c7 100644
--- a/src/llmcompressor/modeling/llama4.py
+++ b/src/llmcompressor/modeling/llama4.py
@@ -11,11 +11,14 @@
     Llama4TextMoe,
 )
 
-from llmcompressor.modeling.moe_context import MoECalibrationModule
+from llmcompressor.modeling.moe_context import (
+    MoECalibrationModule,
+    register_moe_calibration,
+)
 from llmcompressor.utils.dev import skip_weights_initialize
 
 
-@MoECalibrationModule.register("Llama4TextMoe")
+@register_moe_calibration("Llama4TextMoe")
 class SequentialLlama4TextMoe(MoECalibrationModule):
     """
     Calibration version of Llama4TextMoe that unpacks experts for sequential processing.
diff --git a/src/llmcompressor/modeling/moe_context.py b/src/llmcompressor/modeling/moe_context.py
index fb8bd86732..0216b651ae 100644
--- a/src/llmcompressor/modeling/moe_context.py
+++ b/src/llmcompressor/modeling/moe_context.py
@@ -14,20 +14,22 @@
 
 import contextlib
 from abc import ABC
+from typing import Dict, Type
 
 import torch
-from compressed_tensors.registry import RegistryMixin
 from loguru import logger
 from tqdm import tqdm
 from transformers import PreTrainedModel
 
 __all__ = [
     "MoECalibrationModule",
+    "MOE_CALIBRATION_MODULES",
+    "register_moe_calibration",
     "moe_calibration_context",
 ]
 
 
-class MoECalibrationModule(ABC, torch.nn.Module, RegistryMixin):
+class MoECalibrationModule(ABC, torch.nn.Module):
     """
     Abstract base class for MoE calibration modules.
 
@@ -60,6 +62,32 @@ def restore(self, original: torch.nn.Module) -> torch.nn.Module:
         )
 
 
+# Registry: module class name -> calibration module class
+MOE_CALIBRATION_MODULES: Dict[str, Type[MoECalibrationModule]] = {}
+
+
+def register_moe_calibration(module_class_name: str):
+    """
+    Decorator to register a MoE calibration module.
+
+    Usage:
+        @register_moe_calibration("DeepseekV3MoE")
+        class CalibrationDeepseekV3MoE(MoECalibrationModule):
+            ...
+
+    Args:
+        module_class_name: The class name of the original module to replace
+    """
+
+    def decorator(cls: Type[MoECalibrationModule]) -> Type[MoECalibrationModule]:
+        if not issubclass(cls, MoECalibrationModule):
+            raise TypeError(f"{cls.__name__} must inherit from MoECalibrationModule")
+        MOE_CALIBRATION_MODULES[module_class_name] = cls
+        return cls
+
+    return decorator
+
+
 @contextlib.contextmanager
 def moe_calibration_context(
     model: PreTrainedModel,
@@ -99,10 +127,9 @@ def moe_calibration_context(
     # Step 1: Collect all MoE modules that need replacement
     logger.debug("Entering MoE calibration context")
     modules_to_replace = []
-    moe_class_names = MoECalibrationModule.registered_names()
     for name, module in model.named_modules():
         class_name = module.__class__.__name__
-        if class_name in moe_class_names:
+        if class_name in MOE_CALIBRATION_MODULES:
             modules_to_replace.append((name, module, class_name))
 
     # Step 2: Replace modules with progress bar
@@ -111,8 +138,8 @@ def moe_calibration_context(
         for name, module, class_name in tqdm(
             modules_to_replace, desc="Replacing MoE modules for calibration"
         ):
-            replacement = MoECalibrationModule.load_from_registry(
-                class_name,
+            calibration_cls = MOE_CALIBRATION_MODULES[class_name]
+            replacement = calibration_cls(
                 module,
                 model.config,
                 calibrate_all_experts=calibrate_all_experts,
diff --git a/src/llmcompressor/modeling/qwen3_moe.py b/src/llmcompressor/modeling/qwen3_moe.py
index 678e32f10c..5432b731bb 100644
--- a/src/llmcompressor/modeling/qwen3_moe.py
+++ b/src/llmcompressor/modeling/qwen3_moe.py
@@ -20,10 +20,13 @@
     Qwen3MoeSparseMoeBlock as OriginalQwen3MoeSparseMoeBlock,
 )
 
-from llmcompressor.modeling.moe_context import MoECalibrationModule
+from llmcompressor.modeling.moe_context import (
+    MoECalibrationModule,
+    register_moe_calibration,
+)
 
 
-@MoECalibrationModule.register("Qwen3MoeSparseMoeBlock")
+@register_moe_calibration("Qwen3MoeSparseMoeBlock")
 class CalibrationQwen3MoeSparseMoeBlock(MoECalibrationModule):
     """
     Calibration version of Qwen3MoeSparseMoeBlock that sends all tokens to all experts.
diff --git a/src/llmcompressor/modeling/qwen3_vl_moe.py b/src/llmcompressor/modeling/qwen3_vl_moe.py
index 97cc221758..9eeba6dfc1 100644
--- a/src/llmcompressor/modeling/qwen3_vl_moe.py
+++ b/src/llmcompressor/modeling/qwen3_vl_moe.py
@@ -4,11 +4,14 @@
     Qwen3VLMoeTextSparseMoeBlock as OriginalQwen3VLMoeTextSparseMoeBlock,
 )
 
-from llmcompressor.modeling.moe_context import MoECalibrationModule
+from llmcompressor.modeling.moe_context import (
+    MoECalibrationModule,
+    register_moe_calibration,
+)
 from llmcompressor.utils.dev import skip_weights_initialize
 
 
-@MoECalibrationModule.register("CalibrationQwen3VLMoeTextSparseMoeBlock")
+@register_moe_calibration("CalibrationQwen3VLMoeTextSparseMoeBlock")
 class CalibrateQwen3VLMoeTextSparseMoeBlock(MoECalibrationModule):
     """
     Calibration version of Qwen3VLMoeTextSparseMoeBlock that sends all tokens to all

From bb39dacad1c71d7947bb85d541b559a7e4e40230 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Wed, 29 Oct 2025 18:18:15 -0400
Subject: [PATCH 06/10] remove file

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 tests/llmcompressor/modeling/test_calib_granite4.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 tests/llmcompressor/modeling/test_calib_granite4.py

diff --git a/tests/llmcompressor/modeling/test_calib_granite4.py b/tests/llmcompressor/modeling/test_calib_granite4.py
deleted file mode 100644
index e69de29bb2..0000000000

From 8091a43681f5de274798eb8d14c2cb932f2c3901 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 30 Oct 2025 14:47:45 +0000
Subject: [PATCH 07/10] fix issues, use registry

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/modeling/__init__.py        |  7 ++
 src/llmcompressor/modeling/deepseek_v3.py     |  7 +-
 src/llmcompressor/modeling/llama4.py          |  7 +-
 src/llmcompressor/modeling/moe_context.py     | 49 +++-----------
 src/llmcompressor/modeling/qwen3_moe.py       |  7 +-
 src/llmcompressor/modeling/qwen3_vl_moe.py    |  9 +--
 .../modeling/test_calib_qwen3_vl_moe.py       | 10 +--
 .../pipelines/test_ptq_weights.py             | 66 +++++++++++++++++++
 8 files changed, 96 insertions(+), 66 deletions(-)
 create mode 100644 tests/llmcompressor/pipelines/test_ptq_weights.py

diff --git a/src/llmcompressor/modeling/__init__.py b/src/llmcompressor/modeling/__init__.py
index 9d105d823a..b1aba40835 100644
--- a/src/llmcompressor/modeling/__init__.py
+++ b/src/llmcompressor/modeling/__init__.py
@@ -9,5 +9,12 @@
 needed for efficient compression.
 """
 
+# trigger registration
+from .deepseek_v3 import CalibrationDeepseekV3MoE  # noqa: F401
+from .llama4 import SequentialLlama4TextMoe  # noqa: F401
+from .qwen3_moe import CalibrationQwen3MoeSparseMoeBlock  # noqa: F401
+from .qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock  # noqa: F401
+# TODO: add granite4, Qwen3Next
+
 from .fuse import *
 from .prepare import *
diff --git a/src/llmcompressor/modeling/deepseek_v3.py b/src/llmcompressor/modeling/deepseek_v3.py
index c2dd8f4b69..4618d15b68 100644
--- a/src/llmcompressor/modeling/deepseek_v3.py
+++ b/src/llmcompressor/modeling/deepseek_v3.py
@@ -4,13 +4,10 @@
     DeepseekV3MoE as OriginalDeepseekV3MoE,
 )
 
-from llmcompressor.modeling.moe_context import (
-    MoECalibrationModule,
-    register_moe_calibration,
-)
+from llmcompressor.modeling.moe_context import MoECalibrationModule
 
 
-@register_moe_calibration("DeepseekV3MoE")
+@MoECalibrationModule.register("DeepseekV3MoE")
 class CalibrationDeepseekV3MoE(MoECalibrationModule):
     """
     Calibration version of DeepseekV3MoE that sends all tokens to all experts.
diff --git a/src/llmcompressor/modeling/llama4.py b/src/llmcompressor/modeling/llama4.py
index bfaa8e21c7..1f2ef9b77f 100644
--- a/src/llmcompressor/modeling/llama4.py
+++ b/src/llmcompressor/modeling/llama4.py
@@ -11,14 +11,11 @@
     Llama4TextMoe,
 )
 
-from llmcompressor.modeling.moe_context import (
-    MoECalibrationModule,
-    register_moe_calibration,
-)
+from llmcompressor.modeling.moe_context import MoECalibrationModule
 from llmcompressor.utils.dev import skip_weights_initialize
 
 
-@register_moe_calibration("Llama4TextMoe")
+@MoECalibrationModule.register("Llama4TextMoe")
 class SequentialLlama4TextMoe(MoECalibrationModule):
     """
     Calibration version of Llama4TextMoe that unpacks experts for sequential processing.
diff --git a/src/llmcompressor/modeling/moe_context.py b/src/llmcompressor/modeling/moe_context.py
index 0216b651ae..fea0e057c8 100644
--- a/src/llmcompressor/modeling/moe_context.py
+++ b/src/llmcompressor/modeling/moe_context.py
@@ -8,28 +8,25 @@
 
 Key components:
 - MoECalibrationModule: Abstract base class for calibration modules
-- MOE_CALIBRATION_MODULES: Registry mapping module class names to calibration classes
 - moe_calibration_context: Context manager that applies calibration to a model
 """
 
 import contextlib
 from abc import ABC
-from typing import Dict, Type
 
 import torch
+from compressed_tensors.registry import RegistryMixin, standardize_lookup_name
 from loguru import logger
 from tqdm import tqdm
 from transformers import PreTrainedModel
 
 __all__ = [
     "MoECalibrationModule",
-    "MOE_CALIBRATION_MODULES",
-    "register_moe_calibration",
     "moe_calibration_context",
 ]
 
 
-class MoECalibrationModule(ABC, torch.nn.Module):
+class MoECalibrationModule(ABC, torch.nn.Module, RegistryMixin):
     """
     Abstract base class for MoE calibration modules.
 
@@ -62,32 +59,6 @@ def restore(self, original: torch.nn.Module) -> torch.nn.Module:
         )
 
 
-# Registry: module class name -> calibration module class
-MOE_CALIBRATION_MODULES: Dict[str, Type[MoECalibrationModule]] = {}
-
-
-def register_moe_calibration(module_class_name: str):
-    """
-    Decorator to register a MoE calibration module.
-
-    Usage:
-        @register_moe_calibration("DeepseekV3MoE")
-        class CalibrationDeepseekV3MoE(MoECalibrationModule):
-            ...
-
-    Args:
-        module_class_name: The class name of the original module to replace
-    """
-
-    def decorator(cls: Type[MoECalibrationModule]) -> Type[MoECalibrationModule]:
-        if not issubclass(cls, MoECalibrationModule):
-            raise TypeError(f"{cls.__name__} must inherit from MoECalibrationModule")
-        MOE_CALIBRATION_MODULES[module_class_name] = cls
-        return cls
-
-    return decorator
-
-
 @contextlib.contextmanager
 def moe_calibration_context(
     model: PreTrainedModel,
@@ -115,12 +86,6 @@ def moe_calibration_context(
                 model(**batch)
         # Model is now restored (unless permanent)
     """
-    # trigger registration
-    from .deepseek_v3 import CalibrationDeepseekV3MoE  # noqa: F401
-    from .llama4 import SequentialLlama4TextMoe  # noqa: F401
-    from .qwen3_moe import CalibrationQwen3MoeSparseMoeBlock  # noqa: F401
-    from .qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock  # noqa: F401
-    # TODO: add granite4, Qwen3Next
 
     replaced = {}
 
@@ -129,7 +94,7 @@ def moe_calibration_context(
     modules_to_replace = []
     for name, module in model.named_modules():
         class_name = module.__class__.__name__
-        if class_name in MOE_CALIBRATION_MODULES:
+        if _is_registered(class_name, MoECalibrationModule):
             modules_to_replace.append((name, module, class_name))
 
     # Step 2: Replace modules with progress bar
@@ -138,8 +103,8 @@ def moe_calibration_context(
         for name, module, class_name in tqdm(
             modules_to_replace, desc="Replacing MoE modules for calibration"
         ):
-            calibration_cls = MOE_CALIBRATION_MODULES[class_name]
-            replacement = calibration_cls(
+            replacement = MoECalibrationModule.load_from_registry(
+                class_name,
                 module,
                 model.config,
                 calibrate_all_experts=calibrate_all_experts,
@@ -172,3 +137,7 @@ def moe_calibration_context(
             if not replacement.is_permanent:
                 restored = replacement.restore(original)
                 model.set_submodule(name, restored)
+
+
+def _is_registered(name: str, subclass: RegistryMixin):
+    return standardize_lookup_name(name) in subclass.registered_names()
diff --git a/src/llmcompressor/modeling/qwen3_moe.py b/src/llmcompressor/modeling/qwen3_moe.py
index 5432b731bb..678e32f10c 100644
--- a/src/llmcompressor/modeling/qwen3_moe.py
+++ b/src/llmcompressor/modeling/qwen3_moe.py
@@ -20,13 +20,10 @@
     Qwen3MoeSparseMoeBlock as OriginalQwen3MoeSparseMoeBlock,
 )
 
-from llmcompressor.modeling.moe_context import (
-    MoECalibrationModule,
-    register_moe_calibration,
-)
+from llmcompressor.modeling.moe_context import MoECalibrationModule
 
 
-@register_moe_calibration("Qwen3MoeSparseMoeBlock")
+@MoECalibrationModule.register("Qwen3MoeSparseMoeBlock")
 class CalibrationQwen3MoeSparseMoeBlock(MoECalibrationModule):
     """
     Calibration version of Qwen3MoeSparseMoeBlock that sends all tokens to all experts.
diff --git a/src/llmcompressor/modeling/qwen3_vl_moe.py b/src/llmcompressor/modeling/qwen3_vl_moe.py
index 9eeba6dfc1..3637525954 100644
--- a/src/llmcompressor/modeling/qwen3_vl_moe.py
+++ b/src/llmcompressor/modeling/qwen3_vl_moe.py
@@ -4,14 +4,11 @@
     Qwen3VLMoeTextSparseMoeBlock as OriginalQwen3VLMoeTextSparseMoeBlock,
 )
 
-from llmcompressor.modeling.moe_context import (
-    MoECalibrationModule,
-    register_moe_calibration,
-)
+from llmcompressor.modeling.moe_context import MoECalibrationModule
 from llmcompressor.utils.dev import skip_weights_initialize
 
 
-@register_moe_calibration("CalibrationQwen3VLMoeTextSparseMoeBlock")
+@MoECalibrationModule.register("Qwen3VLMoeTextSparseMoeBlock")
 class CalibrateQwen3VLMoeTextSparseMoeBlock(MoECalibrationModule):
     """
     Calibration version of Qwen3VLMoeTextSparseMoeBlock that sends all tokens to all
@@ -118,7 +115,7 @@ def replace(
     calibrate_all_experts: bool,
 ):
     return CalibrateQwen3VLMoeTextSparseMoeBlock(
-        config=config.get_text_config(),
         original=original,
+        config=config,
         calibrate_all_experts=calibrate_all_experts,
     )
diff --git a/tests/llmcompressor/modeling/test_calib_qwen3_vl_moe.py b/tests/llmcompressor/modeling/test_calib_qwen3_vl_moe.py
index c34951755b..46694a38db 100644
--- a/tests/llmcompressor/modeling/test_calib_qwen3_vl_moe.py
+++ b/tests/llmcompressor/modeling/test_calib_qwen3_vl_moe.py
@@ -1,5 +1,5 @@
 import torch
-from transformers import Qwen3VLMoeTextConfig
+from transformers import Qwen3VLMoeConfig
 from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import (
     Qwen3VLMoeTextSparseMoeBlock,
 )
@@ -10,10 +10,10 @@
 
 
 @requires_gpu
-def test_calib_qwen3_moe_module():
-    config = Qwen3VLMoeTextConfig()
+def test_calib_qwen3_vl_moe_module():
+    config = Qwen3VLMoeConfig()
     with torch.device("cuda"):
-        original = Qwen3VLMoeTextSparseMoeBlock(config).eval()
+        original = Qwen3VLMoeTextSparseMoeBlock(config.get_text_config()).eval()
         # these are initialized as empty / all 0s which results in outputs
         # from the experts being all 0
         # update to use a small random value
@@ -21,7 +21,7 @@ def test_calib_qwen3_moe_module():
         original.experts.down_proj.data.normal_(mean=0.0, std=0.02)
 
     # Create dummy input tensor that simulates hidden_states
-    hidden_dim = config.hidden_size
+    hidden_dim = config.get_text_config().hidden_size
     batch, seq_len = 4, 32
     sample = torch.randn(batch, seq_len, hidden_dim, device="cuda")
 
diff --git a/tests/llmcompressor/pipelines/test_ptq_weights.py b/tests/llmcompressor/pipelines/test_ptq_weights.py
new file mode 100644
index 0000000000..1e13a75d6d
--- /dev/null
+++ b/tests/llmcompressor/pipelines/test_ptq_weights.py
@@ -0,0 +1,66 @@
+import os
+
+import pytest
+import torch
+from safetensors.torch import load_file
+
+from llmcompressor import oneshot, ptq_weights
+from llmcompressor.modifiers.quantization import QuantizationModifier
+from tests.testing_utils import requires_gpu
+
+
+@requires_gpu
+@pytest.mark.parametrize("scheme", ["FP8_dynamic", "NVFP4A16"])
+def test_weights_ptq_e2e(scheme, tmp_path):
+    model = "nm-testing/tinysmokellama-3.2"
+    ptq_ignore = ["model.embed_tokens.weight", "lm_head.weight", "re:.*norm.weight$"]
+    oneshot_ignore = ["lm_head"]
+    device = "cuda:0"
+
+    ptq_outdir = tmp_path / "weights_out"
+    oneshot_outdir = tmp_path / "oneshot_out"
+
+    ptq_weights(
+        model,
+        ptq_outdir,
+        scheme=scheme,
+        max_workers=2,
+        device=device,
+        ignore=ptq_ignore,
+    )
+
+    oneshot(
+        model=model,
+        recipe=QuantizationModifier(
+            targets="Linear", scheme=scheme, ignore=oneshot_ignore
+        ),
+        output_dir=oneshot_outdir,
+    )
+
+    ptq_st_files = _get_safetensors_files(ptq_outdir)
+    oneshot_st_files = _get_safetensors_files(oneshot_outdir)
+    assert set(ptq_st_files) == set(oneshot_st_files)
+
+    for file_name in ptq_st_files:
+        _assert_safetensors_equal(ptq_outdir / file_name, oneshot_outdir / file_name)
+
+
+def _get_safetensors_files(dir_path: str) -> list[str]:
+    return [
+        file_name
+        for file_name in os.listdir(dir_path)
+        if file_name.endswith("safetensors")
+    ]
+
+
+def _assert_safetensors_equal(a_path: str, b_path: str) -> bool:
+    a = load_file(a_path)
+    b = load_file(b_path)
+
+    assert a.keys() == b.keys(), (a.keys() - b.keys(), b.keys() - a.keys())
+
+    for key in a.keys():
+        value_equal = torch.equal(a[key].to(torch.bfloat16), b[key].to(torch.bfloat16))
+        dtype_equal = a[key].dtype == b[key].dtype
+
+        assert value_equal and dtype_equal, (key, value_equal, dtype_equal)

From d1a8ee2fbfd98dbbd2d716c7195482ca51107084 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 30 Oct 2025 18:28:35 +0000
Subject: [PATCH 08/10] remove accidentally added file

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 .../pipelines/test_ptq_weights.py             | 66 -------------------
 1 file changed, 66 deletions(-)
 delete mode 100644 tests/llmcompressor/pipelines/test_ptq_weights.py

diff --git a/tests/llmcompressor/pipelines/test_ptq_weights.py b/tests/llmcompressor/pipelines/test_ptq_weights.py
deleted file mode 100644
index 1e13a75d6d..0000000000
--- a/tests/llmcompressor/pipelines/test_ptq_weights.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import os
-
-import pytest
-import torch
-from safetensors.torch import load_file
-
-from llmcompressor import oneshot, ptq_weights
-from llmcompressor.modifiers.quantization import QuantizationModifier
-from tests.testing_utils import requires_gpu
-
-
-@requires_gpu
-@pytest.mark.parametrize("scheme", ["FP8_dynamic", "NVFP4A16"])
-def test_weights_ptq_e2e(scheme, tmp_path):
-    model = "nm-testing/tinysmokellama-3.2"
-    ptq_ignore = ["model.embed_tokens.weight", "lm_head.weight", "re:.*norm.weight$"]
-    oneshot_ignore = ["lm_head"]
-    device = "cuda:0"
-
-    ptq_outdir = tmp_path / "weights_out"
-    oneshot_outdir = tmp_path / "oneshot_out"
-
-    ptq_weights(
-        model,
-        ptq_outdir,
-        scheme=scheme,
-        max_workers=2,
-        device=device,
-        ignore=ptq_ignore,
-    )
-
-    oneshot(
-        model=model,
-        recipe=QuantizationModifier(
-            targets="Linear", scheme=scheme, ignore=oneshot_ignore
-        ),
-        output_dir=oneshot_outdir,
-    )
-
-    ptq_st_files = _get_safetensors_files(ptq_outdir)
-    oneshot_st_files = _get_safetensors_files(oneshot_outdir)
-    assert set(ptq_st_files) == set(oneshot_st_files)
-
-    for file_name in ptq_st_files:
-        _assert_safetensors_equal(ptq_outdir / file_name, oneshot_outdir / file_name)
-
-
-def _get_safetensors_files(dir_path: str) -> list[str]:
-    return [
-        file_name
-        for file_name in os.listdir(dir_path)
-        if file_name.endswith("safetensors")
-    ]
-
-
-def _assert_safetensors_equal(a_path: str, b_path: str) -> bool:
-    a = load_file(a_path)
-    b = load_file(b_path)
-
-    assert a.keys() == b.keys(), (a.keys() - b.keys(), b.keys() - a.keys())
-
-    for key in a.keys():
-        value_equal = torch.equal(a[key].to(torch.bfloat16), b[key].to(torch.bfloat16))
-        dtype_equal = a[key].dtype == b[key].dtype
-
-        assert value_equal and dtype_equal, (key, value_equal, dtype_equal)

From 215b95cd17ce187aaa6b5dd5cedcd2fdaa5dbc13 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Fri, 7 Nov 2025 19:35:21 +0000
Subject: [PATCH 09/10] rebase

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/modeling/__init__.py     | 1 +
 src/llmcompressor/modeling/moe_context.py  | 2 +-
 src/llmcompressor/modeling/qwen3_vl_moe.py | 5 +++++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/llmcompressor/modeling/__init__.py b/src/llmcompressor/modeling/__init__.py
index b1aba40835..1aeec42b06 100644
--- a/src/llmcompressor/modeling/__init__.py
+++ b/src/llmcompressor/modeling/__init__.py
@@ -14,6 +14,7 @@
 from .llama4 import SequentialLlama4TextMoe  # noqa: F401
 from .qwen3_moe import CalibrationQwen3MoeSparseMoeBlock  # noqa: F401
 from .qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock  # noqa: F401
+from .qwen3_next_moe import CalibrationQwen3NextSparseMoeBlock  # noqa: F401
 # TODO: add granite4, Qwen3Next
 
 from .fuse import *
diff --git a/src/llmcompressor/modeling/moe_context.py b/src/llmcompressor/modeling/moe_context.py
index fea0e057c8..6e96c4e2df 100644
--- a/src/llmcompressor/modeling/moe_context.py
+++ b/src/llmcompressor/modeling/moe_context.py
@@ -133,7 +133,7 @@ def moe_calibration_context(
         yield
     finally:
         # Step 2: Restore non-permanent modules
-        for name, (_original, replacement) in replaced.items():
+        for name, (original, replacement) in replaced.items():
             if not replacement.is_permanent:
                 restored = replacement.restore(original)
                 model.set_submodule(name, restored)
diff --git a/src/llmcompressor/modeling/qwen3_vl_moe.py b/src/llmcompressor/modeling/qwen3_vl_moe.py
index 3637525954..c162c6d0c8 100644
--- a/src/llmcompressor/modeling/qwen3_vl_moe.py
+++ b/src/llmcompressor/modeling/qwen3_vl_moe.py
@@ -15,6 +15,8 @@ class CalibrateQwen3VLMoeTextSparseMoeBlock(MoECalibrationModule):
     experts.
     """
 
+    is_permanent = True
+
     def __init__(
         self,
         original: OriginalQwen3VLMoeTextSparseMoeBlock,
@@ -82,6 +84,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         next_states = next_states.reshape(batch_size, sequence_length, hidden_dim)
         return next_states, router_logits
 
+    def restore(self, original: torch.nn.Module) -> torch.nn.Module:
+        return original
+
 
 class SequentialQwen3VLMoeTextExperts(torch.nn.ModuleList):
     def __init__(self, config, original):

From 822c5a6caa1f3fb86c504e4dd8af098edeb6674e Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Fri, 7 Nov 2025 20:11:34 +0000
Subject: [PATCH 10/10] remove register fn

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/multimodal_vision/llama4_example.py | 2 +-
 src/llmcompressor/modeling/qwen3_next_moe.py | 7 ++-----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/examples/multimodal_vision/llama4_example.py b/examples/multimodal_vision/llama4_example.py
index 53b98621f3..c17ec01a1f 100644
--- a/examples/multimodal_vision/llama4_example.py
+++ b/examples/multimodal_vision/llama4_example.py
@@ -19,7 +19,7 @@
 # NOTE: This restructuring is specifically required for vLLM compatibility.
 # To define custom calibration logic, create a new calibration module in
 # modeling/llama4.py that inherits from `MoECalibrationModule`, and register
-# it using the `@register_moe_calibration` decorator with the appropriate
+# it using the `@MoECalibrationModule.register` decorator with the appropriate
 # module class name (e.g., "Llama4TextMoe").
 
 DATASET_ID = "neuralmagic/calibration"
diff --git a/src/llmcompressor/modeling/qwen3_next_moe.py b/src/llmcompressor/modeling/qwen3_next_moe.py
index cf11a84d08..823ca779b0 100644
--- a/src/llmcompressor/modeling/qwen3_next_moe.py
+++ b/src/llmcompressor/modeling/qwen3_next_moe.py
@@ -16,13 +16,10 @@
 
 import torch
 
-from llmcompressor.modeling.moe_context import (
-    MoECalibrationModule,
-    register_moe_calibration,
-)
+from llmcompressor.modeling.moe_context import MoECalibrationModule
 
 
-@register_moe_calibration("Qwen3NextSparseMoeBlock")
+@MoECalibrationModule.register("Qwen3NextSparseMoeBlock")
 class CalibrationQwen3NextSparseMoeBlock(MoECalibrationModule):
     from transformers import Qwen3NextConfig
     from transformers.models.qwen3_next.modeling_qwen3_next import (