fix issues, use registry

kylesayrs · kylesayrs · commit 8091a43681f5 · 2025-11-07T19:31:21.000Z
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/modeling/__init__.py b/src/llmcompressor/modeling/__init__.py
@@ -9,5 +9,12 @@
 needed for efficient compression.
 """
 
+# trigger registration
+from .deepseek_v3 import CalibrationDeepseekV3MoE  # noqa: F401
+from .llama4 import SequentialLlama4TextMoe  # noqa: F401
+from .qwen3_moe import CalibrationQwen3MoeSparseMoeBlock  # noqa: F401
+from .qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock  # noqa: F401
+# TODO: add granite4, Qwen3Next
+
 from .fuse import *
 from .prepare import *
diff --git a/src/llmcompressor/modeling/deepseek_v3.py b/src/llmcompressor/modeling/deepseek_v3.py
@@ -4,13 +4,10 @@
     DeepseekV3MoE as OriginalDeepseekV3MoE,
 )
 
-from llmcompressor.modeling.moe_context import (
-    MoECalibrationModule,
-    register_moe_calibration,
-)
+from llmcompressor.modeling.moe_context import MoECalibrationModule
 
 
-@register_moe_calibration("DeepseekV3MoE")
+@MoECalibrationModule.register("DeepseekV3MoE")
 class CalibrationDeepseekV3MoE(MoECalibrationModule):
     """
     Calibration version of DeepseekV3MoE that sends all tokens to all experts.
diff --git a/src/llmcompressor/modeling/llama4.py b/src/llmcompressor/modeling/llama4.py
@@ -11,14 +11,11 @@
     Llama4TextMoe,
 )
 
-from llmcompressor.modeling.moe_context import (
-    MoECalibrationModule,
-    register_moe_calibration,
-)
+from llmcompressor.modeling.moe_context import MoECalibrationModule
 from llmcompressor.utils.dev import skip_weights_initialize
 
 
-@register_moe_calibration("Llama4TextMoe")
+@MoECalibrationModule.register("Llama4TextMoe")
 class SequentialLlama4TextMoe(MoECalibrationModule):
     """
     Calibration version of Llama4TextMoe that unpacks experts for sequential processing.
diff --git a/src/llmcompressor/modeling/moe_context.py b/src/llmcompressor/modeling/moe_context.py
@@ -8,28 +8,25 @@
 
 Key components:
 - MoECalibrationModule: Abstract base class for calibration modules
-- MOE_CALIBRATION_MODULES: Registry mapping module class names to calibration classes
 - moe_calibration_context: Context manager that applies calibration to a model
 """
 
 import contextlib
 from abc import ABC
-from typing import Dict, Type
 
 import torch
+from compressed_tensors.registry import RegistryMixin, standardize_lookup_name
 from loguru import logger
 from tqdm import tqdm
 from transformers import PreTrainedModel
 
 __all__ = [
     "MoECalibrationModule",
-    "MOE_CALIBRATION_MODULES",
-    "register_moe_calibration",
     "moe_calibration_context",
 ]
 
 
-class MoECalibrationModule(ABC, torch.nn.Module):
+class MoECalibrationModule(ABC, torch.nn.Module, RegistryMixin):
     """
     Abstract base class for MoE calibration modules.
 
@@ -62,32 +59,6 @@ def restore(self, original: torch.nn.Module) -> torch.nn.Module:
         )
 
 
-# Registry: module class name -> calibration module class
-MOE_CALIBRATION_MODULES: Dict[str, Type[MoECalibrationModule]] = {}
-
-
-def register_moe_calibration(module_class_name: str):
-    """
-    Decorator to register a MoE calibration module.
-
-    Usage:
-        @register_moe_calibration("DeepseekV3MoE")
-        class CalibrationDeepseekV3MoE(MoECalibrationModule):
-            ...
-
-    Args:
-        module_class_name: The class name of the original module to replace
-    """
-
-    def decorator(cls: Type[MoECalibrationModule]) -> Type[MoECalibrationModule]:
-        if not issubclass(cls, MoECalibrationModule):
-            raise TypeError(f"{cls.__name__} must inherit from MoECalibrationModule")
-        MOE_CALIBRATION_MODULES[module_class_name] = cls
-        return cls
-
-    return decorator
-
-
 @contextlib.contextmanager
 def moe_calibration_context(
     model: PreTrainedModel,
@@ -115,12 +86,6 @@ def moe_calibration_context(
                 model(**batch)
         # Model is now restored (unless permanent)
     """
-    # trigger registration
-    from .deepseek_v3 import CalibrationDeepseekV3MoE  # noqa: F401
-    from .llama4 import SequentialLlama4TextMoe  # noqa: F401
-    from .qwen3_moe import CalibrationQwen3MoeSparseMoeBlock  # noqa: F401
-    from .qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock  # noqa: F401
-    # TODO: add granite4, Qwen3Next
 
     replaced = {}
 
@@ -129,7 +94,7 @@ def moe_calibration_context(
     modules_to_replace = []
     for name, module in model.named_modules():
         class_name = module.__class__.__name__
-        if class_name in MOE_CALIBRATION_MODULES:
+        if _is_registered(class_name, MoECalibrationModule):
             modules_to_replace.append((name, module, class_name))
 
     # Step 2: Replace modules with progress bar
@@ -138,8 +103,8 @@ def moe_calibration_context(
         for name, module, class_name in tqdm(
             modules_to_replace, desc="Replacing MoE modules for calibration"
         ):
-            calibration_cls = MOE_CALIBRATION_MODULES[class_name]
-            replacement = calibration_cls(
+            replacement = MoECalibrationModule.load_from_registry(
+                class_name,
                 module,
                 model.config,
                 calibrate_all_experts=calibrate_all_experts,
@@ -172,3 +137,7 @@ def moe_calibration_context(
             if not replacement.is_permanent:
                 restored = replacement.restore(original)
                 model.set_submodule(name, restored)
+
+
+def _is_registered(name: str, subclass: RegistryMixin):
+    return standardize_lookup_name(name) in subclass.registered_names()
diff --git a/src/llmcompressor/modeling/qwen3_moe.py b/src/llmcompressor/modeling/qwen3_moe.py
@@ -20,13 +20,10 @@
     Qwen3MoeSparseMoeBlock as OriginalQwen3MoeSparseMoeBlock,
 )
 
-from llmcompressor.modeling.moe_context import (
-    MoECalibrationModule,
-    register_moe_calibration,
-)
+from llmcompressor.modeling.moe_context import MoECalibrationModule
 
 
-@register_moe_calibration("Qwen3MoeSparseMoeBlock")
+@MoECalibrationModule.register("Qwen3MoeSparseMoeBlock")
 class CalibrationQwen3MoeSparseMoeBlock(MoECalibrationModule):
     """
     Calibration version of Qwen3MoeSparseMoeBlock that sends all tokens to all experts.
diff --git a/src/llmcompressor/modeling/qwen3_vl_moe.py b/src/llmcompressor/modeling/qwen3_vl_moe.py
@@ -4,14 +4,11 @@
     Qwen3VLMoeTextSparseMoeBlock as OriginalQwen3VLMoeTextSparseMoeBlock,
 )
 
-from llmcompressor.modeling.moe_context import (
-    MoECalibrationModule,
-    register_moe_calibration,
-)
+from llmcompressor.modeling.moe_context import MoECalibrationModule
 from llmcompressor.utils.dev import skip_weights_initialize
 
 
-@register_moe_calibration("CalibrationQwen3VLMoeTextSparseMoeBlock")
+@MoECalibrationModule.register("Qwen3VLMoeTextSparseMoeBlock")
 class CalibrateQwen3VLMoeTextSparseMoeBlock(MoECalibrationModule):
     """
     Calibration version of Qwen3VLMoeTextSparseMoeBlock that sends all tokens to all
@@ -118,7 +115,7 @@ def replace(
     calibrate_all_experts: bool,
 ):
     return CalibrateQwen3VLMoeTextSparseMoeBlock(
-        config=config.get_text_config(),
         original=original,
+        config=config,
         calibrate_all_experts=calibrate_all_experts,
     )
diff --git a/tests/llmcompressor/modeling/test_calib_qwen3_vl_moe.py b/tests/llmcompressor/modeling/test_calib_qwen3_vl_moe.py
@@ -1,5 +1,5 @@
 import torch
-from transformers import Qwen3VLMoeTextConfig
+from transformers import Qwen3VLMoeConfig
 from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import (
     Qwen3VLMoeTextSparseMoeBlock,
 )
@@ -10,18 +10,18 @@
 
 
 @requires_gpu
-def test_calib_qwen3_moe_module():
-    config = Qwen3VLMoeTextConfig()
+def test_calib_qwen3_vl_moe_module():
+    config = Qwen3VLMoeConfig()
     with torch.device("cuda"):
-        original = Qwen3VLMoeTextSparseMoeBlock(config).eval()
+        original = Qwen3VLMoeTextSparseMoeBlock(config.get_text_config()).eval()
         # these are initialized as empty / all 0s which results in outputs
         # from the experts being all 0
         # update to use a small random value
         original.experts.gate_up_proj.data.normal_(mean=0.0, std=0.02)
         original.experts.down_proj.data.normal_(mean=0.0, std=0.02)
 
     # Create dummy input tensor that simulates hidden_states
-    hidden_dim = config.hidden_size
+    hidden_dim = config.get_text_config().hidden_size
     batch, seq_len = 4, 32
     sample = torch.randn(batch, seq_len, hidden_dim, device="cuda")
 
diff --git a/tests/llmcompressor/pipelines/test_ptq_weights.py b/tests/llmcompressor/pipelines/test_ptq_weights.py
@@ -0,0 +1,66 @@
+import os
+
+import pytest
+import torch
+from safetensors.torch import load_file
+
+from llmcompressor import oneshot, ptq_weights
+from llmcompressor.modifiers.quantization import QuantizationModifier
+from tests.testing_utils import requires_gpu
+
+
+@requires_gpu
+@pytest.mark.parametrize("scheme", ["FP8_dynamic", "NVFP4A16"])
+def test_weights_ptq_e2e(scheme, tmp_path):
+    model = "nm-testing/tinysmokellama-3.2"
+    ptq_ignore = ["model.embed_tokens.weight", "lm_head.weight", "re:.*norm.weight$"]
+    oneshot_ignore = ["lm_head"]
+    device = "cuda:0"
+
+    ptq_outdir = tmp_path / "weights_out"
+    oneshot_outdir = tmp_path / "oneshot_out"
+
+    ptq_weights(
+        model,
+        ptq_outdir,
+        scheme=scheme,
+        max_workers=2,
+        device=device,
+        ignore=ptq_ignore,
+    )
+
+    oneshot(
+        model=model,
+        recipe=QuantizationModifier(
+            targets="Linear", scheme=scheme, ignore=oneshot_ignore
+        ),
+        output_dir=oneshot_outdir,
+    )
+
+    ptq_st_files = _get_safetensors_files(ptq_outdir)
+    oneshot_st_files = _get_safetensors_files(oneshot_outdir)
+    assert set(ptq_st_files) == set(oneshot_st_files)
+
+    for file_name in ptq_st_files:
+        _assert_safetensors_equal(ptq_outdir / file_name, oneshot_outdir / file_name)
+
+
+def _get_safetensors_files(dir_path: str) -> list[str]:
+    return [
+        file_name
+        for file_name in os.listdir(dir_path)
+        if file_name.endswith("safetensors")
+    ]
+
+
+def _assert_safetensors_equal(a_path: str, b_path: str) -> bool:
+    a = load_file(a_path)
+    b = load_file(b_path)
+
+    assert a.keys() == b.keys(), (a.keys() - b.keys(), b.keys() - a.keys())
+
+    for key in a.keys():
+        value_equal = torch.equal(a[key].to(torch.bfloat16), b[key].to(torch.bfloat16))
+        dtype_equal = a[key].dtype == b[key].dtype
+
+        assert value_equal and dtype_equal, (key, value_equal, dtype_equal)