From 3ba4f00f440e483f15e7ce9bb3f3322c828ad1d6 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 27 Oct 2025 12:00:40 -0400 Subject: [PATCH 01/10] apply patch Signed-off-by: Kyle Sayers --- src/llmcompressor/modeling/llama4.py | 6 +-- src/llmcompressor/modeling/prepare.py | 39 ++++++----------- src/llmcompressor/modeling/qwen3_vl_moe.py | 43 +++++++++++++++---- .../modeling/test_calib_qwen3_vl_moe.py | 21 +++++---- 4 files changed, 59 insertions(+), 50 deletions(-) diff --git a/src/llmcompressor/modeling/llama4.py b/src/llmcompressor/modeling/llama4.py index 2b49a652af..bfaa8e21c7 100644 --- a/src/llmcompressor/modeling/llama4.py +++ b/src/llmcompressor/modeling/llama4.py @@ -38,10 +38,8 @@ def __init__( calibrate_all_experts: bool = True, ): super().__init__() - # Extract text config from multimodal config if needed - text_config = ( - config.get_text_config() if hasattr(config, "get_text_config") else config - ) + # Extract text config from multimodal config + text_config: Llama4TextConfig = config.get_text_config() self.top_k = text_config.num_experts_per_tok self.hidden_dim = text_config.hidden_size self.num_experts = text_config.num_local_experts diff --git a/src/llmcompressor/modeling/prepare.py b/src/llmcompressor/modeling/prepare.py index 42173bb8b5..10b92f2603 100644 --- a/src/llmcompressor/modeling/prepare.py +++ b/src/llmcompressor/modeling/prepare.py @@ -10,33 +10,20 @@ from compressed_tensors.utils import deprecated, replace_module from transformers import PreTrainedModel -# Import MoE calibration modules to trigger registration -from llmcompressor.modeling.deepseek_v3 import ( # noqa: F401 - CalibrationDeepseekV3MoE, -) -from llmcompressor.modeling.deepseek_v3 import ( - replace as replace_deepseekv3, -) -from llmcompressor.modeling.llama4 import ( # noqa: F401 - SequentialLlama4TextMoe, -) -from llmcompressor.modeling.llama4 import ( - replace as replace_llama4, -) -from llmcompressor.modeling.moe_context import ( # noqa: F401 - moe_calibration_context, -) -from llmcompressor.modeling.qwen3_moe import ( # noqa: F401 - CalibrationQwen3MoeSparseMoeBlock, -) -from llmcompressor.modeling.qwen3_next_moe import ( # noqa: F401 - CalibrationQwen3NextSparseMoeBlock, -) -from llmcompressor.modeling.qwen3_vl_moe import ( - replace as replace_Qwen3VLMoE, -) +# deprecated replacement functions +from llmcompressor.modeling.deepseek_v3 import replace as replace_deepseekv3 +from llmcompressor.modeling.llama4 import replace as replace_llama4 +from llmcompressor.modeling.qwen3_vl_moe import replace as replace_Qwen3VLMoE + +# trigger registration +from .deepseek_v3 import CalibrationDeepseekV3MoE # noqa: F401 +from .llama4 import SequentialLlama4TextMoe # noqa: F401 +from .qwen3_moe import CalibrationQwen3MoeSparseMoeBlock # noqa: F401 +from .qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock # noqa: F401 + +# TODO: add granite4, Qwen3Next -__all__ = ["moe_calibration_context", "replace_modules_for_calibration"] +__all__ = ["replace_modules_for_calibration"] # ---------------------- module replacements; permanent ------------------------- replacements = { diff --git a/src/llmcompressor/modeling/qwen3_vl_moe.py b/src/llmcompressor/modeling/qwen3_vl_moe.py index 5af6b7abf3..9eeba6dfc1 100644 --- a/src/llmcompressor/modeling/qwen3_vl_moe.py +++ b/src/llmcompressor/modeling/qwen3_vl_moe.py @@ -1,19 +1,40 @@ import torch - +from transformers import Qwen3VLMoeConfig, Qwen3VLMoeTextConfig +from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import ( + Qwen3VLMoeTextSparseMoeBlock as OriginalQwen3VLMoeTextSparseMoeBlock, +) + +from llmcompressor.modeling.moe_context import ( + MoECalibrationModule, + register_moe_calibration, +) from llmcompressor.utils.dev import skip_weights_initialize -class LinearQwen3VLMoeTextSparseMoeBlock(torch.nn.Module): - def __init__(self, config, original, calibrate_all_experts): +@register_moe_calibration("CalibrationQwen3VLMoeTextSparseMoeBlock") +class CalibrateQwen3VLMoeTextSparseMoeBlock(MoECalibrationModule): + """ + Calibration version of Qwen3VLMoeTextSparseMoeBlock that sends all tokens to all + experts. + """ + + def __init__( + self, + original: OriginalQwen3VLMoeTextSparseMoeBlock, + config: Qwen3VLMoeConfig, + calibrate_all_experts: bool, + ): super().__init__() - self.hidden_size = config.hidden_size - self.num_experts = config.num_experts + text_config: Qwen3VLMoeTextConfig = config.get_text_config() + + self.hidden_size = text_config.hidden_size + self.num_experts = text_config.num_experts self.top_k = original.top_k # Note: gate was changed to be a Linear layer in transformers==4.57.0 # https://github.com/JJJYmmm/transformers/commit/f5dea1c694af8c994c769170813a8702332119ee self.gate = original.gate self.calibrate_all_experts = calibrate_all_experts - self.experts = SequentialQwen3VLMoeTextExperts(config, original.experts) + self.experts = SequentialQwen3VLMoeTextExperts(text_config, original.experts) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: batch_size, sequence_length, hidden_dim = hidden_states.shape @@ -91,9 +112,13 @@ def __init__(self, config, original): self[i].down_proj.weight.data = down.t().clone().contiguous() -def replace(config, module, calibrate_all_experts): - return LinearQwen3VLMoeTextSparseMoeBlock( +def replace( + config: Qwen3VLMoeConfig, + original: OriginalQwen3VLMoeTextSparseMoeBlock, + calibrate_all_experts: bool, +): + return CalibrateQwen3VLMoeTextSparseMoeBlock( config=config.get_text_config(), - original=module, + original=original, calibrate_all_experts=calibrate_all_experts, ) diff --git a/tests/llmcompressor/modeling/test_calib_qwen3_vl_moe.py b/tests/llmcompressor/modeling/test_calib_qwen3_vl_moe.py index 513fb8cd73..c34951755b 100644 --- a/tests/llmcompressor/modeling/test_calib_qwen3_vl_moe.py +++ b/tests/llmcompressor/modeling/test_calib_qwen3_vl_moe.py @@ -1,17 +1,16 @@ import torch +from transformers import Qwen3VLMoeTextConfig +from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import ( + Qwen3VLMoeTextSparseMoeBlock, +) -from llmcompressor.modeling.qwen3_vl_moe import LinearQwen3VLMoeTextSparseMoeBlock +from llmcompressor.modeling.qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock from llmcompressor.utils.helpers import calibration_forward_context from tests.testing_utils import requires_gpu @requires_gpu -def test_calib_qwen3_vl_moe_module(): - from transformers import Qwen3VLMoeTextConfig - from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import ( - Qwen3VLMoeTextSparseMoeBlock, - ) - +def test_calib_qwen3_moe_module(): config = Qwen3VLMoeTextConfig() with torch.device("cuda"): original = Qwen3VLMoeTextSparseMoeBlock(config).eval() @@ -29,16 +28,16 @@ def test_calib_qwen3_vl_moe_module(): with calibration_forward_context(original): true_output = original(sample) - module = LinearQwen3VLMoeTextSparseMoeBlock( - config, original, calibrate_all_experts=True + module = CalibrateQwen3VLMoeTextSparseMoeBlock( + original, config, calibrate_all_experts=True ) with calibration_forward_context(module): output = module(sample) assert torch.nn.functional.mse_loss(true_output[0], output[0]) < 1e-10 assert torch.nn.functional.mse_loss(true_output[1], output[1]) < 1e-10 - module = LinearQwen3VLMoeTextSparseMoeBlock( - config, original, calibrate_all_experts=False + module = CalibrateQwen3VLMoeTextSparseMoeBlock( + original, config, calibrate_all_experts=False ) with calibration_forward_context(module): output = module(sample) From 2cf2aa9b12c13a234144cd3d631f9fb76a39cd5d Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 29 Oct 2025 18:01:19 -0400 Subject: [PATCH 02/10] change log level Signed-off-by: Kyle Sayers --- src/llmcompressor/modeling/moe_context.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llmcompressor/modeling/moe_context.py b/src/llmcompressor/modeling/moe_context.py index 232b271c01..a25b2fa0ec 100644 --- a/src/llmcompressor/modeling/moe_context.py +++ b/src/llmcompressor/modeling/moe_context.py @@ -118,7 +118,7 @@ def moe_calibration_context( replaced = {} # Step 1: Collect all MoE modules that need replacement - logger.info("Entering MoE calibration context") + logger.debug("Entering MoE calibration context") modules_to_replace = [] for name, module in model.named_modules(): class_name = module.__class__.__name__ @@ -161,7 +161,7 @@ def moe_calibration_context( yield finally: # Step 2: Restore non-permanent modules - for name, (original, replacement) in replaced.items(): + for name, (_original, replacement) in replaced.items(): if not replacement.is_permanent: restored = replacement.restore(original) model.set_submodule(name, restored) From fc34db696930dfc99220d60505280b06706d2c40 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 29 Oct 2025 18:08:11 -0400 Subject: [PATCH 03/10] move registration Signed-off-by: Kyle Sayers --- src/llmcompressor/modeling/moe_context.py | 7 +++++++ src/llmcompressor/modeling/prepare.py | 8 -------- tests/llmcompressor/modeling/test_calib_granite4.py | 0 3 files changed, 7 insertions(+), 8 deletions(-) create mode 100644 tests/llmcompressor/modeling/test_calib_granite4.py diff --git a/src/llmcompressor/modeling/moe_context.py b/src/llmcompressor/modeling/moe_context.py index a25b2fa0ec..0216b651ae 100644 --- a/src/llmcompressor/modeling/moe_context.py +++ b/src/llmcompressor/modeling/moe_context.py @@ -115,6 +115,13 @@ def moe_calibration_context( model(**batch) # Model is now restored (unless permanent) """ + # trigger registration + from .deepseek_v3 import CalibrationDeepseekV3MoE # noqa: F401 + from .llama4 import SequentialLlama4TextMoe # noqa: F401 + from .qwen3_moe import CalibrationQwen3MoeSparseMoeBlock # noqa: F401 + from .qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock # noqa: F401 + # TODO: add granite4, Qwen3Next + replaced = {} # Step 1: Collect all MoE modules that need replacement diff --git a/src/llmcompressor/modeling/prepare.py b/src/llmcompressor/modeling/prepare.py index 10b92f2603..af9920d1b8 100644 --- a/src/llmcompressor/modeling/prepare.py +++ b/src/llmcompressor/modeling/prepare.py @@ -15,14 +15,6 @@ from llmcompressor.modeling.llama4 import replace as replace_llama4 from llmcompressor.modeling.qwen3_vl_moe import replace as replace_Qwen3VLMoE -# trigger registration -from .deepseek_v3 import CalibrationDeepseekV3MoE # noqa: F401 -from .llama4 import SequentialLlama4TextMoe # noqa: F401 -from .qwen3_moe import CalibrationQwen3MoeSparseMoeBlock # noqa: F401 -from .qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock # noqa: F401 - -# TODO: add granite4, Qwen3Next - __all__ = ["replace_modules_for_calibration"] # ---------------------- module replacements; permanent ------------------------- diff --git a/tests/llmcompressor/modeling/test_calib_granite4.py b/tests/llmcompressor/modeling/test_calib_granite4.py new file mode 100644 index 0000000000..e69de29bb2 From 4e78ae46fce829304d787c147b41d1ff1ee4a627 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 29 Oct 2025 18:14:27 -0400 Subject: [PATCH 04/10] use registry mixin Signed-off-by: Kyle Sayers --- src/llmcompressor/modeling/deepseek_v3.py | 7 ++-- src/llmcompressor/modeling/llama4.py | 7 ++-- src/llmcompressor/modeling/moe_context.py | 39 ++++------------------ src/llmcompressor/modeling/qwen3_moe.py | 7 ++-- src/llmcompressor/modeling/qwen3_vl_moe.py | 7 ++-- 5 files changed, 14 insertions(+), 53 deletions(-) diff --git a/src/llmcompressor/modeling/deepseek_v3.py b/src/llmcompressor/modeling/deepseek_v3.py index c2dd8f4b69..4618d15b68 100644 --- a/src/llmcompressor/modeling/deepseek_v3.py +++ b/src/llmcompressor/modeling/deepseek_v3.py @@ -4,13 +4,10 @@ DeepseekV3MoE as OriginalDeepseekV3MoE, ) -from llmcompressor.modeling.moe_context import ( - MoECalibrationModule, - register_moe_calibration, -) +from llmcompressor.modeling.moe_context import MoECalibrationModule -@register_moe_calibration("DeepseekV3MoE") +@MoECalibrationModule.register("DeepseekV3MoE") class CalibrationDeepseekV3MoE(MoECalibrationModule): """ Calibration version of DeepseekV3MoE that sends all tokens to all experts. diff --git a/src/llmcompressor/modeling/llama4.py b/src/llmcompressor/modeling/llama4.py index bfaa8e21c7..1f2ef9b77f 100644 --- a/src/llmcompressor/modeling/llama4.py +++ b/src/llmcompressor/modeling/llama4.py @@ -11,14 +11,11 @@ Llama4TextMoe, ) -from llmcompressor.modeling.moe_context import ( - MoECalibrationModule, - register_moe_calibration, -) +from llmcompressor.modeling.moe_context import MoECalibrationModule from llmcompressor.utils.dev import skip_weights_initialize -@register_moe_calibration("Llama4TextMoe") +@MoECalibrationModule.register("Llama4TextMoe") class SequentialLlama4TextMoe(MoECalibrationModule): """ Calibration version of Llama4TextMoe that unpacks experts for sequential processing. diff --git a/src/llmcompressor/modeling/moe_context.py b/src/llmcompressor/modeling/moe_context.py index 0216b651ae..fb8bd86732 100644 --- a/src/llmcompressor/modeling/moe_context.py +++ b/src/llmcompressor/modeling/moe_context.py @@ -14,22 +14,20 @@ import contextlib from abc import ABC -from typing import Dict, Type import torch +from compressed_tensors.registry import RegistryMixin from loguru import logger from tqdm import tqdm from transformers import PreTrainedModel __all__ = [ "MoECalibrationModule", - "MOE_CALIBRATION_MODULES", - "register_moe_calibration", "moe_calibration_context", ] -class MoECalibrationModule(ABC, torch.nn.Module): +class MoECalibrationModule(ABC, torch.nn.Module, RegistryMixin): """ Abstract base class for MoE calibration modules. @@ -62,32 +60,6 @@ def restore(self, original: torch.nn.Module) -> torch.nn.Module: ) -# Registry: module class name -> calibration module class -MOE_CALIBRATION_MODULES: Dict[str, Type[MoECalibrationModule]] = {} - - -def register_moe_calibration(module_class_name: str): - """ - Decorator to register a MoE calibration module. - - Usage: - @register_moe_calibration("DeepseekV3MoE") - class CalibrationDeepseekV3MoE(MoECalibrationModule): - ... - - Args: - module_class_name: The class name of the original module to replace - """ - - def decorator(cls: Type[MoECalibrationModule]) -> Type[MoECalibrationModule]: - if not issubclass(cls, MoECalibrationModule): - raise TypeError(f"{cls.__name__} must inherit from MoECalibrationModule") - MOE_CALIBRATION_MODULES[module_class_name] = cls - return cls - - return decorator - - @contextlib.contextmanager def moe_calibration_context( model: PreTrainedModel, @@ -127,9 +99,10 @@ def moe_calibration_context( # Step 1: Collect all MoE modules that need replacement logger.debug("Entering MoE calibration context") modules_to_replace = [] + moe_class_names = MoECalibrationModule.registered_names() for name, module in model.named_modules(): class_name = module.__class__.__name__ - if class_name in MOE_CALIBRATION_MODULES: + if class_name in moe_class_names: modules_to_replace.append((name, module, class_name)) # Step 2: Replace modules with progress bar @@ -138,8 +111,8 @@ def moe_calibration_context( for name, module, class_name in tqdm( modules_to_replace, desc="Replacing MoE modules for calibration" ): - calibration_cls = MOE_CALIBRATION_MODULES[class_name] - replacement = calibration_cls( + replacement = MoECalibrationModule.load_from_registry( + class_name, module, model.config, calibrate_all_experts=calibrate_all_experts, diff --git a/src/llmcompressor/modeling/qwen3_moe.py b/src/llmcompressor/modeling/qwen3_moe.py index 5432b731bb..678e32f10c 100644 --- a/src/llmcompressor/modeling/qwen3_moe.py +++ b/src/llmcompressor/modeling/qwen3_moe.py @@ -20,13 +20,10 @@ Qwen3MoeSparseMoeBlock as OriginalQwen3MoeSparseMoeBlock, ) -from llmcompressor.modeling.moe_context import ( - MoECalibrationModule, - register_moe_calibration, -) +from llmcompressor.modeling.moe_context import MoECalibrationModule -@register_moe_calibration("Qwen3MoeSparseMoeBlock") +@MoECalibrationModule.register("Qwen3MoeSparseMoeBlock") class CalibrationQwen3MoeSparseMoeBlock(MoECalibrationModule): """ Calibration version of Qwen3MoeSparseMoeBlock that sends all tokens to all experts. diff --git a/src/llmcompressor/modeling/qwen3_vl_moe.py b/src/llmcompressor/modeling/qwen3_vl_moe.py index 9eeba6dfc1..97cc221758 100644 --- a/src/llmcompressor/modeling/qwen3_vl_moe.py +++ b/src/llmcompressor/modeling/qwen3_vl_moe.py @@ -4,14 +4,11 @@ Qwen3VLMoeTextSparseMoeBlock as OriginalQwen3VLMoeTextSparseMoeBlock, ) -from llmcompressor.modeling.moe_context import ( - MoECalibrationModule, - register_moe_calibration, -) +from llmcompressor.modeling.moe_context import MoECalibrationModule from llmcompressor.utils.dev import skip_weights_initialize -@register_moe_calibration("CalibrationQwen3VLMoeTextSparseMoeBlock") +@MoECalibrationModule.register("CalibrationQwen3VLMoeTextSparseMoeBlock") class CalibrateQwen3VLMoeTextSparseMoeBlock(MoECalibrationModule): """ Calibration version of Qwen3VLMoeTextSparseMoeBlock that sends all tokens to all From 78f08a308c1a8b433549161da13a11a70b8e805b Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 29 Oct 2025 18:17:23 -0400 Subject: [PATCH 05/10] Revert "use registry mixin" This reverts commit 6dd03202f2e262a726125b955098e6769dec1f85. --- src/llmcompressor/modeling/deepseek_v3.py | 7 ++-- src/llmcompressor/modeling/llama4.py | 7 ++-- src/llmcompressor/modeling/moe_context.py | 39 ++++++++++++++++++---- src/llmcompressor/modeling/qwen3_moe.py | 7 ++-- src/llmcompressor/modeling/qwen3_vl_moe.py | 7 ++-- 5 files changed, 53 insertions(+), 14 deletions(-) diff --git a/src/llmcompressor/modeling/deepseek_v3.py b/src/llmcompressor/modeling/deepseek_v3.py index 4618d15b68..c2dd8f4b69 100644 --- a/src/llmcompressor/modeling/deepseek_v3.py +++ b/src/llmcompressor/modeling/deepseek_v3.py @@ -4,10 +4,13 @@ DeepseekV3MoE as OriginalDeepseekV3MoE, ) -from llmcompressor.modeling.moe_context import MoECalibrationModule +from llmcompressor.modeling.moe_context import ( + MoECalibrationModule, + register_moe_calibration, +) -@MoECalibrationModule.register("DeepseekV3MoE") +@register_moe_calibration("DeepseekV3MoE") class CalibrationDeepseekV3MoE(MoECalibrationModule): """ Calibration version of DeepseekV3MoE that sends all tokens to all experts. diff --git a/src/llmcompressor/modeling/llama4.py b/src/llmcompressor/modeling/llama4.py index 1f2ef9b77f..bfaa8e21c7 100644 --- a/src/llmcompressor/modeling/llama4.py +++ b/src/llmcompressor/modeling/llama4.py @@ -11,11 +11,14 @@ Llama4TextMoe, ) -from llmcompressor.modeling.moe_context import MoECalibrationModule +from llmcompressor.modeling.moe_context import ( + MoECalibrationModule, + register_moe_calibration, +) from llmcompressor.utils.dev import skip_weights_initialize -@MoECalibrationModule.register("Llama4TextMoe") +@register_moe_calibration("Llama4TextMoe") class SequentialLlama4TextMoe(MoECalibrationModule): """ Calibration version of Llama4TextMoe that unpacks experts for sequential processing. diff --git a/src/llmcompressor/modeling/moe_context.py b/src/llmcompressor/modeling/moe_context.py index fb8bd86732..0216b651ae 100644 --- a/src/llmcompressor/modeling/moe_context.py +++ b/src/llmcompressor/modeling/moe_context.py @@ -14,20 +14,22 @@ import contextlib from abc import ABC +from typing import Dict, Type import torch -from compressed_tensors.registry import RegistryMixin from loguru import logger from tqdm import tqdm from transformers import PreTrainedModel __all__ = [ "MoECalibrationModule", + "MOE_CALIBRATION_MODULES", + "register_moe_calibration", "moe_calibration_context", ] -class MoECalibrationModule(ABC, torch.nn.Module, RegistryMixin): +class MoECalibrationModule(ABC, torch.nn.Module): """ Abstract base class for MoE calibration modules. @@ -60,6 +62,32 @@ def restore(self, original: torch.nn.Module) -> torch.nn.Module: ) +# Registry: module class name -> calibration module class +MOE_CALIBRATION_MODULES: Dict[str, Type[MoECalibrationModule]] = {} + + +def register_moe_calibration(module_class_name: str): + """ + Decorator to register a MoE calibration module. + + Usage: + @register_moe_calibration("DeepseekV3MoE") + class CalibrationDeepseekV3MoE(MoECalibrationModule): + ... + + Args: + module_class_name: The class name of the original module to replace + """ + + def decorator(cls: Type[MoECalibrationModule]) -> Type[MoECalibrationModule]: + if not issubclass(cls, MoECalibrationModule): + raise TypeError(f"{cls.__name__} must inherit from MoECalibrationModule") + MOE_CALIBRATION_MODULES[module_class_name] = cls + return cls + + return decorator + + @contextlib.contextmanager def moe_calibration_context( model: PreTrainedModel, @@ -99,10 +127,9 @@ def moe_calibration_context( # Step 1: Collect all MoE modules that need replacement logger.debug("Entering MoE calibration context") modules_to_replace = [] - moe_class_names = MoECalibrationModule.registered_names() for name, module in model.named_modules(): class_name = module.__class__.__name__ - if class_name in moe_class_names: + if class_name in MOE_CALIBRATION_MODULES: modules_to_replace.append((name, module, class_name)) # Step 2: Replace modules with progress bar @@ -111,8 +138,8 @@ def moe_calibration_context( for name, module, class_name in tqdm( modules_to_replace, desc="Replacing MoE modules for calibration" ): - replacement = MoECalibrationModule.load_from_registry( - class_name, + calibration_cls = MOE_CALIBRATION_MODULES[class_name] + replacement = calibration_cls( module, model.config, calibrate_all_experts=calibrate_all_experts, diff --git a/src/llmcompressor/modeling/qwen3_moe.py b/src/llmcompressor/modeling/qwen3_moe.py index 678e32f10c..5432b731bb 100644 --- a/src/llmcompressor/modeling/qwen3_moe.py +++ b/src/llmcompressor/modeling/qwen3_moe.py @@ -20,10 +20,13 @@ Qwen3MoeSparseMoeBlock as OriginalQwen3MoeSparseMoeBlock, ) -from llmcompressor.modeling.moe_context import MoECalibrationModule +from llmcompressor.modeling.moe_context import ( + MoECalibrationModule, + register_moe_calibration, +) -@MoECalibrationModule.register("Qwen3MoeSparseMoeBlock") +@register_moe_calibration("Qwen3MoeSparseMoeBlock") class CalibrationQwen3MoeSparseMoeBlock(MoECalibrationModule): """ Calibration version of Qwen3MoeSparseMoeBlock that sends all tokens to all experts. diff --git a/src/llmcompressor/modeling/qwen3_vl_moe.py b/src/llmcompressor/modeling/qwen3_vl_moe.py index 97cc221758..9eeba6dfc1 100644 --- a/src/llmcompressor/modeling/qwen3_vl_moe.py +++ b/src/llmcompressor/modeling/qwen3_vl_moe.py @@ -4,11 +4,14 @@ Qwen3VLMoeTextSparseMoeBlock as OriginalQwen3VLMoeTextSparseMoeBlock, ) -from llmcompressor.modeling.moe_context import MoECalibrationModule +from llmcompressor.modeling.moe_context import ( + MoECalibrationModule, + register_moe_calibration, +) from llmcompressor.utils.dev import skip_weights_initialize -@MoECalibrationModule.register("CalibrationQwen3VLMoeTextSparseMoeBlock") +@register_moe_calibration("CalibrationQwen3VLMoeTextSparseMoeBlock") class CalibrateQwen3VLMoeTextSparseMoeBlock(MoECalibrationModule): """ Calibration version of Qwen3VLMoeTextSparseMoeBlock that sends all tokens to all From bb39dacad1c71d7947bb85d541b559a7e4e40230 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 29 Oct 2025 18:18:15 -0400 Subject: [PATCH 06/10] remove file Signed-off-by: Kyle Sayers --- tests/llmcompressor/modeling/test_calib_granite4.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tests/llmcompressor/modeling/test_calib_granite4.py diff --git a/tests/llmcompressor/modeling/test_calib_granite4.py b/tests/llmcompressor/modeling/test_calib_granite4.py deleted file mode 100644 index e69de29bb2..0000000000 From 8091a43681f5de274798eb8d14c2cb932f2c3901 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 30 Oct 2025 14:47:45 +0000 Subject: [PATCH 07/10] fix issues, use registry Signed-off-by: Kyle Sayers --- src/llmcompressor/modeling/__init__.py | 7 ++ src/llmcompressor/modeling/deepseek_v3.py | 7 +- src/llmcompressor/modeling/llama4.py | 7 +- src/llmcompressor/modeling/moe_context.py | 49 +++----------- src/llmcompressor/modeling/qwen3_moe.py | 7 +- src/llmcompressor/modeling/qwen3_vl_moe.py | 9 +-- .../modeling/test_calib_qwen3_vl_moe.py | 10 +-- .../pipelines/test_ptq_weights.py | 66 +++++++++++++++++++ 8 files changed, 96 insertions(+), 66 deletions(-) create mode 100644 tests/llmcompressor/pipelines/test_ptq_weights.py diff --git a/src/llmcompressor/modeling/__init__.py b/src/llmcompressor/modeling/__init__.py index 9d105d823a..b1aba40835 100644 --- a/src/llmcompressor/modeling/__init__.py +++ b/src/llmcompressor/modeling/__init__.py @@ -9,5 +9,12 @@ needed for efficient compression. """ +# trigger registration +from .deepseek_v3 import CalibrationDeepseekV3MoE # noqa: F401 +from .llama4 import SequentialLlama4TextMoe # noqa: F401 +from .qwen3_moe import CalibrationQwen3MoeSparseMoeBlock # noqa: F401 +from .qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock # noqa: F401 +# TODO: add granite4, Qwen3Next + from .fuse import * from .prepare import * diff --git a/src/llmcompressor/modeling/deepseek_v3.py b/src/llmcompressor/modeling/deepseek_v3.py index c2dd8f4b69..4618d15b68 100644 --- a/src/llmcompressor/modeling/deepseek_v3.py +++ b/src/llmcompressor/modeling/deepseek_v3.py @@ -4,13 +4,10 @@ DeepseekV3MoE as OriginalDeepseekV3MoE, ) -from llmcompressor.modeling.moe_context import ( - MoECalibrationModule, - register_moe_calibration, -) +from llmcompressor.modeling.moe_context import MoECalibrationModule -@register_moe_calibration("DeepseekV3MoE") +@MoECalibrationModule.register("DeepseekV3MoE") class CalibrationDeepseekV3MoE(MoECalibrationModule): """ Calibration version of DeepseekV3MoE that sends all tokens to all experts. diff --git a/src/llmcompressor/modeling/llama4.py b/src/llmcompressor/modeling/llama4.py index bfaa8e21c7..1f2ef9b77f 100644 --- a/src/llmcompressor/modeling/llama4.py +++ b/src/llmcompressor/modeling/llama4.py @@ -11,14 +11,11 @@ Llama4TextMoe, ) -from llmcompressor.modeling.moe_context import ( - MoECalibrationModule, - register_moe_calibration, -) +from llmcompressor.modeling.moe_context import MoECalibrationModule from llmcompressor.utils.dev import skip_weights_initialize -@register_moe_calibration("Llama4TextMoe") +@MoECalibrationModule.register("Llama4TextMoe") class SequentialLlama4TextMoe(MoECalibrationModule): """ Calibration version of Llama4TextMoe that unpacks experts for sequential processing. diff --git a/src/llmcompressor/modeling/moe_context.py b/src/llmcompressor/modeling/moe_context.py index 0216b651ae..fea0e057c8 100644 --- a/src/llmcompressor/modeling/moe_context.py +++ b/src/llmcompressor/modeling/moe_context.py @@ -8,28 +8,25 @@ Key components: - MoECalibrationModule: Abstract base class for calibration modules -- MOE_CALIBRATION_MODULES: Registry mapping module class names to calibration classes - moe_calibration_context: Context manager that applies calibration to a model """ import contextlib from abc import ABC -from typing import Dict, Type import torch +from compressed_tensors.registry import RegistryMixin, standardize_lookup_name from loguru import logger from tqdm import tqdm from transformers import PreTrainedModel __all__ = [ "MoECalibrationModule", - "MOE_CALIBRATION_MODULES", - "register_moe_calibration", "moe_calibration_context", ] -class MoECalibrationModule(ABC, torch.nn.Module): +class MoECalibrationModule(ABC, torch.nn.Module, RegistryMixin): """ Abstract base class for MoE calibration modules. @@ -62,32 +59,6 @@ def restore(self, original: torch.nn.Module) -> torch.nn.Module: ) -# Registry: module class name -> calibration module class -MOE_CALIBRATION_MODULES: Dict[str, Type[MoECalibrationModule]] = {} - - -def register_moe_calibration(module_class_name: str): - """ - Decorator to register a MoE calibration module. - - Usage: - @register_moe_calibration("DeepseekV3MoE") - class CalibrationDeepseekV3MoE(MoECalibrationModule): - ... - - Args: - module_class_name: The class name of the original module to replace - """ - - def decorator(cls: Type[MoECalibrationModule]) -> Type[MoECalibrationModule]: - if not issubclass(cls, MoECalibrationModule): - raise TypeError(f"{cls.__name__} must inherit from MoECalibrationModule") - MOE_CALIBRATION_MODULES[module_class_name] = cls - return cls - - return decorator - - @contextlib.contextmanager def moe_calibration_context( model: PreTrainedModel, @@ -115,12 +86,6 @@ def moe_calibration_context( model(**batch) # Model is now restored (unless permanent) """ - # trigger registration - from .deepseek_v3 import CalibrationDeepseekV3MoE # noqa: F401 - from .llama4 import SequentialLlama4TextMoe # noqa: F401 - from .qwen3_moe import CalibrationQwen3MoeSparseMoeBlock # noqa: F401 - from .qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock # noqa: F401 - # TODO: add granite4, Qwen3Next replaced = {} @@ -129,7 +94,7 @@ def moe_calibration_context( modules_to_replace = [] for name, module in model.named_modules(): class_name = module.__class__.__name__ - if class_name in MOE_CALIBRATION_MODULES: + if _is_registered(class_name, MoECalibrationModule): modules_to_replace.append((name, module, class_name)) # Step 2: Replace modules with progress bar @@ -138,8 +103,8 @@ def moe_calibration_context( for name, module, class_name in tqdm( modules_to_replace, desc="Replacing MoE modules for calibration" ): - calibration_cls = MOE_CALIBRATION_MODULES[class_name] - replacement = calibration_cls( + replacement = MoECalibrationModule.load_from_registry( + class_name, module, model.config, calibrate_all_experts=calibrate_all_experts, @@ -172,3 +137,7 @@ def moe_calibration_context( if not replacement.is_permanent: restored = replacement.restore(original) model.set_submodule(name, restored) + + +def _is_registered(name: str, subclass: RegistryMixin): + return standardize_lookup_name(name) in subclass.registered_names() diff --git a/src/llmcompressor/modeling/qwen3_moe.py b/src/llmcompressor/modeling/qwen3_moe.py index 5432b731bb..678e32f10c 100644 --- a/src/llmcompressor/modeling/qwen3_moe.py +++ b/src/llmcompressor/modeling/qwen3_moe.py @@ -20,13 +20,10 @@ Qwen3MoeSparseMoeBlock as OriginalQwen3MoeSparseMoeBlock, ) -from llmcompressor.modeling.moe_context import ( - MoECalibrationModule, - register_moe_calibration, -) +from llmcompressor.modeling.moe_context import MoECalibrationModule -@register_moe_calibration("Qwen3MoeSparseMoeBlock") +@MoECalibrationModule.register("Qwen3MoeSparseMoeBlock") class CalibrationQwen3MoeSparseMoeBlock(MoECalibrationModule): """ Calibration version of Qwen3MoeSparseMoeBlock that sends all tokens to all experts. diff --git a/src/llmcompressor/modeling/qwen3_vl_moe.py b/src/llmcompressor/modeling/qwen3_vl_moe.py index 9eeba6dfc1..3637525954 100644 --- a/src/llmcompressor/modeling/qwen3_vl_moe.py +++ b/src/llmcompressor/modeling/qwen3_vl_moe.py @@ -4,14 +4,11 @@ Qwen3VLMoeTextSparseMoeBlock as OriginalQwen3VLMoeTextSparseMoeBlock, ) -from llmcompressor.modeling.moe_context import ( - MoECalibrationModule, - register_moe_calibration, -) +from llmcompressor.modeling.moe_context import MoECalibrationModule from llmcompressor.utils.dev import skip_weights_initialize -@register_moe_calibration("CalibrationQwen3VLMoeTextSparseMoeBlock") +@MoECalibrationModule.register("Qwen3VLMoeTextSparseMoeBlock") class CalibrateQwen3VLMoeTextSparseMoeBlock(MoECalibrationModule): """ Calibration version of Qwen3VLMoeTextSparseMoeBlock that sends all tokens to all @@ -118,7 +115,7 @@ def replace( calibrate_all_experts: bool, ): return CalibrateQwen3VLMoeTextSparseMoeBlock( - config=config.get_text_config(), original=original, + config=config, calibrate_all_experts=calibrate_all_experts, ) diff --git a/tests/llmcompressor/modeling/test_calib_qwen3_vl_moe.py b/tests/llmcompressor/modeling/test_calib_qwen3_vl_moe.py index c34951755b..46694a38db 100644 --- a/tests/llmcompressor/modeling/test_calib_qwen3_vl_moe.py +++ b/tests/llmcompressor/modeling/test_calib_qwen3_vl_moe.py @@ -1,5 +1,5 @@ import torch -from transformers import Qwen3VLMoeTextConfig +from transformers import Qwen3VLMoeConfig from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import ( Qwen3VLMoeTextSparseMoeBlock, ) @@ -10,10 +10,10 @@ @requires_gpu -def test_calib_qwen3_moe_module(): - config = Qwen3VLMoeTextConfig() +def test_calib_qwen3_vl_moe_module(): + config = Qwen3VLMoeConfig() with torch.device("cuda"): - original = Qwen3VLMoeTextSparseMoeBlock(config).eval() + original = Qwen3VLMoeTextSparseMoeBlock(config.get_text_config()).eval() # these are initialized as empty / all 0s which results in outputs # from the experts being all 0 # update to use a small random value @@ -21,7 +21,7 @@ def test_calib_qwen3_moe_module(): original.experts.down_proj.data.normal_(mean=0.0, std=0.02) # Create dummy input tensor that simulates hidden_states - hidden_dim = config.hidden_size + hidden_dim = config.get_text_config().hidden_size batch, seq_len = 4, 32 sample = torch.randn(batch, seq_len, hidden_dim, device="cuda") diff --git a/tests/llmcompressor/pipelines/test_ptq_weights.py b/tests/llmcompressor/pipelines/test_ptq_weights.py new file mode 100644 index 0000000000..1e13a75d6d --- /dev/null +++ b/tests/llmcompressor/pipelines/test_ptq_weights.py @@ -0,0 +1,66 @@ +import os + +import pytest +import torch +from safetensors.torch import load_file + +from llmcompressor import oneshot, ptq_weights +from llmcompressor.modifiers.quantization import QuantizationModifier +from tests.testing_utils import requires_gpu + + +@requires_gpu +@pytest.mark.parametrize("scheme", ["FP8_dynamic", "NVFP4A16"]) +def test_weights_ptq_e2e(scheme, tmp_path): + model = "nm-testing/tinysmokellama-3.2" + ptq_ignore = ["model.embed_tokens.weight", "lm_head.weight", "re:.*norm.weight$"] + oneshot_ignore = ["lm_head"] + device = "cuda:0" + + ptq_outdir = tmp_path / "weights_out" + oneshot_outdir = tmp_path / "oneshot_out" + + ptq_weights( + model, + ptq_outdir, + scheme=scheme, + max_workers=2, + device=device, + ignore=ptq_ignore, + ) + + oneshot( + model=model, + recipe=QuantizationModifier( + targets="Linear", scheme=scheme, ignore=oneshot_ignore + ), + output_dir=oneshot_outdir, + ) + + ptq_st_files = _get_safetensors_files(ptq_outdir) + oneshot_st_files = _get_safetensors_files(oneshot_outdir) + assert set(ptq_st_files) == set(oneshot_st_files) + + for file_name in ptq_st_files: + _assert_safetensors_equal(ptq_outdir / file_name, oneshot_outdir / file_name) + + +def _get_safetensors_files(dir_path: str) -> list[str]: + return [ + file_name + for file_name in os.listdir(dir_path) + if file_name.endswith("safetensors") + ] + + +def _assert_safetensors_equal(a_path: str, b_path: str) -> bool: + a = load_file(a_path) + b = load_file(b_path) + + assert a.keys() == b.keys(), (a.keys() - b.keys(), b.keys() - a.keys()) + + for key in a.keys(): + value_equal = torch.equal(a[key].to(torch.bfloat16), b[key].to(torch.bfloat16)) + dtype_equal = a[key].dtype == b[key].dtype + + assert value_equal and dtype_equal, (key, value_equal, dtype_equal) From d1a8ee2fbfd98dbbd2d716c7195482ca51107084 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 30 Oct 2025 18:28:35 +0000 Subject: [PATCH 08/10] remove accidentally added file Signed-off-by: Kyle Sayers --- .../pipelines/test_ptq_weights.py | 66 ------------------- 1 file changed, 66 deletions(-) delete mode 100644 tests/llmcompressor/pipelines/test_ptq_weights.py diff --git a/tests/llmcompressor/pipelines/test_ptq_weights.py b/tests/llmcompressor/pipelines/test_ptq_weights.py deleted file mode 100644 index 1e13a75d6d..0000000000 --- a/tests/llmcompressor/pipelines/test_ptq_weights.py +++ /dev/null @@ -1,66 +0,0 @@ -import os - -import pytest -import torch -from safetensors.torch import load_file - -from llmcompressor import oneshot, ptq_weights -from llmcompressor.modifiers.quantization import QuantizationModifier -from tests.testing_utils import requires_gpu - - -@requires_gpu -@pytest.mark.parametrize("scheme", ["FP8_dynamic", "NVFP4A16"]) -def test_weights_ptq_e2e(scheme, tmp_path): - model = "nm-testing/tinysmokellama-3.2" - ptq_ignore = ["model.embed_tokens.weight", "lm_head.weight", "re:.*norm.weight$"] - oneshot_ignore = ["lm_head"] - device = "cuda:0" - - ptq_outdir = tmp_path / "weights_out" - oneshot_outdir = tmp_path / "oneshot_out" - - ptq_weights( - model, - ptq_outdir, - scheme=scheme, - max_workers=2, - device=device, - ignore=ptq_ignore, - ) - - oneshot( - model=model, - recipe=QuantizationModifier( - targets="Linear", scheme=scheme, ignore=oneshot_ignore - ), - output_dir=oneshot_outdir, - ) - - ptq_st_files = _get_safetensors_files(ptq_outdir) - oneshot_st_files = _get_safetensors_files(oneshot_outdir) - assert set(ptq_st_files) == set(oneshot_st_files) - - for file_name in ptq_st_files: - _assert_safetensors_equal(ptq_outdir / file_name, oneshot_outdir / file_name) - - -def _get_safetensors_files(dir_path: str) -> list[str]: - return [ - file_name - for file_name in os.listdir(dir_path) - if file_name.endswith("safetensors") - ] - - -def _assert_safetensors_equal(a_path: str, b_path: str) -> bool: - a = load_file(a_path) - b = load_file(b_path) - - assert a.keys() == b.keys(), (a.keys() - b.keys(), b.keys() - a.keys()) - - for key in a.keys(): - value_equal = torch.equal(a[key].to(torch.bfloat16), b[key].to(torch.bfloat16)) - dtype_equal = a[key].dtype == b[key].dtype - - assert value_equal and dtype_equal, (key, value_equal, dtype_equal) From 215b95cd17ce187aaa6b5dd5cedcd2fdaa5dbc13 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 7 Nov 2025 19:35:21 +0000 Subject: [PATCH 09/10] rebase Signed-off-by: Kyle Sayers --- src/llmcompressor/modeling/__init__.py | 1 + src/llmcompressor/modeling/moe_context.py | 2 +- src/llmcompressor/modeling/qwen3_vl_moe.py | 5 +++++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/llmcompressor/modeling/__init__.py b/src/llmcompressor/modeling/__init__.py index b1aba40835..1aeec42b06 100644 --- a/src/llmcompressor/modeling/__init__.py +++ b/src/llmcompressor/modeling/__init__.py @@ -14,6 +14,7 @@ from .llama4 import SequentialLlama4TextMoe # noqa: F401 from .qwen3_moe import CalibrationQwen3MoeSparseMoeBlock # noqa: F401 from .qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock # noqa: F401 +from .qwen3_next_moe import CalibrationQwen3NextSparseMoeBlock # noqa: F401 # TODO: add granite4, Qwen3Next from .fuse import * diff --git a/src/llmcompressor/modeling/moe_context.py b/src/llmcompressor/modeling/moe_context.py index fea0e057c8..6e96c4e2df 100644 --- a/src/llmcompressor/modeling/moe_context.py +++ b/src/llmcompressor/modeling/moe_context.py @@ -133,7 +133,7 @@ def moe_calibration_context( yield finally: # Step 2: Restore non-permanent modules - for name, (_original, replacement) in replaced.items(): + for name, (original, replacement) in replaced.items(): if not replacement.is_permanent: restored = replacement.restore(original) model.set_submodule(name, restored) diff --git a/src/llmcompressor/modeling/qwen3_vl_moe.py b/src/llmcompressor/modeling/qwen3_vl_moe.py index 3637525954..c162c6d0c8 100644 --- a/src/llmcompressor/modeling/qwen3_vl_moe.py +++ b/src/llmcompressor/modeling/qwen3_vl_moe.py @@ -15,6 +15,8 @@ class CalibrateQwen3VLMoeTextSparseMoeBlock(MoECalibrationModule): experts. """ + is_permanent = True + def __init__( self, original: OriginalQwen3VLMoeTextSparseMoeBlock, @@ -82,6 +84,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: next_states = next_states.reshape(batch_size, sequence_length, hidden_dim) return next_states, router_logits + def restore(self, original: torch.nn.Module) -> torch.nn.Module: + return original + class SequentialQwen3VLMoeTextExperts(torch.nn.ModuleList): def __init__(self, config, original): From 822c5a6caa1f3fb86c504e4dd8af098edeb6674e Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 7 Nov 2025 20:11:34 +0000 Subject: [PATCH 10/10] remove register fn Signed-off-by: Kyle Sayers --- examples/multimodal_vision/llama4_example.py | 2 +- src/llmcompressor/modeling/qwen3_next_moe.py | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/examples/multimodal_vision/llama4_example.py b/examples/multimodal_vision/llama4_example.py index 53b98621f3..c17ec01a1f 100644 --- a/examples/multimodal_vision/llama4_example.py +++ b/examples/multimodal_vision/llama4_example.py @@ -19,7 +19,7 @@ # NOTE: This restructuring is specifically required for vLLM compatibility. # To define custom calibration logic, create a new calibration module in # modeling/llama4.py that inherits from `MoECalibrationModule`, and register -# it using the `@register_moe_calibration` decorator with the appropriate +# it using the `@MoECalibrationModule.register` decorator with the appropriate # module class name (e.g., "Llama4TextMoe"). DATASET_ID = "neuralmagic/calibration" diff --git a/src/llmcompressor/modeling/qwen3_next_moe.py b/src/llmcompressor/modeling/qwen3_next_moe.py index cf11a84d08..823ca779b0 100644 --- a/src/llmcompressor/modeling/qwen3_next_moe.py +++ b/src/llmcompressor/modeling/qwen3_next_moe.py @@ -16,13 +16,10 @@ import torch -from llmcompressor.modeling.moe_context import ( - MoECalibrationModule, - register_moe_calibration, -) +from llmcompressor.modeling.moe_context import MoECalibrationModule -@register_moe_calibration("Qwen3NextSparseMoeBlock") +@MoECalibrationModule.register("Qwen3NextSparseMoeBlock") class CalibrationQwen3NextSparseMoeBlock(MoECalibrationModule): from transformers import Qwen3NextConfig from transformers.models.qwen3_next.modeling_qwen3_next import (