apply patch

kylesayrs · kylesayrs · commit 3ba4f00f440e · 2025-11-07T19:31:18.000Z
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/modeling/llama4.py b/src/llmcompressor/modeling/llama4.py
@@ -38,10 +38,8 @@ def __init__(
         calibrate_all_experts: bool = True,
     ):
         super().__init__()
-        # Extract text config from multimodal config if needed
-        text_config = (
-            config.get_text_config() if hasattr(config, "get_text_config") else config
-        )
+        # Extract text config from multimodal config
+        text_config: Llama4TextConfig = config.get_text_config()
         self.top_k = text_config.num_experts_per_tok
         self.hidden_dim = text_config.hidden_size
         self.num_experts = text_config.num_local_experts
diff --git a/src/llmcompressor/modeling/prepare.py b/src/llmcompressor/modeling/prepare.py
@@ -10,33 +10,20 @@
 from compressed_tensors.utils import deprecated, replace_module
 from transformers import PreTrainedModel
 
-# Import MoE calibration modules to trigger registration
-from llmcompressor.modeling.deepseek_v3 import (  # noqa: F401
-    CalibrationDeepseekV3MoE,
-)
-from llmcompressor.modeling.deepseek_v3 import (
-    replace as replace_deepseekv3,
-)
-from llmcompressor.modeling.llama4 import (  # noqa: F401
-    SequentialLlama4TextMoe,
-)
-from llmcompressor.modeling.llama4 import (
-    replace as replace_llama4,
-)
-from llmcompressor.modeling.moe_context import (  # noqa: F401
-    moe_calibration_context,
-)
-from llmcompressor.modeling.qwen3_moe import (  # noqa: F401
-    CalibrationQwen3MoeSparseMoeBlock,
-)
-from llmcompressor.modeling.qwen3_next_moe import (  # noqa: F401
-    CalibrationQwen3NextSparseMoeBlock,
-)
-from llmcompressor.modeling.qwen3_vl_moe import (
-    replace as replace_Qwen3VLMoE,
-)
+# deprecated replacement functions
+from llmcompressor.modeling.deepseek_v3 import replace as replace_deepseekv3
+from llmcompressor.modeling.llama4 import replace as replace_llama4
+from llmcompressor.modeling.qwen3_vl_moe import replace as replace_Qwen3VLMoE
+
+# trigger registration
+from .deepseek_v3 import CalibrationDeepseekV3MoE  # noqa: F401
+from .llama4 import SequentialLlama4TextMoe  # noqa: F401
+from .qwen3_moe import CalibrationQwen3MoeSparseMoeBlock  # noqa: F401
+from .qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock  # noqa: F401
+
+# TODO: add granite4, Qwen3Next
 
-__all__ = ["moe_calibration_context", "replace_modules_for_calibration"]
+__all__ = ["replace_modules_for_calibration"]
 
 # ---------------------- module replacements; permanent -------------------------
 replacements = {
diff --git a/src/llmcompressor/modeling/qwen3_vl_moe.py b/src/llmcompressor/modeling/qwen3_vl_moe.py
@@ -1,19 +1,40 @@
 import torch
-
+from transformers import Qwen3VLMoeConfig, Qwen3VLMoeTextConfig
+from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import (
+    Qwen3VLMoeTextSparseMoeBlock as OriginalQwen3VLMoeTextSparseMoeBlock,
+)
+
+from llmcompressor.modeling.moe_context import (
+    MoECalibrationModule,
+    register_moe_calibration,
+)
 from llmcompressor.utils.dev import skip_weights_initialize
 
 
-class LinearQwen3VLMoeTextSparseMoeBlock(torch.nn.Module):
-    def __init__(self, config, original, calibrate_all_experts):
+@register_moe_calibration("CalibrationQwen3VLMoeTextSparseMoeBlock")
+class CalibrateQwen3VLMoeTextSparseMoeBlock(MoECalibrationModule):
+    """
+    Calibration version of Qwen3VLMoeTextSparseMoeBlock that sends all tokens to all
+    experts.
+    """
+
+    def __init__(
+        self,
+        original: OriginalQwen3VLMoeTextSparseMoeBlock,
+        config: Qwen3VLMoeConfig,
+        calibrate_all_experts: bool,
+    ):
         super().__init__()
-        self.hidden_size = config.hidden_size
-        self.num_experts = config.num_experts
+        text_config: Qwen3VLMoeTextConfig = config.get_text_config()
+
+        self.hidden_size = text_config.hidden_size
+        self.num_experts = text_config.num_experts
         self.top_k = original.top_k
         # Note: gate was changed to be a Linear layer in transformers==4.57.0
         # https://github.com/JJJYmmm/transformers/commit/f5dea1c694af8c994c769170813a8702332119ee
         self.gate = original.gate
         self.calibrate_all_experts = calibrate_all_experts
-        self.experts = SequentialQwen3VLMoeTextExperts(config, original.experts)
+        self.experts = SequentialQwen3VLMoeTextExperts(text_config, original.experts)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
@@ -91,9 +112,13 @@ def __init__(self, config, original):
             self[i].down_proj.weight.data = down.t().clone().contiguous()
 
 
-def replace(config, module, calibrate_all_experts):
-    return LinearQwen3VLMoeTextSparseMoeBlock(
+def replace(
+    config: Qwen3VLMoeConfig,
+    original: OriginalQwen3VLMoeTextSparseMoeBlock,
+    calibrate_all_experts: bool,
+):
+    return CalibrateQwen3VLMoeTextSparseMoeBlock(
         config=config.get_text_config(),
-        original=module,
+        original=original,
         calibrate_all_experts=calibrate_all_experts,
     )
diff --git a/tests/llmcompressor/modeling/test_calib_qwen3_vl_moe.py b/tests/llmcompressor/modeling/test_calib_qwen3_vl_moe.py
@@ -1,17 +1,16 @@
 import torch
+from transformers import Qwen3VLMoeTextConfig
+from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import (
+    Qwen3VLMoeTextSparseMoeBlock,
+)
 
-from llmcompressor.modeling.qwen3_vl_moe import LinearQwen3VLMoeTextSparseMoeBlock
+from llmcompressor.modeling.qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock
 from llmcompressor.utils.helpers import calibration_forward_context
 from tests.testing_utils import requires_gpu
 
 
 @requires_gpu
-def test_calib_qwen3_vl_moe_module():
-    from transformers import Qwen3VLMoeTextConfig
-    from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import (
-        Qwen3VLMoeTextSparseMoeBlock,
-    )
-
+def test_calib_qwen3_moe_module():
     config = Qwen3VLMoeTextConfig()
     with torch.device("cuda"):
         original = Qwen3VLMoeTextSparseMoeBlock(config).eval()
@@ -29,16 +28,16 @@ def test_calib_qwen3_vl_moe_module():
     with calibration_forward_context(original):
         true_output = original(sample)
 
-    module = LinearQwen3VLMoeTextSparseMoeBlock(
-        config, original, calibrate_all_experts=True
+    module = CalibrateQwen3VLMoeTextSparseMoeBlock(
+        original, config, calibrate_all_experts=True
     )
     with calibration_forward_context(module):
         output = module(sample)
         assert torch.nn.functional.mse_loss(true_output[0], output[0]) < 1e-10
         assert torch.nn.functional.mse_loss(true_output[1], output[1]) < 1e-10
 
-    module = LinearQwen3VLMoeTextSparseMoeBlock(
-        config, original, calibrate_all_experts=False
+    module = CalibrateQwen3VLMoeTextSparseMoeBlock(
+        original, config, calibrate_all_experts=False
     )
     with calibration_forward_context(module):
         output = module(sample)