[PyTorch] Avoid initializing recipe state in fusible op base class constructor (#2421)

timmoon10 · web-flow · commit 9ca89e97e9d9 · 2025-11-26T09:13:12.000+08:00
Do not initialize recipe state in base op class

Op attrs may not be set. Move recipe state initialization to linear op constructor.

Signed-off-by: Tim Moon &lt;tmoon@nvidia.com&gt;
diff --git a/tests/pytorch/test_fusible_ops.py b/tests/pytorch/test_fusible_ops.py
@@ -901,15 +901,15 @@ def _test_basic_linear(
                 dtype=dtype,
                 accumulate_into_main_grad=accumulate_into_main_grad,
             )
+            forward = te_ops.Sequential(
+                te_ops.Quantize(forward=quantized_input, backward=quantized_grad_input),
+                op,
+                te_ops.Quantize(forward=quantized_output, backward=quantized_grad_output),
+            )
         with torch.no_grad():
             op.weight.copy_(w_test)
             del w_test
             op.weight.main_grad = torch.full_like(op.weight, 0.5, dtype=torch.float32)
-        forward = te_ops.Sequential(
-            te_ops.Quantize(forward=quantized_input, backward=quantized_grad_input),
-            op,
-            te_ops.Quantize(forward=quantized_output, backward=quantized_grad_output),
-        )
         with te.autocast(enabled=quantized_compute, recipe=recipe):
             y_test = forward(x_test)
         y_test.backward(dy_test)
diff --git a/transformer_engine/pytorch/ops/basic/basic_linear.py b/transformer_engine/pytorch/ops/basic/basic_linear.py
@@ -137,8 +137,10 @@ def __init__(
             out_features=out_features,
         )
 
-        # Whether weight tensor is natively quantized
+        # Initialize recipe state if needed for natively quantized weight
         self._with_quantized_weight: bool = FP8GlobalStateManager.with_fp8_parameters()
+        if self._with_quantized_weight:
+            self.reset_recipe_state(recipe=FP8GlobalStateManager.get_fp8_recipe())
 
         # Initialize parameters if needed
         weight = torch.empty(
diff --git a/transformer_engine/pytorch/ops/op.py b/transformer_engine/pytorch/ops/op.py
@@ -188,9 +188,6 @@ def __init__(self) -> None:
         # Objects for quantization
         self._fp8_metas: Optional[dict[str, dict[str, Any]]] = None
         self._quantizers: Optional[dict[str, list[Quantizer]]] = None
-        with_fp8_parameters = FP8GlobalStateManager.with_fp8_parameters()
-        recipe = FP8GlobalStateManager.get_fp8_recipe() if with_fp8_parameters else None
-        self.reset_recipe_state(recipe=recipe)
 
     @property
     def is_fused_op(self) -> bool: