Properly thread through use_triton_kernel (#3155)

drisspg · web-flow · commit 5c4cd17e8058 · 2025-10-11T12:06:51.000-07:00
diff --git a/test/prototype/mx_formats/test_inference_workflow.py b/test/prototype/mx_formats/test_inference_workflow.py
@@ -6,10 +6,12 @@
 
 import copy
 import tempfile
+from contextlib import contextmanager
 
 import pytest
 import torch
 import torch.nn as nn
+from torch.profiler import ProfilerActivity, profile
 
 from torchao.prototype.mx_formats.config import (
     MXGemmKernelChoice,
@@ -44,6 +46,23 @@ def run_around_tests():
     torch._dynamo.reset()
 
 
+@contextmanager
+def cuda_kernel_profiler(kernel_pattern):
+    """Context manager for profiling CUDA kernels."""
+    result = {"found": False, "kernel_names": []}
+
+    with profile(activities=[ProfilerActivity.CUDA]) as prof:
+        yield result
+
+    kernel_names = [
+        evt.name
+        for evt in prof.events()
+        if evt.device_type == torch.autograd.DeviceType.CUDA and evt.name
+    ]
+    result["kernel_names"] = kernel_names
+    result["found"] = any(kernel_pattern in name for name in kernel_names)
+
+
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.skipif(
     not torch_version_at_least("2.8.0"), reason="torch.compile requires PyTorch 2.8+"
@@ -178,7 +197,14 @@ def test_inference_workflow_nvfp4(
 
     x = torch.randn(batch_size, in_features, device="cuda", dtype=inpt_dtype)
     y_ref = m(x)
-    y_mx = m_mx(x)
+
+    if use_triton_kernel and mm_config != NVFP4MMConfig.WEIGHT_ONLY:
+        with cuda_kernel_profiler("quantize_nvfp4_triton_kernel") as result:
+            y_mx = m_mx(x)
+        assert result["found"], "Expected quantize_nvfp4 kernel to be found"
+    else:
+        y_mx = m_mx(x)
+
     sqnr = compute_error(y_ref, y_mx)
 
     if mm_config == NVFP4MMConfig.WEIGHT_ONLY:
diff --git a/torchao/prototype/mx_formats/inference_workflow.py b/torchao/prototype/mx_formats/inference_workflow.py
@@ -188,6 +188,8 @@ def _nvfp4_inference_linear_transform(
     if config.mm_config == NVFP4MMConfig.DYNAMIC:
         act_quant_kwargs = QuantizeTensorToNVFP4Kwargs(
             use_dynamic_per_tensor_scale=config.use_dynamic_per_tensor_scale,
+            use_triton_kernel=config.use_triton_kernel,
+            is_swizzled_scales=True,
         )
 
     quantized_weight = NVFP4Tensor.to_nvfp4(

Original file line number	Diff line number	Diff line change
`@@ -188,6 +188,8 @@ def _nvfp4_inference_linear_transform(`
`188`	`188`	`if config.mm_config == NVFP4MMConfig.DYNAMIC:`
`189`	`189`	`act_quant_kwargs = QuantizeTensorToNVFP4Kwargs(`
`190`	`190`	`use_dynamic_per_tensor_scale=config.use_dynamic_per_tensor_scale,`
	`191`	`+ use_triton_kernel=config.use_triton_kernel,`
	`192`	`+ is_swizzled_scales=True,`
`191`	`193`	`)`
`192`	`194`
`193`	`195`	`quantized_weight = NVFP4Tensor.to_nvfp4(`