Support AMD-specific autotune parameters: waves_per_eu and matrix_instr_nonkdim (#1162)

choijon5 · web-flow · commit b9925d9dd5c1 · 2025-11-23T16:48:54.000-08:00
diff --git a/helion/_compat.py b/helion/_compat.py
@@ -2,6 +2,7 @@
 
 import contextlib
 import functools
+import re
 from typing import Any
 from typing import Callable
 from typing import cast
@@ -286,3 +287,22 @@ def warps_to_threads(num_warps: int) -> int:
         )
         return num_warps * (props.warp_size or 32)
     return num_warps * 32
+
+
+@functools.cache
+def supports_amd_cdna_tunables() -> bool:
+    if torch.version.hip is None or not torch.cuda.is_available():
+        return False
+    try:
+        props = torch.cuda.get_device_properties(torch.cuda.current_device())
+        arch = getattr(props, "gcnArchName", None)
+        if arch is None:
+            return False
+        # Extract base architecture (e.g., "gfx942" from "gfx942:sramecc+:xnack-")
+        # CDNA architectures are gfx908 and above but less than gfx1000
+        # Reference: https://llvm.org/docs/AMDGPUUsage.html
+        base_arch = arch.split(":")[0]
+        match = re.match(r"gfx([0-9a-f]{3})", base_arch)
+        return match is not None and int(match.group(1), 16) >= 0x908
+    except Exception:
+        return False
diff --git a/helion/_compiler/device_function.py b/helion/_compiler/device_function.py
@@ -670,6 +670,9 @@ def codegen_function_call(self) -> ast.AST:
                 if x.startswith("_triton_config_")
             ]
         )
+        for key in ("waves_per_eu", "matrix_instr_nonkdim"):
+            if key in self.config:
+                args.append(f"{key}={self.config[key]}")
         pid = self.pid
         assert pid is not None
         # TODO(jansel): we should run CSE this statement
diff --git a/helion/_testing.py b/helion/_testing.py
@@ -24,6 +24,7 @@
 import triton
 
 from ._compat import get_tensor_descriptor_fn_name
+from ._compat import supports_amd_cdna_tunables
 from ._utils import counters
 from .autotuner.benchmarking import compute_repeat
 from .autotuner.benchmarking import interleaved_bench
@@ -37,6 +38,13 @@
     from .runtime.kernel import Kernel
 
 
+def _strip_amd_launcher_args(value: str) -> str:
+    if not supports_amd_cdna_tunables():
+        return value
+    value = re.sub(r", waves_per_eu=\d+", "", value)
+    return re.sub(r", matrix_instr_nonkdim=\d+", "", value)
+
+
 def _get_triton_backend() -> str | None:
     try:
         # pyrefly: ignore [missing-attribute]
@@ -130,6 +138,13 @@ def skipIfRocm(reason: str) -> Callable[[Callable], Callable]:
     return unittest.skipIf(torch.version.hip is not None, reason)
 
 
+def skipUnlessAMDCDNA(reason: str) -> Callable[[Callable], Callable]:
+    """Skip test unless running on AMD CDNA architecture."""
+    from helion._compat import supports_amd_cdna_tunables
+
+    return unittest.skipUnless(supports_amd_cdna_tunables(), reason)
+
+
 def skipIfXPU(reason: str) -> Callable[[Callable], Callable]:
     """Skip test if running with Intel XPU"""
     return unittest.skipIf(torch.xpu.is_available(), reason)
@@ -1029,7 +1044,9 @@ def assertExpectedJournal(self, value: str) -> None:
         Note:
             Use EXPECTTEST_ACCEPT=1 environment variable to update expected outputs.
         """
+        value = _strip_amd_launcher_args(value)
         value, expected = self._expected_journal.lookup(self.id(), value)
+        expected = _strip_amd_launcher_args(expected)
         self.assertMultiLineEqual(
             value,
             expected,
diff --git a/helion/autotuner/config_spec.py b/helion/autotuner/config_spec.py
@@ -8,6 +8,7 @@
 
 from torch._inductor.runtime.runtime_utils import next_power_of_2
 
+from .._compat import supports_amd_cdna_tunables
 from .._compat import supports_tensor_descriptor
 from ..exc import InvalidConfig
 from .block_id_sequence import BlockIdSequence
@@ -34,6 +35,7 @@
 
 DEFAULT_NUM_WARPS = 4
 DEFAULT_NUM_STAGES = 1
+AMD_CDNA_TUNABLES = ("waves_per_eu", "matrix_instr_nonkdim")
 VALID_KEYS: frozenset[str] = frozenset(
     [
         "block_sizes",
@@ -52,10 +54,13 @@
         "pid_type",
         "indexing",
         "load_eviction_policies",
+        *AMD_CDNA_TUNABLES,
     ]
 )
 VALID_PID_TYPES = ("flat", "xyz", "persistent_blocked", "persistent_interleaved")
 VALID_EVICTION_POLICIES = ("", "first", "last")
+VALID_WAVES_PER_EU = (1, 2, 3, 4)
+VALID_MATRIX_INSTR_NONKDIM = (0, 16, 32)
 
 
 @dataclasses.dataclass
@@ -112,6 +117,20 @@ class ConfigSpec:
             length=0,
         )
     )
+    waves_per_eu: ConfigSpecFragment | None = dataclasses.field(
+        default_factory=lambda: (
+            EnumFragment(choices=VALID_WAVES_PER_EU)
+            if supports_amd_cdna_tunables()
+            else None
+        )
+    )
+    matrix_instr_nonkdim: ConfigSpecFragment | None = dataclasses.field(
+        default_factory=lambda: (
+            EnumFragment(choices=VALID_MATRIX_INSTR_NONKDIM)
+            if supports_amd_cdna_tunables()
+            else None
+        )
+    )
 
     @staticmethod
     def _valid_indexing_types() -> tuple[IndexingLiteral, ...]:
@@ -226,6 +245,12 @@ def normalize(self, config: helion.Config | dict[str, object]) -> None:
             "load_eviction_policies", self.load_eviction_policies.default()
         )
         config.setdefault("indexing", self.indexing.default())
+        for key in AMD_CDNA_TUNABLES:
+            if (fragment := getattr(self, key)) is not None:
+                config.setdefault(key, fragment.default())
+            elif key in config:
+                raise InvalidConfig(f"{key} is not supported on this target hardware")
+
         # TODO(jansel): include num_ctas and max_nreg
 
         for name, values in (("pid_type", VALID_PID_TYPES),):
diff --git a/test/test_amd_cdna.py b/test/test_amd_cdna.py
@@ -0,0 +1,70 @@
+from __future__ import annotations
+
+from unittest.mock import patch
+
+import torch
+
+import helion
+from helion._compiler.compile_environment import CompileEnvironment
+from helion._testing import DEVICE
+from helion._testing import TestCase
+from helion._testing import code_and_output
+from helion._testing import skipUnlessAMDCDNA
+import helion.language as hl
+
+
+class TestAMDCDNA(TestCase):
+    @skipUnlessAMDCDNA("Test requires AMD CDNA GPU (MI200/MI300 series)")
+    def test_amd_cdna_tunables_in_kernel(self) -> None:
+        """Test that AMD CDNA tunables are supported."""
+
+        @helion.kernel(
+            autotune_effort="none",
+            config=helion.Config(
+                block_sizes=[32, 32],
+                waves_per_eu=2,
+                matrix_instr_nonkdim=16,
+            ),
+        )
+        def add_kernel(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            result = torch.empty_like(x)
+            for tile in hl.tile(x.shape):
+                result[tile] = x[tile] + y[tile]
+            return result
+
+        x = torch.randn(128, 128, device=DEVICE, dtype=torch.float32)
+        y = torch.randn(128, 128, device=DEVICE, dtype=torch.float32)
+
+        code, result = code_and_output(add_kernel, (x, y))
+        expected = x + y
+
+        torch.testing.assert_close(result, expected)
+
+        # Verify that the tunables are passed to Triton
+        self.assertIn("waves_per_eu=2", code)
+        self.assertIn("matrix_instr_nonkdim=16", code)
+
+    def test_amd_tunables_error_when_not_supported(self) -> None:
+        """Test that specifying AMD tunables on non-AMD hardware raises an error."""
+        device = torch.device("cuda")
+        settings = helion.Settings()
+
+        with patch(
+            "helion.autotuner.config_spec.supports_amd_cdna_tunables",
+            return_value=False,
+        ):
+            env = CompileEnvironment(device, settings)
+
+            config = helion.Config(waves_per_eu=2)
+            with self.assertRaisesRegex(
+                helion.exc.InvalidConfig,
+                "waves_per_eu is not supported on this target hardware",
+            ):
+                env.config_spec.normalize(config)
+
+            config = helion.Config(matrix_instr_nonkdim=16)
+            with self.assertRaisesRegex(
+                helion.exc.InvalidConfig,
+                "matrix_instr_nonkdim is not supported on this target hardware",
+            ):
+                env.config_spec.normalize(config)
diff --git a/test/test_examples_dist.py b/test/test_examples_dist.py
@@ -12,6 +12,7 @@
 from helion._testing import TestCase
 from helion._testing import code_and_output
 from helion._testing import import_path
+from helion._testing import skipIfRocm
 
 
 @instantiate_parametrized_tests
@@ -43,6 +44,7 @@ def _init_process(self):
         )
         torch.manual_seed(42 + self.rank)
 
+    @skipIfRocm("Distributed example requires CUDA/NCCL")
     @skip_if_lt_x_gpu(4)
     def test_all_gather_matmul(self):
         self._init_process()
@@ -100,6 +102,7 @@ def test_all_gather_matmul(self):
         torch.cuda.current_stream().wait_stream(backend_stream)
         dist.destroy_process_group()
 
+    @skipIfRocm("Distributed example requires CUDA/NCCL")
     @skip_if_lt_x_gpu(4)
     def test_all_reduce(self):
         self._init_process()

Original file line number	Diff line number	Diff line change
`@@ -670,6 +670,9 @@ def codegen_function_call(self) -> ast.AST:`
`670`	`670`	`if x.startswith("_triton_config_")`
`671`	`671`	`]`
`672`	`672`	`)`
	`673`	`+ for key in ("waves_per_eu", "matrix_instr_nonkdim"):`
	`674`	`+ if key in self.config:`
	`675`	`+ args.append(f"{key}={self.config[key]}")`
`673`	`676`	`pid = self.pid`
`674`	`677`	`assert pid is not None`
`675`	`678`	`# TODO(jansel): we should run CSE this statement`