Extend eviction policy tests to all indexing types (#833)

oulgen · web-flow · commit b504f18d1928 · 2025-10-08T15:30:51.000-07:00
diff --git a/test/test_eviction_policy.expected b/test/test_eviction_policy.expected
@@ -9,6 +9,79 @@ import triton
 import triton.language as tl
 from helion.runtime import default_launcher as _default_launcher
 
+@triton.jit
+def _helion_kernel_with_eviction(x, y, out, x_size_0, out_stride_0, x_stride_0, y_stride_0, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < x_size_0
+    val_x = tl.load(x + indices_0 * x_stride_0, mask_0, other=0)
+    val_y = tl.load(y + indices_0 * y_stride_0, mask_0, other=0, eviction_policy='evict_last')
+    v_0 = val_x + val_y
+    tl.store(out + indices_0 * out_stride_0, v_0, mask_0)
+
+def kernel_with_eviction(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+    out = torch.empty_like(x)
+    _BLOCK_SIZE_0 = 16
+    _launcher(_helion_kernel_with_eviction, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, out, x.size(0), out.stride(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return out
+
+--- assertExpectedJournal(TestEvictionPolicy.test_eviction_policy_in_generated_code_indexing_block_ptr)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_kernel_with_eviction(x, y, out, out_size_0, x_size_0, y_size_0, out_stride_0, x_stride_0, y_stride_0, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    val_x = tl.load(tl.make_block_ptr(x, [x_size_0], [x_stride_0], [offset_0], [_BLOCK_SIZE_0], [0]), boundary_check=[0], padding_option='zero')
+    val_y = tl.load(tl.make_block_ptr(y, [y_size_0], [y_stride_0], [offset_0], [_BLOCK_SIZE_0], [0]), boundary_check=[0], padding_option='zero', eviction_policy='evict_last')
+    v_0 = val_x + val_y
+    tl.store(tl.make_block_ptr(out, [out_size_0], [out_stride_0], [offset_0], [_BLOCK_SIZE_0], [0]), v_0, boundary_check=[0])
+
+def kernel_with_eviction(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+    out = torch.empty_like(x)
+    _BLOCK_SIZE_0 = 16
+    _launcher(_helion_kernel_with_eviction, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, out, out.size(0), x.size(0), y.size(0), out.stride(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return out
+
+--- assertExpectedJournal(TestEvictionPolicy.test_eviction_policy_in_generated_code_indexing_pointer)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_kernel_with_eviction(x, y, out, x_size_0, out_stride_0, x_stride_0, y_stride_0, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < x_size_0
+    val_x = tl.load(x + indices_0 * x_stride_0, mask_0, other=0)
+    val_y = tl.load(y + indices_0 * y_stride_0, mask_0, other=0, eviction_policy='evict_last')
+    v_0 = val_x + val_y
+    tl.store(out + indices_0 * out_stride_0, v_0, mask_0)
+
+def kernel_with_eviction(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+    out = torch.empty_like(x)
+    _BLOCK_SIZE_0 = 16
+    _launcher(_helion_kernel_with_eviction, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, out, x.size(0), out.stride(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return out
+
+--- assertExpectedJournal(TestEvictionPolicy.test_eviction_policy_in_generated_code_indexing_tensor_descriptor)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
 @triton.jit
 def _helion_kernel_with_eviction(x, y, out, x_size_0, out_stride_0, x_stride_0, y_stride_0, _BLOCK_SIZE_0: tl.constexpr):
     pid_0 = tl.program_id(0)
@@ -34,6 +107,79 @@ import triton
 import triton.language as tl
 from helion.runtime import default_launcher as _default_launcher
 
+@triton.jit
+def _helion_kernel_with_override(x, y, out, x_size_0, out_stride_0, x_stride_0, y_stride_0, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < x_size_0
+    val_x = tl.load(x + indices_0 * x_stride_0, mask_0, other=0, eviction_policy='evict_last')
+    val_y = tl.load(y + indices_0 * y_stride_0, mask_0, other=0, eviction_policy='evict_first')
+    v_0 = val_x + val_y
+    tl.store(out + indices_0 * out_stride_0, v_0, mask_0)
+
+def kernel_with_override(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+    out = torch.empty_like(x)
+    _BLOCK_SIZE_0 = 16
+    _launcher(_helion_kernel_with_override, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, out, x.size(0), out.stride(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return out
+
+--- assertExpectedJournal(TestEvictionPolicy.test_explicit_eviction_policy_overrides_tunable_indexing_block_ptr)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_kernel_with_override(x, y, out, out_size_0, x_size_0, y_size_0, out_stride_0, x_stride_0, y_stride_0, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    val_x = tl.load(tl.make_block_ptr(x, [x_size_0], [x_stride_0], [offset_0], [_BLOCK_SIZE_0], [0]), boundary_check=[0], padding_option='zero', eviction_policy='evict_last')
+    val_y = tl.load(tl.make_block_ptr(y, [y_size_0], [y_stride_0], [offset_0], [_BLOCK_SIZE_0], [0]), boundary_check=[0], padding_option='zero', eviction_policy='evict_first')
+    v_0 = val_x + val_y
+    tl.store(tl.make_block_ptr(out, [out_size_0], [out_stride_0], [offset_0], [_BLOCK_SIZE_0], [0]), v_0, boundary_check=[0])
+
+def kernel_with_override(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+    out = torch.empty_like(x)
+    _BLOCK_SIZE_0 = 16
+    _launcher(_helion_kernel_with_override, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, out, out.size(0), x.size(0), y.size(0), out.stride(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return out
+
+--- assertExpectedJournal(TestEvictionPolicy.test_explicit_eviction_policy_overrides_tunable_indexing_pointer)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_kernel_with_override(x, y, out, x_size_0, out_stride_0, x_stride_0, y_stride_0, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < x_size_0
+    val_x = tl.load(x + indices_0 * x_stride_0, mask_0, other=0, eviction_policy='evict_last')
+    val_y = tl.load(y + indices_0 * y_stride_0, mask_0, other=0, eviction_policy='evict_first')
+    v_0 = val_x + val_y
+    tl.store(out + indices_0 * out_stride_0, v_0, mask_0)
+
+def kernel_with_override(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+    out = torch.empty_like(x)
+    _BLOCK_SIZE_0 = 16
+    _launcher(_helion_kernel_with_override, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, out, x.size(0), out.stride(0), x.stride(0), y.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return out
+
+--- assertExpectedJournal(TestEvictionPolicy.test_explicit_eviction_policy_overrides_tunable_indexing_tensor_descriptor)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
 @triton.jit
 def _helion_kernel_with_override(x, y, out, x_size_0, out_stride_0, x_stride_0, y_stride_0, _BLOCK_SIZE_0: tl.constexpr):
     pid_0 = tl.program_id(0)
@@ -103,6 +249,29 @@ import triton
 import triton.language as tl
 from helion.runtime import default_launcher as _default_launcher
 
+@triton.jit
+def _helion_copy_with_eviction(x, out, x_size_0, out_stride_0, x_stride_0, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < x_size_0
+    val = tl.load(x + indices_0 * x_stride_0, mask_0, other=0, eviction_policy='evict_last')
+    tl.store(out + indices_0 * out_stride_0, val, mask_0)
+
+def copy_with_eviction(x: torch.Tensor, *, _launcher=_default_launcher):
+    out = torch.empty_like(x)
+    _BLOCK_SIZE_0 = 16
+    _launcher(_helion_copy_with_eviction, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, out, x.size(0), out.stride(0), x.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return out
+
+--- assertExpectedJournal(TestEvictionPolicy.test_hl_load_eviction_policy_emitted_indexing_tensor_descriptor)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
 @triton.jit
 def _helion_copy_with_eviction(x, out, x_size_0, out_stride_0, x_stride_0, _BLOCK_SIZE_0: tl.constexpr):
     pid_0 = tl.program_id(0)
@@ -144,3 +313,82 @@ def kernel_multiple_loads(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, *,
     _BLOCK_SIZE_0 = 16
     _launcher(_helion_kernel_multiple_loads, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, z, out, x.size(0), out.stride(0), x.stride(0), y.stride(0), z.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
     return out
+
+--- assertExpectedJournal(TestEvictionPolicy.test_multiple_loads_different_policies_indexing_block_ptr)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_kernel_multiple_loads(x, y, z, out, out_size_0, x_size_0, y_size_0, z_size_0, out_stride_0, x_stride_0, y_stride_0, z_stride_0, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    val_x = tl.load(tl.make_block_ptr(x, [x_size_0], [x_stride_0], [offset_0], [_BLOCK_SIZE_0], [0]), boundary_check=[0], padding_option='zero', eviction_policy='evict_first')
+    val_y = tl.load(tl.make_block_ptr(y, [y_size_0], [y_stride_0], [offset_0], [_BLOCK_SIZE_0], [0]), boundary_check=[0], padding_option='zero', eviction_policy='evict_last')
+    val_z = tl.load(tl.make_block_ptr(z, [z_size_0], [z_stride_0], [offset_0], [_BLOCK_SIZE_0], [0]), boundary_check=[0], padding_option='zero')
+    v_0 = val_x + val_y
+    v_1 = v_0 + val_z
+    tl.store(tl.make_block_ptr(out, [out_size_0], [out_stride_0], [offset_0], [_BLOCK_SIZE_0], [0]), v_1, boundary_check=[0])
+
+def kernel_multiple_loads(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, *, _launcher=_default_launcher):
+    out = torch.empty_like(x)
+    _BLOCK_SIZE_0 = 16
+    _launcher(_helion_kernel_multiple_loads, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, z, out, out.size(0), x.size(0), y.size(0), z.size(0), out.stride(0), x.stride(0), y.stride(0), z.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return out
+
+--- assertExpectedJournal(TestEvictionPolicy.test_multiple_loads_different_policies_indexing_pointer)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_kernel_multiple_loads(x, y, z, out, x_size_0, out_stride_0, x_stride_0, y_stride_0, z_stride_0, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < x_size_0
+    val_x = tl.load(x + indices_0 * x_stride_0, mask_0, other=0, eviction_policy='evict_first')
+    val_y = tl.load(y + indices_0 * y_stride_0, mask_0, other=0, eviction_policy='evict_last')
+    val_z = tl.load(z + indices_0 * z_stride_0, mask_0, other=0)
+    v_0 = val_x + val_y
+    v_1 = v_0 + val_z
+    tl.store(out + indices_0 * out_stride_0, v_1, mask_0)
+
+def kernel_multiple_loads(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, *, _launcher=_default_launcher):
+    out = torch.empty_like(x)
+    _BLOCK_SIZE_0 = 16
+    _launcher(_helion_kernel_multiple_loads, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, z, out, x.size(0), out.stride(0), x.stride(0), y.stride(0), z.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return out
+
+--- assertExpectedJournal(TestEvictionPolicy.test_multiple_loads_different_policies_indexing_tensor_descriptor)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_kernel_multiple_loads(x, y, z, out, x_size_0, out_stride_0, x_stride_0, y_stride_0, z_stride_0, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < x_size_0
+    val_x = tl.load(x + indices_0 * x_stride_0, mask_0, other=0, eviction_policy='evict_first')
+    val_y = tl.load(y + indices_0 * y_stride_0, mask_0, other=0, eviction_policy='evict_last')
+    val_z = tl.load(z + indices_0 * z_stride_0, mask_0, other=0)
+    v_0 = val_x + val_y
+    v_1 = v_0 + val_z
+    tl.store(out + indices_0 * out_stride_0, v_1, mask_0)
+
+def kernel_multiple_loads(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, *, _launcher=_default_launcher):
+    out = torch.empty_like(x)
+    _BLOCK_SIZE_0 = 16
+    _launcher(_helion_kernel_multiple_loads, (triton.cdiv(x.size(0), _BLOCK_SIZE_0),), x, y, z, out, x.size(0), out.stride(0), x.stride(0), y.stride(0), z.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return out
diff --git a/test/test_eviction_policy.py b/test/test_eviction_policy.py
@@ -33,9 +33,7 @@ def copy_with_eviction(x: torch.Tensor) -> torch.Tensor:
         x = torch.randn([128], device=DEVICE, dtype=torch.float32)
         code, result = code_and_output(copy_with_eviction, (x,))
         torch.testing.assert_close(result, x)
-        if indexing != "tensor_descriptor":
-            # TODO(oulgen): Update this on a machine that supports tensor_descriptor
-            self.assertExpectedJournal(code)
+        self.assertExpectedJournal(code)
         self.assertIn("eviction_policy", code)
         self.assertIn("evict_last", code)
 
@@ -69,13 +67,18 @@ def kernel_with_loads(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         self.assertIn("first", fragment.inner.choices)
         self.assertIn("last", fragment.inner.choices)
 
-    def test_eviction_policy_in_generated_code(self):
+    @parametrize("indexing", ("pointer", "block_ptr", "tensor_descriptor"))
+    def test_eviction_policy_in_generated_code(self, indexing: str):
         """Test that eviction policies appear in generated code when configured."""
 
+        if indexing == "tensor_descriptor" and not supports_tensor_descriptor():
+            self.skipTest("Tensor descriptor support is required")
+
         @helion.kernel(
             config={
                 "block_size": 16,
                 "load_eviction_policies": ["", "last"],
+                "indexing": indexing,
             }
         )
         def kernel_with_eviction(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
@@ -96,11 +99,16 @@ def kernel_with_eviction(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         self.assertIn("evict_last", code)
         self.assertExpectedJournal(code)
 
-    def test_explicit_eviction_policy_overrides_tunable(self):
+    @parametrize("indexing", ("pointer", "block_ptr", "tensor_descriptor"))
+    def test_explicit_eviction_policy_overrides_tunable(self, indexing: str):
+        if indexing == "tensor_descriptor" and not supports_tensor_descriptor():
+            self.skipTest("Tensor descriptor support is required")
+
         @helion.kernel(
             config={
                 "block_size": 16,
                 "load_eviction_policies": ["first", "first"],
+                "indexing": indexing,
             }
         )
         def kernel_with_override(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
@@ -121,11 +129,16 @@ def kernel_with_override(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         self.assertIn("evict_last", code)
         self.assertExpectedJournal(code)
 
-    def test_multiple_loads_different_policies(self):
+    @parametrize("indexing", ("pointer", "block_ptr", "tensor_descriptor"))
+    def test_multiple_loads_different_policies(self, indexing: str):
+        if indexing == "tensor_descriptor" and not supports_tensor_descriptor():
+            self.skipTest("Tensor descriptor support is required")
+
         @helion.kernel(
             config={
                 "block_size": 16,
                 "load_eviction_policies": ["first", "last", ""],
+                "indexing": indexing,
             }
         )
         def kernel_multiple_loads(