Normalize device name and decorate cuda-only test cases (#819)

EikanWang · web-flow · commit 5354356ae32d · 2025-10-06T18:18:56.000-07:00
diff --git a/helion/_testing.py b/helion/_testing.py
@@ -703,9 +703,16 @@ def normalize_tensor_descriptors(code: str) -> str:
     @staticmethod
     def normalize_device_name(code: str) -> str:
         """
-        convert device='cuda:0' etc to device=DEVICE
+        convert device='cuda:0' or device(type='cuda', index=0) etc to device=DEVICE
         """
-        return re.sub(r"device\s*=\s*['\"][^'\"]+['\"]", "device=DEVICE", code)
+        # device='cuda:0'
+        reg_pattern_for_device_str = r"device\s*=\s*['\"][^'\"]+['\"]"
+        normalized_code = re.sub(reg_pattern_for_device_str, "device=DEVICE", code)
+        # device(type='cuda', index=0)
+        reg_pattern_for_torch_device = (
+            r"device\s*\(type\s*=\s*['\"][^'\"]+['\"][^'\"\)]*\)"
+        )
+        return re.sub(reg_pattern_for_torch_device, "device=DEVICE", normalized_code)
 
     def lookup(self, test_id: str, value: str) -> tuple[str, str]:
         test_id = self.normalize_id(test_id)
diff --git a/test/test_constexpr.py b/test/test_constexpr.py
@@ -10,6 +10,7 @@
 from helion._testing import TestCase
 from helion._testing import code_and_output
 from helion._testing import skipIfRefEager
+from helion._testing import skipIfXPU
 import helion.language as hl
 
 
@@ -94,6 +95,7 @@ def fn(x: torch.Tensor, mode: str) -> torch.Tensor:
         self.assertExpectedJournal(code)
 
     @skipIfRefEager("Triton codegen does not work in ref eager mode")
+    @skipIfXPU("Failed on XPU due to a different configuration for min dot size")
     def test_block_size_constexpr_assignment_in_host_code(self) -> None:
         @helion.kernel(
             config=helion.Config(
diff --git a/test/test_examples.py b/test/test_examples.py
@@ -14,6 +14,7 @@
 from helion._testing import import_path
 from helion._testing import skipIfRefEager
 from helion._testing import skipIfRocm
+from helion._testing import skipIfXPU
 
 torch.backends.cuda.matmul.fp32_precision = "tf32"
 torch.backends.cudnn.conv.fp32_precision = "tf32"
@@ -163,6 +164,7 @@ def test_template_via_closure0(self):
             )
         )
 
+    @skipIfXPU("Failed on XPU - https://github.com/pytorch/helion/issues/795")
     def test_template_via_closure1(self):
         bias = torch.randn([1, 1024], device=DEVICE, dtype=torch.float16)
         args = (
diff --git a/test/test_reductions.expected b/test/test_reductions.expected
@@ -392,7 +392,7 @@ def reduce_kernel(x: torch.Tensor, fn: Callable[[torch.Tensor], torch.Tensor], o
     # List: SequenceType([SymIntType(s77)]) SourceOrigin(location=<SourceLocation test_reductions.py:52>)
     # Name: SymIntType(s77) GetItemOrigin(value=SourceOrigin(location=<SourceLocation test_reductions.py:50>), key=0)
     # Name: LiteralType(torch.float32) ArgumentOrigin(name='out_dtype')
-    # Attribute: LiteralType(device(type='cuda', index=0)) AttributeOrigin(value=ArgumentOrigin(name='x'), key='device')
+    # Attribute: LiteralType(device=DEVICE) AttributeOrigin(value=ArgumentOrigin(name='x'), key='device')
     # Name: TensorType([x_size0, x_size1], torch.float32) ArgumentOrigin(name='x')
     # For: loop_type=GRID
     out = torch.empty([n], dtype=out_dtype, device=x.device)
diff --git a/test/test_signal_wait.py b/test/test_signal_wait.py
@@ -9,6 +9,7 @@
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfNotCUDA
 from helion._testing import skipIfRocm
 import helion.language as hl
 
@@ -82,7 +83,7 @@ def gmem_wait_multi_bar_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
         self.maxDiff = None
         self.assertExpectedJournal(code)
 
-    @skipIfRocm("only works on cuda")
+    @skipIfNotCUDA()
     def test_wait_multi_bar_cas(self):
         @helion.kernel
         def gmem_wait_multi_bar_kernel_cas(signal_pad: torch.Tensor) -> torch.Tensor:
@@ -156,7 +157,7 @@ def gmem_signal_tensor_bar_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
         )
         self.assertExpectedJournal(code)
 
-    @skipIfRocm("only works on cuda")
+    @skipIfNotCUDA()
     def test_signal_multiple_cas(self):
         @helion.kernel
         def gmem_signal_tensor_bar_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
@@ -218,7 +219,7 @@ def gmem_multi_bar_sync_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
         )
         self.assertExpectedJournal(code)
 
-    @skipIfRocm("only works on cuda")
+    @skipIfNotCUDA()
     def test_global_sync_cas(self):
         @helion.kernel
         def gmem_multi_bar_sync_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
diff --git a/test/test_type_propagation.expected b/test/test_type_propagation.expected
@@ -499,14 +499,14 @@ def root_graph_0():
 
 --- assertExpectedJournal(TestTypePropagation.test_cuda_device_properties)
 def use_device_properties(x: torch.Tensor):
-    # Attribute: LiteralType(device(type='cuda', index=0)) AttributeOrigin(value=ArgumentOrigin(name='x'), key='device')
+    # Attribute: LiteralType(device=DEVICE) AttributeOrigin(value=ArgumentOrigin(name='x'), key='device')
     # Name: TensorType([x_size0], torch.float32) ArgumentOrigin(name='x')
     device = x.device
     # Call: ClassType({'multi_processor_count': SymIntType(u0)}) SourceOrigin(location=<SourceLocation test_type_propagation.py:104>)
     # Attribute: CallableType(get_device_properties) AttributeOrigin(value=AttributeOrigin(value=GlobalOrigin(name='torch'), key='cuda'), key='get_device_properties')
     # Attribute: PythonModuleType(torch.cuda) AttributeOrigin(value=GlobalOrigin(name='torch'), key='cuda')
     # Name: PythonModuleType(torch) GlobalOrigin(name='torch')
-    # Name: LiteralType(device(type='cuda', index=0)) AttributeOrigin(value=ArgumentOrigin(name='x'), key='device')
+    # Name: LiteralType(device=DEVICE) AttributeOrigin(value=ArgumentOrigin(name='x'), key='device')
     props = torch.cuda.get_device_properties(device)
     # Attribute: SymIntType(u0) AttributeOrigin(value=SourceOrigin(location=<SourceLocation test_type_propagation.py:104>), key='multi_processor_count')
     # Name: ClassType({'multi_processor_count': SymIntType(u0)}) SourceOrigin(location=<SourceLocation test_type_propagation.py:104>)
@@ -737,7 +737,7 @@ def matmul(x: Tensor, y: Tensor, epilogue: Callable[[Tensor, tuple[Tensor, ...]]
     # Name: TensorType([512, 512], torch.float32) ArgumentOrigin(name='x')
     # Attribute: LiteralType(torch.float32) AttributeOrigin(value=ArgumentOrigin(name='y'), key='dtype')
     # Name: TensorType([512, 512], torch.float32) ArgumentOrigin(name='y')
-    # Attribute: LiteralType(device(type='cpu')) AttributeOrigin(value=ArgumentOrigin(name='x'), key='device')
+    # Attribute: LiteralType(device=DEVICE) AttributeOrigin(value=ArgumentOrigin(name='x'), key='device')
     # Name: TensorType([512, 512], torch.float32) ArgumentOrigin(name='x')
     # For: loop_type=GRID
     out = torch.empty([m, n], dtype=torch.promote_types(x.dtype, y.dtype), device=x.device)

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,7 @@`
`14`	`14`	`from helion._testing import import_path`
`15`	`15`	`from helion._testing import skipIfRefEager`
`16`	`16`	`from helion._testing import skipIfRocm`
	`17`	`+from helion._testing import skipIfXPU`
`17`	`18`
`18`	`19`	`torch.backends.cuda.matmul.fp32_precision = "tf32"`
`19`	`20`	`torch.backends.cudnn.conv.fp32_precision = "tf32"`
`@@ -163,6 +164,7 @@ def test_template_via_closure0(self):`
`163`	`164`	`)`
`164`	`165`	`)`
`165`	`166`
	`167`	`+ @skipIfXPU("Failed on XPU - https://github.com/pytorch/helion/issues/795")`
`166`	`168`	`def test_template_via_closure1(self):`
`167`	`169`	`bias = torch.randn([1, 1024], device=DEVICE, dtype=torch.float16)`
`168`	`170`	`args = (`