From df7ebb6ba9bd35ce890ede0ff7e0b9d2c555e124 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Thu, 16 Oct 2025 08:41:55 +0000
Subject: [PATCH 1/2] port cuda specific case with hook

---
 test/xpu/test_nn_xpu.py | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/test/xpu/test_nn_xpu.py b/test/xpu/test_nn_xpu.py
index 4ff4bcef2..abdecc20e 100644
--- a/test/xpu/test_nn_xpu.py
+++ b/test/xpu/test_nn_xpu.py
@@ -16,6 +16,7 @@
 from torch.testing._internal.common_device_type import (
     dtypes,
     instantiate_device_type_tests,
+    largeTensorTest,
 )
 from torch.testing._internal.common_dtype import get_all_math_dtypes, integral_types
 from torch.testing._internal.common_utils import (
@@ -3786,6 +3787,39 @@ def test_cross_entropy_loss_2d_out_of_bounds_class_index(self):
     )
 
 
+@dtypes(torch.float, torch.half)
+@largeTensorTest("20GB")
+@largeTensorTest("64GB", "cpu")
+def _test_warp_softmax_64bit_indexing(self, device, dtype):
+    def run_test(*shape):
+        x = torch.randn(shape, device="xpu", dtype=torch.float16, requires_grad=True)
+        y = F.log_softmax(x, dim=-1, dtype=dtype)
+        y.backward(y)
+        with torch.no_grad():
+            xx = x.cpu().requires_grad_()
+        yy = F.log_softmax(xx.float(), dim=-1).to(dtype)
+        yy.backward(yy)
+        # workaround to reduce memory usage vs. self.assertEqual, see #84944
+        rtol, atol = torch.testing._comparison.get_tolerances(
+            dtype, rtol=None, atol=None
+        )
+        self.assertTrue(torch.allclose(y.cpu(), yy, rtol=rtol, atol=atol))
+        # x is half
+        rtol, _ = torch.testing._comparison.get_tolerances(
+            torch.half, rtol=None, atol=None
+        )
+        self.assertTrue(torch.allclose(x.grad.cpu(), xx.grad, rtol=rtol, atol=1e-3))
+
+    run_test(
+        1100000000, 2
+    )  # Illegal memory access https://github.com/pytorch/pytorch/issues/52715
+    run_test(
+        2200000000, 1
+    )  # invalid configuration argument https://github.com/pytorch/pytorch/issues/52716
+
+
+TestNNDeviceType.test_warp_softmax_64bit_indexing = _test_warp_softmax_64bit_indexing
+
 TestNNDeviceType.test_cross_entropy_loss_2d_out_of_bounds_class_index = (
     _test_cross_entropy_loss_2d_out_of_bounds_class_index
 )

From 6ab64469aeae61de1fbe84ef8e901065af6fcbb3 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Fri, 17 Oct 2025 03:20:02 +0000
Subject: [PATCH 2/2] replace hardcoded xpu with device variable

---
 test/xpu/test_nn_xpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/xpu/test_nn_xpu.py b/test/xpu/test_nn_xpu.py
index abdecc20e..72d2647af 100644
--- a/test/xpu/test_nn_xpu.py
+++ b/test/xpu/test_nn_xpu.py
@@ -3792,7 +3792,7 @@ def test_cross_entropy_loss_2d_out_of_bounds_class_index(self):
 @largeTensorTest("64GB", "cpu")
 def _test_warp_softmax_64bit_indexing(self, device, dtype):
     def run_test(*shape):
-        x = torch.randn(shape, device="xpu", dtype=torch.float16, requires_grad=True)
+        x = torch.randn(shape, device=device, dtype=torch.float16, requires_grad=True)
         y = F.log_softmax(x, dim=-1, dtype=dtype)
         y.backward(y)
         with torch.no_grad():