From 351e4705dc7897b314105648ee8ff7e86f19ecf3 Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Fri, 7 Nov 2025 14:17:29 +0800
Subject: [PATCH 1/7] Update ReduceSumProdKernels.cpp

---
 .../native/xpu/sycl/ReduceSumProdKernels.cpp  | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)
diff --git a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp
index 7bdc3a188a..8e18d6803d 100644
--- a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp
@@ -48,23 +48,24 @@ struct SumFunctor {
   }
 };
 
-template <>
-struct SumFunctor<c10::complex<at::Half>> {
-  using scalar_t = c10::complex<at::Half>;
-  using acc_t = at::opmath_type<scalar_t>;
-  inline acc_t operator()(acc_t a, acc_t b) const {
-    return a + b;
-  }
-};
-
 template <
     typename scalar_t,
     typename acc_t = scalar_t,
     typename out_t = scalar_t>
 struct sum_functor {
   void operator()(TensorIterator& iter) {
-    gpu_reduce_kernel<scalar_t, out_t>(
-        iter, func_wrapper<out_t>(SumFunctor<acc_t>()));
+    gpu_reduce_kernel<scalar_t, scalar_t>(
+        iter, func_wrapper<scalar_t>(SumFunctor<acc_t>()));
+  }
+};
+
+template <>
+struct sum_functor<c10::complex<at::Half>> {
+  void operator()(TensorIterator& iter) {
+    using scalar_t = c10::complex<at::Half>;
+    using acc_t = at::opmath_type<scalar_t>;
+    gpu_reduce_kernel<scalar_t, scalar_t>(
+        iter, func_wrapper<scalar_t>(SumFunctor<acc_t>()));
   }
 };
 

From e01e0b382bff3866a7a0cdefc1a49477f01e7931 Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Fri, 7 Nov 2025 14:20:54 +0800
Subject: [PATCH 2/7] Update Reduce.h

---
 src/ATen/native/xpu/sycl/Reduce.h | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/Reduce.h b/src/ATen/native/xpu/sycl/Reduce.h
index 0c713ee51d..ff49718c51 100644
--- a/src/ATen/native/xpu/sycl/Reduce.h
+++ b/src/ATen/native/xpu/sycl/Reduce.h
@@ -1151,11 +1151,21 @@ inline void gpu_reduce_kernel(
 
   using traits = function_traits<decltype(&ops_t::reduce)>;
   using arg_t = typename traits::template arg<0>::type;
-  static constexpr bool can_accumulate_in_output =
-      std::is_convertible<arg_t, out_scalar_t>::value;
 
-  bool can_use_32bit_indexing = iter.can_use_32bit_indexing();
-  std::unique_ptr<AccumulationBuffer> owned_buf_ptr;
+static constexpr bool is_inp_out_type_half_or_chalf =
+      (std::is_same_v<at::Half, scalar_t> &&
+       std::is_same_v<at::Half, out_scalar_t>) ||
+      (std::is_same_v<c10::complex<Half>, scalar_t> &&
+       std::is_same_v<c10::complex<Half>, out_scalar_t>);
+  // at::BFloat16 has lower precision and can lead to rounding errors.
+  // So when scalar_t and out_scalar_t are at::BFloat16, we
+  // set can_accumulate_in_output to False.
+  static constexpr bool is_inp_out_type_bfloat16 =
+      (std::is_same_v<at::BFloat16, scalar_t> &&
+       std::is_same_v<at::BFloat16, out_scalar_t>);
+  static constexpr bool can_accumulate_in_output =
+      std::is_convertible_v<arg_t, out_scalar_t> &&
+      !(is_inp_out_type_half_or_chalf || is_inp_out_type_bfloat16);
 
   // The acc_buf_ptr is a shared pointer. It is create at the first entrance
   // and resued by all recursive function calls.

From c8a1dddfc092f26a9be207fbb68e667532fd9e96 Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Fri, 7 Nov 2025 14:22:27 +0800
Subject: [PATCH 3/7] format

---
 src/ATen/native/xpu/sycl/Reduce.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ATen/native/xpu/sycl/Reduce.h b/src/ATen/native/xpu/sycl/Reduce.h
index ff49718c51..8913322591 100644
--- a/src/ATen/native/xpu/sycl/Reduce.h
+++ b/src/ATen/native/xpu/sycl/Reduce.h
@@ -1152,7 +1152,7 @@ inline void gpu_reduce_kernel(
   using traits = function_traits<decltype(&ops_t::reduce)>;
   using arg_t = typename traits::template arg<0>::type;
 
-static constexpr bool is_inp_out_type_half_or_chalf =
+  static constexpr bool is_inp_out_type_half_or_chalf =
       (std::is_same_v<at::Half, scalar_t> &&
        std::is_same_v<at::Half, out_scalar_t>) ||
       (std::is_same_v<c10::complex<Half>, scalar_t> &&

From b2ccf33da41b873ddbaeb6bba8db4b9af0cd4953 Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Fri, 7 Nov 2025 15:12:57 +0800
Subject: [PATCH 4/7] Update Reduce.h

---
 src/ATen/native/xpu/sycl/Reduce.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/ATen/native/xpu/sycl/Reduce.h b/src/ATen/native/xpu/sycl/Reduce.h
index 8913322591..7011aac8a1 100644
--- a/src/ATen/native/xpu/sycl/Reduce.h
+++ b/src/ATen/native/xpu/sycl/Reduce.h
@@ -1167,6 +1167,9 @@ inline void gpu_reduce_kernel(
       std::is_convertible_v<arg_t, out_scalar_t> &&
       !(is_inp_out_type_half_or_chalf || is_inp_out_type_bfloat16);
 
+  bool can_use_32bit_indexing = iter.can_use_32bit_indexing();
+  std::unique_ptr<AccumulationBuffer> owned_buf_ptr;
+
   // The acc_buf_ptr is a shared pointer. It is create at the first entrance
   // and resued by all recursive function calls.
   if (acc_buf_ptr == nullptr) {

From 5059aac6960b9a5c7aec4ee0d637ea6e3f4bce37 Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Mon, 10 Nov 2025 15:42:18 +0800
Subject: [PATCH 5/7] Update Reduce.h

---
 src/ATen/native/xpu/sycl/Reduce.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/ATen/native/xpu/sycl/Reduce.h b/src/ATen/native/xpu/sycl/Reduce.h
index 7011aac8a1..4639ea9e89 100644
--- a/src/ATen/native/xpu/sycl/Reduce.h
+++ b/src/ATen/native/xpu/sycl/Reduce.h
@@ -1152,6 +1152,9 @@ inline void gpu_reduce_kernel(
   using traits = function_traits<decltype(&ops_t::reduce)>;
   using arg_t = typename traits::template arg<0>::type;
 
+  // at::Half/at::ComplexHalf overflows easily as it's range is very small.
+  // So when scalar_t and out_scalar_t are at::Half/at::ComplexHalf, we
+  // set can_accumulate_in_output to False.
   static constexpr bool is_inp_out_type_half_or_chalf =
       (std::is_same_v<at::Half, scalar_t> &&
        std::is_same_v<at::Half, out_scalar_t>) ||

From f09cb9e6226c6157067408ddc1443841a8395b27 Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Mon, 10 Nov 2025 15:58:14 +0800
Subject: [PATCH 6/7] Update ReduceSumProdKernels.cpp

---
 src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp
index 8e18d6803d..dc76d53430 100644
--- a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp
@@ -54,7 +54,7 @@ template <
     typename out_t = scalar_t>
 struct sum_functor {
   void operator()(TensorIterator& iter) {
-    gpu_reduce_kernel<scalar_t, scalar_t>(
+    gpu_reduce_kernel<scalar_t, out_t>(
         iter, func_wrapper<scalar_t>(SumFunctor<acc_t>()));
   }
 };

From 0be939af2c587b27b086f3ebab9265c21a1ea8b1 Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Wed, 12 Nov 2025 11:57:03 +0800
Subject: [PATCH 7/7] Update ReduceSumProdKernels.cpp

---
 src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp
index dc76d53430..7b62371c44 100644
--- a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp
@@ -55,7 +55,7 @@ template <
 struct sum_functor {
   void operator()(TensorIterator& iter) {
     gpu_reduce_kernel<scalar_t, out_t>(
-        iter, func_wrapper<scalar_t>(SumFunctor<acc_t>()));
+        iter, func_wrapper<out_t>(SumFunctor<acc_t>()));
   }
 };