From 351e4705dc7897b314105648ee8ff7e86f19ecf3 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Fri, 7 Nov 2025 14:17:29 +0800 Subject: [PATCH 1/7] Update ReduceSumProdKernels.cpp --- .../native/xpu/sycl/ReduceSumProdKernels.cpp | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp index 7bdc3a188a..8e18d6803d 100644 --- a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp @@ -48,23 +48,24 @@ struct SumFunctor { } }; -template <> -struct SumFunctor> { - using scalar_t = c10::complex; - using acc_t = at::opmath_type; - inline acc_t operator()(acc_t a, acc_t b) const { - return a + b; - } -}; - template < typename scalar_t, typename acc_t = scalar_t, typename out_t = scalar_t> struct sum_functor { void operator()(TensorIterator& iter) { - gpu_reduce_kernel( - iter, func_wrapper(SumFunctor())); + gpu_reduce_kernel( + iter, func_wrapper(SumFunctor())); + } +}; + +template <> +struct sum_functor> { + void operator()(TensorIterator& iter) { + using scalar_t = c10::complex; + using acc_t = at::opmath_type; + gpu_reduce_kernel( + iter, func_wrapper(SumFunctor())); } }; From e01e0b382bff3866a7a0cdefc1a49477f01e7931 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Fri, 7 Nov 2025 14:20:54 +0800 Subject: [PATCH 2/7] Update Reduce.h --- src/ATen/native/xpu/sycl/Reduce.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/ATen/native/xpu/sycl/Reduce.h b/src/ATen/native/xpu/sycl/Reduce.h index 0c713ee51d..ff49718c51 100644 --- a/src/ATen/native/xpu/sycl/Reduce.h +++ b/src/ATen/native/xpu/sycl/Reduce.h @@ -1151,11 +1151,21 @@ inline void gpu_reduce_kernel( using traits = function_traits; using arg_t = typename traits::template arg<0>::type; - static constexpr bool can_accumulate_in_output = - std::is_convertible::value; - bool can_use_32bit_indexing = iter.can_use_32bit_indexing(); - std::unique_ptr owned_buf_ptr; +static constexpr bool is_inp_out_type_half_or_chalf = + (std::is_same_v && + std::is_same_v) || + (std::is_same_v, scalar_t> && + std::is_same_v, out_scalar_t>); + // at::BFloat16 has lower precision and can lead to rounding errors. + // So when scalar_t and out_scalar_t are at::BFloat16, we + // set can_accumulate_in_output to False. + static constexpr bool is_inp_out_type_bfloat16 = + (std::is_same_v && + std::is_same_v); + static constexpr bool can_accumulate_in_output = + std::is_convertible_v && + !(is_inp_out_type_half_or_chalf || is_inp_out_type_bfloat16); // The acc_buf_ptr is a shared pointer. It is create at the first entrance // and resued by all recursive function calls. From c8a1dddfc092f26a9be207fbb68e667532fd9e96 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Fri, 7 Nov 2025 14:22:27 +0800 Subject: [PATCH 3/7] format --- src/ATen/native/xpu/sycl/Reduce.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ATen/native/xpu/sycl/Reduce.h b/src/ATen/native/xpu/sycl/Reduce.h index ff49718c51..8913322591 100644 --- a/src/ATen/native/xpu/sycl/Reduce.h +++ b/src/ATen/native/xpu/sycl/Reduce.h @@ -1152,7 +1152,7 @@ inline void gpu_reduce_kernel( using traits = function_traits; using arg_t = typename traits::template arg<0>::type; -static constexpr bool is_inp_out_type_half_or_chalf = + static constexpr bool is_inp_out_type_half_or_chalf = (std::is_same_v && std::is_same_v) || (std::is_same_v, scalar_t> && From b2ccf33da41b873ddbaeb6bba8db4b9af0cd4953 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Fri, 7 Nov 2025 15:12:57 +0800 Subject: [PATCH 4/7] Update Reduce.h --- src/ATen/native/xpu/sycl/Reduce.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/ATen/native/xpu/sycl/Reduce.h b/src/ATen/native/xpu/sycl/Reduce.h index 8913322591..7011aac8a1 100644 --- a/src/ATen/native/xpu/sycl/Reduce.h +++ b/src/ATen/native/xpu/sycl/Reduce.h @@ -1167,6 +1167,9 @@ inline void gpu_reduce_kernel( std::is_convertible_v && !(is_inp_out_type_half_or_chalf || is_inp_out_type_bfloat16); + bool can_use_32bit_indexing = iter.can_use_32bit_indexing(); + std::unique_ptr owned_buf_ptr; + // The acc_buf_ptr is a shared pointer. It is create at the first entrance // and resued by all recursive function calls. if (acc_buf_ptr == nullptr) { From 5059aac6960b9a5c7aec4ee0d637ea6e3f4bce37 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Mon, 10 Nov 2025 15:42:18 +0800 Subject: [PATCH 5/7] Update Reduce.h --- src/ATen/native/xpu/sycl/Reduce.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/ATen/native/xpu/sycl/Reduce.h b/src/ATen/native/xpu/sycl/Reduce.h index 7011aac8a1..4639ea9e89 100644 --- a/src/ATen/native/xpu/sycl/Reduce.h +++ b/src/ATen/native/xpu/sycl/Reduce.h @@ -1152,6 +1152,9 @@ inline void gpu_reduce_kernel( using traits = function_traits; using arg_t = typename traits::template arg<0>::type; + // at::Half/at::ComplexHalf overflows easily as it's range is very small. + // So when scalar_t and out_scalar_t are at::Half/at::ComplexHalf, we + // set can_accumulate_in_output to False. static constexpr bool is_inp_out_type_half_or_chalf = (std::is_same_v && std::is_same_v) || From f09cb9e6226c6157067408ddc1443841a8395b27 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Mon, 10 Nov 2025 15:58:14 +0800 Subject: [PATCH 6/7] Update ReduceSumProdKernels.cpp --- src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp index 8e18d6803d..dc76d53430 100644 --- a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp @@ -54,7 +54,7 @@ template < typename out_t = scalar_t> struct sum_functor { void operator()(TensorIterator& iter) { - gpu_reduce_kernel( + gpu_reduce_kernel( iter, func_wrapper(SumFunctor())); } }; From 0be939af2c587b27b086f3ebab9265c21a1ea8b1 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Wed, 12 Nov 2025 11:57:03 +0800 Subject: [PATCH 7/7] Update ReduceSumProdKernels.cpp --- src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp index dc76d53430..7b62371c44 100644 --- a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp @@ -55,7 +55,7 @@ template < struct sum_functor { void operator()(TensorIterator& iter) { gpu_reduce_kernel( - iter, func_wrapper(SumFunctor())); + iter, func_wrapper(SumFunctor())); } };