From f4d7a3464b3f38fb26b6f8e3cacba6b349f79a23 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Fri, 31 Oct 2025 10:54:56 +0100 Subject: [PATCH] Move from split_avx / split_avx512 to lower_half/ upper_half Related to #1184 and #1162 --- include/xsimd/arch/xsimd_avx.hpp | 56 +++++++++++++-------------- include/xsimd/arch/xsimd_avx512f.hpp | 58 ++++++++++++++-------------- 2 files changed, 56 insertions(+), 58 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 9d93be071..b2584bd71 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -36,20 +36,29 @@ namespace xsimd namespace detail { - XSIMD_INLINE void split_avx(__m256i val, __m128i& low, __m128i& high) noexcept + XSIMD_INLINE __m128i lower_half(__m256i self) noexcept { - low = _mm256_castsi256_si128(val); - high = _mm256_extractf128_si256(val, 1); + return _mm256_castsi256_si128(self); } - XSIMD_INLINE void split_avx(__m256 val, __m128& low, __m128& high) noexcept + XSIMD_INLINE __m128 lower_half(__m256 self) noexcept { - low = _mm256_castps256_ps128(val); - high = _mm256_extractf128_ps(val, 1); + return _mm256_castps256_ps128(self); } - XSIMD_INLINE void split_avx(__m256d val, __m128d& low, __m128d& high) noexcept + XSIMD_INLINE __m128d lower_half(__m256d self) noexcept { - low = _mm256_castpd256_pd128(val); - high = _mm256_extractf128_pd(val, 1); + return _mm256_castpd256_pd128(self); + } + XSIMD_INLINE __m128i upper_half(__m256i self) noexcept + { + return _mm256_extractf128_si256(self, 1); + } + XSIMD_INLINE __m128 upper_half(__m256 self) noexcept + { + return _mm256_extractf128_ps(self, 1); + } + XSIMD_INLINE __m128d upper_half(__m256d self) noexcept + { + return _mm256_extractf128_pd(self, 1); } XSIMD_INLINE __m256i merge_sse(__m128i low, __m128i high) noexcept { @@ -66,8 +75,7 @@ namespace xsimd template XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self) noexcept { - __m128i self_low, self_high; - split_avx(self, self_low, self_high); + __m128i self_low = lower_half(self), self_high = upper_half(self); __m128i res_low = f(self_low); __m128i res_high = f(self_high); return merge_sse(res_low, res_high); @@ -75,9 +83,8 @@ namespace xsimd template XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, __m256i other) noexcept { - __m128i self_low, self_high, other_low, other_high; - split_avx(self, self_low, self_high); - split_avx(other, other_low, other_high); + __m128i self_low = lower_half(self), self_high = upper_half(self), + other_low = lower_half(other), other_high = upper_half(other); __m128i res_low = f(self_low, other_low); __m128i res_high = f(self_high, other_high); return merge_sse(res_low, res_high); @@ -85,8 +92,7 @@ namespace xsimd template XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, int32_t other) noexcept { - __m128i self_low, self_high; - split_avx(self, self_low, self_high); + __m128i self_low = lower_half(self), self_high = upper_half(self); __m128i res_low = f(self_low, other); __m128i res_high = f(self_high, other); return merge_sse(res_low, res_high); @@ -891,8 +897,7 @@ namespace xsimd { XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2) { - __m128i self_low, self_high; - detail::split_avx(self, self_low, self_high); + __m128i self_low = detail::lower_half(self), self_high = detail::upper_half(self); return mask(batch_bool(self_low), sse4_2 {}) | (mask(batch_bool(self_high), sse4_2 {}) << (128 / (8 * sizeof(T)))); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) @@ -1049,8 +1054,7 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept { - typename batch::register_type low, high; - detail::split_avx(self, low, high); + typename batch::register_type low = detail::lower_half(self), high = detail::upper_half(self); batch blow(low), bhigh(high); return reduce_add(blow + bhigh); } @@ -1081,8 +1085,7 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE T reduce_mul(batch const& self, requires_arch) noexcept { - typename batch::register_type low, high; - detail::split_avx(self, low, high); + typename batch::register_type low = detail::lower_half(self), high = detail::upper_half(self); batch blow(low), bhigh(high); return reduce_mul(blow * bhigh); } @@ -1132,14 +1135,11 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { - __m128i cond_low, cond_hi; - detail::split_avx(cond, cond_low, cond_hi); + __m128i cond_low = detail::lower_half(cond), cond_hi = detail::upper_half(cond); - __m128i true_low, true_hi; - detail::split_avx(true_br, true_low, true_hi); + __m128i true_low = detail::lower_half(true_br), true_hi = detail::upper_half(true_br); - __m128i false_low, false_hi; - detail::split_avx(false_br, false_low, false_hi); + __m128i false_low = detail::lower_half(false_br), false_hi = detail::upper_half(false_br); __m128i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), sse4_2 {}); __m128i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), sse4_2 {}); diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 4daf0a02f..1173df094 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -41,20 +41,29 @@ namespace xsimd namespace detail { - XSIMD_INLINE void split_avx512(__m512 val, __m256& low, __m256& high) noexcept + XSIMD_INLINE __m256 lower_half(__m512 self) noexcept { - low = _mm512_castps512_ps256(val); - high = _mm512_extractf32x8_ps(val, 1); + return _mm512_castps512_ps256(self); } - XSIMD_INLINE void split_avx512(__m512d val, __m256d& low, __m256d& high) noexcept + XSIMD_INLINE __m256d lower_half(__m512d self) noexcept { - low = _mm512_castpd512_pd256(val); - high = _mm512_extractf64x4_pd(val, 1); + return _mm512_castpd512_pd256(self); } - XSIMD_INLINE void split_avx512(__m512i val, __m256i& low, __m256i& high) noexcept + XSIMD_INLINE __m256i lower_half(__m512i self) noexcept { - low = _mm512_castsi512_si256(val); - high = _mm512_extracti64x4_epi64(val, 1); + return _mm512_castsi512_si256(self); + } + XSIMD_INLINE __m256 upper_half(__m512 self) noexcept + { + return _mm512_extractf32x8_ps(self, 1); + } + XSIMD_INLINE __m256d upper_half(__m512d self) noexcept + { + return _mm512_extractf64x4_pd(self, 1); + } + XSIMD_INLINE __m256i upper_half(__m512i self) noexcept + { + return _mm512_extracti64x4_epi64(self, 1); } XSIMD_INLINE __m512i merge_avx(__m256i low, __m256i high) noexcept { @@ -71,8 +80,7 @@ namespace xsimd template __m512i fwd_to_avx(F f, __m512i self) { - __m256i self_low, self_high; - split_avx512(self, self_low, self_high); + __m256i self_low = lower_half(self), self_high = upper_half(self); __m256i res_low = f(self_low); __m256i res_high = f(self_high); return merge_avx(res_low, res_high); @@ -80,9 +88,8 @@ namespace xsimd template __m512i fwd_to_avx(F f, __m512i self, __m512i other) { - __m256i self_low, self_high, other_low, other_high; - split_avx512(self, self_low, self_high); - split_avx512(other, other_low, other_high); + __m256i self_low = lower_half(self), self_high = upper_half(self), + other_low = lower_half(other), other_high = upper_half(other); __m256i res_low = f(self_low, other_low); __m256i res_high = f(self_high, other_high); return merge_avx(res_low, res_high); @@ -90,8 +97,7 @@ namespace xsimd template __m512i fwd_to_avx(F f, __m512i self, int32_t other) { - __m256i self_low, self_high; - split_avx512(self, self_low, self_high); + __m256i self_low = lower_half(self), self_high = upper_half(self); __m256i res_low = f(self_low, other); __m256i res_high = f(self_high, other); return merge_avx(res_low, res_high); @@ -1624,8 +1630,7 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept { - __m256i low, high; - detail::split_avx512(self, low, high); + __m256i low = detail::lower_half(self), high = detail::upper_half(self); batch blow(low), bhigh(high); return reduce_add(blow, avx2 {}) + reduce_add(bhigh, avx2 {}); } @@ -1676,8 +1681,7 @@ namespace xsimd } else { - __m256i low, high; - detail::split_avx512(self, low, high); + __m256i low = detail::lower_half(self), high = detail::upper_half(self); batch blow(low), bhigh(high); return reduce_mul(blow, avx2 {}) * reduce_mul(bhigh, avx2 {}); } @@ -1773,11 +1777,8 @@ namespace xsimd __m256i cond_low = batch::load_aligned(&buffer[0]); __m256i cond_hi = batch::load_aligned(&buffer[32]); - __m256i true_low, true_hi; - detail::split_avx512(true_br, true_low, true_hi); - - __m256i false_low, false_hi; - detail::split_avx512(false_br, false_low, false_hi); + __m256i true_low = detail::lower_half(true_br), true_hi = detail::upper_half(true_br); + __m256i false_low = detail::lower_half(false_br), false_hi = detail::upper_half(false_br); __m256i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), avx2 {}); __m256i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), avx2 {}); @@ -1788,11 +1789,8 @@ namespace xsimd __m256i cond_low = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data & 0xFFFF, _mm512_set1_epi32(~0)); __m256i cond_hi = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data >> 16, _mm512_set1_epi32(~0)); - __m256i true_low, true_hi; - detail::split_avx512(true_br, true_low, true_hi); - - __m256i false_low, false_hi; - detail::split_avx512(false_br, false_low, false_hi); + __m256i true_low = detail::lower_half(true_br), true_hi = detail::upper_half(true_br); + __m256i false_low = detail::lower_half(false_br), false_hi = detail::upper_half(false_br); __m256i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), avx2 {}); __m256i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), avx2 {});