From f4d7a3464b3f38fb26b6f8e3cacba6b349f79a23 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Fri, 31 Oct 2025 10:54:56 +0100
Subject: [PATCH] Move from split_avx / split_avx512 to lower_half/ upper_half

Related to #1184 and #1162
---
 include/xsimd/arch/xsimd_avx.hpp     | 56 +++++++++++++--------------
 include/xsimd/arch/xsimd_avx512f.hpp | 58 ++++++++++++++--------------
 2 files changed, 56 insertions(+), 58 deletions(-)
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
index 9d93be071..b2584bd71 100644
--- a/include/xsimd/arch/xsimd_avx.hpp
+++ b/include/xsimd/arch/xsimd_avx.hpp
@@ -36,20 +36,29 @@ namespace xsimd
 
         namespace detail
         {
-            XSIMD_INLINE void split_avx(__m256i val, __m128i& low, __m128i& high) noexcept
+            XSIMD_INLINE __m128i lower_half(__m256i self) noexcept
             {
-                low = _mm256_castsi256_si128(val);
-                high = _mm256_extractf128_si256(val, 1);
+                return _mm256_castsi256_si128(self);
             }
-            XSIMD_INLINE void split_avx(__m256 val, __m128& low, __m128& high) noexcept
+            XSIMD_INLINE __m128 lower_half(__m256 self) noexcept
             {
-                low = _mm256_castps256_ps128(val);
-                high = _mm256_extractf128_ps(val, 1);
+                return _mm256_castps256_ps128(self);
             }
-            XSIMD_INLINE void split_avx(__m256d val, __m128d& low, __m128d& high) noexcept
+            XSIMD_INLINE __m128d lower_half(__m256d self) noexcept
             {
-                low = _mm256_castpd256_pd128(val);
-                high = _mm256_extractf128_pd(val, 1);
+                return _mm256_castpd256_pd128(self);
+            }
+            XSIMD_INLINE __m128i upper_half(__m256i self) noexcept
+            {
+                return _mm256_extractf128_si256(self, 1);
+            }
+            XSIMD_INLINE __m128 upper_half(__m256 self) noexcept
+            {
+                return _mm256_extractf128_ps(self, 1);
+            }
+            XSIMD_INLINE __m128d upper_half(__m256d self) noexcept
+            {
+                return _mm256_extractf128_pd(self, 1);
             }
             XSIMD_INLINE __m256i merge_sse(__m128i low, __m128i high) noexcept
             {
@@ -66,8 +75,7 @@ namespace xsimd
             template <class F>
             XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self) noexcept
             {
-                __m128i self_low, self_high;
-                split_avx(self, self_low, self_high);
+                __m128i self_low = lower_half(self), self_high = upper_half(self);
                 __m128i res_low = f(self_low);
                 __m128i res_high = f(self_high);
                 return merge_sse(res_low, res_high);
@@ -75,9 +83,8 @@ namespace xsimd
             template <class F>
             XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, __m256i other) noexcept
             {
-                __m128i self_low, self_high, other_low, other_high;
-                split_avx(self, self_low, self_high);
-                split_avx(other, other_low, other_high);
+                __m128i self_low = lower_half(self), self_high = upper_half(self),
+                        other_low = lower_half(other), other_high = upper_half(other);
                 __m128i res_low = f(self_low, other_low);
                 __m128i res_high = f(self_high, other_high);
                 return merge_sse(res_low, res_high);
@@ -85,8 +92,7 @@ namespace xsimd
             template <class F>
             XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, int32_t other) noexcept
             {
-                __m128i self_low, self_high;
-                split_avx(self, self_low, self_high);
+                __m128i self_low = lower_half(self), self_high = upper_half(self);
                 __m128i res_low = f(self_low, other);
                 __m128i res_high = f(self_high, other);
                 return merge_sse(res_low, res_high);
@@ -891,8 +897,7 @@ namespace xsimd
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
             {
-                __m128i self_low, self_high;
-                detail::split_avx(self, self_low, self_high);
+                __m128i self_low = detail::lower_half(self), self_high = detail::upper_half(self);
                 return mask(batch_bool<T, sse4_2>(self_low), sse4_2 {}) | (mask(batch_bool<T, sse4_2>(self_high), sse4_2 {}) << (128 / (8 * sizeof(T))));
             }
             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
@@ -1049,8 +1054,7 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
         {
-            typename batch<T, sse4_2>::register_type low, high;
-            detail::split_avx(self, low, high);
+            typename batch<T, sse4_2>::register_type low = detail::lower_half(self), high = detail::upper_half(self);
             batch<T, sse4_2> blow(low), bhigh(high);
             return reduce_add(blow + bhigh);
         }
@@ -1081,8 +1085,7 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx>) noexcept
         {
-            typename batch<T, sse4_2>::register_type low, high;
-            detail::split_avx(self, low, high);
+            typename batch<T, sse4_2>::register_type low = detail::lower_half(self), high = detail::upper_half(self);
             batch<T, sse4_2> blow(low), bhigh(high);
             return reduce_mul(blow * bhigh);
         }
@@ -1132,14 +1135,11 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
         {
-            __m128i cond_low, cond_hi;
-            detail::split_avx(cond, cond_low, cond_hi);
+            __m128i cond_low = detail::lower_half(cond), cond_hi = detail::upper_half(cond);
 
-            __m128i true_low, true_hi;
-            detail::split_avx(true_br, true_low, true_hi);
+            __m128i true_low = detail::lower_half(true_br), true_hi = detail::upper_half(true_br);
 
-            __m128i false_low, false_hi;
-            detail::split_avx(false_br, false_low, false_hi);
+            __m128i false_low = detail::lower_half(false_br), false_hi = detail::upper_half(false_br);
 
             __m128i res_low = select(batch_bool<T, sse4_2>(cond_low), batch<T, sse4_2>(true_low), batch<T, sse4_2>(false_low), sse4_2 {});
             __m128i res_hi = select(batch_bool<T, sse4_2>(cond_hi), batch<T, sse4_2>(true_hi), batch<T, sse4_2>(false_hi), sse4_2 {});
diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
index 4daf0a02f..1173df094 100644
--- a/include/xsimd/arch/xsimd_avx512f.hpp
+++ b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -41,20 +41,29 @@ namespace xsimd
 
         namespace detail
         {
-            XSIMD_INLINE void split_avx512(__m512 val, __m256& low, __m256& high) noexcept
+            XSIMD_INLINE __m256 lower_half(__m512 self) noexcept
             {
-                low = _mm512_castps512_ps256(val);
-                high = _mm512_extractf32x8_ps(val, 1);
+                return _mm512_castps512_ps256(self);
             }
-            XSIMD_INLINE void split_avx512(__m512d val, __m256d& low, __m256d& high) noexcept
+            XSIMD_INLINE __m256d lower_half(__m512d self) noexcept
             {
-                low = _mm512_castpd512_pd256(val);
-                high = _mm512_extractf64x4_pd(val, 1);
+                return _mm512_castpd512_pd256(self);
             }
-            XSIMD_INLINE void split_avx512(__m512i val, __m256i& low, __m256i& high) noexcept
+            XSIMD_INLINE __m256i lower_half(__m512i self) noexcept
             {
-                low = _mm512_castsi512_si256(val);
-                high = _mm512_extracti64x4_epi64(val, 1);
+                return _mm512_castsi512_si256(self);
+            }
+            XSIMD_INLINE __m256 upper_half(__m512 self) noexcept
+            {
+                return _mm512_extractf32x8_ps(self, 1);
+            }
+            XSIMD_INLINE __m256d upper_half(__m512d self) noexcept
+            {
+                return _mm512_extractf64x4_pd(self, 1);
+            }
+            XSIMD_INLINE __m256i upper_half(__m512i self) noexcept
+            {
+                return _mm512_extracti64x4_epi64(self, 1);
             }
             XSIMD_INLINE __m512i merge_avx(__m256i low, __m256i high) noexcept
             {
@@ -71,8 +80,7 @@ namespace xsimd
             template <class F>
             __m512i fwd_to_avx(F f, __m512i self)
             {
-                __m256i self_low, self_high;
-                split_avx512(self, self_low, self_high);
+                __m256i self_low = lower_half(self), self_high = upper_half(self);
                 __m256i res_low = f(self_low);
                 __m256i res_high = f(self_high);
                 return merge_avx(res_low, res_high);
@@ -80,9 +88,8 @@ namespace xsimd
             template <class F>
             __m512i fwd_to_avx(F f, __m512i self, __m512i other)
             {
-                __m256i self_low, self_high, other_low, other_high;
-                split_avx512(self, self_low, self_high);
-                split_avx512(other, other_low, other_high);
+                __m256i self_low = lower_half(self), self_high = upper_half(self),
+                        other_low = lower_half(other), other_high = upper_half(other);
                 __m256i res_low = f(self_low, other_low);
                 __m256i res_high = f(self_high, other_high);
                 return merge_avx(res_low, res_high);
@@ -90,8 +97,7 @@ namespace xsimd
             template <class F>
             __m512i fwd_to_avx(F f, __m512i self, int32_t other)
             {
-                __m256i self_low, self_high;
-                split_avx512(self, self_low, self_high);
+                __m256i self_low = lower_half(self), self_high = upper_half(self);
                 __m256i res_low = f(self_low, other);
                 __m256i res_high = f(self_high, other);
                 return merge_avx(res_low, res_high);
@@ -1624,8 +1630,7 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx512f>) noexcept
         {
-            __m256i low, high;
-            detail::split_avx512(self, low, high);
+            __m256i low = detail::lower_half(self), high = detail::upper_half(self);
             batch<T, avx2> blow(low), bhigh(high);
             return reduce_add(blow, avx2 {}) + reduce_add(bhigh, avx2 {});
         }
@@ -1676,8 +1681,7 @@ namespace xsimd
             }
             else
             {
-                __m256i low, high;
-                detail::split_avx512(self, low, high);
+                __m256i low = detail::lower_half(self), high = detail::upper_half(self);
                 batch<T, avx2> blow(low), bhigh(high);
                 return reduce_mul(blow, avx2 {}) * reduce_mul(bhigh, avx2 {});
             }
@@ -1773,11 +1777,8 @@ namespace xsimd
                 __m256i cond_low = batch<uint8_t, avx2>::load_aligned(&buffer[0]);
                 __m256i cond_hi = batch<uint8_t, avx2>::load_aligned(&buffer[32]);
 
-                __m256i true_low, true_hi;
-                detail::split_avx512(true_br, true_low, true_hi);
-
-                __m256i false_low, false_hi;
-                detail::split_avx512(false_br, false_low, false_hi);
+                __m256i true_low = detail::lower_half(true_br), true_hi = detail::upper_half(true_br);
+                __m256i false_low = detail::lower_half(false_br), false_hi = detail::upper_half(false_br);
 
                 __m256i res_low = select(batch_bool<T, avx2>(cond_low), batch<T, avx2>(true_low), batch<T, avx2>(false_low), avx2 {});
                 __m256i res_hi = select(batch_bool<T, avx2>(cond_hi), batch<T, avx2>(true_hi), batch<T, avx2>(false_hi), avx2 {});
@@ -1788,11 +1789,8 @@ namespace xsimd
                 __m256i cond_low = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data & 0xFFFF, _mm512_set1_epi32(~0));
                 __m256i cond_hi = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data >> 16, _mm512_set1_epi32(~0));
 
-                __m256i true_low, true_hi;
-                detail::split_avx512(true_br, true_low, true_hi);
-
-                __m256i false_low, false_hi;
-                detail::split_avx512(false_br, false_low, false_hi);
+                __m256i true_low = detail::lower_half(true_br), true_hi = detail::upper_half(true_br);
+                __m256i false_low = detail::lower_half(false_br), false_hi = detail::upper_half(false_br);
 
                 __m256i res_low = select(batch_bool<T, avx2>(cond_low), batch<T, avx2>(true_low), batch<T, avx2>(false_low), avx2 {});
                 __m256i res_hi = select(batch_bool<T, avx2>(cond_hi), batch<T, avx2>(true_hi), batch<T, avx2>(false_hi), avx2 {});