@@ -36,20 +36,29 @@ namespace xsimd
3636
3737 namespace detail
3838 {
39- XSIMD_INLINE void split_avx (__m256i val, __m128i& low, __m128i& high ) noexcept
39+ XSIMD_INLINE __m128i lower_half (__m256i self ) noexcept
4040 {
41- low = _mm256_castsi256_si128 (val);
42- high = _mm256_extractf128_si256 (val, 1 );
41+ return _mm256_castsi256_si128 (self);
4342 }
44- XSIMD_INLINE void split_avx (__m256 val, __m128& low, __m128& high ) noexcept
43+ XSIMD_INLINE __m128 lower_half (__m256 self ) noexcept
4544 {
46- low = _mm256_castps256_ps128 (val);
47- high = _mm256_extractf128_ps (val, 1 );
45+ return _mm256_castps256_ps128 (self);
4846 }
49- XSIMD_INLINE void split_avx (__m256d val, __m128d& low, __m128d& high ) noexcept
47+ XSIMD_INLINE __m128d lower_half (__m256d self ) noexcept
5048 {
51- low = _mm256_castpd256_pd128 (val);
52- high = _mm256_extractf128_pd (val, 1 );
49+ return _mm256_castpd256_pd128 (self);
50+ }
51+ XSIMD_INLINE __m128i upper_half (__m256i self) noexcept
52+ {
53+ return _mm256_extractf128_si256 (self, 1 );
54+ }
55+ XSIMD_INLINE __m128 upper_half (__m256 self) noexcept
56+ {
57+ return _mm256_extractf128_ps (self, 1 );
58+ }
59+ XSIMD_INLINE __m128d upper_half (__m256d self) noexcept
60+ {
61+ return _mm256_extractf128_pd (self, 1 );
5362 }
5463 XSIMD_INLINE __m256i merge_sse (__m128i low, __m128i high) noexcept
5564 {
@@ -66,27 +75,24 @@ namespace xsimd
6675 template <class F >
6776 XSIMD_INLINE __m256i fwd_to_sse (F f, __m256i self) noexcept
6877 {
69- __m128i self_low, self_high;
70- split_avx (self, self_low, self_high);
78+ __m128i self_low = lower_half (self), self_high = upper_half (self);
7179 __m128i res_low = f (self_low);
7280 __m128i res_high = f (self_high);
7381 return merge_sse (res_low, res_high);
7482 }
7583 template <class F >
7684 XSIMD_INLINE __m256i fwd_to_sse (F f, __m256i self, __m256i other) noexcept
7785 {
78- __m128i self_low, self_high, other_low, other_high;
79- split_avx (self, self_low, self_high);
80- split_avx (other, other_low, other_high);
86+ __m128i self_low = lower_half (self), self_high = upper_half (self),
87+ other_low = lower_half (other), other_high = upper_half (other);
8188 __m128i res_low = f (self_low, other_low);
8289 __m128i res_high = f (self_high, other_high);
8390 return merge_sse (res_low, res_high);
8491 }
8592 template <class F >
8693 XSIMD_INLINE __m256i fwd_to_sse (F f, __m256i self, int32_t other) noexcept
8794 {
88- __m128i self_low, self_high;
89- split_avx (self, self_low, self_high);
95+ __m128i self_low = lower_half (self), self_high = upper_half (self);
9096 __m128i res_low = f (self_low, other);
9197 __m128i res_high = f (self_high, other);
9298 return merge_sse (res_low, res_high);
@@ -891,8 +897,7 @@ namespace xsimd
891897 {
892898 XSIMD_IF_CONSTEXPR (sizeof (T) == 1 || sizeof (T) == 2 )
893899 {
894- __m128i self_low, self_high;
895- detail::split_avx (self, self_low, self_high);
900+ __m128i self_low = detail::lower_half (self), self_high = detail::upper_half (self);
896901 return mask (batch_bool<T, sse4_2>(self_low), sse4_2 {}) | (mask (batch_bool<T, sse4_2>(self_high), sse4_2 {}) << (128 / (8 * sizeof (T))));
897902 }
898903 else XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
@@ -1049,8 +1054,7 @@ namespace xsimd
10491054 template <class A , class T , class = typename std::enable_if<std::is_scalar<T>::value>::type>
10501055 XSIMD_INLINE T reduce_add (batch<T, A> const & self, requires_arch<avx>) noexcept
10511056 {
1052- typename batch<T, sse4_2>::register_type low, high;
1053- detail::split_avx (self, low, high);
1057+ typename batch<T, sse4_2>::register_type low = detail::lower_half (self), high = detail::upper_half (self);
10541058 batch<T, sse4_2> blow (low), bhigh (high);
10551059 return reduce_add (blow + bhigh);
10561060 }
@@ -1081,8 +1085,7 @@ namespace xsimd
10811085 template <class A , class T , class = typename std::enable_if<std::is_scalar<T>::value>::type>
10821086 XSIMD_INLINE T reduce_mul (batch<T, A> const & self, requires_arch<avx>) noexcept
10831087 {
1084- typename batch<T, sse4_2>::register_type low, high;
1085- detail::split_avx (self, low, high);
1088+ typename batch<T, sse4_2>::register_type low = detail::lower_half (self), high = detail::upper_half (self);
10861089 batch<T, sse4_2> blow (low), bhigh (high);
10871090 return reduce_mul (blow * bhigh);
10881091 }
@@ -1132,14 +1135,11 @@ namespace xsimd
11321135 template <class A , class T , class = typename std::enable_if<std::is_integral<T>::value>::type>
11331136 XSIMD_INLINE batch<T, A> select (batch_bool<T, A> const & cond, batch<T, A> const & true_br, batch<T, A> const & false_br, requires_arch<avx>) noexcept
11341137 {
1135- __m128i cond_low, cond_hi;
1136- detail::split_avx (cond, cond_low, cond_hi);
1138+ __m128i cond_low = detail::lower_half (cond), cond_hi = detail::upper_half (cond);
11371139
1138- __m128i true_low, true_hi;
1139- detail::split_avx (true_br, true_low, true_hi);
1140+ __m128i true_low = detail::lower_half (true_br), true_hi = detail::upper_half (true_br);
11401141
1141- __m128i false_low, false_hi;
1142- detail::split_avx (false_br, false_low, false_hi);
1142+ __m128i false_low = detail::lower_half (false_br), false_hi = detail::upper_half (false_br);
11431143
11441144 __m128i res_low = select (batch_bool<T, sse4_2>(cond_low), batch<T, sse4_2>(true_low), batch<T, sse4_2>(false_low), sse4_2 {});
11451145 __m128i res_hi = select (batch_bool<T, sse4_2>(cond_hi), batch<T, sse4_2>(true_hi), batch<T, sse4_2>(false_hi), sse4_2 {});
0 commit comments