Skip to content

Commit fce8da2

Browse files
Move from split_avx / split_avx512 to lower_half/ upper_half
Related to #1184 and #1162
1 parent fc0ebb6 commit fce8da2

File tree

2 files changed

+56
-58
lines changed

2 files changed

+56
-58
lines changed

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -36,20 +36,29 @@ namespace xsimd
3636

3737
namespace detail
3838
{
39-
XSIMD_INLINE void split_avx(__m256i val, __m128i& low, __m128i& high) noexcept
39+
XSIMD_INLINE __m128i lower_half(__m256i self) noexcept
4040
{
41-
low = _mm256_castsi256_si128(val);
42-
high = _mm256_extractf128_si256(val, 1);
41+
return _mm256_castsi256_si128(self);
4342
}
44-
XSIMD_INLINE void split_avx(__m256 val, __m128& low, __m128& high) noexcept
43+
XSIMD_INLINE __m128 lower_half(__m256 self) noexcept
4544
{
46-
low = _mm256_castps256_ps128(val);
47-
high = _mm256_extractf128_ps(val, 1);
45+
return _mm256_castps256_ps128(self);
4846
}
49-
XSIMD_INLINE void split_avx(__m256d val, __m128d& low, __m128d& high) noexcept
47+
XSIMD_INLINE __m128d lower_half(__m256d self) noexcept
5048
{
51-
low = _mm256_castpd256_pd128(val);
52-
high = _mm256_extractf128_pd(val, 1);
49+
return _mm256_castpd256_pd128(self);
50+
}
51+
XSIMD_INLINE __m128i upper_half(__m256i self) noexcept
52+
{
53+
return _mm256_extractf128_si256(self, 1);
54+
}
55+
XSIMD_INLINE __m128 upper_half(__m256 self) noexcept
56+
{
57+
return _mm256_extractf128_ps(self, 1);
58+
}
59+
XSIMD_INLINE __m128d upper_half(__m256d self) noexcept
60+
{
61+
return _mm256_extractf128_pd(self, 1);
5362
}
5463
XSIMD_INLINE __m256i merge_sse(__m128i low, __m128i high) noexcept
5564
{
@@ -66,27 +75,24 @@ namespace xsimd
6675
template <class F>
6776
XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self) noexcept
6877
{
69-
__m128i self_low, self_high;
70-
split_avx(self, self_low, self_high);
78+
__m128i self_low = lower_half(self), self_high = upper_half(self);
7179
__m128i res_low = f(self_low);
7280
__m128i res_high = f(self_high);
7381
return merge_sse(res_low, res_high);
7482
}
7583
template <class F>
7684
XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, __m256i other) noexcept
7785
{
78-
__m128i self_low, self_high, other_low, other_high;
79-
split_avx(self, self_low, self_high);
80-
split_avx(other, other_low, other_high);
86+
__m128i self_low = lower_half(self), self_high = upper_half(self),
87+
other_low = lower_half(other), other_high = upper_half(other);
8188
__m128i res_low = f(self_low, other_low);
8289
__m128i res_high = f(self_high, other_high);
8390
return merge_sse(res_low, res_high);
8491
}
8592
template <class F>
8693
XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, int32_t other) noexcept
8794
{
88-
__m128i self_low, self_high;
89-
split_avx(self, self_low, self_high);
95+
__m128i self_low = lower_half(self), self_high = upper_half(self);
9096
__m128i res_low = f(self_low, other);
9197
__m128i res_high = f(self_high, other);
9298
return merge_sse(res_low, res_high);
@@ -891,8 +897,7 @@ namespace xsimd
891897
{
892898
XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
893899
{
894-
__m128i self_low, self_high;
895-
detail::split_avx(self, self_low, self_high);
900+
__m128i self_low = detail::lower_half(self), self_high = detail::upper_half(self);
896901
return mask(batch_bool<T, sse4_2>(self_low), sse4_2 {}) | (mask(batch_bool<T, sse4_2>(self_high), sse4_2 {}) << (128 / (8 * sizeof(T))));
897902
}
898903
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
@@ -1049,8 +1054,7 @@ namespace xsimd
10491054
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
10501055
XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
10511056
{
1052-
typename batch<T, sse4_2>::register_type low, high;
1053-
detail::split_avx(self, low, high);
1057+
typename batch<T, sse4_2>::register_type low = detail::lower_half(self), high = detail::upper_half(self);
10541058
batch<T, sse4_2> blow(low), bhigh(high);
10551059
return reduce_add(blow + bhigh);
10561060
}
@@ -1081,8 +1085,7 @@ namespace xsimd
10811085
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
10821086
XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx>) noexcept
10831087
{
1084-
typename batch<T, sse4_2>::register_type low, high;
1085-
detail::split_avx(self, low, high);
1088+
typename batch<T, sse4_2>::register_type low = detail::lower_half(self), high = detail::upper_half(self);
10861089
batch<T, sse4_2> blow(low), bhigh(high);
10871090
return reduce_mul(blow * bhigh);
10881091
}
@@ -1132,14 +1135,11 @@ namespace xsimd
11321135
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value>::type>
11331136
XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
11341137
{
1135-
__m128i cond_low, cond_hi;
1136-
detail::split_avx(cond, cond_low, cond_hi);
1138+
__m128i cond_low = detail::lower_half(cond), cond_hi = detail::upper_half(cond);
11371139

1138-
__m128i true_low, true_hi;
1139-
detail::split_avx(true_br, true_low, true_hi);
1140+
__m128i true_low = detail::lower_half(true_br), true_hi = detail::upper_half(true_br);
11401141

1141-
__m128i false_low, false_hi;
1142-
detail::split_avx(false_br, false_low, false_hi);
1142+
__m128i false_low = detail::lower_half(false_br), false_hi = detail::upper_half(false_br);
11431143

11441144
__m128i res_low = select(batch_bool<T, sse4_2>(cond_low), batch<T, sse4_2>(true_low), batch<T, sse4_2>(false_low), sse4_2 {});
11451145
__m128i res_hi = select(batch_bool<T, sse4_2>(cond_hi), batch<T, sse4_2>(true_hi), batch<T, sse4_2>(false_hi), sse4_2 {});

include/xsimd/arch/xsimd_avx512f.hpp

Lines changed: 28 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -41,20 +41,29 @@ namespace xsimd
4141

4242
namespace detail
4343
{
44-
XSIMD_INLINE void split_avx512(__m512 val, __m256& low, __m256& high) noexcept
44+
XSIMD_INLINE __m256 lower_half(__m512 self) noexcept
4545
{
46-
low = _mm512_castps512_ps256(val);
47-
high = _mm512_extractf32x8_ps(val, 1);
46+
return _mm512_castps512_ps256(self);
4847
}
49-
XSIMD_INLINE void split_avx512(__m512d val, __m256d& low, __m256d& high) noexcept
48+
XSIMD_INLINE __m256d lower_half(__m512d self) noexcept
5049
{
51-
low = _mm512_castpd512_pd256(val);
52-
high = _mm512_extractf64x4_pd(val, 1);
50+
return _mm512_castpd512_pd256(self);
5351
}
54-
XSIMD_INLINE void split_avx512(__m512i val, __m256i& low, __m256i& high) noexcept
52+
XSIMD_INLINE __m256i lower_half(__m512i self) noexcept
5553
{
56-
low = _mm512_castsi512_si256(val);
57-
high = _mm512_extracti64x4_epi64(val, 1);
54+
return _mm512_castsi512_si256(self);
55+
}
56+
XSIMD_INLINE __m256 upper_half(__m512 self) noexcept
57+
{
58+
return _mm512_extractf32x8_ps(self, 1);
59+
}
60+
XSIMD_INLINE __m256d upper_half(__m512d self) noexcept
61+
{
62+
return _mm512_extractf64x4_pd(self, 1);
63+
}
64+
XSIMD_INLINE __m256i upper_half(__m512i self) noexcept
65+
{
66+
return _mm512_extracti64x4_epi64(self, 1);
5867
}
5968
XSIMD_INLINE __m512i merge_avx(__m256i low, __m256i high) noexcept
6069
{
@@ -71,27 +80,24 @@ namespace xsimd
7180
template <class F>
7281
__m512i fwd_to_avx(F f, __m512i self)
7382
{
74-
__m256i self_low, self_high;
75-
split_avx512(self, self_low, self_high);
83+
__m256i self_low = lower_half(self), self_high = upper_half(self);
7684
__m256i res_low = f(self_low);
7785
__m256i res_high = f(self_high);
7886
return merge_avx(res_low, res_high);
7987
}
8088
template <class F>
8189
__m512i fwd_to_avx(F f, __m512i self, __m512i other)
8290
{
83-
__m256i self_low, self_high, other_low, other_high;
84-
split_avx512(self, self_low, self_high);
85-
split_avx512(other, other_low, other_high);
91+
__m256i self_low = lower_half(self), self_high = upper_half(self),
92+
other_low = lower_half(other), other_high = upper_half(other);
8693
__m256i res_low = f(self_low, other_low);
8794
__m256i res_high = f(self_high, other_high);
8895
return merge_avx(res_low, res_high);
8996
}
9097
template <class F>
9198
__m512i fwd_to_avx(F f, __m512i self, int32_t other)
9299
{
93-
__m256i self_low, self_high;
94-
split_avx512(self, self_low, self_high);
100+
__m256i self_low = lower_half(self), self_high = upper_half(self);
95101
__m256i res_low = f(self_low, other);
96102
__m256i res_high = f(self_high, other);
97103
return merge_avx(res_low, res_high);
@@ -1624,8 +1630,7 @@ namespace xsimd
16241630
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value>::type>
16251631
XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx512f>) noexcept
16261632
{
1627-
__m256i low, high;
1628-
detail::split_avx512(self, low, high);
1633+
__m256i low = detail::lower_half(self), high = detail::upper_half(self);
16291634
batch<T, avx2> blow(low), bhigh(high);
16301635
return reduce_add(blow, avx2 {}) + reduce_add(bhigh, avx2 {});
16311636
}
@@ -1676,8 +1681,7 @@ namespace xsimd
16761681
}
16771682
else
16781683
{
1679-
__m256i low, high;
1680-
detail::split_avx512(self, low, high);
1684+
__m256i low = detail::lower_half(self), high = detail::upper_half(self);
16811685
batch<T, avx2> blow(low), bhigh(high);
16821686
return reduce_mul(blow, avx2 {}) * reduce_mul(bhigh, avx2 {});
16831687
}
@@ -1773,11 +1777,8 @@ namespace xsimd
17731777
__m256i cond_low = batch<uint8_t, avx2>::load_aligned(&buffer[0]);
17741778
__m256i cond_hi = batch<uint8_t, avx2>::load_aligned(&buffer[32]);
17751779

1776-
__m256i true_low, true_hi;
1777-
detail::split_avx512(true_br, true_low, true_hi);
1778-
1779-
__m256i false_low, false_hi;
1780-
detail::split_avx512(false_br, false_low, false_hi);
1780+
__m256i true_low = detail::lower_half(true_br), true_hi = detail::upper_half(true_br);
1781+
__m256i false_low = detail::lower_half(false_br), false_hi = detail::upper_half(false_br);
17811782

17821783
__m256i res_low = select(batch_bool<T, avx2>(cond_low), batch<T, avx2>(true_low), batch<T, avx2>(false_low), avx2 {});
17831784
__m256i res_hi = select(batch_bool<T, avx2>(cond_hi), batch<T, avx2>(true_hi), batch<T, avx2>(false_hi), avx2 {});
@@ -1788,11 +1789,8 @@ namespace xsimd
17881789
__m256i cond_low = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data & 0xFFFF, _mm512_set1_epi32(~0));
17891790
__m256i cond_hi = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data >> 16, _mm512_set1_epi32(~0));
17901791

1791-
__m256i true_low, true_hi;
1792-
detail::split_avx512(true_br, true_low, true_hi);
1793-
1794-
__m256i false_low, false_hi;
1795-
detail::split_avx512(false_br, false_low, false_hi);
1792+
__m256i true_low = detail::lower_half(true_br), true_hi = detail::upper_half(true_br);
1793+
__m256i false_low = detail::lower_half(false_br), false_hi = detail::upper_half(false_br);
17961794

17971795
__m256i res_low = select(batch_bool<T, avx2>(cond_low), batch<T, avx2>(true_low), batch<T, avx2>(false_low), avx2 {});
17981796
__m256i res_hi = select(batch_bool<T, avx2>(cond_hi), batch<T, avx2>(true_hi), batch<T, avx2>(false_hi), avx2 {});

0 commit comments

Comments
 (0)