Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 28 additions & 28 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,29 @@ namespace xsimd

namespace detail
{
XSIMD_INLINE void split_avx(__m256i val, __m128i& low, __m128i& high) noexcept
XSIMD_INLINE __m128i lower_half(__m256i self) noexcept
{
low = _mm256_castsi256_si128(val);
high = _mm256_extractf128_si256(val, 1);
return _mm256_castsi256_si128(self);
}
XSIMD_INLINE void split_avx(__m256 val, __m128& low, __m128& high) noexcept
XSIMD_INLINE __m128 lower_half(__m256 self) noexcept
{
low = _mm256_castps256_ps128(val);
high = _mm256_extractf128_ps(val, 1);
return _mm256_castps256_ps128(self);
}
XSIMD_INLINE void split_avx(__m256d val, __m128d& low, __m128d& high) noexcept
XSIMD_INLINE __m128d lower_half(__m256d self) noexcept
{
low = _mm256_castpd256_pd128(val);
high = _mm256_extractf128_pd(val, 1);
return _mm256_castpd256_pd128(self);
}
XSIMD_INLINE __m128i upper_half(__m256i self) noexcept
{
return _mm256_extractf128_si256(self, 1);
}
XSIMD_INLINE __m128 upper_half(__m256 self) noexcept
{
return _mm256_extractf128_ps(self, 1);
}
XSIMD_INLINE __m128d upper_half(__m256d self) noexcept
{
return _mm256_extractf128_pd(self, 1);
}
XSIMD_INLINE __m256i merge_sse(__m128i low, __m128i high) noexcept
{
Expand All @@ -66,27 +75,24 @@ namespace xsimd
template <class F>
XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self) noexcept
{
__m128i self_low, self_high;
split_avx(self, self_low, self_high);
__m128i self_low = lower_half(self), self_high = upper_half(self);
__m128i res_low = f(self_low);
__m128i res_high = f(self_high);
return merge_sse(res_low, res_high);
}
template <class F>
XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, __m256i other) noexcept
{
__m128i self_low, self_high, other_low, other_high;
split_avx(self, self_low, self_high);
split_avx(other, other_low, other_high);
__m128i self_low = lower_half(self), self_high = upper_half(self),
other_low = lower_half(other), other_high = upper_half(other);
__m128i res_low = f(self_low, other_low);
__m128i res_high = f(self_high, other_high);
return merge_sse(res_low, res_high);
}
template <class F>
XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, int32_t other) noexcept
{
__m128i self_low, self_high;
split_avx(self, self_low, self_high);
__m128i self_low = lower_half(self), self_high = upper_half(self);
__m128i res_low = f(self_low, other);
__m128i res_high = f(self_high, other);
return merge_sse(res_low, res_high);
Expand Down Expand Up @@ -891,8 +897,7 @@ namespace xsimd
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
{
__m128i self_low, self_high;
detail::split_avx(self, self_low, self_high);
__m128i self_low = detail::lower_half(self), self_high = detail::upper_half(self);
return mask(batch_bool<T, sse4_2>(self_low), sse4_2 {}) | (mask(batch_bool<T, sse4_2>(self_high), sse4_2 {}) << (128 / (8 * sizeof(T))));
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
Expand Down Expand Up @@ -1049,8 +1054,7 @@ namespace xsimd
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
{
typename batch<T, sse4_2>::register_type low, high;
detail::split_avx(self, low, high);
typename batch<T, sse4_2>::register_type low = detail::lower_half(self), high = detail::upper_half(self);
batch<T, sse4_2> blow(low), bhigh(high);
return reduce_add(blow + bhigh);
}
Expand Down Expand Up @@ -1081,8 +1085,7 @@ namespace xsimd
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx>) noexcept
{
typename batch<T, sse4_2>::register_type low, high;
detail::split_avx(self, low, high);
typename batch<T, sse4_2>::register_type low = detail::lower_half(self), high = detail::upper_half(self);
batch<T, sse4_2> blow(low), bhigh(high);
return reduce_mul(blow * bhigh);
}
Expand Down Expand Up @@ -1132,14 +1135,11 @@ namespace xsimd
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
{
__m128i cond_low, cond_hi;
detail::split_avx(cond, cond_low, cond_hi);
__m128i cond_low = detail::lower_half(cond), cond_hi = detail::upper_half(cond);

__m128i true_low, true_hi;
detail::split_avx(true_br, true_low, true_hi);
__m128i true_low = detail::lower_half(true_br), true_hi = detail::upper_half(true_br);

__m128i false_low, false_hi;
detail::split_avx(false_br, false_low, false_hi);
__m128i false_low = detail::lower_half(false_br), false_hi = detail::upper_half(false_br);

__m128i res_low = select(batch_bool<T, sse4_2>(cond_low), batch<T, sse4_2>(true_low), batch<T, sse4_2>(false_low), sse4_2 {});
__m128i res_hi = select(batch_bool<T, sse4_2>(cond_hi), batch<T, sse4_2>(true_hi), batch<T, sse4_2>(false_hi), sse4_2 {});
Expand Down
58 changes: 28 additions & 30 deletions include/xsimd/arch/xsimd_avx512f.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,20 +41,29 @@ namespace xsimd

namespace detail
{
XSIMD_INLINE void split_avx512(__m512 val, __m256& low, __m256& high) noexcept
XSIMD_INLINE __m256 lower_half(__m512 self) noexcept
{
low = _mm512_castps512_ps256(val);
high = _mm512_extractf32x8_ps(val, 1);
return _mm512_castps512_ps256(self);
}
XSIMD_INLINE void split_avx512(__m512d val, __m256d& low, __m256d& high) noexcept
XSIMD_INLINE __m256d lower_half(__m512d self) noexcept
{
low = _mm512_castpd512_pd256(val);
high = _mm512_extractf64x4_pd(val, 1);
return _mm512_castpd512_pd256(self);
}
XSIMD_INLINE void split_avx512(__m512i val, __m256i& low, __m256i& high) noexcept
XSIMD_INLINE __m256i lower_half(__m512i self) noexcept
{
low = _mm512_castsi512_si256(val);
high = _mm512_extracti64x4_epi64(val, 1);
return _mm512_castsi512_si256(self);
}
XSIMD_INLINE __m256 upper_half(__m512 self) noexcept
{
return _mm512_extractf32x8_ps(self, 1);
}
XSIMD_INLINE __m256d upper_half(__m512d self) noexcept
{
return _mm512_extractf64x4_pd(self, 1);
}
XSIMD_INLINE __m256i upper_half(__m512i self) noexcept
{
return _mm512_extracti64x4_epi64(self, 1);
}
XSIMD_INLINE __m512i merge_avx(__m256i low, __m256i high) noexcept
{
Expand All @@ -71,27 +80,24 @@ namespace xsimd
template <class F>
__m512i fwd_to_avx(F f, __m512i self)
{
__m256i self_low, self_high;
split_avx512(self, self_low, self_high);
__m256i self_low = lower_half(self), self_high = upper_half(self);
__m256i res_low = f(self_low);
__m256i res_high = f(self_high);
return merge_avx(res_low, res_high);
}
template <class F>
__m512i fwd_to_avx(F f, __m512i self, __m512i other)
{
__m256i self_low, self_high, other_low, other_high;
split_avx512(self, self_low, self_high);
split_avx512(other, other_low, other_high);
__m256i self_low = lower_half(self), self_high = upper_half(self),
other_low = lower_half(other), other_high = upper_half(other);
__m256i res_low = f(self_low, other_low);
__m256i res_high = f(self_high, other_high);
return merge_avx(res_low, res_high);
}
template <class F>
__m512i fwd_to_avx(F f, __m512i self, int32_t other)
{
__m256i self_low, self_high;
split_avx512(self, self_low, self_high);
__m256i self_low = lower_half(self), self_high = upper_half(self);
__m256i res_low = f(self_low, other);
__m256i res_high = f(self_high, other);
return merge_avx(res_low, res_high);
Expand Down Expand Up @@ -1624,8 +1630,7 @@ namespace xsimd
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx512f>) noexcept
{
__m256i low, high;
detail::split_avx512(self, low, high);
__m256i low = detail::lower_half(self), high = detail::upper_half(self);
batch<T, avx2> blow(low), bhigh(high);
return reduce_add(blow, avx2 {}) + reduce_add(bhigh, avx2 {});
}
Expand Down Expand Up @@ -1676,8 +1681,7 @@ namespace xsimd
}
else
{
__m256i low, high;
detail::split_avx512(self, low, high);
__m256i low = detail::lower_half(self), high = detail::upper_half(self);
batch<T, avx2> blow(low), bhigh(high);
return reduce_mul(blow, avx2 {}) * reduce_mul(bhigh, avx2 {});
}
Expand Down Expand Up @@ -1773,11 +1777,8 @@ namespace xsimd
__m256i cond_low = batch<uint8_t, avx2>::load_aligned(&buffer[0]);
__m256i cond_hi = batch<uint8_t, avx2>::load_aligned(&buffer[32]);

__m256i true_low, true_hi;
detail::split_avx512(true_br, true_low, true_hi);

__m256i false_low, false_hi;
detail::split_avx512(false_br, false_low, false_hi);
__m256i true_low = detail::lower_half(true_br), true_hi = detail::upper_half(true_br);
__m256i false_low = detail::lower_half(false_br), false_hi = detail::upper_half(false_br);

__m256i res_low = select(batch_bool<T, avx2>(cond_low), batch<T, avx2>(true_low), batch<T, avx2>(false_low), avx2 {});
__m256i res_hi = select(batch_bool<T, avx2>(cond_hi), batch<T, avx2>(true_hi), batch<T, avx2>(false_hi), avx2 {});
Expand All @@ -1788,11 +1789,8 @@ namespace xsimd
__m256i cond_low = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data & 0xFFFF, _mm512_set1_epi32(~0));
__m256i cond_hi = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data >> 16, _mm512_set1_epi32(~0));

__m256i true_low, true_hi;
detail::split_avx512(true_br, true_low, true_hi);

__m256i false_low, false_hi;
detail::split_avx512(false_br, false_low, false_hi);
__m256i true_low = detail::lower_half(true_br), true_hi = detail::upper_half(true_br);
__m256i false_low = detail::lower_half(false_br), false_hi = detail::upper_half(false_br);

__m256i res_low = select(batch_bool<T, avx2>(cond_low), batch<T, avx2>(true_low), batch<T, avx2>(false_low), avx2 {});
__m256i res_hi = select(batch_bool<T, avx2>(cond_hi), batch<T, avx2>(true_hi), batch<T, avx2>(false_hi), avx2 {});
Expand Down
Loading