Skip to content

Commit ef4a389

Browse files
Use dedicated unpacking intrinsic instead of a shuffle in sse2 swizzle / short
1 parent 825d298 commit ef4a389

File tree

1 file changed

+4
-3
lines changed

1 file changed

+4
-3
lines changed

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1690,16 +1690,17 @@ namespace xsimd
16901690
template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
16911691
XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<sse2>) noexcept
16921692
{
1693-
// permute within each lane
1693+
// permute within each sub lane
16941694
constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3);
16951695
constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7);
16961696
__m128i lol = _mm_shufflelo_epi16(self, mask_lo);
16971697
__m128i loh = _mm_shufflelo_epi16(self, mask_hi);
16981698
__m128i hil = _mm_shufflehi_epi16(self, mask_lo);
16991699
__m128i hih = _mm_shufflehi_epi16(self, mask_hi);
17001700

1701-
__m128i lo = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(lol), _mm_castsi128_pd(loh), _MM_SHUFFLE2(0, 0)));
1702-
__m128i hi = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(hil), _mm_castsi128_pd(hih), _MM_SHUFFLE2(1, 1)));
1701+
// generate temporary lanes
1702+
__m128i lo = _mm_unpacklo_epi64(lol, loh);
1703+
__m128i hi = _mm_unpackhi_epi64(hil, hih);
17031704

17041705
// mask to choose the right lane
17051706
batch_bool_constant<uint16_t, A, (V0 < 4), (V1 < 4), (V2 < 4), (V3 < 4), (V4 < 4), (V5 < 4), (V6 < 4), (V7 < 4)> blend_mask;

0 commit comments

Comments
 (0)