@@ -1640,6 +1640,31 @@ namespace xsimd
16401640 return bitwise_cast<int32_t >(swizzle (bitwise_cast<uint32_t >(self), mask, sse2 {}));
16411641 }
16421642
1643+ template <class A , uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1644+ XSIMD_INLINE batch<uint16_t , A> swizzle (batch<uint16_t , A> const & self, batch_constant<uint16_t , A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<sse2>) noexcept
1645+ {
1646+ // permute within each lane
1647+ constexpr auto mask_lo = detail::mod_shuffle (V0, V1, V2, V3);
1648+ constexpr auto mask_hi = detail::mod_shuffle (V4, V5, V6, V7);
1649+ __m128i lo = _mm_shufflelo_epi16 (self, mask_lo);
1650+ __m128i hi = _mm_shufflehi_epi16 (self, mask_hi);
1651+
1652+ __m128i lo_lo = _mm_castpd_si128 (_mm_shuffle_pd (_mm_castsi128_pd (lo), _mm_castsi128_pd (lo), _MM_SHUFFLE2 (0 , 0 )));
1653+ __m128i hi_hi = _mm_castpd_si128 (_mm_shuffle_pd (_mm_castsi128_pd (hi), _mm_castsi128_pd (hi), _MM_SHUFFLE2 (1 , 1 )));
1654+
1655+ // mask to choose the right lane
1656+ batch_bool_constant<uint16_t , A, (V0 < 4 ), (V1 < 4 ), (V2 < 4 ), (V3 < 4 ), (V4 < 4 ), (V5 < 4 ), (V6 < 4 ), (V7 < 4 )> blend_mask;
1657+
1658+ // blend the two permutes
1659+ return select (blend_mask, batch<uint16_t , A>(lo_lo), batch<uint16_t , A>(hi_hi));
1660+ }
1661+
1662+ template <class A , uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1663+ XSIMD_INLINE batch<int16_t , A> swizzle (batch<int16_t , A> const & self, batch_constant<uint16_t , A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<sse2>) noexcept
1664+ {
1665+ return bitwise_cast<int16_t >(swizzle (bitwise_cast<uint16_t >(self), mask, sse2 {}));
1666+ }
1667+
16431668 // transpose
16441669 template <class A >
16451670 XSIMD_INLINE void transpose (batch<float , A>* matrix_begin, batch<float , A>* matrix_end, requires_arch<sse2>) noexcept
0 commit comments