@@ -1688,8 +1688,61 @@ namespace xsimd
16881688 }
16891689
16901690 template <class A , uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1691- XSIMD_INLINE batch<uint16_t , A> swizzle (batch<uint16_t , A> const & self, batch_constant<uint16_t , A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<sse2>) noexcept
1691+ XSIMD_INLINE batch<uint16_t , A> swizzle (batch<uint16_t , A> const & self, batch_constant<uint16_t , A, V0, V1, V2, V3, V4, V5, V6, V7> mask , requires_arch<sse2>) noexcept
16921692 {
1693+ constexpr bool is_identity = detail::is_identity (mask);
1694+ constexpr bool is_dup_lo = detail::is_dup_lo (mask);
1695+ constexpr bool is_dup_hi = detail::is_dup_hi (mask);
1696+
1697+ XSIMD_IF_CONSTEXPR (is_identity)
1698+ {
1699+ return self;
1700+ }
1701+ XSIMD_IF_CONSTEXPR (is_dup_lo)
1702+ {
1703+ // permute the low half
1704+ constexpr int imm = detail::mod_shuffle (V0, V1, V2, V3);
1705+ const auto lo = _mm_shufflelo_epi16 (self, imm);
1706+ // broadcast that 64-bit low half into both halves
1707+ const auto lo_all = _mm_unpacklo_epi64 (lo, lo);
1708+ return lo_all;
1709+ }
1710+ XSIMD_IF_CONSTEXPR (is_dup_hi)
1711+ {
1712+ // permute the high half
1713+ constexpr int imm = detail::mod_shuffle (V4, V5, V6, V7);
1714+ const auto hi = _mm_shufflehi_epi16 (self, imm);
1715+ // broadcast that 64-bit high half into both halves
1716+ const auto hi_all = _mm_unpackhi_epi64 (hi, hi);
1717+ return hi_all;
1718+ }
1719+ // Only pick elements from the low lane
1720+ XSIMD_IF_CONSTEXPR ((V0 < 4 ) && (V1 < 4 ) && (V2 < 4 ) && (V3 < 4 ) && (V4 < 4 ) && (V5 < 4 ) && (V6 < 4 ) && (V7 < 4 ))
1721+ {
1722+ // permute within each sub lane
1723+ constexpr auto mask_lo = detail::mod_shuffle (V0, V1, V2, V3);
1724+ constexpr auto mask_hi = detail::mod_shuffle (V4, V5, V6, V7);
1725+ __m128i lol = _mm_shufflelo_epi16 (self, mask_lo);
1726+ __m128i loh = _mm_shufflelo_epi16 (self, mask_hi);
1727+
1728+ // generate temporary lanes
1729+ return _mm_unpacklo_epi64 (lol, loh);
1730+ }
1731+ // Only pick elements from the high lane
1732+ XSIMD_IF_CONSTEXPR ((V0 >= 4 ) && (V1 >= 4 ) && (V2 >= 4 ) && (V3 >= 4 ) && (V4 >= 4 ) && (V5 >= 4 ) && (V6 >= 4 ) && (V7 >= 4 ))
1733+ {
1734+ // permute within each sub lane
1735+ constexpr auto mask_lo = detail::mod_shuffle (V0, V1, V2, V3);
1736+ constexpr auto mask_hi = detail::mod_shuffle (V4, V5, V6, V7);
1737+ __m128i hil = _mm_shufflehi_epi16 (self, mask_lo);
1738+ __m128i hih = _mm_shufflehi_epi16 (self, mask_hi);
1739+
1740+ // generate temporary lanes
1741+ return _mm_unpackhi_epi64 (hil, hih);
1742+ }
1743+
1744+ // Generic case
1745+
16931746 // permute within each sub lane
16941747 constexpr auto mask_lo = detail::mod_shuffle (V0, V1, V2, V3);
16951748 constexpr auto mask_hi = detail::mod_shuffle (V4, V5, V6, V7);
0 commit comments