@@ -1446,21 +1446,20 @@ namespace xsimd
14461446 template <class A >
14471447 XSIMD_INLINE batch<double , A> swizzle (batch<double , A> const & self, batch<uint64_t , A> mask, requires_arch<avx>) noexcept
14481448 {
1449- // duplicate low and high part of input
1450- __m256d lo = _mm256_permute2f128_pd (self, self, 0x00 );
1451- __m256d hi = _mm256_permute2f128_pd (self, self, 0x11 );
1449+ // swap lanes
1450+ __m256d swapped = _mm256_permute2f128_pd (self, self, 0x01 ); // [high | low]
14521451
1453- // normalize mask
1454- batch<uint64_t , A> half_mask = -(mask & 1 );
1452+ // The half mask value is found in mask modulo 2, but the intrinsic expect it in the
1453+ // second least significant bit. We use negative as a cheap alternative to lshift.
1454+ batch<uint64_t , A> half_mask = -(mask & 0b1u );
14551455
14561456 // permute within each lane
1457- __m256d r0 = _mm256_permutevar_pd (lo , half_mask);
1458- __m256d r1 = _mm256_permutevar_pd (hi , half_mask);
1457+ __m256d r0 = _mm256_permutevar_pd (self , half_mask);
1458+ __m256d r1 = _mm256_permutevar_pd (swapped , half_mask);
14591459
1460- // mask to choose the right lane
1461- batch_bool<uint64_t , A> blend_mask = mask >= 2 ;
1462-
1463- // blend the two permutes
1460+ // select lane by the mask index divided by 2
1461+ constexpr auto lane = batch_constant<uint64_t , A, 0 , 0 , 2 , 2 > {};
1462+ batch_bool<uint64_t , A> blend_mask = (mask & 0b10u ) != lane;
14641463 return _mm256_blendv_pd (r0, r1, batch_bool_cast<double >(blend_mask));
14651464 }
14661465
0 commit comments