Remove one permute from swizzle double

AntoinePrv · serge-sans-paille · commit 9d41ad9be0d4 · 2025-11-03T22:21:53.000+01:00
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
@@ -1446,21 +1446,20 @@ namespace xsimd
         template <class A>
         XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx>) noexcept
         {
-            // duplicate low and high part of input
-            __m256d lo = _mm256_permute2f128_pd(self, self, 0x00);
-            __m256d hi = _mm256_permute2f128_pd(self, self, 0x11);
+            // swap lanes
+            __m256d swapped = _mm256_permute2f128_pd(self, self, 0x01); // [high | low]
 
-            // normalize mask
-            batch<uint64_t, A> half_mask = -(mask & 1);
+            // The half mask value is found in mask modulo 2, but the intrinsic expect it in the
+            // second least significant bit. We use negative as a cheap alternative to lshift.
+            batch<uint64_t, A> half_mask = -(mask & 0b1u);
 
             // permute within each lane
-            __m256d r0 = _mm256_permutevar_pd(lo, half_mask);
-            __m256d r1 = _mm256_permutevar_pd(hi, half_mask);
+            __m256d r0 = _mm256_permutevar_pd(self, half_mask);
+            __m256d r1 = _mm256_permutevar_pd(swapped, half_mask);
 
-            // mask to choose the right lane
-            batch_bool<uint64_t, A> blend_mask = mask >= 2;
-
-            // blend the two permutes
+            // select lane by the mask index divided by 2
+            constexpr auto lane = batch_constant<uint64_t, A, 0, 0, 2, 2> {};
+            batch_bool<uint64_t, A> blend_mask = (mask & 0b10u) != lane;
             return _mm256_blendv_pd(r0, r1, batch_bool_cast<double>(blend_mask));
         }
 
diff --git a/test/test_batch_manip.cpp b/test/test_batch_manip.cpp
@@ -247,6 +247,7 @@ struct swizzle_test
         using idx_t = typename xsimd::as_index<value_type>::type;
         auto idx_batch = xsimd::make_batch_constant<idx_t, Pattern<idx_t>, arch_type>();
 
+        CAPTURE(idx_batch.as_batch());
         CHECK_BATCH_EQ(xsimd::swizzle(b_lhs, idx_batch), b_expect);
         CHECK_BATCH_EQ(xsimd::swizzle(b_lhs,
                                       static_cast<xsimd::batch<idx_t, arch_type>>(idx_batch)),