Skip to content

Commit 9d41ad9

Browse files
AntoinePrvserge-sans-paille
authored andcommitted
Remove one permute from swizzle double
1 parent cfc6717 commit 9d41ad9

File tree

2 files changed

+11
-11
lines changed

2 files changed

+11
-11
lines changed

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1446,21 +1446,20 @@ namespace xsimd
14461446
template <class A>
14471447
XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx>) noexcept
14481448
{
1449-
// duplicate low and high part of input
1450-
__m256d lo = _mm256_permute2f128_pd(self, self, 0x00);
1451-
__m256d hi = _mm256_permute2f128_pd(self, self, 0x11);
1449+
// swap lanes
1450+
__m256d swapped = _mm256_permute2f128_pd(self, self, 0x01); // [high | low]
14521451

1453-
// normalize mask
1454-
batch<uint64_t, A> half_mask = -(mask & 1);
1452+
// The half mask value is found in mask modulo 2, but the intrinsic expect it in the
1453+
// second least significant bit. We use negative as a cheap alternative to lshift.
1454+
batch<uint64_t, A> half_mask = -(mask & 0b1u);
14551455

14561456
// permute within each lane
1457-
__m256d r0 = _mm256_permutevar_pd(lo, half_mask);
1458-
__m256d r1 = _mm256_permutevar_pd(hi, half_mask);
1457+
__m256d r0 = _mm256_permutevar_pd(self, half_mask);
1458+
__m256d r1 = _mm256_permutevar_pd(swapped, half_mask);
14591459

1460-
// mask to choose the right lane
1461-
batch_bool<uint64_t, A> blend_mask = mask >= 2;
1462-
1463-
// blend the two permutes
1460+
// select lane by the mask index divided by 2
1461+
constexpr auto lane = batch_constant<uint64_t, A, 0, 0, 2, 2> {};
1462+
batch_bool<uint64_t, A> blend_mask = (mask & 0b10u) != lane;
14641463
return _mm256_blendv_pd(r0, r1, batch_bool_cast<double>(blend_mask));
14651464
}
14661465

test/test_batch_manip.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,7 @@ struct swizzle_test
247247
using idx_t = typename xsimd::as_index<value_type>::type;
248248
auto idx_batch = xsimd::make_batch_constant<idx_t, Pattern<idx_t>, arch_type>();
249249

250+
CAPTURE(idx_batch.as_batch());
250251
CHECK_BATCH_EQ(xsimd::swizzle(b_lhs, idx_batch), b_expect);
251252
CHECK_BATCH_EQ(xsimd::swizzle(b_lhs,
252253
static_cast<xsimd::batch<idx_t, arch_type>>(idx_batch)),

0 commit comments

Comments
 (0)