Skip to content

Commit 49be336

Browse files
Add support for xsimd::transpose
Support 64 bit registers for avx, neon, sse and wasm
1 parent 9906e71 commit 49be336

File tree

5 files changed

+109
-2
lines changed

5 files changed

+109
-2
lines changed

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1646,6 +1646,36 @@ namespace xsimd
16461646
return transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
16471647
}
16481648

1649+
template <class A>
1650+
XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<avx>) noexcept
1651+
{
1652+
assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
1653+
(void)matrix_end;
1654+
auto r0 = matrix_begin[0], r1 = matrix_begin[1],
1655+
r2 = matrix_begin[2], r3 = matrix_begin[3];
1656+
1657+
auto t0 = _mm256_unpacklo_pd(r0, r1); // r00 r10 r01 r11
1658+
auto t1 = _mm256_unpackhi_pd(r0, r1); // r02 r12 r03 r13
1659+
auto t2 = _mm256_unpacklo_pd(r2, r3); // r20 r30 r21 r31
1660+
auto t3 = _mm256_unpackhi_pd(r2, r3); // r22 r32 r23 r33
1661+
1662+
matrix_begin[0] = _mm256_permute2f128_pd(t0, t2, 0x20);
1663+
matrix_begin[1] = _mm256_permute2f128_pd(t1, t3, 0x20);
1664+
matrix_begin[2] = _mm256_permute2f128_pd(t0, t2, 0x31);
1665+
matrix_begin[3] = _mm256_permute2f128_pd(t1, t3, 0x31);
1666+
}
1667+
1668+
template <class A>
1669+
XSIMD_INLINE void transpose(batch<uint64_t, A>* matrix_begin, batch<uint64_t, A>* matrix_end, requires_arch<avx>) noexcept
1670+
{
1671+
return transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
1672+
}
1673+
template <class A>
1674+
XSIMD_INLINE void transpose(batch<int64_t, A>* matrix_begin, batch<int64_t, A>* matrix_end, requires_arch<avx>) noexcept
1675+
{
1676+
return transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
1677+
}
1678+
16491679
// trunc
16501680
template <class A>
16511681
XSIMD_INLINE batch<float, A> trunc(batch<float, A> const& self, requires_arch<avx>) noexcept

include/xsimd/arch/xsimd_neon.hpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1791,6 +1791,26 @@ namespace xsimd
17911791
matrix_begin[3] = vcombine_s32(vget_high_s32(t01.val[1]), vget_high_s32(t23.val[1]));
17921792
}
17931793

1794+
template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1795+
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon>) noexcept
1796+
{
1797+
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
1798+
(void)matrix_end;
1799+
auto r0 = matrix_begin[0], r1 = matrix_begin[1];
1800+
matrix_begin[0] = vcombine_u64(vget_low_u64(r0), vget_low_u64(r1));
1801+
matrix_begin[1] = vcombine_u64(vget_high_u64(r0), vget_high_u64(r1));
1802+
}
1803+
1804+
template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1805+
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon>) noexcept
1806+
{
1807+
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
1808+
(void)matrix_end;
1809+
auto r0 = matrix_begin[0], r1 = matrix_begin[1];
1810+
matrix_begin[0] = vcombine_s64(vget_low_s64(r0), vget_low_s64(r1));
1811+
matrix_begin[1] = vcombine_s64(vget_high_s64(r0), vget_high_s64(r1));
1812+
}
1813+
17941814
/**********
17951815
* zip_lo *
17961816
**********/

include/xsimd/arch/xsimd_neon64.hpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -950,6 +950,36 @@ namespace xsimd
950950
return select(batch_bool<double, A> { b... }, true_br, false_br, neon64 {});
951951
}
952952

953+
template <class A>
954+
XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<neon64>) noexcept
955+
{
956+
assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
957+
(void)matrix_end;
958+
auto r0 = matrix_begin[0], r1 = matrix_begin[1];
959+
matrix_begin[0] = vzip1q_f64(r0, r1);
960+
matrix_begin[1] = vzip2q_f64(r0, r1);
961+
}
962+
963+
template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
964+
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon64>) noexcept
965+
{
966+
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
967+
(void)matrix_end;
968+
auto r0 = matrix_begin[0], r1 = matrix_begin[1];
969+
matrix_begin[0] = vzip1q_u64(r0, r1);
970+
matrix_begin[1] = vzip2q_u64(r0, r1);
971+
}
972+
973+
template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
974+
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon64>) noexcept
975+
{
976+
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
977+
(void)matrix_end;
978+
auto r0 = matrix_begin[0], r1 = matrix_begin[1];
979+
matrix_begin[0] = vzip1q_s64(r0, r1);
980+
matrix_begin[1] = vzip2q_s64(r0, r1);
981+
}
982+
953983
/**********
954984
* zip_lo *
955985
**********/

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1656,12 +1656,32 @@ namespace xsimd
16561656
template <class A>
16571657
XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<sse2>) noexcept
16581658
{
1659-
transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A{});
1659+
transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
16601660
}
16611661
template <class A>
16621662
XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<sse2>) noexcept
16631663
{
1664-
transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A{});
1664+
transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
1665+
}
1666+
1667+
template <class A>
1668+
XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<sse2>) noexcept
1669+
{
1670+
assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
1671+
(void)matrix_end;
1672+
auto r0 = matrix_begin[0], r1 = matrix_begin[1];
1673+
matrix_begin[0] = _mm_unpacklo_pd(r0, r1);
1674+
matrix_begin[1] = _mm_unpackhi_pd(r0, r1);
1675+
}
1676+
template <class A>
1677+
XSIMD_INLINE void transpose(batch<uint64_t, A>* matrix_begin, batch<uint64_t, A>* matrix_end, requires_arch<sse2>) noexcept
1678+
{
1679+
transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
1680+
}
1681+
template <class A>
1682+
XSIMD_INLINE void transpose(batch<int64_t, A>* matrix_begin, batch<int64_t, A>* matrix_end, requires_arch<sse2>) noexcept
1683+
{
1684+
transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
16651685
}
16661686

16671687
// zip_hi

include/xsimd/arch/xsimd_wasm.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1599,6 +1599,13 @@ namespace xsimd
15991599
matrix_begin[2] = wasm_i32x4_shuffle(t1, t3, 0, 1, 4, 5); // r0[2] r1[2] r2[2] r3[2]
16001600
matrix_begin[3] = wasm_i32x4_shuffle(t1, t3, 2, 3, 6, 7); // r0[3] r1[3] r2[3] r3[3]
16011601
}
1602+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1603+
{
1604+
auto r0 = matrix_begin[0], r1 = matrix_begin[1];
1605+
1606+
matrix_begin[0] = wasm_i64x2_shuffle(r0, r1, 0, 2);
1607+
matrix_begin[1] = wasm_i64x2_shuffle(r0, r1, 1, 3);
1608+
}
16021609
else
16031610
{
16041611
transpose(matrix_begin, matrix_end, generic {});

0 commit comments

Comments
 (0)