Skip to content

Commit 9906e71

Browse files
Add support for xsimd::transpose
Currently only specialized 32bits types on sse, avx, neon and wasm. The other implementations fallback to sequential generic implementation. Fix #107 (6 years old!)
1 parent 3747986 commit 9906e71

File tree

9 files changed

+217
-0
lines changed

9 files changed

+217
-0
lines changed

docs/source/api/data_transfer.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ In place:
6161

6262
Between batches:
6363

64+
+---------------------------------------+----------------------------------------------------+
65+
| :cpp:func:`transpose` | tranpose a matrix as an array of batches |
6466
+---------------------------------------+----------------------------------------------------+
6567
| :cpp:func:`zip_lo` | interleave low halves of two batches |
6668
+---------------------------------------+----------------------------------------------------+

include/xsimd/arch/generic/xsimd_generic_memory.hpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -639,6 +639,32 @@ namespace xsimd
639639
hi.store_unaligned(buffer + real_batch::size);
640640
}
641641

642+
// transpose
643+
template <class A, class T>
644+
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<generic>) noexcept
645+
{
646+
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
647+
(void)matrix_end;
648+
alignas(A::alignment()) T scratch_buffer[batch<T, A>::size * batch<T, A>::size];
649+
for (size_t i = 0; i < batch<T, A>::size; ++i)
650+
{
651+
matrix_begin[i].store_aligned(&scratch_buffer[i * batch<T, A>::size]);
652+
}
653+
// FIXME: this is super naive we can probably do better.
654+
for (size_t i = 0; i < batch<T, A>::size; ++i)
655+
{
656+
for (size_t j = 0; j < i; ++j)
657+
{
658+
std::swap(scratch_buffer[i * batch<T, A>::size + j],
659+
scratch_buffer[j * batch<T, A>::size + i]);
660+
}
661+
}
662+
for (size_t i = 0; i < batch<T, A>::size; ++i)
663+
{
664+
matrix_begin[i] = batch<T, A>::load_aligned(&scratch_buffer[i * batch<T, A>::size]);
665+
}
666+
}
667+
642668
}
643669

644670
}

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1594,6 +1594,57 @@ namespace xsimd
15941594
return bitwise_cast<T>(
15951595
swizzle(bitwise_cast<double>(self), mask));
15961596
}
1597+
// transpose
1598+
template <class A>
1599+
XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<avx>) noexcept
1600+
{
1601+
assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
1602+
(void)matrix_end;
1603+
// See
1604+
// https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2
1605+
auto r0 = matrix_begin[0], r1 = matrix_begin[1],
1606+
r2 = matrix_begin[2], r3 = matrix_begin[3],
1607+
r4 = matrix_begin[4], r5 = matrix_begin[5],
1608+
r6 = matrix_begin[6], r7 = matrix_begin[7];
1609+
1610+
auto t0 = _mm256_unpacklo_ps(r0, r1);
1611+
auto t1 = _mm256_unpackhi_ps(r0, r1);
1612+
auto t2 = _mm256_unpacklo_ps(r2, r3);
1613+
auto t3 = _mm256_unpackhi_ps(r2, r3);
1614+
auto t4 = _mm256_unpacklo_ps(r4, r5);
1615+
auto t5 = _mm256_unpackhi_ps(r4, r5);
1616+
auto t6 = _mm256_unpacklo_ps(r6, r7);
1617+
auto t7 = _mm256_unpackhi_ps(r6, r7);
1618+
1619+
r0 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(1, 0, 1, 0));
1620+
r1 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 2, 3, 2));
1621+
r2 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0));
1622+
r3 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2));
1623+
r4 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(1, 0, 1, 0));
1624+
r5 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(3, 2, 3, 2));
1625+
r6 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0));
1626+
r7 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2));
1627+
1628+
matrix_begin[0] = _mm256_permute2f128_ps(r0, r4, 0x20);
1629+
matrix_begin[1] = _mm256_permute2f128_ps(r1, r5, 0x20);
1630+
matrix_begin[2] = _mm256_permute2f128_ps(r2, r6, 0x20);
1631+
matrix_begin[3] = _mm256_permute2f128_ps(r3, r7, 0x20);
1632+
matrix_begin[4] = _mm256_permute2f128_ps(r0, r4, 0x31);
1633+
matrix_begin[5] = _mm256_permute2f128_ps(r1, r5, 0x31);
1634+
matrix_begin[6] = _mm256_permute2f128_ps(r2, r6, 0x31);
1635+
matrix_begin[7] = _mm256_permute2f128_ps(r3, r7, 0x31);
1636+
}
1637+
1638+
template <class A>
1639+
XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<avx>) noexcept
1640+
{
1641+
return transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
1642+
}
1643+
template <class A>
1644+
XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<avx>) noexcept
1645+
{
1646+
return transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
1647+
}
15971648

15981649
// trunc
15991650
template <class A>

include/xsimd/arch/xsimd_neon.hpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1748,6 +1748,49 @@ namespace xsimd
17481748
return select(batch_bool<T, A> { b... }, true_br, false_br, neon {});
17491749
}
17501750

1751+
/*************
1752+
* transpose *
1753+
*************/
1754+
template <class A>
1755+
XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<neon>) noexcept
1756+
{
1757+
assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
1758+
(void)matrix_end;
1759+
auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
1760+
auto t01 = vtrnq_f32(r0, r1);
1761+
auto t23 = vtrnq_f32(r2, r3);
1762+
matrix_begin[0] = vcombine_f32(vget_low_f32(t01.val[0]), vget_low_f32(t23.val[0]));
1763+
matrix_begin[1] = vcombine_f32(vget_low_f32(t01.val[1]), vget_low_f32(t23.val[1]));
1764+
matrix_begin[2] = vcombine_f32(vget_high_f32(t01.val[0]), vget_high_f32(t23.val[0]));
1765+
matrix_begin[3] = vcombine_f32(vget_high_f32(t01.val[1]), vget_high_f32(t23.val[1]));
1766+
}
1767+
template <class A>
1768+
XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<neon>) noexcept
1769+
{
1770+
assert((matrix_end - matrix_begin == batch<uint32_t, A>::size) && "correctly sized matrix");
1771+
(void)matrix_end;
1772+
auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
1773+
auto t01 = vtrnq_u32(r0, r1);
1774+
auto t23 = vtrnq_u32(r2, r3);
1775+
matrix_begin[0] = vcombine_u32(vget_low_u32(t01.val[0]), vget_low_u32(t23.val[0]));
1776+
matrix_begin[1] = vcombine_u32(vget_low_u32(t01.val[1]), vget_low_u32(t23.val[1]));
1777+
matrix_begin[2] = vcombine_u32(vget_high_u32(t01.val[0]), vget_high_u32(t23.val[0]));
1778+
matrix_begin[3] = vcombine_u32(vget_high_u32(t01.val[1]), vget_high_u32(t23.val[1]));
1779+
}
1780+
template <class A>
1781+
XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<neon>) noexcept
1782+
{
1783+
assert((matrix_end - matrix_begin == batch<int32_t, A>::size) && "correctly sized matrix");
1784+
(void)matrix_end;
1785+
auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
1786+
auto t01 = vtrnq_s32(r0, r1);
1787+
auto t23 = vtrnq_s32(r2, r3);
1788+
matrix_begin[0] = vcombine_s32(vget_low_s32(t01.val[0]), vget_low_s32(t23.val[0]));
1789+
matrix_begin[1] = vcombine_s32(vget_low_s32(t01.val[1]), vget_low_s32(t23.val[1]));
1790+
matrix_begin[2] = vcombine_s32(vget_high_s32(t01.val[0]), vget_high_s32(t23.val[0]));
1791+
matrix_begin[3] = vcombine_s32(vget_high_s32(t01.val[1]), vget_high_s32(t23.val[1]));
1792+
}
1793+
17511794
/**********
17521795
* zip_lo *
17531796
**********/
@@ -2737,6 +2780,7 @@ namespace xsimd
27372780
return set(batch<T, A>(), A(), data[idx]...);
27382781
}
27392782
}
2783+
27402784
}
27412785

27422786
#undef WRAP_BINARY_INT_EXCLUDING_64

include/xsimd/arch/xsimd_neon64.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -949,6 +949,7 @@ namespace xsimd
949949
{
950950
return select(batch_bool<double, A> { b... }, true_br, false_br, neon64 {});
951951
}
952+
952953
/**********
953954
* zip_lo *
954955
**********/

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1640,6 +1640,30 @@ namespace xsimd
16401640
return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, sse2 {}));
16411641
}
16421642

1643+
// transpose
1644+
template <class A>
1645+
XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<sse2>) noexcept
1646+
{
1647+
assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
1648+
(void)matrix_end;
1649+
auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
1650+
_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
1651+
matrix_begin[0] = r0;
1652+
matrix_begin[1] = r1;
1653+
matrix_begin[2] = r2;
1654+
matrix_begin[3] = r3;
1655+
}
1656+
template <class A>
1657+
XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<sse2>) noexcept
1658+
{
1659+
transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A{});
1660+
}
1661+
template <class A>
1662+
XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<sse2>) noexcept
1663+
{
1664+
transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A{});
1665+
}
1666+
16431667
// zip_hi
16441668
template <class A>
16451669
XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept

include/xsimd/arch/xsimd_wasm.hpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ namespace xsimd
3939
XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
4040
template <class A, class T>
4141
XSIMD_INLINE batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
42+
template <class A, class T>
43+
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<generic>) noexcept;
4244

4345
// abs
4446
template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
@@ -1576,6 +1578,33 @@ namespace xsimd
15761578
return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, wasm {}));
15771579
}
15781580

1581+
// transpose
1582+
template <class A, class T>
1583+
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<wasm>) noexcept
1584+
{
1585+
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
1586+
(void)matrix_end;
1587+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1588+
{
1589+
auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
1590+
1591+
auto t0 = wasm_i32x4_shuffle(r0, r1, 0, 4, 1, 5); // r0[0] r1[0] r0[1] r1[1]
1592+
auto t1 = wasm_i32x4_shuffle(r0, r1, 2, 6, 3, 7); // r0[2] r1[2] r0[3] r1[3]
1593+
1594+
auto t2 = wasm_i32x4_shuffle(r2, r3, 0, 4, 1, 5); // r2[0] r3[0] r2[1] r3[1]
1595+
auto t3 = wasm_i32x4_shuffle(r2, r3, 2, 6, 3, 7); // r2[2] r3[2] r2[3] r3[3]
1596+
1597+
matrix_begin[0] = wasm_i32x4_shuffle(t0, t2, 0, 1, 4, 5); // r0[0] r1[0] r2[0] r3[0]
1598+
matrix_begin[1] = wasm_i32x4_shuffle(t0, t2, 2, 3, 6, 7); // r0[1] r1[1] r2[1] r3[1]
1599+
matrix_begin[2] = wasm_i32x4_shuffle(t1, t3, 0, 1, 4, 5); // r0[2] r1[2] r2[2] r3[2]
1600+
matrix_begin[3] = wasm_i32x4_shuffle(t1, t3, 2, 3, 6, 7); // r0[3] r1[3] r2[3] r3[3]
1601+
}
1602+
else
1603+
{
1604+
transpose(matrix_begin, matrix_end, generic {});
1605+
}
1606+
}
1607+
15791608
// trunc
15801609
template <class A>
15811610
XSIMD_INLINE batch<float, A> trunc(batch<float, A> const& self, requires_arch<wasm>) noexcept

include/xsimd/types/xsimd_api.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2516,6 +2516,23 @@ namespace xsimd
25162516
return batch_cast<as_integer_t<T>>(x);
25172517
}
25182518

2519+
/**
2520+
* @ingroup batch_data_transfer
2521+
*
2522+
* Transposes in place the matrix whose line are each of the batch passed as
2523+
* argument.
2524+
* @param matrix_begin pointer to the first line of the matrix to transpose
2525+
* @param matrix_end pointer to one element after the last line of the matrix to transpose
2526+
*
2527+
*/
2528+
template <class T, class A>
2529+
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end) noexcept
2530+
{
2531+
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
2532+
detail::static_check_supported_config<T, A>();
2533+
return kernel::transpose(matrix_begin, matrix_end, A {});
2534+
}
2535+
25192536
/**
25202537
* @ingroup batch_rounding
25212538
*

test/test_shuffle.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -605,6 +605,25 @@ struct shuffle_test
605605
}
606606
}
607607

608+
void transpose()
609+
{
610+
B b_lhs = B::load_unaligned(lhs.data());
611+
std::array<B, size> b_matrix;
612+
for (size_t i = 0; i < size; ++i)
613+
b_matrix[i] = b_lhs;
614+
std::array<value_type, size * size> ref_matrix;
615+
for (size_t i = 0; i < size; ++i)
616+
for (size_t j = 0; j < size; ++j)
617+
ref_matrix[i * size + j] = lhs[i];
618+
619+
INFO("transpose");
620+
xsimd::transpose(b_matrix.data(), b_matrix.data() + b_matrix.size());
621+
for (size_t i = 0; i < size; ++i)
622+
{
623+
CHECK_BATCH_EQ(b_matrix[i], B::load_unaligned(&ref_matrix[i * size]));
624+
}
625+
}
626+
608627
void select()
609628
{
610629
B b_lhs = B::load_unaligned(lhs.data());
@@ -694,6 +713,10 @@ TEST_CASE_TEMPLATE("[shuffle]", B, BATCH_FLOAT_TYPES, xsimd::batch<uint32_t>, xs
694713
{
695714
Test.swizzle();
696715
}
716+
SUBCASE("transpose")
717+
{
718+
Test.transpose();
719+
}
697720
SUBCASE("zip")
698721
{
699722
Test.zip();

0 commit comments

Comments
 (0)