Skip to content

Commit 401e149

Browse files
avx implementation of transpose for [u]int[8|16]
1 parent d12b767 commit 401e149

File tree

1 file changed

+77
-0
lines changed

1 file changed

+77
-0
lines changed

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,11 @@ namespace xsimd
2929
template <class A, class T, size_t I>
3030
XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
3131

32+
template <class A>
33+
XSIMD_INLINE void transpose(batch<uint16_t, A>* matrix_begin, batch<uint16_t, A>* matrix_end, requires_arch<generic>) noexcept;
34+
template <class A>
35+
XSIMD_INLINE void transpose(batch<uint8_t, A>* matrix_begin, batch<uint8_t, A>* matrix_end, requires_arch<generic>) noexcept;
36+
3237
namespace detail
3338
{
3439
XSIMD_INLINE void split_avx(__m256i val, __m128i& low, __m128i& high) noexcept
@@ -1676,6 +1681,78 @@ namespace xsimd
16761681
return transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
16771682
}
16781683

1684+
template <class A>
1685+
XSIMD_INLINE void transpose(batch<uint16_t, A>* matrix_begin, batch<uint16_t, A>* matrix_end, requires_arch<avx>) noexcept
1686+
{
1687+
assert((matrix_end - matrix_begin == batch<uint16_t, A>::size) && "correctly sized matrix");
1688+
(void)matrix_end;
1689+
batch<uint16_t, sse4_2> tmp_lo0[8];
1690+
for (int i = 0; i < 8; ++i)
1691+
tmp_lo0[i] = _mm256_castsi256_si128(matrix_begin[i]);
1692+
transpose(tmp_lo0 + 0, tmp_lo0 + 8, sse4_2 {});
1693+
1694+
batch<uint16_t, sse4_2> tmp_hi0[8];
1695+
for (int i = 0; i < 8; ++i)
1696+
tmp_hi0[i] = _mm256_castsi256_si128(matrix_begin[8 + i]);
1697+
transpose(tmp_hi0 + 0, tmp_hi0 + 8, sse4_2 {});
1698+
1699+
batch<uint16_t, sse4_2> tmp_lo1[8];
1700+
for (int i = 0; i < 8; ++i)
1701+
tmp_lo1[i] = _mm256_extractf128_si256(matrix_begin[i], 1);
1702+
transpose(tmp_lo1 + 0, tmp_lo1 + 8, sse4_2 {});
1703+
1704+
batch<uint16_t, sse4_2> tmp_hi1[8];
1705+
for (int i = 0; i < 8; ++i)
1706+
tmp_hi1[i] = _mm256_extractf128_si256(matrix_begin[8 + i], 1);
1707+
transpose(tmp_hi1 + 0, tmp_hi1 + 8, sse4_2 {});
1708+
1709+
for (int i = 0; i < 8; ++i)
1710+
matrix_begin[i] = detail::merge_sse(tmp_lo0[i], tmp_hi0[i]);
1711+
for (int i = 0; i < 8; ++i)
1712+
matrix_begin[i + 8] = detail::merge_sse(tmp_lo1[i], tmp_hi1[i]);
1713+
}
1714+
template <class A>
1715+
XSIMD_INLINE void transpose(batch<int16_t, A>* matrix_begin, batch<int16_t, A>* matrix_end, requires_arch<avx>) noexcept
1716+
{
1717+
return transpose(reinterpret_cast<batch<uint16_t, A>*>(matrix_begin), reinterpret_cast<batch<uint16_t, A>*>(matrix_end), A {});
1718+
}
1719+
1720+
template <class A>
1721+
XSIMD_INLINE void transpose(batch<uint8_t, A>* matrix_begin, batch<uint8_t, A>* matrix_end, requires_arch<avx>) noexcept
1722+
{
1723+
assert((matrix_end - matrix_begin == batch<uint8_t, A>::size) && "correctly sized matrix");
1724+
(void)matrix_end;
1725+
batch<uint8_t, sse4_2> tmp_lo0[16];
1726+
for (int i = 0; i < 16; ++i)
1727+
tmp_lo0[i] = _mm256_castsi256_si128(matrix_begin[i]);
1728+
transpose(tmp_lo0 + 0, tmp_lo0 + 16, sse4_2 {});
1729+
1730+
batch<uint8_t, sse4_2> tmp_hi0[16];
1731+
for (int i = 0; i < 16; ++i)
1732+
tmp_hi0[i] = _mm256_castsi256_si128(matrix_begin[16 + i]);
1733+
transpose(tmp_hi0 + 0, tmp_hi0 + 16, sse4_2 {});
1734+
1735+
batch<uint8_t, sse4_2> tmp_lo1[16];
1736+
for (int i = 0; i < 16; ++i)
1737+
tmp_lo1[i] = _mm256_extractf128_si256(matrix_begin[i], 1);
1738+
transpose(tmp_lo1 + 0, tmp_lo1 + 16, sse4_2 {});
1739+
1740+
batch<uint8_t, sse4_2> tmp_hi1[16];
1741+
for (int i = 0; i < 16; ++i)
1742+
tmp_hi1[i] = _mm256_extractf128_si256(matrix_begin[16 + i], 1);
1743+
transpose(tmp_hi1 + 0, tmp_hi1 + 16, sse4_2 {});
1744+
1745+
for (int i = 0; i < 16; ++i)
1746+
matrix_begin[i] = detail::merge_sse(tmp_lo0[i], tmp_hi0[i]);
1747+
for (int i = 0; i < 16; ++i)
1748+
matrix_begin[i + 16] = detail::merge_sse(tmp_lo1[i], tmp_hi1[i]);
1749+
}
1750+
template <class A>
1751+
XSIMD_INLINE void transpose(batch<int8_t, A>* matrix_begin, batch<int8_t, A>* matrix_end, requires_arch<avx>) noexcept
1752+
{
1753+
return transpose(reinterpret_cast<batch<uint8_t, A>*>(matrix_begin), reinterpret_cast<batch<uint8_t, A>*>(matrix_end), A {});
1754+
}
1755+
16791756
// trunc
16801757
template <class A>
16811758
XSIMD_INLINE batch<float, A> trunc(batch<float, A> const& self, requires_arch<avx>) noexcept

0 commit comments

Comments
 (0)