@@ -29,6 +29,11 @@ namespace xsimd
2929 template <class A , class T , size_t I>
3030 XSIMD_INLINE batch<T, A> insert (batch<T, A> const & self, T val, index<I>, requires_arch<generic>) noexcept ;
3131
32+ template <class A >
33+ XSIMD_INLINE void transpose (batch<uint16_t , A>* matrix_begin, batch<uint16_t , A>* matrix_end, requires_arch<generic>) noexcept ;
34+ template <class A >
35+ XSIMD_INLINE void transpose (batch<uint8_t , A>* matrix_begin, batch<uint8_t , A>* matrix_end, requires_arch<generic>) noexcept ;
36+
3237 namespace detail
3338 {
3439 XSIMD_INLINE void split_avx (__m256i val, __m128i& low, __m128i& high) noexcept
@@ -1676,6 +1681,78 @@ namespace xsimd
16761681 return transpose (reinterpret_cast <batch<double , A>*>(matrix_begin), reinterpret_cast <batch<double , A>*>(matrix_end), A {});
16771682 }
16781683
1684+ template <class A >
1685+ XSIMD_INLINE void transpose (batch<uint16_t , A>* matrix_begin, batch<uint16_t , A>* matrix_end, requires_arch<avx>) noexcept
1686+ {
1687+ assert ((matrix_end - matrix_begin == batch<uint16_t , A>::size) && " correctly sized matrix" );
1688+ (void )matrix_end;
1689+ batch<uint16_t , sse4_2> tmp_lo0[8 ];
1690+ for (int i = 0 ; i < 8 ; ++i)
1691+ tmp_lo0[i] = _mm256_castsi256_si128 (matrix_begin[i]);
1692+ transpose (tmp_lo0 + 0 , tmp_lo0 + 8 , sse4_2 {});
1693+
1694+ batch<uint16_t , sse4_2> tmp_hi0[8 ];
1695+ for (int i = 0 ; i < 8 ; ++i)
1696+ tmp_hi0[i] = _mm256_castsi256_si128 (matrix_begin[8 + i]);
1697+ transpose (tmp_hi0 + 0 , tmp_hi0 + 8 , sse4_2 {});
1698+
1699+ batch<uint16_t , sse4_2> tmp_lo1[8 ];
1700+ for (int i = 0 ; i < 8 ; ++i)
1701+ tmp_lo1[i] = _mm256_extractf128_si256 (matrix_begin[i], 1 );
1702+ transpose (tmp_lo1 + 0 , tmp_lo1 + 8 , sse4_2 {});
1703+
1704+ batch<uint16_t , sse4_2> tmp_hi1[8 ];
1705+ for (int i = 0 ; i < 8 ; ++i)
1706+ tmp_hi1[i] = _mm256_extractf128_si256 (matrix_begin[8 + i], 1 );
1707+ transpose (tmp_hi1 + 0 , tmp_hi1 + 8 , sse4_2 {});
1708+
1709+ for (int i = 0 ; i < 8 ; ++i)
1710+ matrix_begin[i] = detail::merge_sse (tmp_lo0[i], tmp_hi0[i]);
1711+ for (int i = 0 ; i < 8 ; ++i)
1712+ matrix_begin[i + 8 ] = detail::merge_sse (tmp_lo1[i], tmp_hi1[i]);
1713+ }
1714+ template <class A >
1715+ XSIMD_INLINE void transpose (batch<int16_t , A>* matrix_begin, batch<int16_t , A>* matrix_end, requires_arch<avx>) noexcept
1716+ {
1717+ return transpose (reinterpret_cast <batch<uint16_t , A>*>(matrix_begin), reinterpret_cast <batch<uint16_t , A>*>(matrix_end), A {});
1718+ }
1719+
1720+ template <class A >
1721+ XSIMD_INLINE void transpose (batch<uint8_t , A>* matrix_begin, batch<uint8_t , A>* matrix_end, requires_arch<avx>) noexcept
1722+ {
1723+ assert ((matrix_end - matrix_begin == batch<uint8_t , A>::size) && " correctly sized matrix" );
1724+ (void )matrix_end;
1725+ batch<uint8_t , sse4_2> tmp_lo0[16 ];
1726+ for (int i = 0 ; i < 16 ; ++i)
1727+ tmp_lo0[i] = _mm256_castsi256_si128 (matrix_begin[i]);
1728+ transpose (tmp_lo0 + 0 , tmp_lo0 + 16 , sse4_2 {});
1729+
1730+ batch<uint8_t , sse4_2> tmp_hi0[16 ];
1731+ for (int i = 0 ; i < 16 ; ++i)
1732+ tmp_hi0[i] = _mm256_castsi256_si128 (matrix_begin[16 + i]);
1733+ transpose (tmp_hi0 + 0 , tmp_hi0 + 16 , sse4_2 {});
1734+
1735+ batch<uint8_t , sse4_2> tmp_lo1[16 ];
1736+ for (int i = 0 ; i < 16 ; ++i)
1737+ tmp_lo1[i] = _mm256_extractf128_si256 (matrix_begin[i], 1 );
1738+ transpose (tmp_lo1 + 0 , tmp_lo1 + 16 , sse4_2 {});
1739+
1740+ batch<uint8_t , sse4_2> tmp_hi1[16 ];
1741+ for (int i = 0 ; i < 16 ; ++i)
1742+ tmp_hi1[i] = _mm256_extractf128_si256 (matrix_begin[16 + i], 1 );
1743+ transpose (tmp_hi1 + 0 , tmp_hi1 + 16 , sse4_2 {});
1744+
1745+ for (int i = 0 ; i < 16 ; ++i)
1746+ matrix_begin[i] = detail::merge_sse (tmp_lo0[i], tmp_hi0[i]);
1747+ for (int i = 0 ; i < 16 ; ++i)
1748+ matrix_begin[i + 16 ] = detail::merge_sse (tmp_lo1[i], tmp_hi1[i]);
1749+ }
1750+ template <class A >
1751+ XSIMD_INLINE void transpose (batch<int8_t , A>* matrix_begin, batch<int8_t , A>* matrix_end, requires_arch<avx>) noexcept
1752+ {
1753+ return transpose (reinterpret_cast <batch<uint8_t , A>*>(matrix_begin), reinterpret_cast <batch<uint8_t , A>*>(matrix_end), A {});
1754+ }
1755+
16791756 // trunc
16801757 template <class A >
16811758 XSIMD_INLINE batch<float , A> trunc (batch<float , A> const & self, requires_arch<avx>) noexcept
0 commit comments