@@ -1646,6 +1646,36 @@ namespace xsimd
16461646 return transpose (reinterpret_cast <batch<float , A>*>(matrix_begin), reinterpret_cast <batch<float , A>*>(matrix_end), A {});
16471647 }
16481648
1649+ template <class A >
1650+ XSIMD_INLINE void transpose (batch<double , A>* matrix_begin, batch<double , A>* matrix_end, requires_arch<avx>) noexcept
1651+ {
1652+ assert ((matrix_end - matrix_begin == batch<double , A>::size) && " correctly sized matrix" );
1653+ (void )matrix_end;
1654+ auto r0 = matrix_begin[0 ], r1 = matrix_begin[1 ],
1655+ r2 = matrix_begin[2 ], r3 = matrix_begin[3 ];
1656+
1657+ auto t0 = _mm256_unpacklo_pd (r0, r1); // r00 r10 r01 r11
1658+ auto t1 = _mm256_unpackhi_pd (r0, r1); // r02 r12 r03 r13
1659+ auto t2 = _mm256_unpacklo_pd (r2, r3); // r20 r30 r21 r31
1660+ auto t3 = _mm256_unpackhi_pd (r2, r3); // r22 r32 r23 r33
1661+
1662+ matrix_begin[0 ] = _mm256_permute2f128_pd (t0, t2, 0x20 );
1663+ matrix_begin[1 ] = _mm256_permute2f128_pd (t1, t3, 0x20 );
1664+ matrix_begin[2 ] = _mm256_permute2f128_pd (t0, t2, 0x31 );
1665+ matrix_begin[3 ] = _mm256_permute2f128_pd (t1, t3, 0x31 );
1666+ }
1667+
1668+ template <class A >
1669+ XSIMD_INLINE void transpose (batch<uint64_t , A>* matrix_begin, batch<uint64_t , A>* matrix_end, requires_arch<avx>) noexcept
1670+ {
1671+ return transpose (reinterpret_cast <batch<double , A>*>(matrix_begin), reinterpret_cast <batch<double , A>*>(matrix_end), A {});
1672+ }
1673+ template <class A >
1674+ XSIMD_INLINE void transpose (batch<int64_t , A>* matrix_begin, batch<int64_t , A>* matrix_end, requires_arch<avx>) noexcept
1675+ {
1676+ return transpose (reinterpret_cast <batch<double , A>*>(matrix_begin), reinterpret_cast <batch<double , A>*>(matrix_end), A {});
1677+ }
1678+
16491679 // trunc
16501680 template <class A >
16511681 XSIMD_INLINE batch<float , A> trunc (batch<float , A> const & self, requires_arch<avx>) noexcept
0 commit comments