@@ -1594,6 +1594,57 @@ namespace xsimd
15941594 return bitwise_cast<T>(
15951595 swizzle (bitwise_cast<double >(self), mask));
15961596 }
1597+ // transpose
1598+ template <class A >
1599+ XSIMD_INLINE void transpose (batch<float , A>* matrix_begin, batch<float , A>* matrix_end, requires_arch<avx>) noexcept
1600+ {
1601+ assert ((matrix_end - matrix_begin == batch<float , A>::size) && " correctly sized matrix" );
1602+ (void )matrix_end;
1603+ // See
1604+ // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2
1605+ auto r0 = matrix_begin[0 ], r1 = matrix_begin[1 ],
1606+ r2 = matrix_begin[2 ], r3 = matrix_begin[3 ],
1607+ r4 = matrix_begin[4 ], r5 = matrix_begin[5 ],
1608+ r6 = matrix_begin[6 ], r7 = matrix_begin[7 ];
1609+
1610+ auto t0 = _mm256_unpacklo_ps (r0, r1);
1611+ auto t1 = _mm256_unpackhi_ps (r0, r1);
1612+ auto t2 = _mm256_unpacklo_ps (r2, r3);
1613+ auto t3 = _mm256_unpackhi_ps (r2, r3);
1614+ auto t4 = _mm256_unpacklo_ps (r4, r5);
1615+ auto t5 = _mm256_unpackhi_ps (r4, r5);
1616+ auto t6 = _mm256_unpacklo_ps (r6, r7);
1617+ auto t7 = _mm256_unpackhi_ps (r6, r7);
1618+
1619+ r0 = _mm256_shuffle_ps (t0, t2, _MM_SHUFFLE (1 , 0 , 1 , 0 ));
1620+ r1 = _mm256_shuffle_ps (t0, t2, _MM_SHUFFLE (3 , 2 , 3 , 2 ));
1621+ r2 = _mm256_shuffle_ps (t1, t3, _MM_SHUFFLE (1 , 0 , 1 , 0 ));
1622+ r3 = _mm256_shuffle_ps (t1, t3, _MM_SHUFFLE (3 , 2 , 3 , 2 ));
1623+ r4 = _mm256_shuffle_ps (t4, t6, _MM_SHUFFLE (1 , 0 , 1 , 0 ));
1624+ r5 = _mm256_shuffle_ps (t4, t6, _MM_SHUFFLE (3 , 2 , 3 , 2 ));
1625+ r6 = _mm256_shuffle_ps (t5, t7, _MM_SHUFFLE (1 , 0 , 1 , 0 ));
1626+ r7 = _mm256_shuffle_ps (t5, t7, _MM_SHUFFLE (3 , 2 , 3 , 2 ));
1627+
1628+ matrix_begin[0 ] = _mm256_permute2f128_ps (r0, r4, 0x20 );
1629+ matrix_begin[1 ] = _mm256_permute2f128_ps (r1, r5, 0x20 );
1630+ matrix_begin[2 ] = _mm256_permute2f128_ps (r2, r6, 0x20 );
1631+ matrix_begin[3 ] = _mm256_permute2f128_ps (r3, r7, 0x20 );
1632+ matrix_begin[4 ] = _mm256_permute2f128_ps (r0, r4, 0x31 );
1633+ matrix_begin[5 ] = _mm256_permute2f128_ps (r1, r5, 0x31 );
1634+ matrix_begin[6 ] = _mm256_permute2f128_ps (r2, r6, 0x31 );
1635+ matrix_begin[7 ] = _mm256_permute2f128_ps (r3, r7, 0x31 );
1636+ }
1637+
1638+ template <class A >
1639+ XSIMD_INLINE void transpose (batch<uint32_t , A>* matrix_begin, batch<uint32_t , A>* matrix_end, requires_arch<avx>) noexcept
1640+ {
1641+ return transpose (reinterpret_cast <batch<float , A>*>(matrix_begin), reinterpret_cast <batch<float , A>*>(matrix_end), A {});
1642+ }
1643+ template <class A >
1644+ XSIMD_INLINE void transpose (batch<int32_t , A>* matrix_begin, batch<int32_t , A>* matrix_end, requires_arch<avx>) noexcept
1645+ {
1646+ return transpose (reinterpret_cast <batch<float , A>*>(matrix_begin), reinterpret_cast <batch<float , A>*>(matrix_end), A {});
1647+ }
15971648
15981649 // trunc
15991650 template <class A >
0 commit comments