@@ -459,6 +459,7 @@ namespace xsimd
459459 XSIMD_INLINE batch<T, A> shuffle (batch<T, A> const & x, batch<T, A> const & y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept
460460 {
461461 constexpr size_t bsize = sizeof ...(Indices);
462+ static_assert (bsize == batch<T, A>::size, " valid shuffle" );
462463
463464 // Detect common patterns
464465 XSIMD_IF_CONSTEXPR (detail::is_swizzle_fst (bsize, Indices...))
@@ -486,14 +487,15 @@ namespace xsimd
486487 return select (batch_bool_constant<T, A, (Indices < bsize)...>(), x, y);
487488 }
488489
489- #if defined(__has_builtin)
490- #if __has_builtin(__builtin_shuffle_vector )
491- #define builtin_shuffle __builtin_shuffle_vector
490+ #if defined(__has_builtin) && !defined(XSIMD_WITH_EMULATED)
491+ #if __has_builtin(__builtin_shufflevector )
492+ #define builtin_shuffle __builtin_shufflevector
492493#endif
493494#endif
494495
495496#if defined(builtin_shuffle)
496- return builtin_shuffle (x.data , y.data , Indices...);
497+ typedef T vty __attribute__ ((__vector_size__ (sizeof (batch<T, A>))));
498+ return (typename batch<T, A>::register_type)builtin_shuffle ((vty)x.data , (vty)y.data , Indices...);
497499
498500// FIXME: my experiments show that GCC only correctly optimizes this builtin
499501// starting at GCC 13, where it already has __builtin_shuffle_vector
0 commit comments