@@ -572,64 +572,56 @@ namespace xsimd
572572 }
573573
574574 // load_unaligned<batch_bool>
575- namespace detail
575+
576+ template <class T , class A , class = typename std::enable_if<std::is_integral<T>::value, void >::type>
577+ XSIMD_INLINE batch_bool<T, A> load_unaligned (bool const * mem, batch_bool<T, A>, requires_arch<avx2>) noexcept
576578 {
577- template <class T >
578- XSIMD_INLINE __m256i load_bool_avx2 (bool const * mem) noexcept
579+ XSIMD_IF_CONSTEXPR (sizeof (T) == 1 )
580+ {
581+ return { _mm256_sub_epi8 (_mm256_set1_epi8 (0 ), _mm256_loadu_si256 ((__m256i const *)mem)) };
582+ }
583+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 2 )
584+ {
585+ auto bpack = _mm_loadu_si128 ((__m128i const *)mem);
586+ return { _mm256_sub_epi16 (_mm256_set1_epi8 (0 ), _mm256_cvtepu8_epi16 (bpack)) };
587+ }
588+ // GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
589+ // GCC/Clang/MSVC will turn it into the correct load.
590+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
579591 {
580- XSIMD_IF_CONSTEXPR (sizeof (T) == 1 )
581- {
582- return _mm256_sub_epi8 (_mm256_set1_epi8 (0 ), _mm256_loadu_si256 ((__m256i const *)mem));
583- }
584- else XSIMD_IF_CONSTEXPR (sizeof (T) == 2 )
585- {
586- auto bpack = _mm_loadu_si128 ((__m128i const *)mem);
587- return _mm256_sub_epi16 (_mm256_set1_epi8 (0 ), _mm256_cvtepu8_epi16 (bpack));
588- }
589- // GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
590- // GCC/Clang/MSVC will turn it into the correct load.
591- else XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
592- {
593592#if defined(__x86_64__)
594- uint64_t tmp;
595- memcpy (&tmp, mem, sizeof (tmp));
596- auto val = _mm_cvtsi64_si128 (tmp);
593+ uint64_t tmp;
594+ memcpy (&tmp, mem, sizeof (tmp));
595+ auto val = _mm_cvtsi64_si128 (tmp);
597596#else
598- __m128i val;
599- memcpy (&val, mem, sizeof (uint64_t ));
597+ __m128i val;
598+ memcpy (&val, mem, sizeof (uint64_t ));
600599#endif
601- return _mm256_sub_epi32 (_mm256_set1_epi8 (0 ), _mm256_cvtepu8_epi32 (val));
602- }
603- else XSIMD_IF_CONSTEXPR (sizeof (T) == 8 )
604- {
605- uint32_t tmp;
606- memcpy (&tmp, mem, sizeof (tmp));
607- return _mm256_sub_epi64 (_mm256_set1_epi8 (0 ), _mm256_cvtepu8_epi64 (_mm_cvtsi32_si128 (tmp)));
608- }
609- else
610- {
611- assert (false && " unsupported arch/op combination" );
612- return __m256i {};
613- }
600+ return { _mm256_sub_epi32 (_mm256_set1_epi8 (0 ), _mm256_cvtepu8_epi32 (val)) };
601+ }
602+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 8 )
603+ {
604+ uint32_t tmp;
605+ memcpy (&tmp, mem, sizeof (tmp));
606+ return { _mm256_sub_epi64 (_mm256_set1_epi8 (0 ), _mm256_cvtepu8_epi64 (_mm_cvtsi32_si128 (tmp))) };
607+ }
608+ else
609+ {
610+ assert (false && " unsupported arch/op combination" );
611+ return {};
614612 }
615- }
616-
617- template <class T , class A , class = typename std::enable_if<std::is_integral<T>::value, void >::type>
618- XSIMD_INLINE batch_bool<T, A> load_unaligned (bool const * mem, batch_bool<T, A>, requires_arch<avx2>) noexcept
619- {
620- return batch_bool<T, A>(detail::load_bool_avx2<T>(mem));
621613 }
622614
623615 template <class A >
624- XSIMD_INLINE batch_bool<float , A> load_unaligned (bool const * mem, batch_bool<float , A>, requires_arch<avx2>) noexcept
616+ XSIMD_INLINE batch_bool<float , A> load_unaligned (bool const * mem, batch_bool<float , A>, requires_arch<avx2> r ) noexcept
625617 {
626- return batch_bool<float , A>( _mm256_castsi256_ps (detail::load_bool_avx2< float >(mem))) ;
618+ return { _mm256_castsi256_ps ( load_unaligned (mem, batch_bool<uint32_t , A> {}, r). data ) } ;
627619 }
628620
629621 template <class A >
630- XSIMD_INLINE batch_bool<double , A> load_unaligned (bool const * mem, batch_bool<double , A>, requires_arch<avx2>) noexcept
622+ XSIMD_INLINE batch_bool<double , A> load_unaligned (bool const * mem, batch_bool<double , A>, requires_arch<avx2> r ) noexcept
631623 {
632- return batch_bool<double , A>( _mm256_castsi256_pd (detail::load_bool_avx2< double >(mem))) ;
624+ return { _mm256_castsi256_pd ( load_unaligned (mem, batch_bool<uint64_t , A> {}, r). data ) } ;
633625 }
634626
635627 // mask
0 commit comments