@@ -873,6 +873,18 @@ namespace xsimd
873873 aligned_mode,
874874 requires_arch<avx>) noexcept
875875 {
876+ int mask_bits = _mm256_movemask_ps (mask.data );
877+ if (mask_bits == 0x0F )
878+ {
879+ __m128i mask128 = _mm_castps_si128 (_mm256_castps256_ps128 (mask.data ));
880+ return _mm256_zextps128_ps256 (_mm_maskload_ps (mem, mask128));
881+ }
882+ else if (mask_bits == 0xF0 )
883+ {
884+ __m128i mask128 = _mm_castps_si128 (_mm256_extractf128_ps (mask.data , 1 ));
885+ __m128 hi = _mm_maskload_ps (mem + 4 , mask128);
886+ return _mm256_insertf128_ps (_mm256_setzero_ps (), hi, 1 );
887+ }
876888 return _mm256_maskload_ps (mem, _mm256_castps_si256 (mask.data ));
877889 }
878890
@@ -883,6 +895,18 @@ namespace xsimd
883895 unaligned_mode,
884896 requires_arch<avx>) noexcept
885897 {
898+ int mask_bits = _mm256_movemask_ps (mask.data );
899+ if (mask_bits == 0x0F )
900+ {
901+ __m128i mask128 = _mm_castps_si128 (_mm256_castps256_ps128 (mask.data ));
902+ return _mm256_zextps128_ps256 (_mm_maskload_ps (mem, mask128));
903+ }
904+ else if (mask_bits == 0xF0 )
905+ {
906+ __m128i mask128 = _mm_castps_si128 (_mm256_extractf128_ps (mask.data , 1 ));
907+ __m128 hi = _mm_maskload_ps (mem + 4 , mask128);
908+ return _mm256_insertf128_ps (_mm256_setzero_ps (), hi, 1 );
909+ }
886910 return _mm256_maskload_ps (mem, _mm256_castps_si256 (mask.data ));
887911 }
888912
@@ -939,6 +963,18 @@ namespace xsimd
939963 aligned_mode,
940964 requires_arch<avx>) noexcept
941965 {
966+ int mask_bits = _mm256_movemask_pd (mask.data );
967+ if (mask_bits == 0x3 )
968+ {
969+ __m128i mask128 = _mm_castpd_si128 (_mm256_castpd256_pd128 (mask.data ));
970+ return _mm256_zextpd128_pd256 (_mm_maskload_pd (mem, mask128));
971+ }
972+ else if (mask_bits == 0xC )
973+ {
974+ __m128i mask128 = _mm_castpd_si128 (_mm256_extractf128_pd (mask.data , 1 ));
975+ __m128d hi = _mm_maskload_pd (mem + 2 , mask128);
976+ return _mm256_insertf128_pd (_mm256_setzero_pd (), hi, 1 );
977+ }
942978 return _mm256_maskload_pd (mem, _mm256_castpd_si256 (mask.data ));
943979 }
944980
@@ -949,6 +985,18 @@ namespace xsimd
949985 unaligned_mode,
950986 requires_arch<avx>) noexcept
951987 {
988+ int mask_bits = _mm256_movemask_pd (mask.data );
989+ if (mask_bits == 0x3 )
990+ {
991+ __m128i mask128 = _mm_castpd_si128 (_mm256_castpd256_pd128 (mask.data ));
992+ return _mm256_zextpd128_pd256 (_mm_maskload_pd (mem, mask128));
993+ }
994+ else if (mask_bits == 0xC )
995+ {
996+ __m128i mask128 = _mm_castpd_si128 (_mm256_extractf128_pd (mask.data , 1 ));
997+ __m128d hi = _mm_maskload_pd (mem + 2 , mask128);
998+ return _mm256_insertf128_pd (_mm256_setzero_pd (), hi, 1 );
999+ }
9521000 return _mm256_maskload_pd (mem, _mm256_castpd_si256 (mask.data ));
9531001 }
9541002
0 commit comments