Skip to content

Commit 1bd8e19

Browse files
committed
optimizing for half loads
1 parent 53da643 commit 1bd8e19

File tree

1 file changed

+48
-0
lines changed

1 file changed

+48
-0
lines changed

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -873,6 +873,18 @@ namespace xsimd
873873
aligned_mode,
874874
requires_arch<avx>) noexcept
875875
{
876+
int mask_bits = _mm256_movemask_ps(mask.data);
877+
if(mask_bits == 0x0F)
878+
{
879+
__m128i mask128 = _mm_castps_si128(_mm256_castps256_ps128(mask.data));
880+
return _mm256_zextps128_ps256(_mm_maskload_ps(mem, mask128));
881+
}
882+
else if(mask_bits == 0xF0)
883+
{
884+
__m128i mask128 = _mm_castps_si128(_mm256_extractf128_ps(mask.data, 1));
885+
__m128 hi = _mm_maskload_ps(mem + 4, mask128);
886+
return _mm256_insertf128_ps(_mm256_setzero_ps(), hi, 1);
887+
}
876888
return _mm256_maskload_ps(mem, _mm256_castps_si256(mask.data));
877889
}
878890

@@ -883,6 +895,18 @@ namespace xsimd
883895
unaligned_mode,
884896
requires_arch<avx>) noexcept
885897
{
898+
int mask_bits = _mm256_movemask_ps(mask.data);
899+
if(mask_bits == 0x0F)
900+
{
901+
__m128i mask128 = _mm_castps_si128(_mm256_castps256_ps128(mask.data));
902+
return _mm256_zextps128_ps256(_mm_maskload_ps(mem, mask128));
903+
}
904+
else if(mask_bits == 0xF0)
905+
{
906+
__m128i mask128 = _mm_castps_si128(_mm256_extractf128_ps(mask.data, 1));
907+
__m128 hi = _mm_maskload_ps(mem + 4, mask128);
908+
return _mm256_insertf128_ps(_mm256_setzero_ps(), hi, 1);
909+
}
886910
return _mm256_maskload_ps(mem, _mm256_castps_si256(mask.data));
887911
}
888912

@@ -939,6 +963,18 @@ namespace xsimd
939963
aligned_mode,
940964
requires_arch<avx>) noexcept
941965
{
966+
int mask_bits = _mm256_movemask_pd(mask.data);
967+
if(mask_bits == 0x3)
968+
{
969+
__m128i mask128 = _mm_castpd_si128(_mm256_castpd256_pd128(mask.data));
970+
return _mm256_zextpd128_pd256(_mm_maskload_pd(mem, mask128));
971+
}
972+
else if(mask_bits == 0xC)
973+
{
974+
__m128i mask128 = _mm_castpd_si128(_mm256_extractf128_pd(mask.data, 1));
975+
__m128d hi = _mm_maskload_pd(mem + 2, mask128);
976+
return _mm256_insertf128_pd(_mm256_setzero_pd(), hi, 1);
977+
}
942978
return _mm256_maskload_pd(mem, _mm256_castpd_si256(mask.data));
943979
}
944980

@@ -949,6 +985,18 @@ namespace xsimd
949985
unaligned_mode,
950986
requires_arch<avx>) noexcept
951987
{
988+
int mask_bits = _mm256_movemask_pd(mask.data);
989+
if(mask_bits == 0x3)
990+
{
991+
__m128i mask128 = _mm_castpd_si128(_mm256_castpd256_pd128(mask.data));
992+
return _mm256_zextpd128_pd256(_mm_maskload_pd(mem, mask128));
993+
}
994+
else if(mask_bits == 0xC)
995+
{
996+
__m128i mask128 = _mm_castpd_si128(_mm256_extractf128_pd(mask.data, 1));
997+
__m128d hi = _mm_maskload_pd(mem + 2, mask128);
998+
return _mm256_insertf128_pd(_mm256_setzero_pd(), hi, 1);
999+
}
9521000
return _mm256_maskload_pd(mem, _mm256_castpd_si256(mask.data));
9531001
}
9541002

0 commit comments

Comments
 (0)