@@ -891,7 +891,21 @@ pub fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
891891#[ cfg_attr( test, assert_instr( vphaddw) ) ]
892892#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
893893pub fn _mm256_hadd_epi16 ( a : __m256i , b : __m256i ) -> __m256i {
894- unsafe { transmute ( phaddw ( a. as_i16x16 ( ) , b. as_i16x16 ( ) ) ) }
894+ let a = a. as_i16x16 ( ) ;
895+ let b = b. as_i16x16 ( ) ;
896+ unsafe {
897+ let even: i16x16 = simd_shuffle ! (
898+ a,
899+ b,
900+ [ 0 , 2 , 4 , 6 , 16 , 18 , 20 , 22 , 8 , 10 , 12 , 14 , 24 , 26 , 28 , 30 ]
901+ ) ;
902+ let odd: i16x16 = simd_shuffle ! (
903+ a,
904+ b,
905+ [ 1 , 3 , 5 , 7 , 17 , 19 , 21 , 23 , 9 , 11 , 13 , 15 , 25 , 27 , 29 , 31 ]
906+ ) ;
907+ simd_add ( even, odd) . as_m256i ( )
908+ }
895909}
896910
897911/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
@@ -902,7 +916,13 @@ pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
902916#[ cfg_attr( test, assert_instr( vphaddd) ) ]
903917#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
904918pub fn _mm256_hadd_epi32 ( a : __m256i , b : __m256i ) -> __m256i {
905- unsafe { transmute ( phaddd ( a. as_i32x8 ( ) , b. as_i32x8 ( ) ) ) }
919+ let a = a. as_i32x8 ( ) ;
920+ let b = b. as_i32x8 ( ) ;
921+ unsafe {
922+ let even: i32x8 = simd_shuffle ! ( a, b, [ 0 , 2 , 8 , 10 , 4 , 6 , 12 , 14 ] ) ;
923+ let odd: i32x8 = simd_shuffle ! ( a, b, [ 1 , 3 , 9 , 11 , 5 , 7 , 13 , 15 ] ) ;
924+ simd_add ( even, odd) . as_m256i ( )
925+ }
906926}
907927
908928/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
@@ -925,7 +945,21 @@ pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
925945#[ cfg_attr( test, assert_instr( vphsubw) ) ]
926946#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
927947pub fn _mm256_hsub_epi16 ( a : __m256i , b : __m256i ) -> __m256i {
928- unsafe { transmute ( phsubw ( a. as_i16x16 ( ) , b. as_i16x16 ( ) ) ) }
948+ let a = a. as_i16x16 ( ) ;
949+ let b = b. as_i16x16 ( ) ;
950+ unsafe {
951+ let even: i16x16 = simd_shuffle ! (
952+ a,
953+ b,
954+ [ 0 , 2 , 4 , 6 , 16 , 18 , 20 , 22 , 8 , 10 , 12 , 14 , 24 , 26 , 28 , 30 ]
955+ ) ;
956+ let odd: i16x16 = simd_shuffle ! (
957+ a,
958+ b,
959+ [ 1 , 3 , 5 , 7 , 17 , 19 , 21 , 23 , 9 , 11 , 13 , 15 , 25 , 27 , 29 , 31 ]
960+ ) ;
961+ simd_sub ( even, odd) . as_m256i ( )
962+ }
929963}
930964
931965/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
@@ -936,7 +970,13 @@ pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
936970#[ cfg_attr( test, assert_instr( vphsubd) ) ]
937971#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
938972pub fn _mm256_hsub_epi32 ( a : __m256i , b : __m256i ) -> __m256i {
939- unsafe { transmute ( phsubd ( a. as_i32x8 ( ) , b. as_i32x8 ( ) ) ) }
973+ let a = a. as_i32x8 ( ) ;
974+ let b = b. as_i32x8 ( ) ;
975+ unsafe {
976+ let even: i32x8 = simd_shuffle ! ( a, b, [ 0 , 2 , 8 , 10 , 4 , 6 , 12 , 14 ] ) ;
977+ let odd: i32x8 = simd_shuffle ! ( a, b, [ 1 , 3 , 9 , 11 , 5 , 7 , 13 , 15 ] ) ;
978+ simd_sub ( even, odd) . as_m256i ( )
979+ }
940980}
941981
942982/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
@@ -1714,7 +1754,12 @@ pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m25
17141754#[ cfg_attr( test, assert_instr( vpmaddwd) ) ]
17151755#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
17161756pub fn _mm256_madd_epi16 ( a : __m256i , b : __m256i ) -> __m256i {
1717- unsafe { transmute ( pmaddwd ( a. as_i16x16 ( ) , b. as_i16x16 ( ) ) ) }
1757+ unsafe {
1758+ let r: i32x16 = simd_mul ( simd_cast ( a. as_i16x16 ( ) ) , simd_cast ( b. as_i16x16 ( ) ) ) ;
1759+ let even: i32x8 = simd_shuffle ! ( r, r, [ 0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 ] ) ;
1760+ let odd: i32x8 = simd_shuffle ! ( r, r, [ 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15 ] ) ;
1761+ simd_add ( even, odd) . as_m256i ( )
1762+ }
17181763}
17191764
17201765/// Vertically multiplies each unsigned 8-bit integer from `a` with the
@@ -3594,20 +3639,10 @@ pub fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
35943639
35953640#[ allow( improper_ctypes) ]
35963641unsafe extern "C" {
3597- #[ link_name = "llvm.x86.avx2.phadd.w" ]
3598- fn phaddw ( a : i16x16 , b : i16x16 ) -> i16x16 ;
3599- #[ link_name = "llvm.x86.avx2.phadd.d" ]
3600- fn phaddd ( a : i32x8 , b : i32x8 ) -> i32x8 ;
36013642 #[ link_name = "llvm.x86.avx2.phadd.sw" ]
36023643 fn phaddsw ( a : i16x16 , b : i16x16 ) -> i16x16 ;
3603- #[ link_name = "llvm.x86.avx2.phsub.w" ]
3604- fn phsubw ( a : i16x16 , b : i16x16 ) -> i16x16 ;
3605- #[ link_name = "llvm.x86.avx2.phsub.d" ]
3606- fn phsubd ( a : i32x8 , b : i32x8 ) -> i32x8 ;
36073644 #[ link_name = "llvm.x86.avx2.phsub.sw" ]
36083645 fn phsubsw ( a : i16x16 , b : i16x16 ) -> i16x16 ;
3609- #[ link_name = "llvm.x86.avx2.pmadd.wd" ]
3610- fn pmaddwd ( a : i16x16 , b : i16x16 ) -> i32x8 ;
36113646 #[ link_name = "llvm.x86.avx2.pmadd.ub.sw" ]
36123647 fn pmaddubsw ( a : u8x32 , b : u8x32 ) -> i16x16 ;
36133648 #[ link_name = "llvm.x86.avx2.maskload.d" ]
0 commit comments