Use SIMD intrinsics for madd, hadd and hsub intrinsics

sayantn · sayantn · commit 575823de42cb · 2025-10-01T07:20:30.000+05:30
diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs
@@ -587,7 +587,11 @@ pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vhaddpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
-    unsafe { vhaddpd(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
+        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
+        simd_add(even, odd)
+    }
 }
 
 /// Horizontal addition of adjacent pairs in the two packed vectors
@@ -602,7 +606,11 @@ pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
 #[cfg_attr(test, assert_instr(vhaddps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
-    unsafe { vhaddps(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
+        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
+        simd_add(even, odd)
+    }
 }
 
 /// Horizontal subtraction of adjacent pairs in the two packed vectors
@@ -616,7 +624,11 @@ pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vhsubpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
-    unsafe { vhsubpd(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
+        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
+        simd_sub(even, odd)
+    }
 }
 
 /// Horizontal subtraction of adjacent pairs in the two packed vectors
@@ -631,7 +643,11 @@ pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
 #[cfg_attr(test, assert_instr(vhsubps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
-    unsafe { vhsubps(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
+        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
+        simd_sub(even, odd)
+    }
 }
 
 /// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
@@ -3044,14 +3060,6 @@ unsafe extern "C" {
     fn roundps256(a: __m256, b: i32) -> __m256;
     #[link_name = "llvm.x86.avx.dp.ps.256"]
     fn vdpps(a: __m256, b: __m256, imm8: i8) -> __m256;
-    #[link_name = "llvm.x86.avx.hadd.pd.256"]
-    fn vhaddpd(a: __m256d, b: __m256d) -> __m256d;
-    #[link_name = "llvm.x86.avx.hadd.ps.256"]
-    fn vhaddps(a: __m256, b: __m256) -> __m256;
-    #[link_name = "llvm.x86.avx.hsub.pd.256"]
-    fn vhsubpd(a: __m256d, b: __m256d) -> __m256d;
-    #[link_name = "llvm.x86.avx.hsub.ps.256"]
-    fn vhsubps(a: __m256, b: __m256) -> __m256;
     #[link_name = "llvm.x86.sse2.cmp.pd"]
     fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
     #[link_name = "llvm.x86.avx.cmp.pd.256"]
diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs
@@ -891,7 +891,21 @@ pub fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
 #[cfg_attr(test, assert_instr(vphaddw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(phaddw(a.as_i16x16(), b.as_i16x16())) }
+    let a = a.as_i16x16();
+    let b = b.as_i16x16();
+    unsafe {
+        let even: i16x16 = simd_shuffle!(
+            a,
+            b,
+            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
+        );
+        let odd: i16x16 = simd_shuffle!(
+            a,
+            b,
+            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
+        );
+        simd_add(even, odd).as_m256i()
+    }
 }
 
 /// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
@@ -902,7 +916,13 @@ pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vphaddd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(phaddd(a.as_i32x8(), b.as_i32x8())) }
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    unsafe {
+        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
+        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
+        simd_add(even, odd).as_m256i()
+    }
 }
 
 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
@@ -925,7 +945,21 @@ pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vphsubw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(phsubw(a.as_i16x16(), b.as_i16x16())) }
+    let a = a.as_i16x16();
+    let b = b.as_i16x16();
+    unsafe {
+        let even: i16x16 = simd_shuffle!(
+            a,
+            b,
+            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
+        );
+        let odd: i16x16 = simd_shuffle!(
+            a,
+            b,
+            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
+        );
+        simd_sub(even, odd).as_m256i()
+    }
 }
 
 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
@@ -936,7 +970,13 @@ pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vphsubd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(phsubd(a.as_i32x8(), b.as_i32x8())) }
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    unsafe {
+        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
+        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
+        simd_sub(even, odd).as_m256i()
+    }
 }
 
 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
@@ -1714,7 +1754,12 @@ pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m25
 #[cfg_attr(test, assert_instr(vpmaddwd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) }
+    unsafe {
+        let r: i32x16 = simd_mul(simd_cast(a.as_i16x16()), simd_cast(b.as_i16x16()));
+        let even: i32x8 = simd_shuffle!(r, r, [0, 2, 4, 6, 8, 10, 12, 14]);
+        let odd: i32x8 = simd_shuffle!(r, r, [1, 3, 5, 7, 9, 11, 13, 15]);
+        simd_add(even, odd).as_m256i()
+    }
 }
 
 /// Vertically multiplies each unsigned 8-bit integer from `a` with the
@@ -3594,20 +3639,10 @@ pub fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
 
 #[allow(improper_ctypes)]
 unsafe extern "C" {
-    #[link_name = "llvm.x86.avx2.phadd.w"]
-    fn phaddw(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx2.phadd.d"]
-    fn phaddd(a: i32x8, b: i32x8) -> i32x8;
     #[link_name = "llvm.x86.avx2.phadd.sw"]
     fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx2.phsub.w"]
-    fn phsubw(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx2.phsub.d"]
-    fn phsubd(a: i32x8, b: i32x8) -> i32x8;
     #[link_name = "llvm.x86.avx2.phsub.sw"]
     fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx2.pmadd.wd"]
-    fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
     #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
     fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
     #[link_name = "llvm.x86.avx2.maskload.d"]
diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
@@ -5835,7 +5835,20 @@ pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpmaddwd))]
 pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpmaddwd(a.as_i16x32(), b.as_i16x32())) }
+    unsafe {
+        let r: i32x32 = simd_mul(simd_cast(a.as_i16x32()), simd_cast(b.as_i16x32()));
+        let even: i32x16 = simd_shuffle!(
+            r,
+            r,
+            [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
+        );
+        let odd: i32x16 = simd_shuffle!(
+            r,
+            r,
+            [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]
+        );
+        simd_add(even, odd).as_m512i()
+    }
 }
 
 /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11617,8 +11630,6 @@ unsafe extern "C" {
     #[link_name = "llvm.x86.avx512.pmul.hr.sw.512"]
     fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32;
 
-    #[link_name = "llvm.x86.avx512.pmaddw.d.512"]
-    fn vpmaddwd(a: i16x32, b: i16x32) -> i32x16;
     #[link_name = "llvm.x86.avx512.pmaddubs.w.512"]
     fn vpmaddubsw(a: i8x64, b: i8x64) -> i16x32;
 
diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs
@@ -201,7 +201,12 @@ pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pmaddwd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
+    unsafe {
+        let r: i32x8 = simd_mul(simd_cast(a.as_i16x8()), simd_cast(b.as_i16x8()));
+        let even: i32x4 = simd_shuffle!(r, r, [0, 2, 4, 6]);
+        let odd: i32x4 = simd_shuffle!(r, r, [1, 3, 5, 7]);
+        simd_add(even, odd).as_m128i()
+    }
 }
 
 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
@@ -3043,8 +3048,6 @@ unsafe extern "C" {
     fn lfence();
     #[link_name = "llvm.x86.sse2.mfence"]
     fn mfence();
-    #[link_name = "llvm.x86.sse2.pmadd.wd"]
-    fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
     #[link_name = "llvm.x86.sse2.psad.bw"]
     fn psadbw(a: u8x16, b: u8x16) -> u64x2;
     #[link_name = "llvm.x86.sse2.psll.w"]
diff --git a/crates/core_arch/src/x86/sse3.rs b/crates/core_arch/src/x86/sse3.rs
@@ -51,7 +51,11 @@ pub fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(haddpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
-    unsafe { haddpd(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 2]);
+        let odd = simd_shuffle!(a, b, [1, 3]);
+        simd_add(even, odd)
+    }
 }
 
 /// Horizontally adds adjacent pairs of single-precision (32-bit)
@@ -63,7 +67,11 @@ pub fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(haddps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { haddps(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 2, 4, 6]);
+        let odd = simd_shuffle!(a, b, [1, 3, 5, 7]);
+        simd_add(even, odd)
+    }
 }
 
 /// Horizontally subtract adjacent pairs of double-precision (64-bit)
@@ -75,7 +83,11 @@ pub fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(hsubpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
-    unsafe { hsubpd(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 2]);
+        let odd = simd_shuffle!(a, b, [1, 3]);
+        simd_sub(even, odd)
+    }
 }
 
 /// Horizontally adds adjacent pairs of single-precision (32-bit)
@@ -87,7 +99,11 @@ pub fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(hsubps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
-    unsafe { hsubps(a, b) }
+    unsafe {
+        let even = simd_shuffle!(a, b, [0, 2, 4, 6]);
+        let odd = simd_shuffle!(a, b, [1, 3, 5, 7]);
+        simd_sub(even, odd)
+    }
 }
 
 /// Loads 128-bits of integer data from unaligned memory.
@@ -153,14 +169,6 @@ pub fn _mm_moveldup_ps(a: __m128) -> __m128 {
 
 #[allow(improper_ctypes)]
 unsafe extern "C" {
-    #[link_name = "llvm.x86.sse3.hadd.pd"]
-    fn haddpd(a: __m128d, b: __m128d) -> __m128d;
-    #[link_name = "llvm.x86.sse3.hadd.ps"]
-    fn haddps(a: __m128, b: __m128) -> __m128;
-    #[link_name = "llvm.x86.sse3.hsub.pd"]
-    fn hsubpd(a: __m128d, b: __m128d) -> __m128d;
-    #[link_name = "llvm.x86.sse3.hsub.ps"]
-    fn hsubps(a: __m128, b: __m128) -> __m128;
     #[link_name = "llvm.x86.sse3.ldu.dq"]
     fn lddqu(mem_addr: *const i8) -> i8x16;
 }
diff --git a/crates/core_arch/src/x86/ssse3.rs b/crates/core_arch/src/x86/ssse3.rs
@@ -164,7 +164,13 @@ pub fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(phaddw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(phaddw128(a.as_i16x8(), b.as_i16x8())) }
+    let a = a.as_i16x8();
+    let b = b.as_i16x8();
+    unsafe {
+        let even: i16x8 = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+        let odd: i16x8 = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+        simd_add(even, odd).as_m128i()
+    }
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -189,7 +195,13 @@ pub fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(phaddd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(phaddd128(a.as_i32x4(), b.as_i32x4())) }
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    unsafe {
+        let even: i32x4 = simd_shuffle!(a, b, [0, 2, 4, 6]);
+        let odd: i32x4 = simd_shuffle!(a, b, [1, 3, 5, 7]);
+        simd_add(even, odd).as_m128i()
+    }
 }
 
 /// Horizontally subtract the adjacent pairs of values contained in 2
@@ -201,7 +213,13 @@ pub fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(phsubw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(phsubw128(a.as_i16x8(), b.as_i16x8())) }
+    let a = a.as_i16x8();
+    let b = b.as_i16x8();
+    unsafe {
+        let even: i16x8 = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
+        let odd: i16x8 = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
+        simd_sub(even, odd).as_m128i()
+    }
 }
 
 /// Horizontally subtract the adjacent pairs of values contained in 2
@@ -227,7 +245,13 @@ pub fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(phsubd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { transmute(phsubd128(a.as_i32x4(), b.as_i32x4())) }
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    unsafe {
+        let even: i32x4 = simd_shuffle!(a, b, [0, 2, 4, 6]);
+        let odd: i32x4 = simd_shuffle!(a, b, [1, 3, 5, 7]);
+        simd_sub(even, odd).as_m128i()
+    }
 }
 
 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@@ -305,24 +329,12 @@ unsafe extern "C" {
     #[link_name = "llvm.x86.ssse3.pshuf.b.128"]
     fn pshufb128(a: u8x16, b: u8x16) -> u8x16;
 
-    #[link_name = "llvm.x86.ssse3.phadd.w.128"]
-    fn phaddw128(a: i16x8, b: i16x8) -> i16x8;
-
     #[link_name = "llvm.x86.ssse3.phadd.sw.128"]
     fn phaddsw128(a: i16x8, b: i16x8) -> i16x8;
 
-    #[link_name = "llvm.x86.ssse3.phadd.d.128"]
-    fn phaddd128(a: i32x4, b: i32x4) -> i32x4;
-
-    #[link_name = "llvm.x86.ssse3.phsub.w.128"]
-    fn phsubw128(a: i16x8, b: i16x8) -> i16x8;
-
     #[link_name = "llvm.x86.ssse3.phsub.sw.128"]
     fn phsubsw128(a: i16x8, b: i16x8) -> i16x8;
 
-    #[link_name = "llvm.x86.ssse3.phsub.d.128"]
-    fn phsubd128(a: i32x4, b: i32x4) -> i32x4;
-
     #[link_name = "llvm.x86.ssse3.pmadd.ub.sw.128"]
     fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8;