1919//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
2020
2121use crate :: core_arch:: x86:: * ;
22- use crate :: intrinsics:: simd:: { simd_fma, simd_insert, simd_neg} ;
22+ use crate :: intrinsics:: simd:: { simd_fma, simd_insert, simd_neg, simd_shuffle } ;
2323use crate :: intrinsics:: { fmaf32, fmaf64} ;
2424
2525#[ cfg( test) ]
@@ -119,7 +119,9 @@ pub unsafe fn _mm_fmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
119119#[ cfg_attr( test, assert_instr( vfmaddsub) ) ]
120120#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
121121pub unsafe fn _mm_fmaddsub_pd ( a : __m128d , b : __m128d , c : __m128d ) -> __m128d {
122- vfmaddsubpd ( a, b, c)
122+ let add = simd_fma ( a, b, c) ;
123+ let sub = simd_fma ( a, b, simd_neg ( c) ) ;
124+ simd_shuffle ! ( add, sub, [ 2 , 1 ] )
123125}
124126
125127/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
@@ -132,7 +134,9 @@ pub unsafe fn _mm_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
132134#[ cfg_attr( test, assert_instr( vfmaddsub) ) ]
133135#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
134136pub unsafe fn _mm256_fmaddsub_pd ( a : __m256d , b : __m256d , c : __m256d ) -> __m256d {
135- vfmaddsubpd256 ( a, b, c)
137+ let add = simd_fma ( a, b, c) ;
138+ let sub = simd_fma ( a, b, simd_neg ( c) ) ;
139+ simd_shuffle ! ( add, sub, [ 4 , 1 , 6 , 3 ] )
136140}
137141
138142/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
@@ -145,7 +149,9 @@ pub unsafe fn _mm256_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d
145149#[ cfg_attr( test, assert_instr( vfmaddsub) ) ]
146150#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
147151pub unsafe fn _mm_fmaddsub_ps ( a : __m128 , b : __m128 , c : __m128 ) -> __m128 {
148- vfmaddsubps ( a, b, c)
152+ let add = simd_fma ( a, b, c) ;
153+ let sub = simd_fma ( a, b, simd_neg ( c) ) ;
154+ simd_shuffle ! ( add, sub, [ 4 , 1 , 6 , 3 ] )
149155}
150156
151157/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
@@ -158,7 +164,9 @@ pub unsafe fn _mm_fmaddsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
158164#[ cfg_attr( test, assert_instr( vfmaddsub) ) ]
159165#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
160166pub unsafe fn _mm256_fmaddsub_ps ( a : __m256 , b : __m256 , c : __m256 ) -> __m256 {
161- vfmaddsubps256 ( a, b, c)
167+ let add = simd_fma ( a, b, c) ;
168+ let sub = simd_fma ( a, b, simd_neg ( c) ) ;
169+ simd_shuffle ! ( add, sub, [ 8 , 1 , 10 , 3 , 12 , 5 , 14 , 7 ] )
162170}
163171
164172/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
@@ -255,7 +263,9 @@ pub unsafe fn _mm_fmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
255263#[ cfg_attr( test, assert_instr( vfmsubadd) ) ]
256264#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
257265pub unsafe fn _mm_fmsubadd_pd ( a : __m128d , b : __m128d , c : __m128d ) -> __m128d {
258- vfmsubaddpd ( a, b, c)
266+ let add = simd_fma ( a, b, c) ;
267+ let sub = simd_fma ( a, b, simd_neg ( c) ) ;
268+ simd_shuffle ! ( add, sub, [ 0 , 3 ] )
259269}
260270
261271/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
@@ -268,7 +278,9 @@ pub unsafe fn _mm_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
268278#[ cfg_attr( test, assert_instr( vfmsubadd) ) ]
269279#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
270280pub unsafe fn _mm256_fmsubadd_pd ( a : __m256d , b : __m256d , c : __m256d ) -> __m256d {
271- vfmsubaddpd256 ( a, b, c)
281+ let add = simd_fma ( a, b, c) ;
282+ let sub = simd_fma ( a, b, simd_neg ( c) ) ;
283+ simd_shuffle ! ( add, sub, [ 0 , 5 , 2 , 7 ] )
272284}
273285
274286/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
@@ -281,7 +293,9 @@ pub unsafe fn _mm256_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d
281293#[ cfg_attr( test, assert_instr( vfmsubadd) ) ]
282294#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
283295pub unsafe fn _mm_fmsubadd_ps ( a : __m128 , b : __m128 , c : __m128 ) -> __m128 {
284- vfmsubaddps ( a, b, c)
296+ let add = simd_fma ( a, b, c) ;
297+ let sub = simd_fma ( a, b, simd_neg ( c) ) ;
298+ simd_shuffle ! ( add, sub, [ 0 , 5 , 2 , 7 ] )
285299}
286300
287301/// Multiplies packed single-precision (32-bit) floating-point elements in `a`
@@ -294,7 +308,9 @@ pub unsafe fn _mm_fmsubadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
294308#[ cfg_attr( test, assert_instr( vfmsubadd) ) ]
295309#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
296310pub unsafe fn _mm256_fmsubadd_ps ( a : __m256 , b : __m256 , c : __m256 ) -> __m256 {
297- vfmsubaddps256 ( a, b, c)
311+ let add = simd_fma ( a, b, c) ;
312+ let sub = simd_fma ( a, b, simd_neg ( c) ) ;
313+ simd_shuffle ! ( add, sub, [ 0 , 9 , 2 , 11 , 4 , 13 , 6 , 15 ] )
298314}
299315
300316/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
@@ -471,26 +487,6 @@ pub unsafe fn _mm_fnmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 {
471487 )
472488}
473489
474- #[ allow( improper_ctypes) ]
475- extern "C" {
476- #[ link_name = "llvm.x86.fma.vfmaddsub.pd" ]
477- fn vfmaddsubpd ( a : __m128d , b : __m128d , c : __m128d ) -> __m128d ;
478- #[ link_name = "llvm.x86.fma.vfmaddsub.pd.256" ]
479- fn vfmaddsubpd256 ( a : __m256d , b : __m256d , c : __m256d ) -> __m256d ;
480- #[ link_name = "llvm.x86.fma.vfmaddsub.ps" ]
481- fn vfmaddsubps ( a : __m128 , b : __m128 , c : __m128 ) -> __m128 ;
482- #[ link_name = "llvm.x86.fma.vfmaddsub.ps.256" ]
483- fn vfmaddsubps256 ( a : __m256 , b : __m256 , c : __m256 ) -> __m256 ;
484- #[ link_name = "llvm.x86.fma.vfmsubadd.pd" ]
485- fn vfmsubaddpd ( a : __m128d , b : __m128d , c : __m128d ) -> __m128d ;
486- #[ link_name = "llvm.x86.fma.vfmsubadd.pd.256" ]
487- fn vfmsubaddpd256 ( a : __m256d , b : __m256d , c : __m256d ) -> __m256d ;
488- #[ link_name = "llvm.x86.fma.vfmsubadd.ps" ]
489- fn vfmsubaddps ( a : __m128 , b : __m128 , c : __m128 ) -> __m128 ;
490- #[ link_name = "llvm.x86.fma.vfmsubadd.ps.256" ]
491- fn vfmsubaddps256 ( a : __m256 , b : __m256 , c : __m256 ) -> __m256 ;
492- }
493-
494490#[ cfg( test) ]
495491mod tests {
496492
0 commit comments