11//! Code taken from the `packed_simd` crate.
22//! Run this code with `cargo test --example dot_product`.
33
4- #![ feature( array_chunks) ]
5- #![ feature( slice_as_chunks) ]
64// Add these imports to use the stdsimd library
75#![ feature( portable_simd) ]
86use core_simd:: simd:: prelude:: * ;
@@ -33,7 +31,7 @@ pub fn dot_prod_scalar_1(a: &[f32], b: &[f32]) -> f32 {
3331}
3432
3533// We now move on to the SIMD implementations: notice the following constructs:
36- // `array_chunks ::<4>`: mapping this over the vector will let use construct SIMD vectors
34+ // `as_chunks ::<4>`: mapping this over the vector will let us construct SIMD vectors
3735// `f32x4::from_array`: construct the SIMD vector from a slice
3836// `(a * b).reduce_sum()`: Multiply both f32x4 vectors together, and then reduce them.
3937// This approach essentially uses SIMD to produce a vector of length N/4 of all the products,
@@ -42,9 +40,11 @@ pub fn dot_prod_scalar_1(a: &[f32], b: &[f32]) -> f32 {
4240pub fn dot_prod_simd_0 ( a : & [ f32 ] , b : & [ f32 ] ) -> f32 {
4341 assert_eq ! ( a. len( ) , b. len( ) ) ;
4442 // TODO handle remainder when a.len() % 4 != 0
45- a. array_chunks :: < 4 > ( )
43+ a. as_chunks :: < 4 > ( )
44+ . 0
45+ . iter ( )
4646 . map ( |& a| f32x4:: from_array ( a) )
47- . zip ( b. array_chunks :: < 4 > ( ) . map ( |& b| f32x4:: from_array ( b) ) )
47+ . zip ( b. as_chunks :: < 4 > ( ) . 0 . iter ( ) . map ( |& b| f32x4:: from_array ( b) ) )
4848 . map ( |( a, b) | ( a * b) . reduce_sum ( ) )
4949 . sum ( )
5050}
@@ -60,9 +60,11 @@ pub fn dot_prod_simd_0(a: &[f32], b: &[f32]) -> f32 {
6060pub fn dot_prod_simd_1 ( a : & [ f32 ] , b : & [ f32 ] ) -> f32 {
6161 assert_eq ! ( a. len( ) , b. len( ) ) ;
6262 // TODO handle remainder when a.len() % 4 != 0
63- a. array_chunks :: < 4 > ( )
63+ a. as_chunks :: < 4 > ( )
64+ . 0
65+ . iter ( )
6466 . map ( |& a| f32x4:: from_array ( a) )
65- . zip ( b. array_chunks :: < 4 > ( ) . map ( |& b| f32x4:: from_array ( b) ) )
67+ . zip ( b. as_chunks :: < 4 > ( ) . 0 . iter ( ) . map ( |& b| f32x4:: from_array ( b) ) )
6668 . fold ( f32x4:: splat ( 0.0 ) , |acc, zipped| acc + zipped. 0 * zipped. 1 )
6769 . reduce_sum ( )
6870}
@@ -74,9 +76,11 @@ pub fn dot_prod_simd_2(a: &[f32], b: &[f32]) -> f32 {
7476 assert_eq ! ( a. len( ) , b. len( ) ) ;
7577 // TODO handle remainder when a.len() % 4 != 0
7678 let mut res = f32x4:: splat ( 0.0 ) ;
77- a. array_chunks :: < 4 > ( )
79+ a. as_chunks :: < 4 > ( )
80+ . 0
81+ . iter ( )
7882 . map ( |& a| f32x4:: from_array ( a) )
79- . zip ( b. array_chunks :: < 4 > ( ) . map ( |& b| f32x4:: from_array ( b) ) )
83+ . zip ( b. as_chunks :: < 4 > ( ) . 0 . iter ( ) . map ( |& b| f32x4:: from_array ( b) ) )
8084 . for_each ( |( a, b) | {
8185 res = a. mul_add ( b, res) ;
8286 } ) ;
@@ -113,9 +117,11 @@ pub fn dot_prod_simd_3(a: &[f32], b: &[f32]) -> f32 {
113117// next example.
114118pub fn dot_prod_simd_4 ( a : & [ f32 ] , b : & [ f32 ] ) -> f32 {
115119 let mut sum = a
116- . array_chunks :: < 4 > ( )
120+ . as_chunks :: < 4 > ( )
121+ . 0
122+ . iter ( )
117123 . map ( |& a| f32x4:: from_array ( a) )
118- . zip ( b. array_chunks :: < 4 > ( ) . map ( |& b| f32x4:: from_array ( b) ) )
124+ . zip ( b. as_chunks :: < 4 > ( ) . 0 . iter ( ) . map ( |& b| f32x4:: from_array ( b) ) )
119125 . map ( |( a, b) | a * b)
120126 . fold ( f32x4:: splat ( 0.0 ) , std:: ops:: Add :: add)
121127 . reduce_sum ( ) ;
@@ -131,9 +137,11 @@ pub fn dot_prod_simd_4(a: &[f32], b: &[f32]) -> f32 {
131137// This version allocates a single `XMM` register for accumulation, and the folds don't allocate on top of that.
132138// Notice the use of `mul_add`, which can do a multiply and an add operation ber iteration.
133139pub fn dot_prod_simd_5 ( a : & [ f32 ] , b : & [ f32 ] ) -> f32 {
134- a. array_chunks :: < 4 > ( )
140+ a. as_chunks :: < 4 > ( )
141+ . 0
142+ . iter ( )
135143 . map ( |& a| f32x4:: from_array ( a) )
136- . zip ( b. array_chunks :: < 4 > ( ) . map ( |& b| f32x4:: from_array ( b) ) )
144+ . zip ( b. as_chunks :: < 4 > ( ) . 0 . iter ( ) . map ( |& b| f32x4:: from_array ( b) ) )
137145 . fold ( f32x4:: splat ( 0. ) , |acc, ( a, b) | a. mul_add ( b, acc) )
138146 . reduce_sum ( )
139147}
0 commit comments