@@ -72,123 +72,34 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
7272 FLOAT_V_T_M1 v_res ;
7373 size_t vlmax = VSETVL_MAX_M1 ();
7474
75- #ifndef RISCV_0p10_INTRINSICS
76- FLOAT_V_T va0 , va1 , va2 , va3 , vr0 , vr1 , vr2 , vr3 ;
77- FLOAT_V_T_M1 vec0 , vec1 , vec2 , vec3 ;
78- FLOAT * a_ptrs [4 ], * y_ptrs [4 ];
79- #endif
80-
8175 if (inc_x == 1 ){
82- #ifndef RISCV_0p10_INTRINSICS
83- BLASLONG anr = n - n % 4 ;
84- for (; i < anr ; i += 4 ) {
85- gvl = VSETVL (m );
86- j = 0 ;
87- for (int l = 0 ; l < 4 ; l ++ ) {
88- a_ptrs [l ] = a + (i + l ) * lda ;
89- y_ptrs [l ] = y + (i + l ) * inc_y ;
90- }
91- vec0 = VFMVVF_FLOAT_M1 (0.0 , vlmax );
92- vec1 = VFMVVF_FLOAT_M1 (0.0 , vlmax );
93- vec2 = VFMVVF_FLOAT_M1 (0.0 , vlmax );
94- vec3 = VFMVVF_FLOAT_M1 (0.0 , vlmax );
95- vr0 = VFMVVF_FLOAT (0.0 , gvl );
96- vr1 = VFMVVF_FLOAT (0.0 , gvl );
97- vr2 = VFMVVF_FLOAT (0.0 , gvl );
98- vr3 = VFMVVF_FLOAT (0.0 , gvl );
99- for (k = 0 ; k < m / gvl ; k ++ ) {
100- va0 = VLEV_FLOAT (a_ptrs [0 ] + j , gvl );
101- va1 = VLEV_FLOAT (a_ptrs [1 ] + j , gvl );
102- va2 = VLEV_FLOAT (a_ptrs [2 ] + j , gvl );
103- va3 = VLEV_FLOAT (a_ptrs [3 ] + j , gvl );
10476
105- vx = VLEV_FLOAT (x + j , gvl );
106- vr0 = VFMULVV_FLOAT (va0 , vx , gvl );
107- vr1 = VFMULVV_FLOAT (va1 , vx , gvl );
108- vr2 = VFMULVV_FLOAT (va2 , vx , gvl );
109- vr3 = VFMULVV_FLOAT (va3 , vx , gvl );
110- // Floating-point addition does not satisfy the associative law, that is, (a + b) + c ≠ a + (b + c),
111- // so piecewise multiplication and reduction must be performed inside the loop body.
112- vec0 = VFREDSUM_FLOAT (vr0 , vec0 , gvl );
113- vec1 = VFREDSUM_FLOAT (vr1 , vec1 , gvl );
114- vec2 = VFREDSUM_FLOAT (vr2 , vec2 , gvl );
115- vec3 = VFREDSUM_FLOAT (vr3 , vec3 , gvl );
116- j += gvl ;
117- }
118- if (j < m ) {
119- gvl = VSETVL (m - j );
120- va0 = VLEV_FLOAT (a_ptrs [0 ] + j , gvl );
121- va1 = VLEV_FLOAT (a_ptrs [1 ] + j , gvl );
122- va2 = VLEV_FLOAT (a_ptrs [2 ] + j , gvl );
123- va3 = VLEV_FLOAT (a_ptrs [3 ] + j , gvl );
124-
125- vx = VLEV_FLOAT (x + j , gvl );
126- vr0 = VFMULVV_FLOAT (va0 , vx , gvl );
127- vr1 = VFMULVV_FLOAT (va1 , vx , gvl );
128- vr2 = VFMULVV_FLOAT (va2 , vx , gvl );
129- vr3 = VFMULVV_FLOAT (va3 , vx , gvl );
130- vec0 = VFREDSUM_FLOAT (vr0 , vec0 , gvl );
131- vec1 = VFREDSUM_FLOAT (vr1 , vec1 , gvl );
132- vec2 = VFREDSUM_FLOAT (vr2 , vec2 , gvl );
133- vec3 = VFREDSUM_FLOAT (vr3 , vec3 , gvl );
134- }
135- * y_ptrs [0 ] += alpha * (FLOAT )(EXTRACT_FLOAT (vec0 ));
136- * y_ptrs [1 ] += alpha * (FLOAT )(EXTRACT_FLOAT (vec1 ));
137- * y_ptrs [2 ] += alpha * (FLOAT )(EXTRACT_FLOAT (vec2 ));
138- * y_ptrs [3 ] += alpha * (FLOAT )(EXTRACT_FLOAT (vec3 ));
139- }
140- // deal with the tail
141- for (; i < n ; i ++ ) {
142- v_res = VFMVVF_FLOAT_M1 (0 , vlmax );
77+ for (i = 0 ; i < n ; i ++ ){
78+ v_res = VFMVVF_FLOAT_M1 (0 , 1 );
14379 gvl = VSETVL (m );
14480 j = 0 ;
145- a_ptrs [0 ] = a + i * lda ;
146- y_ptrs [0 ] = y + i * inc_y ;
147- vr0 = VFMVVF_FLOAT (0 , gvl );
148- for (k = 0 ; k < m / gvl ; k ++ ) {
149- va0 = VLEV_FLOAT (a_ptrs [0 ] + j , gvl );
150- vx = VLEV_FLOAT (x + j , gvl );
151- vr0 = VFMULVV_FLOAT (va0 , vx , gvl );
152- v_res = VFREDSUM_FLOAT (vr0 , v_res , gvl );
81+ vr = VFMVVF_FLOAT (0 , gvl );
82+ for (k = 0 ; k < m /gvl ; k ++ ){
83+ va = VLEV_FLOAT (& a_ptr [j ], gvl );
84+ vx = VLEV_FLOAT (& x [j ], gvl );
85+ vr = VFMULVV_FLOAT (va , vx , gvl ); // could vfmacc here and reduce outside loop
86+ v_res = VFREDSUM_FLOAT (vr , v_res , gvl ); // but that reordering diverges far enough from scalar path to make tests fail
15387 j += gvl ;
15488 }
155- if (j < m ) {
156- gvl = VSETVL (m - j );
157- va0 = VLEV_FLOAT (a_ptrs [ 0 ] + j , gvl );
158- vx = VLEV_FLOAT (x + j , gvl );
159- vr0 = VFMULVV_FLOAT (va0 , vx , gvl );
160- v_res = VFREDSUM_FLOAT (vr0 , v_res , gvl );
89+ if (j < m ){
90+ gvl = VSETVL (m - j );
91+ va = VLEV_FLOAT (& a_ptr [ j ] , gvl );
92+ vx = VLEV_FLOAT (& x [ j ] , gvl );
93+ vr = VFMULVV_FLOAT (va , vx , gvl );
94+ v_res = VFREDSUM_FLOAT (vr , v_res , gvl );
16195 }
162- * y_ptrs [0 ] += alpha * (FLOAT )(EXTRACT_FLOAT (v_res ));
163- }
164- #else
165- for (i = 0 ; i < n ; i ++ ){
166- v_res = VFMVVF_FLOAT_M1 (0 , 1 );
167- gvl = VSETVL (m );
168- j = 0 ;
169- vr = VFMVVF_FLOAT (0 , gvl );
170- for (k = 0 ; k < m /gvl ; k ++ ){
171- va = VLEV_FLOAT (& a_ptr [j ], gvl );
172- vx = VLEV_FLOAT (& x [j ], gvl );
173- vr = VFMULVV_FLOAT (va , vx , gvl ); // could vfmacc here and reduce outside loop
174- v_res = VFREDSUM_FLOAT (vr , v_res , gvl ); // but that reordering diverges far enough from scalar path to make tests fail
175- j += gvl ;
176- }
177- if (j < m ){
178- gvl = VSETVL (m - j );
179- va = VLEV_FLOAT (& a_ptr [j ], gvl );
180- vx = VLEV_FLOAT (& x [j ], gvl );
181- vr = VFMULVV_FLOAT (va , vx , gvl );
182- v_res = VFREDSUM_FLOAT (vr , v_res , gvl );
183- }
184- temp = (FLOAT )EXTRACT_FLOAT (v_res );
185- y [iy ] += alpha * temp ;
96+ temp = (FLOAT )EXTRACT_FLOAT (v_res );
97+ y [iy ] += alpha * temp ;
18698
18799
188- iy += inc_y ;
189- a_ptr += lda ;
190- }
191- #endif
100+ iy += inc_y ;
101+ a_ptr += lda ;
102+ }
192103 } else {
193104 BLASLONG stride_x = inc_x * sizeof (FLOAT );
194105 for (i = 0 ; i < n ; i ++ ){
0 commit comments