Skip to content

Commit 43fdff7

Browse files
authored
Merge pull request #5444 from yuanjia111/develop
riscv:Remove the manual unrolling in gemv_t_vector.c to performance better
2 parents 2a95564 + 826cb45 commit 43fdff7

File tree

1 file changed

+19
-111
lines changed

1 file changed

+19
-111
lines changed

kernel/riscv64/gemv_t_vector.c

Lines changed: 19 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2828
#include "common.h"
2929
#if !defined(DOUBLE)
3030
#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
31-
#define VSETVL_MAX_M1 RISCV_RVV(vsetvlmax_e32m1)
3231
#define FLOAT_V_T vfloat32m8_t
3332
#define FLOAT_V_T_M1 vfloat32m1_t
3433
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
@@ -44,7 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4443
#define xint_t int
4544
#else
4645
#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
47-
#define VSETVL_MAX_M1 RISCV_RVV(vsetvlmax_e64m1)
4846
#define FLOAT_V_T vfloat64m8_t
4947
#define FLOAT_V_T_M1 vfloat64m1_t
5048
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
@@ -70,125 +68,35 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
7068
FLOAT_V_T va, vr, vx;
7169
unsigned int gvl = 0;
7270
FLOAT_V_T_M1 v_res;
73-
size_t vlmax = VSETVL_MAX_M1();
74-
75-
#ifndef RISCV_0p10_INTRINSICS
76-
FLOAT_V_T va0, va1, va2, va3, vr0, vr1, vr2, vr3;
77-
FLOAT_V_T_M1 vec0, vec1, vec2, vec3;
78-
FLOAT *a_ptrs[4], *y_ptrs[4];
79-
#endif
8071

8172
if(inc_x == 1){
82-
#ifndef RISCV_0p10_INTRINSICS
83-
BLASLONG anr = n - n % 4;
84-
for (; i < anr; i += 4) {
85-
gvl = VSETVL(m);
86-
j = 0;
87-
for (int l = 0; l < 4; l++) {
88-
a_ptrs[l] = a + (i + l) * lda;
89-
y_ptrs[l] = y + (i + l) * inc_y;
90-
}
91-
vec0 = VFMVVF_FLOAT_M1(0.0, vlmax);
92-
vec1 = VFMVVF_FLOAT_M1(0.0, vlmax);
93-
vec2 = VFMVVF_FLOAT_M1(0.0, vlmax);
94-
vec3 = VFMVVF_FLOAT_M1(0.0, vlmax);
95-
vr0 = VFMVVF_FLOAT(0.0, gvl);
96-
vr1 = VFMVVF_FLOAT(0.0, gvl);
97-
vr2 = VFMVVF_FLOAT(0.0, gvl);
98-
vr3 = VFMVVF_FLOAT(0.0, gvl);
99-
for (k = 0; k < m / gvl; k++) {
100-
va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl);
101-
va1 = VLEV_FLOAT(a_ptrs[1] + j, gvl);
102-
va2 = VLEV_FLOAT(a_ptrs[2] + j, gvl);
103-
va3 = VLEV_FLOAT(a_ptrs[3] + j, gvl);
10473

105-
vx = VLEV_FLOAT(x + j, gvl);
106-
vr0 = VFMULVV_FLOAT(va0, vx, gvl);
107-
vr1 = VFMULVV_FLOAT(va1, vx, gvl);
108-
vr2 = VFMULVV_FLOAT(va2, vx, gvl);
109-
vr3 = VFMULVV_FLOAT(va3, vx, gvl);
110-
// Floating-point addition does not satisfy the associative law, that is, (a + b) + c ≠ a + (b + c),
111-
// so piecewise multiplication and reduction must be performed inside the loop body.
112-
vec0 = VFREDSUM_FLOAT(vr0, vec0, gvl);
113-
vec1 = VFREDSUM_FLOAT(vr1, vec1, gvl);
114-
vec2 = VFREDSUM_FLOAT(vr2, vec2, gvl);
115-
vec3 = VFREDSUM_FLOAT(vr3, vec3, gvl);
116-
j += gvl;
117-
}
118-
if (j < m) {
119-
gvl = VSETVL(m - j);
120-
va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl);
121-
va1 = VLEV_FLOAT(a_ptrs[1] + j, gvl);
122-
va2 = VLEV_FLOAT(a_ptrs[2] + j, gvl);
123-
va3 = VLEV_FLOAT(a_ptrs[3] + j, gvl);
124-
125-
vx = VLEV_FLOAT(x + j, gvl);
126-
vr0 = VFMULVV_FLOAT(va0, vx, gvl);
127-
vr1 = VFMULVV_FLOAT(va1, vx, gvl);
128-
vr2 = VFMULVV_FLOAT(va2, vx, gvl);
129-
vr3 = VFMULVV_FLOAT(va3, vx, gvl);
130-
vec0 = VFREDSUM_FLOAT(vr0, vec0, gvl);
131-
vec1 = VFREDSUM_FLOAT(vr1, vec1, gvl);
132-
vec2 = VFREDSUM_FLOAT(vr2, vec2, gvl);
133-
vec3 = VFREDSUM_FLOAT(vr3, vec3, gvl);
134-
}
135-
*y_ptrs[0] += alpha * (FLOAT)(EXTRACT_FLOAT(vec0));
136-
*y_ptrs[1] += alpha * (FLOAT)(EXTRACT_FLOAT(vec1));
137-
*y_ptrs[2] += alpha * (FLOAT)(EXTRACT_FLOAT(vec2));
138-
*y_ptrs[3] += alpha * (FLOAT)(EXTRACT_FLOAT(vec3));
139-
}
140-
// deal with the tail
141-
for (; i < n; i++) {
142-
v_res = VFMVVF_FLOAT_M1(0, vlmax);
74+
for(i = 0; i < n; i++){
75+
v_res = VFMVVF_FLOAT_M1(0, 1);
14376
gvl = VSETVL(m);
14477
j = 0;
145-
a_ptrs[0] = a + i * lda;
146-
y_ptrs[0] = y + i * inc_y;
147-
vr0 = VFMVVF_FLOAT(0, gvl);
148-
for (k = 0; k < m / gvl; k++) {
149-
va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl);
150-
vx = VLEV_FLOAT(x + j, gvl);
151-
vr0 = VFMULVV_FLOAT(va0, vx, gvl);
152-
v_res = VFREDSUM_FLOAT(vr0, v_res, gvl);
78+
vr = VFMVVF_FLOAT(0, gvl);
79+
for(k = 0; k < m/gvl; k++){
80+
va = VLEV_FLOAT(&a_ptr[j], gvl);
81+
vx = VLEV_FLOAT(&x[j], gvl);
82+
vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop
83+
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail
15384
j += gvl;
15485
}
155-
if (j < m) {
156-
gvl = VSETVL(m - j);
157-
va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl);
158-
vx = VLEV_FLOAT(x + j, gvl);
159-
vr0 = VFMULVV_FLOAT(va0, vx, gvl);
160-
v_res = VFREDSUM_FLOAT(vr0, v_res, gvl);
86+
if(j < m){
87+
gvl = VSETVL(m-j);
88+
va = VLEV_FLOAT(&a_ptr[j], gvl);
89+
vx = VLEV_FLOAT(&x[j], gvl);
90+
vr = VFMULVV_FLOAT(va, vx, gvl);
91+
v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
16192
}
162-
*y_ptrs[0] += alpha * (FLOAT)(EXTRACT_FLOAT(v_res));
163-
}
164-
#else
165-
for(i = 0; i < n; i++){
166-
v_res = VFMVVF_FLOAT_M1(0, 1);
167-
gvl = VSETVL(m);
168-
j = 0;
169-
vr = VFMVVF_FLOAT(0, gvl);
170-
for(k = 0; k < m/gvl; k++){
171-
va = VLEV_FLOAT(&a_ptr[j], gvl);
172-
vx = VLEV_FLOAT(&x[j], gvl);
173-
vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop
174-
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail
175-
j += gvl;
176-
}
177-
if(j < m){
178-
gvl = VSETVL(m-j);
179-
va = VLEV_FLOAT(&a_ptr[j], gvl);
180-
vx = VLEV_FLOAT(&x[j], gvl);
181-
vr = VFMULVV_FLOAT(va, vx, gvl);
182-
v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
183-
}
184-
temp = (FLOAT)EXTRACT_FLOAT(v_res);
185-
y[iy] += alpha * temp;
93+
temp = (FLOAT)EXTRACT_FLOAT(v_res);
94+
y[iy] += alpha * temp;
18695

18796

188-
iy += inc_y;
189-
a_ptr += lda;
190-
}
191-
#endif
97+
iy += inc_y;
98+
a_ptr += lda;
99+
}
192100
} else {
193101
BLASLONG stride_x = inc_x * sizeof(FLOAT);
194102
for(i = 0; i < n; i++){

0 commit comments

Comments
 (0)