Skip to content

Commit 53d7452

Browse files
committed
riscv: gemv_t_vector.c optimize
1 parent 2a95564 commit 53d7452

File tree

1 file changed

+19
-108
lines changed

1 file changed

+19
-108
lines changed

kernel/riscv64/gemv_t_vector.c

Lines changed: 19 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -72,123 +72,34 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
7272
FLOAT_V_T_M1 v_res;
7373
size_t vlmax = VSETVL_MAX_M1();
7474

75-
#ifndef RISCV_0p10_INTRINSICS
76-
FLOAT_V_T va0, va1, va2, va3, vr0, vr1, vr2, vr3;
77-
FLOAT_V_T_M1 vec0, vec1, vec2, vec3;
78-
FLOAT *a_ptrs[4], *y_ptrs[4];
79-
#endif
80-
8175
if(inc_x == 1){
82-
#ifndef RISCV_0p10_INTRINSICS
83-
BLASLONG anr = n - n % 4;
84-
for (; i < anr; i += 4) {
85-
gvl = VSETVL(m);
86-
j = 0;
87-
for (int l = 0; l < 4; l++) {
88-
a_ptrs[l] = a + (i + l) * lda;
89-
y_ptrs[l] = y + (i + l) * inc_y;
90-
}
91-
vec0 = VFMVVF_FLOAT_M1(0.0, vlmax);
92-
vec1 = VFMVVF_FLOAT_M1(0.0, vlmax);
93-
vec2 = VFMVVF_FLOAT_M1(0.0, vlmax);
94-
vec3 = VFMVVF_FLOAT_M1(0.0, vlmax);
95-
vr0 = VFMVVF_FLOAT(0.0, gvl);
96-
vr1 = VFMVVF_FLOAT(0.0, gvl);
97-
vr2 = VFMVVF_FLOAT(0.0, gvl);
98-
vr3 = VFMVVF_FLOAT(0.0, gvl);
99-
for (k = 0; k < m / gvl; k++) {
100-
va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl);
101-
va1 = VLEV_FLOAT(a_ptrs[1] + j, gvl);
102-
va2 = VLEV_FLOAT(a_ptrs[2] + j, gvl);
103-
va3 = VLEV_FLOAT(a_ptrs[3] + j, gvl);
10476

105-
vx = VLEV_FLOAT(x + j, gvl);
106-
vr0 = VFMULVV_FLOAT(va0, vx, gvl);
107-
vr1 = VFMULVV_FLOAT(va1, vx, gvl);
108-
vr2 = VFMULVV_FLOAT(va2, vx, gvl);
109-
vr3 = VFMULVV_FLOAT(va3, vx, gvl);
110-
// Floating-point addition does not satisfy the associative law, that is, (a + b) + c ≠ a + (b + c),
111-
// so piecewise multiplication and reduction must be performed inside the loop body.
112-
vec0 = VFREDSUM_FLOAT(vr0, vec0, gvl);
113-
vec1 = VFREDSUM_FLOAT(vr1, vec1, gvl);
114-
vec2 = VFREDSUM_FLOAT(vr2, vec2, gvl);
115-
vec3 = VFREDSUM_FLOAT(vr3, vec3, gvl);
116-
j += gvl;
117-
}
118-
if (j < m) {
119-
gvl = VSETVL(m - j);
120-
va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl);
121-
va1 = VLEV_FLOAT(a_ptrs[1] + j, gvl);
122-
va2 = VLEV_FLOAT(a_ptrs[2] + j, gvl);
123-
va3 = VLEV_FLOAT(a_ptrs[3] + j, gvl);
124-
125-
vx = VLEV_FLOAT(x + j, gvl);
126-
vr0 = VFMULVV_FLOAT(va0, vx, gvl);
127-
vr1 = VFMULVV_FLOAT(va1, vx, gvl);
128-
vr2 = VFMULVV_FLOAT(va2, vx, gvl);
129-
vr3 = VFMULVV_FLOAT(va3, vx, gvl);
130-
vec0 = VFREDSUM_FLOAT(vr0, vec0, gvl);
131-
vec1 = VFREDSUM_FLOAT(vr1, vec1, gvl);
132-
vec2 = VFREDSUM_FLOAT(vr2, vec2, gvl);
133-
vec3 = VFREDSUM_FLOAT(vr3, vec3, gvl);
134-
}
135-
*y_ptrs[0] += alpha * (FLOAT)(EXTRACT_FLOAT(vec0));
136-
*y_ptrs[1] += alpha * (FLOAT)(EXTRACT_FLOAT(vec1));
137-
*y_ptrs[2] += alpha * (FLOAT)(EXTRACT_FLOAT(vec2));
138-
*y_ptrs[3] += alpha * (FLOAT)(EXTRACT_FLOAT(vec3));
139-
}
140-
// deal with the tail
141-
for (; i < n; i++) {
142-
v_res = VFMVVF_FLOAT_M1(0, vlmax);
77+
for(i = 0; i < n; i++){
78+
v_res = VFMVVF_FLOAT_M1(0, 1);
14379
gvl = VSETVL(m);
14480
j = 0;
145-
a_ptrs[0] = a + i * lda;
146-
y_ptrs[0] = y + i * inc_y;
147-
vr0 = VFMVVF_FLOAT(0, gvl);
148-
for (k = 0; k < m / gvl; k++) {
149-
va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl);
150-
vx = VLEV_FLOAT(x + j, gvl);
151-
vr0 = VFMULVV_FLOAT(va0, vx, gvl);
152-
v_res = VFREDSUM_FLOAT(vr0, v_res, gvl);
81+
vr = VFMVVF_FLOAT(0, gvl);
82+
for(k = 0; k < m/gvl; k++){
83+
va = VLEV_FLOAT(&a_ptr[j], gvl);
84+
vx = VLEV_FLOAT(&x[j], gvl);
85+
vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop
86+
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail
15387
j += gvl;
15488
}
155-
if (j < m) {
156-
gvl = VSETVL(m - j);
157-
va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl);
158-
vx = VLEV_FLOAT(x + j, gvl);
159-
vr0 = VFMULVV_FLOAT(va0, vx, gvl);
160-
v_res = VFREDSUM_FLOAT(vr0, v_res, gvl);
89+
if(j < m){
90+
gvl = VSETVL(m-j);
91+
va = VLEV_FLOAT(&a_ptr[j], gvl);
92+
vx = VLEV_FLOAT(&x[j], gvl);
93+
vr = VFMULVV_FLOAT(va, vx, gvl);
94+
v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
16195
}
162-
*y_ptrs[0] += alpha * (FLOAT)(EXTRACT_FLOAT(v_res));
163-
}
164-
#else
165-
for(i = 0; i < n; i++){
166-
v_res = VFMVVF_FLOAT_M1(0, 1);
167-
gvl = VSETVL(m);
168-
j = 0;
169-
vr = VFMVVF_FLOAT(0, gvl);
170-
for(k = 0; k < m/gvl; k++){
171-
va = VLEV_FLOAT(&a_ptr[j], gvl);
172-
vx = VLEV_FLOAT(&x[j], gvl);
173-
vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop
174-
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail
175-
j += gvl;
176-
}
177-
if(j < m){
178-
gvl = VSETVL(m-j);
179-
va = VLEV_FLOAT(&a_ptr[j], gvl);
180-
vx = VLEV_FLOAT(&x[j], gvl);
181-
vr = VFMULVV_FLOAT(va, vx, gvl);
182-
v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
183-
}
184-
temp = (FLOAT)EXTRACT_FLOAT(v_res);
185-
y[iy] += alpha * temp;
96+
temp = (FLOAT)EXTRACT_FLOAT(v_res);
97+
y[iy] += alpha * temp;
18698

18799

188-
iy += inc_y;
189-
a_ptr += lda;
190-
}
191-
#endif
100+
iy += inc_y;
101+
a_ptr += lda;
102+
}
192103
} else {
193104
BLASLONG stride_x = inc_x * sizeof(FLOAT);
194105
for(i = 0; i < n; i++){

0 commit comments

Comments
 (0)