Skip to content

Commit e5c8361

Browse files
authored
Merge pull request #5451 from ChipKerchner/fixRVVSHGEMM
Fix _Float16 casting issue and reduce LMUL for certain vector instruction from m2 to m1.
2 parents 79a1f38 + a4abf78 commit e5c8361

File tree

2 files changed

+175
-175
lines changed

2 files changed

+175
-175
lines changed

kernel/riscv64/shgemm_kernel_16x8_zvl256b.c

Lines changed: 46 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -295,22 +295,22 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
295295
BLASLONG bi = n_top * K;
296296

297297
for(BLASLONG k=0; k<K; k++) {
298-
result0+=(float)(A[ai+0]*B[bi+0]);
299-
result1+=(float)(A[ai+1]*B[bi+0]);
300-
result2+=(float)(A[ai+0]*B[bi+1]);
301-
result3+=(float)(A[ai+1]*B[bi+1]);
302-
result4+=(float)(A[ai+0]*B[bi+2]);
303-
result5+=(float)(A[ai+1]*B[bi+2]);
304-
result6+=(float)(A[ai+0]*B[bi+3]);
305-
result7+=(float)(A[ai+1]*B[bi+3]);
306-
result8+=(float)(A[ai+0]*B[bi+4]);
307-
result9+=(float)(A[ai+1]*B[bi+4]);
308-
result10+=(float)(A[ai+0]*B[bi+5]);
309-
result11+=(float)(A[ai+1]*B[bi+5]);
310-
result12+=(float)(A[ai+0]*B[bi+6]);
311-
result13+=(float)(A[ai+1]*B[bi+6]);
312-
result14+=(float)(A[ai+0]*B[bi+7]);
313-
result15+=(float)(A[ai+1]*B[bi+7]);
298+
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
299+
result1+=(float)(A[ai+1])*(float)(B[bi+0]);
300+
result2+=(float)(A[ai+0])*(float)(B[bi+1]);
301+
result3+=(float)(A[ai+1])*(float)(B[bi+1]);
302+
result4+=(float)(A[ai+0])*(float)(B[bi+2]);
303+
result5+=(float)(A[ai+1])*(float)(B[bi+2]);
304+
result6+=(float)(A[ai+0])*(float)(B[bi+3]);
305+
result7+=(float)(A[ai+1])*(float)(B[bi+3]);
306+
result8+=(float)(A[ai+0])*(float)(B[bi+4]);
307+
result9+=(float)(A[ai+1])*(float)(B[bi+4]);
308+
result10+=(float)(A[ai+0])*(float)(B[bi+5]);
309+
result11+=(float)(A[ai+1])*(float)(B[bi+5]);
310+
result12+=(float)(A[ai+0])*(float)(B[bi+6]);
311+
result13+=(float)(A[ai+1])*(float)(B[bi+6]);
312+
result14+=(float)(A[ai+0])*(float)(B[bi+7]);
313+
result15+=(float)(A[ai+1])*(float)(B[bi+7]);
314314
ai+=2;
315315
bi+=8;
316316
}
@@ -353,14 +353,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
353353
BLASLONG bi = n_top * K;
354354

355355
for(BLASLONG k=0; k<K; k++) {
356-
result0+=(float)(A[ai+0]*B[bi+0]);
357-
result1+=(float)(A[ai+0]*B[bi+1]);
358-
result2+=(float)(A[ai+0]*B[bi+2]);
359-
result3+=(float)(A[ai+0]*B[bi+3]);
360-
result4+=(float)(A[ai+0]*B[bi+4]);
361-
result5+=(float)(A[ai+0]*B[bi+5]);
362-
result6+=(float)(A[ai+0]*B[bi+6]);
363-
result7+=(float)(A[ai+0]*B[bi+7]);
356+
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
357+
result1+=(float)(A[ai+0])*(float)(B[bi+1]);
358+
result2+=(float)(A[ai+0])*(float)(B[bi+2]);
359+
result3+=(float)(A[ai+0])*(float)(B[bi+3]);
360+
result4+=(float)(A[ai+0])*(float)(B[bi+4]);
361+
result5+=(float)(A[ai+0])*(float)(B[bi+5]);
362+
result6+=(float)(A[ai+0])*(float)(B[bi+6]);
363+
result7+=(float)(A[ai+0])*(float)(B[bi+7]);
364364
ai+=1;
365365
bi+=8;
366366
}
@@ -569,14 +569,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
569569
BLASLONG bi = n_top * K;
570570

571571
for(BLASLONG k=0; k<K; k++) {
572-
result0+=(float)(A[ai+0]*B[bi+0]);
573-
result1+=(float)(A[ai+1]*B[bi+0]);
574-
result2+=(float)(A[ai+0]*B[bi+1]);
575-
result3+=(float)(A[ai+1]*B[bi+1]);
576-
result4+=(float)(A[ai+0]*B[bi+2]);
577-
result5+=(float)(A[ai+1]*B[bi+2]);
578-
result6+=(float)(A[ai+0]*B[bi+3]);
579-
result7+=(float)(A[ai+1]*B[bi+3]);
572+
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
573+
result1+=(float)(A[ai+1])*(float)(B[bi+0]);
574+
result2+=(float)(A[ai+0])*(float)(B[bi+1]);
575+
result3+=(float)(A[ai+1])*(float)(B[bi+1]);
576+
result4+=(float)(A[ai+0])*(float)(B[bi+2]);
577+
result5+=(float)(A[ai+1])*(float)(B[bi+2]);
578+
result6+=(float)(A[ai+0])*(float)(B[bi+3]);
579+
result7+=(float)(A[ai+1])*(float)(B[bi+3]);
580580
ai+=2;
581581
bi+=4;
582582
}
@@ -607,10 +607,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
607607
BLASLONG bi = n_top * K;
608608

609609
for(BLASLONG k=0; k<K; k++) {
610-
result0+=(float)(A[ai+0]*B[bi+0]);
611-
result1+=(float)(A[ai+0]*B[bi+1]);
612-
result2+=(float)(A[ai+0]*B[bi+2]);
613-
result3+=(float)(A[ai+0]*B[bi+3]);
610+
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
611+
result1+=(float)(A[ai+0])*(float)(B[bi+1]);
612+
result2+=(float)(A[ai+0])*(float)(B[bi+2]);
613+
result3+=(float)(A[ai+0])*(float)(B[bi+3]);
614614
ai+=1;
615615
bi+=4;
616616
}
@@ -770,10 +770,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
770770
BLASLONG bi = n_top * K;
771771

772772
for(BLASLONG k=0; k<K; k++) {
773-
result0+=(float)(A[ai+0]*B[bi+0]);
774-
result1+=(float)(A[ai+1]*B[bi+0]);
775-
result2+=(float)(A[ai+0]*B[bi+1]);
776-
result3+=(float)(A[ai+1]*B[bi+1]);
773+
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
774+
result1+=(float)(A[ai+1])*(float)(B[bi+0]);
775+
result2+=(float)(A[ai+0])*(float)(B[bi+1]);
776+
result3+=(float)(A[ai+1])*(float)(B[bi+1]);
777777
ai+=2;
778778
bi+=2;
779779
}
@@ -797,8 +797,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
797797
BLASLONG bi = n_top * K;
798798

799799
for(BLASLONG k=0; k<K; k++) {
800-
result0+=(float)(A[ai+0]*B[bi+0]);
801-
result1+=(float)(A[ai+0]*B[bi+1]);
800+
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
801+
result1+=(float)(A[ai+0])*(float)(B[bi+1]);
802802
ai+=1;
803803
bi+=2;
804804
}
@@ -930,8 +930,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
930930
BLASLONG bi = n_top * K;
931931

932932
for(BLASLONG k=0; k<K; k++) {
933-
result0+=(float)(A[ai+0]*B[bi+0]);
934-
result1+=(float)(A[ai+1]*B[bi+0]);
933+
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
934+
result1+=(float)(A[ai+1])*(float)(B[bi+0]);
935935
ai+=2;
936936
bi+=1;
937937
}
@@ -953,7 +953,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
953953
BLASLONG bi = n_top * K;
954954

955955
for(BLASLONG k=0; k<K; k++) {
956-
result0+=(float)(A[ai+0]*B[bi+0]);
956+
result0+=(float)(A[ai+0])*(float)(B[bi+0]);
957957
ai+=1;
958958
bi+=1;
959959
}
@@ -966,4 +966,4 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
966966
n_top += 1;
967967
}
968968
return 0;
969-
}
969+
}

0 commit comments

Comments
 (0)