@@ -28,35 +28,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2828#include "common.h"
2929
3030#if !defined(DOUBLE )
31- #define VSETVL (n ) __riscv_vsetvl_e32m1(n)
32- #define FLOAT_V_T vfloat32m1_t
33- #define FLOAT_VX2_T vfloat32m1x2_t
34- #define FLOAT_VX4_T vfloat32m1x4_t
35- #define FLOAT_VX8_T vfloat32m1x8_t
36- #define VLEV_FLOAT __riscv_vle32_v_f32m1
37- #define VSEV_FLOAT __riscv_vse32_v_f32m1
38- #define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2
39- #define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4
40- #define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8
41- #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2
42- #define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4
43- #define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8
31+ #define FLOAT_V_T vfloat32m2_t
32+ #define FLOAT_V_T_HALF vfloat32m1_t
33+ #define VLEV_FLOAT __riscv_vle32_v_f32m2
34+ #define VLEV_FLOAT_HALF __riscv_vle32_v_f32m1
35+ #define VSEV_FLOAT __riscv_vse32_v_f32m2
36+ #define VSEV_FLOAT_HALF __riscv_vse32_v_f32m1
4437#else
45- #define VSETVL (n ) __riscv_vsetvl_e64m1(n)
46- #define FLOAT_V_T vfloat64m1_t
47- #define FLOAT_VX2_T vfloat64m1x2_t
48- #define FLOAT_VX4_T vfloat64m1x4_t
49- #define FLOAT_VX8_T vfloat64m1x8_t
50- #define VLEV_FLOAT __riscv_vle64_v_f64m1
51- #define VSEV_FLOAT __riscv_vse64_v_f64m1
52- #define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2
53- #define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4
54- #define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8
55- #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2
56- #define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4
57- #define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8
38+ #define FLOAT_V_T vfloat64m4_t
39+ #define FLOAT_V_T_HALF vfloat64m2_t
40+ #define VLEV_FLOAT __riscv_vle64_v_f64m4
41+ #define VLEV_FLOAT_HALF __riscv_vle64_v_f64m2
42+ #define VSEV_FLOAT __riscv_vse64_v_f64m4
43+ #define VSEV_FLOAT_HALF __riscv_vse64_v_f64m2
5844#endif
5945
46+
6047int CNAME (BLASLONG m , BLASLONG n , FLOAT * a , BLASLONG lda , FLOAT * b ){
6148
6249 BLASLONG i , j ;
@@ -67,9 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
6754 IFLOAT * boffset , * boffset1 , * boffset2 , * boffset3 ;
6855
6956 FLOAT_V_T v0 ;
70- FLOAT_VX2_T vx2 ;
71- FLOAT_VX4_T vx4 ;
72- FLOAT_VX8_T vx8 ;
57+ FLOAT_V_T_HALF v1 ;
7358
7459 size_t vl ;
7560
@@ -80,86 +65,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
8065 boffset2 = b + 2 * m * (n & ~3 );
8166 boffset3 = b + 2 * m * (n & ~1 );
8267
83- for (j = (m >> 2 ); j > 0 ; j -- ) {
84-
85- aoffset1 = aoffset ;
86- aoffset += 8 * lda ;
87-
88- boffset1 = boffset ;
89- boffset += 32 ;
90-
91- for (i = (n >> 2 ); i > 0 ; i -- ) {
92- vl = 4 ;
93-
94- vx8 = VLSSEG8_FLOAT (aoffset1 , lda * sizeof (FLOAT ) * 2 , vl );
95- VSSEG8_FLOAT (boffset1 , vx8 , vl );
96-
97- aoffset1 += 8 ;
98- boffset1 += m * 8 ;
99- }
100-
101- if (n & 2 ) {
102- vl = 4 ;
103-
104- vx4 = VLSSEG4_FLOAT (aoffset1 , lda * sizeof (FLOAT ) * 2 , vl );
105- VSSEG4_FLOAT (boffset2 , vx4 , vl );
106-
107- aoffset1 += 4 ;
108- boffset2 += 16 ;
109- }
110-
111- if (n & 1 ) {
112- vl = 4 ;
113-
114- vx2 = VLSSEG2_FLOAT (aoffset1 , lda * sizeof (FLOAT ) * 2 , vl );
115- VSSEG2_FLOAT (boffset3 , vx2 , vl );
116-
117- aoffset1 += 2 ;
118- boffset3 += 8 ;
119- }
120- }
121-
122- if (m & 2 ) {
68+ for (j = m ; j > 0 ; j -- ) {
12369 aoffset1 = aoffset ;
124- aoffset += 4 * lda ;
125-
12670 boffset1 = boffset ;
127- boffset += 16 ;
128-
129- for (i = (n >> 2 ); i > 0 ; i -- ) {
130- vl = 2 ;
131-
132- vx8 = VLSSEG8_FLOAT (aoffset1 , lda * sizeof (FLOAT ) * 2 , vl );
133- VSSEG8_FLOAT (boffset1 , vx8 , vl );
134-
135- aoffset1 += 8 ;
136- boffset1 += m * 8 ;
137- }
138-
139- if (n & 2 ) {
140- vl = 2 ;
141-
142- vx4 = VLSSEG4_FLOAT (aoffset1 , lda * sizeof (FLOAT ) * 2 , vl );
143- VSSEG4_FLOAT (boffset2 , vx4 , vl );
144-
145- aoffset1 += 4 ;
146- boffset2 += 8 ;
147- }
148-
149- if (n & 1 ) {
150- vl = 2 ;
15171
152- vx2 = VLSSEG2_FLOAT (aoffset1 , lda * sizeof (FLOAT ) * 2 , vl );
153- VSSEG2_FLOAT (boffset3 , vx2 , vl );
154-
155- //aoffset1 += 2;
156- boffset3 += 4 ;
157- }
158- }
159-
160- if (m & 1 ) {
161- aoffset1 = aoffset ;
162- boffset1 = boffset ;
72+ aoffset += 2 * lda ;
73+ boffset += 8 ;
16374
16475 for (i = (n >> 2 ); i > 0 ; i -- ) {
16576 vl = 8 ;
@@ -174,16 +85,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
17485 if (n & 2 ) {
17586 vl = 4 ;
17687
177- v0 = VLEV_FLOAT (aoffset1 , vl );
178- VSEV_FLOAT (boffset2 , v0 , vl );
88+ v1 = VLEV_FLOAT_HALF (aoffset1 , vl );
89+ VSEV_FLOAT_HALF (boffset2 , v1 , vl );
17990
18091 aoffset1 += 4 ;
181- // boffset2 += 4;
92+ boffset2 += 4 ;
18293 }
18394
18495 if (n & 1 ) {
185- * (boffset3 ) = * (aoffset1 );
186- * (boffset3 + 1 ) = * (aoffset1 + 1 );
96+ * (boffset3 ) = * (aoffset1 );
97+ * (boffset3 + 1 ) = * (aoffset1 + 1 );
98+
99+ aoffset1 += 2 ;
100+ boffset3 += 2 ;
187101 }
188102 }
189103
0 commit comments