@@ -37,114 +37,114 @@ double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
3737 vfloat32m2_t vx , vy ;
3838 unsigned int gvl = 0 ;
3939 vfloat64m1_t v_res , v_z0 ;
40- gvl = vsetvlmax_e64m1 ();
41- v_res = vfmv_v_f_f64m1 (0 , gvl );
42- v_z0 = vfmv_v_f_f64m1 (0 , gvl );
40+ gvl = __riscv_vsetvlmax_e64m1 ();
41+ v_res = __riscv_vfmv_v_f_f64m1 (0 , gvl );
42+ v_z0 = __riscv_vfmv_v_f_f64m1 (0 , gvl );
4343
4444 if (inc_x == 1 && inc_y == 1 ){
45- gvl = vsetvl_e64m4 (n );
46- vr = vfmv_v_f_f64m4 (0 , gvl );
45+ gvl = __riscv_vsetvl_e64m4 (n );
46+ vr = __riscv_vfmv_v_f_f64m4 (0 , gvl );
4747 for (i = 0 ,j = 0 ; i < n /gvl ; i ++ ){
48- vx = vle32_v_f32m2 (& x [j ], gvl );
49- vy = vle32_v_f32m2 (& y [j ], gvl );
50- vr = vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
48+ vx = __riscv_vle32_v_f32m2 (& x [j ], gvl );
49+ vy = __riscv_vle32_v_f32m2 (& y [j ], gvl );
50+ vr = __riscv_vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
5151 j += gvl ;
5252 }
5353 if (j > 0 ){
54- v_res = vfredusum_vs_f64m4_f64m1 ( v_res , vr , v_z0 , gvl );
55- dot += (double )vfmv_f_s_f64m1_f64 (v_res );
54+ v_res = __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
55+ dot += (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
5656 }
5757 //tail
5858 if (j < n ){
59- gvl = vsetvl_e64m4 (n - j );
60- vx = vle32_v_f32m2 (& x [j ], gvl );
61- vy = vle32_v_f32m2 (& y [j ], gvl );
62- vfloat64m4_t vz = vfmv_v_f_f64m4 (0 , gvl );
63- //vr = vfdot_vv_f32m2 (vx, vy, gvl);
64- vr = vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
65- v_res = vfredusum_vs_f64m4_f64m1 ( v_res , vr , v_z0 , gvl );
66- dot += (double )vfmv_f_s_f64m1_f64 (v_res );
59+ gvl = __riscv_vsetvl_e64m4 (n - j );
60+ vx = __riscv_vle32_v_f32m2 (& x [j ], gvl );
61+ vy = __riscv_vle32_v_f32m2 (& y [j ], gvl );
62+ vfloat64m4_t vz = __riscv_vfmv_v_f_f64m4 (0 , gvl );
63+ //vr = __riscv_vfdot_vv_f32m2 (vx, vy, gvl);
64+ vr = __riscv_vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
65+ v_res = __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
66+ dot += (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
6767 }
6868 }else if (inc_y == 1 ){
69- gvl = vsetvl_e64m4 (n );
70- vr = vfmv_v_f_f64m4 (0 , gvl );
69+ gvl = __riscv_vsetvl_e64m4 (n );
70+ vr = __riscv_vfmv_v_f_f64m4 (0 , gvl );
7171 int stride_x = inc_x * sizeof (FLOAT );
7272 for (i = 0 ,j = 0 ; i < n /gvl ; i ++ ){
73- vx = vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
74- vy = vle32_v_f32m2 (& y [j ], gvl );
75- vr = vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
73+ vx = __riscv_vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
74+ vy = __riscv_vle32_v_f32m2 (& y [j ], gvl );
75+ vr = __riscv_vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
7676 j += gvl ;
7777 }
7878 if (j > 0 ){
79- v_res = vfredusum_vs_f64m4_f64m1 ( v_res , vr , v_z0 , gvl );
80- dot += (double )vfmv_f_s_f64m1_f64 (v_res );
79+ v_res = __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
80+ dot += (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
8181
8282 }
8383 //tail
8484 if (j < n ){
85- gvl = vsetvl_e64m4 (n - j );
86- vx = vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
87- vy = vle32_v_f32m2 (& y [j ], gvl );
88- vfloat64m4_t vz = vfmv_v_f_f64m4 (0 , gvl );
89- //vr = vfdot_vv_f32m2 (vx, vy, gvl);
90- vr = vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
91- v_res = vfredusum_vs_f64m4_f64m1 ( v_res , vr , v_z0 , gvl );
92- dot += (double )vfmv_f_s_f64m1_f64 (v_res );
85+ gvl = __riscv_vsetvl_e64m4 (n - j );
86+ vx = __riscv_vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
87+ vy = __riscv_vle32_v_f32m2 (& y [j ], gvl );
88+ vfloat64m4_t vz = __riscv_vfmv_v_f_f64m4 (0 , gvl );
89+ //vr = __riscv_vfdot_vv_f32m2 (vx, vy, gvl);
90+ vr = __riscv_vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
91+ v_res = __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
92+ dot += (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
9393
9494 }
9595 }else if (inc_x == 1 ){
96- gvl = vsetvl_e64m4 (n );
97- vr = vfmv_v_f_f64m4 (0 , gvl );
96+ gvl = __riscv_vsetvl_e64m4 (n );
97+ vr = __riscv_vfmv_v_f_f64m4 (0 , gvl );
9898 int stride_y = inc_y * sizeof (FLOAT );
9999 for (i = 0 ,j = 0 ; i < n /gvl ; i ++ ){
100- vx = vle32_v_f32m2 (& x [j ], gvl );
101- vy = vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
102- vr = vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
100+ vx = __riscv_vle32_v_f32m2 (& x [j ], gvl );
101+ vy = __riscv_vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
102+ vr = __riscv_vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
103103 j += gvl ;
104104 }
105105 if (j > 0 ){
106- v_res = vfredusum_vs_f64m4_f64m1 ( v_res , vr , v_z0 , gvl );
107- dot += (double )vfmv_f_s_f64m1_f64 (v_res );
106+ v_res = __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
107+ dot += (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
108108
109109 }
110110 //tail
111111 if (j < n ){
112- gvl = vsetvl_e64m4 (n - j );
113- vx = vle32_v_f32m2 (& x [j ], gvl );
114- vy = vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
115- vfloat64m4_t vz = vfmv_v_f_f64m4 (0 , gvl );
116- //vr = vfdot_vv_f32m2 (vx, vy, gvl);
117- vr = vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
118- v_res = vfredusum_vs_f64m4_f64m1 ( v_res , vr , v_z0 , gvl );
119- dot += (double )vfmv_f_s_f64m1_f64 (v_res );
112+ gvl = __riscv_vsetvl_e64m4 (n - j );
113+ vx = __riscv_vle32_v_f32m2 (& x [j ], gvl );
114+ vy = __riscv_vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
115+ vfloat64m4_t vz = __riscv_vfmv_v_f_f64m4 (0 , gvl );
116+ //vr = __riscv_vfdot_vv_f32m2 (vx, vy, gvl);
117+ vr = __riscv_vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
118+ v_res = __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
119+ dot += (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
120120
121121 }
122122 }else {
123- gvl = vsetvl_e64m4 (n );
124- vr = vfmv_v_f_f64m4 (0 , gvl );
123+ gvl = __riscv_vsetvl_e64m4 (n );
124+ vr = __riscv_vfmv_v_f_f64m4 (0 , gvl );
125125 int stride_x = inc_x * sizeof (FLOAT );
126126 int stride_y = inc_y * sizeof (FLOAT );
127127 for (i = 0 ,j = 0 ; i < n /gvl ; i ++ ){
128- vx = vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
129- vy = vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
130- vr = vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
128+ vx = __riscv_vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
129+ vy = __riscv_vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
130+ vr = __riscv_vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
131131 j += gvl ;
132132 }
133133 if (j > 0 ){
134- v_res = vfredusum_vs_f64m4_f64m1 ( v_res , vr , v_z0 , gvl );
135- dot += (double )vfmv_f_s_f64m1_f64 (v_res );
134+ v_res = __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
135+ dot += (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
136136
137137 }
138138 //tail
139139 if (j < n ){
140- gvl = vsetvl_e64m4 (n - j );
141- vx = vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
142- vy = vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
143- vfloat64m4_t vz = vfmv_v_f_f64m4 (0 , gvl );
144- //vr = vfdot_vv_f32m2 (vx, vy, gvl);
145- vr = vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
146- v_res = vfredusum_vs_f64m4_f64m1 ( v_res , vr , v_z0 , gvl );
147- dot += (double )vfmv_f_s_f64m1_f64 (v_res );
140+ gvl = __riscv_vsetvl_e64m4 (n - j );
141+ vx = __riscv_vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
142+ vy = __riscv_vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
143+ vfloat64m4_t vz = __riscv_vfmv_v_f_f64m4 (0 , gvl );
144+ //vr = __riscv_vfdot_vv_f32m2 (vx, vy, gvl);
145+ vr = __riscv_vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
146+ v_res = __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
147+ dot += (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
148148
149149 }
150150 }
0 commit comments