Skip to content

Commit 16be28a

Browse files
committed
dgemm loop unroll and 4x1 4x2 dgemv implimentation
1 parent e5c8361 commit 16be28a

File tree

2 files changed

+243
-2
lines changed

2 files changed

+243
-2
lines changed

kernel/power/dgemm_kernel_power10.c

Lines changed: 198 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
188188
v4sf_t *rowC;
189189
v4sf_t result[4];
190190
__vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7;
191-
BLASLONG l = 0;
191+
BLASLONG l = 1;
192192
vec_t *rowA = (vec_t *) & AO[0];
193193
__vector_pair rowB, rowB1;
194194
rowB = *((__vector_pair *)((void *)&BO[0]));
@@ -201,7 +201,203 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
201201
__builtin_mma_xvf64ger (&acc5, rowB1, rowA[2]);
202202
__builtin_mma_xvf64ger (&acc6, rowB, rowA[3]);
203203
__builtin_mma_xvf64ger (&acc7, rowB1, rowA[3]);
204-
for (l = 1; l < temp; l++)
204+
for (l = 1; l + 15 < temp; l += 16)
205+
{
206+
207+
vec_t *rowA0 = (vec_t *)&AO[(l + 0) << 3];
208+
__vector_pair rowB0 = *((__vector_pair *)((void *)&BO[(l + 0) << 3]));
209+
__vector_pair rowB0_1 = *((__vector_pair *)((void *)&BO[((l + 0) << 3) + 4]));
210+
__builtin_mma_xvf64gerpp(&acc0, rowB0, rowA0[0]);
211+
__builtin_mma_xvf64gerpp(&acc1, rowB0_1, rowA0[0]);
212+
__builtin_mma_xvf64gerpp(&acc2, rowB0, rowA0[1]);
213+
__builtin_mma_xvf64gerpp(&acc3, rowB0_1, rowA0[1]);
214+
__builtin_mma_xvf64gerpp(&acc4, rowB0, rowA0[2]);
215+
__builtin_mma_xvf64gerpp(&acc5, rowB0_1, rowA0[2]);
216+
__builtin_mma_xvf64gerpp(&acc6, rowB0, rowA0[3]);
217+
__builtin_mma_xvf64gerpp(&acc7, rowB0_1, rowA0[3]);
218+
219+
vec_t *rowA1 = (vec_t *)&AO[(l + 1) << 3];
220+
__vector_pair rowB1 = *((__vector_pair *)((void *)&BO[(l + 1) << 3]));
221+
__vector_pair rowB1_1 = *((__vector_pair *)((void *)&BO[((l + 1) << 3) + 4]));
222+
__builtin_mma_xvf64gerpp(&acc0, rowB1, rowA1[0]);
223+
__builtin_mma_xvf64gerpp(&acc1, rowB1_1, rowA1[0]);
224+
__builtin_mma_xvf64gerpp(&acc2, rowB1, rowA1[1]);
225+
__builtin_mma_xvf64gerpp(&acc3, rowB1_1, rowA1[1]);
226+
__builtin_mma_xvf64gerpp(&acc4, rowB1, rowA1[2]);
227+
__builtin_mma_xvf64gerpp(&acc5, rowB1_1, rowA1[2]);
228+
__builtin_mma_xvf64gerpp(&acc6, rowB1, rowA1[3]);
229+
__builtin_mma_xvf64gerpp(&acc7, rowB1_1, rowA1[3]);
230+
231+
vec_t *rowA2 = (vec_t *)&AO[(l + 2) << 3];
232+
__vector_pair rowB2 = *((__vector_pair *)((void *)&BO[(l + 2) << 3]));
233+
__vector_pair rowB2_1 = *((__vector_pair *)((void *)&BO[((l + 2) << 3) + 4]));
234+
__builtin_mma_xvf64gerpp(&acc0, rowB2, rowA2[0]);
235+
__builtin_mma_xvf64gerpp(&acc1, rowB2_1, rowA2[0]);
236+
__builtin_mma_xvf64gerpp(&acc2, rowB2, rowA2[1]);
237+
__builtin_mma_xvf64gerpp(&acc3, rowB2_1, rowA2[1]);
238+
__builtin_mma_xvf64gerpp(&acc4, rowB2, rowA2[2]);
239+
__builtin_mma_xvf64gerpp(&acc5, rowB2_1, rowA2[2]);
240+
__builtin_mma_xvf64gerpp(&acc6, rowB2, rowA2[3]);
241+
__builtin_mma_xvf64gerpp(&acc7, rowB2_1, rowA2[3]);
242+
243+
vec_t *rowA3 = (vec_t *)&AO[(l + 3) << 3];
244+
__vector_pair rowB3 = *((__vector_pair *)((void *)&BO[(l + 3) << 3]));
245+
__vector_pair rowB3_1 = *((__vector_pair *)((void *)&BO[((l + 3) << 3) + 4]));
246+
__builtin_mma_xvf64gerpp(&acc0, rowB3, rowA3[0]);
247+
__builtin_mma_xvf64gerpp(&acc1, rowB3_1, rowA3[0]);
248+
__builtin_mma_xvf64gerpp(&acc2, rowB3, rowA3[1]);
249+
__builtin_mma_xvf64gerpp(&acc3, rowB3_1, rowA3[1]);
250+
__builtin_mma_xvf64gerpp(&acc4, rowB3, rowA3[2]);
251+
__builtin_mma_xvf64gerpp(&acc5, rowB3_1, rowA3[2]);
252+
__builtin_mma_xvf64gerpp(&acc6, rowB3, rowA3[3]);
253+
__builtin_mma_xvf64gerpp(&acc7, rowB3_1, rowA3[3]);
254+
255+
vec_t *rowA4 = (vec_t *)&AO[(l + 4) << 3];
256+
__vector_pair rowB4 = *((__vector_pair *)((void *)&BO[(l + 4) << 3]));
257+
__vector_pair rowB4_1 = *((__vector_pair *)((void *)&BO[((l + 4) << 3) + 4]));
258+
__builtin_mma_xvf64gerpp(&acc0, rowB4, rowA4[0]);
259+
__builtin_mma_xvf64gerpp(&acc1, rowB4_1, rowA4[0]);
260+
__builtin_mma_xvf64gerpp(&acc2, rowB4, rowA4[1]);
261+
__builtin_mma_xvf64gerpp(&acc3, rowB4_1, rowA4[1]);
262+
__builtin_mma_xvf64gerpp(&acc4, rowB4, rowA4[2]);
263+
__builtin_mma_xvf64gerpp(&acc5, rowB4_1, rowA4[2]);
264+
__builtin_mma_xvf64gerpp(&acc6, rowB4, rowA4[3]);
265+
__builtin_mma_xvf64gerpp(&acc7, rowB4_1, rowA4[3]);
266+
267+
vec_t *rowA5 = (vec_t *)&AO[(l + 5) << 3];
268+
__vector_pair rowB5 = *((__vector_pair *)((void *)&BO[(l + 5) << 3]));
269+
__vector_pair rowB5_1 = *((__vector_pair *)((void *)&BO[((l + 5) << 3) + 4]));
270+
__builtin_mma_xvf64gerpp(&acc0, rowB5, rowA5[0]);
271+
__builtin_mma_xvf64gerpp(&acc1, rowB5_1, rowA5[0]);
272+
__builtin_mma_xvf64gerpp(&acc2, rowB5, rowA5[1]);
273+
__builtin_mma_xvf64gerpp(&acc3, rowB5_1, rowA5[1]);
274+
__builtin_mma_xvf64gerpp(&acc4, rowB5, rowA5[2]);
275+
__builtin_mma_xvf64gerpp(&acc5, rowB5_1, rowA5[2]);
276+
__builtin_mma_xvf64gerpp(&acc6, rowB5, rowA5[3]);
277+
__builtin_mma_xvf64gerpp(&acc7, rowB5_1, rowA5[3]);
278+
279+
vec_t *rowA6 = (vec_t *)&AO[(l + 6) << 3];
280+
__vector_pair rowB6 = *((__vector_pair *)((void *)&BO[(l + 6) << 3]));
281+
__vector_pair rowB6_1 = *((__vector_pair *)((void *)&BO[((l + 6) << 3) + 4]));
282+
__builtin_mma_xvf64gerpp(&acc0, rowB6, rowA6[0]);
283+
__builtin_mma_xvf64gerpp(&acc1, rowB6_1, rowA6[0]);
284+
__builtin_mma_xvf64gerpp(&acc2, rowB6, rowA6[1]);
285+
__builtin_mma_xvf64gerpp(&acc3, rowB6_1, rowA6[1]);
286+
__builtin_mma_xvf64gerpp(&acc4, rowB6, rowA6[2]);
287+
__builtin_mma_xvf64gerpp(&acc5, rowB6_1, rowA6[2]);
288+
__builtin_mma_xvf64gerpp(&acc6, rowB6, rowA6[3]);
289+
__builtin_mma_xvf64gerpp(&acc7, rowB6_1, rowA6[3]);
290+
291+
vec_t *rowA7 = (vec_t *)&AO[(l + 7) << 3];
292+
__vector_pair rowB7 = *((__vector_pair *)((void *)&BO[(l + 7) << 3]));
293+
__vector_pair rowB7_1 = *((__vector_pair *)((void *)&BO[((l + 7) << 3) + 4]));
294+
__builtin_mma_xvf64gerpp(&acc0, rowB7, rowA7[0]);
295+
__builtin_mma_xvf64gerpp(&acc1, rowB7_1, rowA7[0]);
296+
__builtin_mma_xvf64gerpp(&acc2, rowB7, rowA7[1]);
297+
__builtin_mma_xvf64gerpp(&acc3, rowB7_1, rowA7[1]);
298+
__builtin_mma_xvf64gerpp(&acc4, rowB7, rowA7[2]);
299+
__builtin_mma_xvf64gerpp(&acc5, rowB7_1, rowA7[2]);
300+
__builtin_mma_xvf64gerpp(&acc6, rowB7, rowA7[3]);
301+
__builtin_mma_xvf64gerpp(&acc7, rowB7_1, rowA7[3]);
302+
303+
vec_t *rowA8 = (vec_t *)&AO[(l + 8) << 3];
304+
__vector_pair rowB8 = *((__vector_pair *)((void *)&BO[(l + 8) << 3]));
305+
__vector_pair rowB8_1 = *((__vector_pair *)((void *)&BO[((l + 8) << 3) + 4]));
306+
__builtin_mma_xvf64gerpp(&acc0, rowB8, rowA8[0]);
307+
__builtin_mma_xvf64gerpp(&acc1, rowB8_1, rowA8[0]);
308+
__builtin_mma_xvf64gerpp(&acc2, rowB8, rowA8[1]);
309+
__builtin_mma_xvf64gerpp(&acc3, rowB8_1, rowA8[1]);
310+
__builtin_mma_xvf64gerpp(&acc4, rowB8, rowA8[2]);
311+
__builtin_mma_xvf64gerpp(&acc5, rowB8_1, rowA8[2]);
312+
__builtin_mma_xvf64gerpp(&acc6, rowB8, rowA8[3]);
313+
__builtin_mma_xvf64gerpp(&acc7, rowB8_1, rowA8[3]);
314+
315+
vec_t *rowA9 = (vec_t *)&AO[(l + 9) << 3];
316+
__vector_pair rowB9 = *((__vector_pair *)((void *)&BO[(l + 9) << 3]));
317+
__vector_pair rowB9_1 = *((__vector_pair *)((void *)&BO[((l + 9) << 3) + 4]));
318+
__builtin_mma_xvf64gerpp(&acc0, rowB9, rowA9[0]);
319+
__builtin_mma_xvf64gerpp(&acc1, rowB9_1, rowA9[0]);
320+
__builtin_mma_xvf64gerpp(&acc2, rowB9, rowA9[1]);
321+
__builtin_mma_xvf64gerpp(&acc3, rowB9_1, rowA9[1]);
322+
__builtin_mma_xvf64gerpp(&acc4, rowB9, rowA9[2]);
323+
__builtin_mma_xvf64gerpp(&acc5, rowB9_1, rowA9[2]);
324+
__builtin_mma_xvf64gerpp(&acc6, rowB9, rowA9[3]);
325+
__builtin_mma_xvf64gerpp(&acc7, rowB9_1, rowA9[3]);
326+
327+
vec_t *rowA10 = (vec_t *)&AO[(l + 10) << 3];
328+
__vector_pair rowB10 = *((__vector_pair *)((void *)&BO[(l + 10) << 3]));
329+
__vector_pair rowB10_1 = *((__vector_pair *)((void *)&BO[((l + 10) << 3) + 4]));
330+
__builtin_mma_xvf64gerpp(&acc0, rowB10, rowA10[0]);
331+
__builtin_mma_xvf64gerpp(&acc1, rowB10_1, rowA10[0]);
332+
__builtin_mma_xvf64gerpp(&acc2, rowB10, rowA10[1]);
333+
__builtin_mma_xvf64gerpp(&acc3, rowB10_1, rowA10[1]);
334+
__builtin_mma_xvf64gerpp(&acc4, rowB10, rowA10[2]);
335+
__builtin_mma_xvf64gerpp(&acc5, rowB10_1, rowA10[2]);
336+
__builtin_mma_xvf64gerpp(&acc6, rowB10, rowA10[3]);
337+
__builtin_mma_xvf64gerpp(&acc7, rowB10_1, rowA10[3]);
338+
339+
vec_t *rowA11 = (vec_t *)&AO[(l + 11) << 3];
340+
__vector_pair rowB11 = *((__vector_pair *)((void *)&BO[(l + 11) << 3]));
341+
__vector_pair rowB11_1 = *((__vector_pair *)((void *)&BO[((l + 11) << 3) + 4]));
342+
__builtin_mma_xvf64gerpp(&acc0, rowB11, rowA11[0]);
343+
__builtin_mma_xvf64gerpp(&acc1, rowB11_1, rowA11[0]);
344+
__builtin_mma_xvf64gerpp(&acc2, rowB11, rowA11[1]);
345+
__builtin_mma_xvf64gerpp(&acc3, rowB11_1, rowA11[1]);
346+
__builtin_mma_xvf64gerpp(&acc4, rowB11, rowA11[2]);
347+
__builtin_mma_xvf64gerpp(&acc5, rowB11_1, rowA11[2]);
348+
__builtin_mma_xvf64gerpp(&acc6, rowB11, rowA11[3]);
349+
__builtin_mma_xvf64gerpp(&acc7, rowB11_1, rowA11[3]);
350+
351+
vec_t *rowA12 = (vec_t *)&AO[(l + 12) << 3];
352+
__vector_pair rowB12 = *((__vector_pair *)((void *)&BO[(l + 12) << 3]));
353+
__vector_pair rowB12_1 = *((__vector_pair *)((void *)&BO[((l + 12) << 3) + 4]));
354+
__builtin_mma_xvf64gerpp(&acc0, rowB12, rowA12[0]);
355+
__builtin_mma_xvf64gerpp(&acc1, rowB12_1, rowA12[0]);
356+
__builtin_mma_xvf64gerpp(&acc2, rowB12, rowA12[1]);
357+
__builtin_mma_xvf64gerpp(&acc3, rowB12_1, rowA12[1]);
358+
__builtin_mma_xvf64gerpp(&acc4, rowB12, rowA12[2]);
359+
__builtin_mma_xvf64gerpp(&acc5, rowB12_1, rowA12[2]);
360+
__builtin_mma_xvf64gerpp(&acc6, rowB12, rowA12[3]);
361+
__builtin_mma_xvf64gerpp(&acc7, rowB12_1, rowA12[3]);
362+
363+
vec_t *rowA13 = (vec_t *)&AO[(l + 13) << 3];
364+
__vector_pair rowB13 = *((__vector_pair *)((void *)&BO[(l + 13) << 3]));
365+
__vector_pair rowB13_1 = *((__vector_pair *)((void *)&BO[((l + 13) << 3) + 4]));
366+
__builtin_mma_xvf64gerpp(&acc0, rowB13, rowA13[0]);
367+
__builtin_mma_xvf64gerpp(&acc1, rowB13_1, rowA13[0]);
368+
__builtin_mma_xvf64gerpp(&acc2, rowB13, rowA13[1]);
369+
__builtin_mma_xvf64gerpp(&acc3, rowB13_1, rowA13[1]);
370+
__builtin_mma_xvf64gerpp(&acc4, rowB13, rowA13[2]);
371+
__builtin_mma_xvf64gerpp(&acc5, rowB13_1, rowA13[2]);
372+
__builtin_mma_xvf64gerpp(&acc6, rowB13, rowA13[3]);
373+
__builtin_mma_xvf64gerpp(&acc7, rowB13_1, rowA13[3]);
374+
375+
vec_t *rowA14 = (vec_t *)&AO[(l + 14) << 3];
376+
__vector_pair rowB14 = *((__vector_pair *)((void *)&BO[(l + 14) << 3]));
377+
__vector_pair rowB14_1 = *((__vector_pair *)((void *)&BO[((l + 14) << 3) + 4]));
378+
__builtin_mma_xvf64gerpp(&acc0, rowB14, rowA14[0]);
379+
__builtin_mma_xvf64gerpp(&acc1, rowB14_1, rowA14[0]);
380+
__builtin_mma_xvf64gerpp(&acc2, rowB14, rowA14[1]);
381+
__builtin_mma_xvf64gerpp(&acc3, rowB14_1, rowA14[1]);
382+
__builtin_mma_xvf64gerpp(&acc4, rowB14, rowA14[2]);
383+
__builtin_mma_xvf64gerpp(&acc5, rowB14_1, rowA14[2]);
384+
__builtin_mma_xvf64gerpp(&acc6, rowB14, rowA14[3]);
385+
__builtin_mma_xvf64gerpp(&acc7, rowB14_1, rowA14[3]);
386+
387+
vec_t *rowA15 = (vec_t *)&AO[(l + 15) << 3];
388+
__vector_pair rowB15 = *((__vector_pair *)((void *)&BO[(l + 15) << 3]));
389+
__vector_pair rowB15_1 = *((__vector_pair *)((void *)&BO[((l + 15) << 3) + 4]));
390+
__builtin_mma_xvf64gerpp(&acc0, rowB15, rowA15[0]);
391+
__builtin_mma_xvf64gerpp(&acc1, rowB15_1, rowA15[0]);
392+
__builtin_mma_xvf64gerpp(&acc2, rowB15, rowA15[1]);
393+
__builtin_mma_xvf64gerpp(&acc3, rowB15_1, rowA15[1]);
394+
__builtin_mma_xvf64gerpp(&acc4, rowB15, rowA15[2]);
395+
__builtin_mma_xvf64gerpp(&acc5, rowB15_1, rowA15[2]);
396+
__builtin_mma_xvf64gerpp(&acc6, rowB15, rowA15[3]);
397+
__builtin_mma_xvf64gerpp(&acc7, rowB15_1, rowA15[3]);
398+
399+
}
400+
for (; l < temp; l++)
205401
{
206402
rowA = (vec_t *) & AO[l << 3];
207403
rowB = *((__vector_pair *)((void *)&BO[l << 3]));

kernel/power/dgemv_n_microk_power10.c

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,53 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
2525
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626
*****************************************************************************/
2727

28+
#define HAVE_KERNEL_4x2 1
29+
#define HAVE_KERNEL_4x1 1
2830
#define HAVE_KERNEL_4x4 1
2931

32+
static void dgemv_kernel_4x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xo, FLOAT *y, FLOAT alpha)
33+
{
34+
FLOAT x0,x1;
35+
x0 = xo[0] * alpha;
36+
x1 = xo[1] * alpha;
37+
__vector double v_x0 = {x0,x0};
38+
__vector double v_x1 = {x1,x1};
39+
__vector double* v_y =(__vector double*)y;
40+
__vector double* va0 = (__vector double*)a0;
41+
__vector double* va1 = (__vector double*)a1;
42+
for (int i=0; i< n/2; i+=2)
43+
{
44+
45+
v_y[i]+= va0[i] * v_x0 + va1[i] * v_x1;
46+
v_y[i+1]+= va0[i+1] * v_x0 + va1[i+1] * v_x1;
47+
48+
}
49+
50+
51+
}
52+
53+
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT alpha)
54+
{
55+
BLASLONG i;
56+
FLOAT x[1] __attribute__ ((aligned (16)));
57+
58+
FLOAT x0,x1;
59+
x0 = xo[0] * alpha;
60+
61+
__vector double v_x0 = {x0,x0};
62+
__vector double* v_y =(__vector double*)y;
63+
__vector double* va0 = (__vector double*)a0;
64+
for (int i=0; i< n/2; i+=2)
65+
{
66+
67+
v_y[i]+= va0[i] * v_x0 ;
68+
v_y[i+1]+= va0[i+1] * v_x0 ;
69+
70+
}
71+
72+
}
73+
74+
3075
static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha)
3176
{
3277
double *a0;

0 commit comments

Comments
 (0)