@@ -188,7 +188,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
188188 v4sf_t * rowC ;
189189 v4sf_t result [4 ];
190190 __vector_quad acc0 , acc1 , acc2 , acc3 , acc4 ,acc5 ,acc6 ,acc7 ;
191- BLASLONG l = 0 ;
191+ BLASLONG l = 1 ;
192192 vec_t * rowA = (vec_t * ) & AO [0 ];
193193 __vector_pair rowB , rowB1 ;
194194 rowB = * ((__vector_pair * )((void * )& BO [0 ]));
@@ -201,7 +201,203 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
201201 __builtin_mma_xvf64ger (& acc5 , rowB1 , rowA [2 ]);
202202 __builtin_mma_xvf64ger (& acc6 , rowB , rowA [3 ]);
203203 __builtin_mma_xvf64ger (& acc7 , rowB1 , rowA [3 ]);
204- for (l = 1 ; l < temp ; l ++ )
204+ for (l = 1 ; l + 15 < temp ; l += 16 )
205+ {
206+
207+ vec_t * rowA0 = (vec_t * )& AO [(l + 0 ) << 3 ];
208+ __vector_pair rowB0 = * ((__vector_pair * )((void * )& BO [(l + 0 ) << 3 ]));
209+ __vector_pair rowB0_1 = * ((__vector_pair * )((void * )& BO [((l + 0 ) << 3 ) + 4 ]));
210+ __builtin_mma_xvf64gerpp (& acc0 , rowB0 , rowA0 [0 ]);
211+ __builtin_mma_xvf64gerpp (& acc1 , rowB0_1 , rowA0 [0 ]);
212+ __builtin_mma_xvf64gerpp (& acc2 , rowB0 , rowA0 [1 ]);
213+ __builtin_mma_xvf64gerpp (& acc3 , rowB0_1 , rowA0 [1 ]);
214+ __builtin_mma_xvf64gerpp (& acc4 , rowB0 , rowA0 [2 ]);
215+ __builtin_mma_xvf64gerpp (& acc5 , rowB0_1 , rowA0 [2 ]);
216+ __builtin_mma_xvf64gerpp (& acc6 , rowB0 , rowA0 [3 ]);
217+ __builtin_mma_xvf64gerpp (& acc7 , rowB0_1 , rowA0 [3 ]);
218+
219+ vec_t * rowA1 = (vec_t * )& AO [(l + 1 ) << 3 ];
220+ __vector_pair rowB1 = * ((__vector_pair * )((void * )& BO [(l + 1 ) << 3 ]));
221+ __vector_pair rowB1_1 = * ((__vector_pair * )((void * )& BO [((l + 1 ) << 3 ) + 4 ]));
222+ __builtin_mma_xvf64gerpp (& acc0 , rowB1 , rowA1 [0 ]);
223+ __builtin_mma_xvf64gerpp (& acc1 , rowB1_1 , rowA1 [0 ]);
224+ __builtin_mma_xvf64gerpp (& acc2 , rowB1 , rowA1 [1 ]);
225+ __builtin_mma_xvf64gerpp (& acc3 , rowB1_1 , rowA1 [1 ]);
226+ __builtin_mma_xvf64gerpp (& acc4 , rowB1 , rowA1 [2 ]);
227+ __builtin_mma_xvf64gerpp (& acc5 , rowB1_1 , rowA1 [2 ]);
228+ __builtin_mma_xvf64gerpp (& acc6 , rowB1 , rowA1 [3 ]);
229+ __builtin_mma_xvf64gerpp (& acc7 , rowB1_1 , rowA1 [3 ]);
230+
231+ vec_t * rowA2 = (vec_t * )& AO [(l + 2 ) << 3 ];
232+ __vector_pair rowB2 = * ((__vector_pair * )((void * )& BO [(l + 2 ) << 3 ]));
233+ __vector_pair rowB2_1 = * ((__vector_pair * )((void * )& BO [((l + 2 ) << 3 ) + 4 ]));
234+ __builtin_mma_xvf64gerpp (& acc0 , rowB2 , rowA2 [0 ]);
235+ __builtin_mma_xvf64gerpp (& acc1 , rowB2_1 , rowA2 [0 ]);
236+ __builtin_mma_xvf64gerpp (& acc2 , rowB2 , rowA2 [1 ]);
237+ __builtin_mma_xvf64gerpp (& acc3 , rowB2_1 , rowA2 [1 ]);
238+ __builtin_mma_xvf64gerpp (& acc4 , rowB2 , rowA2 [2 ]);
239+ __builtin_mma_xvf64gerpp (& acc5 , rowB2_1 , rowA2 [2 ]);
240+ __builtin_mma_xvf64gerpp (& acc6 , rowB2 , rowA2 [3 ]);
241+ __builtin_mma_xvf64gerpp (& acc7 , rowB2_1 , rowA2 [3 ]);
242+
243+ vec_t * rowA3 = (vec_t * )& AO [(l + 3 ) << 3 ];
244+ __vector_pair rowB3 = * ((__vector_pair * )((void * )& BO [(l + 3 ) << 3 ]));
245+ __vector_pair rowB3_1 = * ((__vector_pair * )((void * )& BO [((l + 3 ) << 3 ) + 4 ]));
246+ __builtin_mma_xvf64gerpp (& acc0 , rowB3 , rowA3 [0 ]);
247+ __builtin_mma_xvf64gerpp (& acc1 , rowB3_1 , rowA3 [0 ]);
248+ __builtin_mma_xvf64gerpp (& acc2 , rowB3 , rowA3 [1 ]);
249+ __builtin_mma_xvf64gerpp (& acc3 , rowB3_1 , rowA3 [1 ]);
250+ __builtin_mma_xvf64gerpp (& acc4 , rowB3 , rowA3 [2 ]);
251+ __builtin_mma_xvf64gerpp (& acc5 , rowB3_1 , rowA3 [2 ]);
252+ __builtin_mma_xvf64gerpp (& acc6 , rowB3 , rowA3 [3 ]);
253+ __builtin_mma_xvf64gerpp (& acc7 , rowB3_1 , rowA3 [3 ]);
254+
255+ vec_t * rowA4 = (vec_t * )& AO [(l + 4 ) << 3 ];
256+ __vector_pair rowB4 = * ((__vector_pair * )((void * )& BO [(l + 4 ) << 3 ]));
257+ __vector_pair rowB4_1 = * ((__vector_pair * )((void * )& BO [((l + 4 ) << 3 ) + 4 ]));
258+ __builtin_mma_xvf64gerpp (& acc0 , rowB4 , rowA4 [0 ]);
259+ __builtin_mma_xvf64gerpp (& acc1 , rowB4_1 , rowA4 [0 ]);
260+ __builtin_mma_xvf64gerpp (& acc2 , rowB4 , rowA4 [1 ]);
261+ __builtin_mma_xvf64gerpp (& acc3 , rowB4_1 , rowA4 [1 ]);
262+ __builtin_mma_xvf64gerpp (& acc4 , rowB4 , rowA4 [2 ]);
263+ __builtin_mma_xvf64gerpp (& acc5 , rowB4_1 , rowA4 [2 ]);
264+ __builtin_mma_xvf64gerpp (& acc6 , rowB4 , rowA4 [3 ]);
265+ __builtin_mma_xvf64gerpp (& acc7 , rowB4_1 , rowA4 [3 ]);
266+
267+ vec_t * rowA5 = (vec_t * )& AO [(l + 5 ) << 3 ];
268+ __vector_pair rowB5 = * ((__vector_pair * )((void * )& BO [(l + 5 ) << 3 ]));
269+ __vector_pair rowB5_1 = * ((__vector_pair * )((void * )& BO [((l + 5 ) << 3 ) + 4 ]));
270+ __builtin_mma_xvf64gerpp (& acc0 , rowB5 , rowA5 [0 ]);
271+ __builtin_mma_xvf64gerpp (& acc1 , rowB5_1 , rowA5 [0 ]);
272+ __builtin_mma_xvf64gerpp (& acc2 , rowB5 , rowA5 [1 ]);
273+ __builtin_mma_xvf64gerpp (& acc3 , rowB5_1 , rowA5 [1 ]);
274+ __builtin_mma_xvf64gerpp (& acc4 , rowB5 , rowA5 [2 ]);
275+ __builtin_mma_xvf64gerpp (& acc5 , rowB5_1 , rowA5 [2 ]);
276+ __builtin_mma_xvf64gerpp (& acc6 , rowB5 , rowA5 [3 ]);
277+ __builtin_mma_xvf64gerpp (& acc7 , rowB5_1 , rowA5 [3 ]);
278+
279+ vec_t * rowA6 = (vec_t * )& AO [(l + 6 ) << 3 ];
280+ __vector_pair rowB6 = * ((__vector_pair * )((void * )& BO [(l + 6 ) << 3 ]));
281+ __vector_pair rowB6_1 = * ((__vector_pair * )((void * )& BO [((l + 6 ) << 3 ) + 4 ]));
282+ __builtin_mma_xvf64gerpp (& acc0 , rowB6 , rowA6 [0 ]);
283+ __builtin_mma_xvf64gerpp (& acc1 , rowB6_1 , rowA6 [0 ]);
284+ __builtin_mma_xvf64gerpp (& acc2 , rowB6 , rowA6 [1 ]);
285+ __builtin_mma_xvf64gerpp (& acc3 , rowB6_1 , rowA6 [1 ]);
286+ __builtin_mma_xvf64gerpp (& acc4 , rowB6 , rowA6 [2 ]);
287+ __builtin_mma_xvf64gerpp (& acc5 , rowB6_1 , rowA6 [2 ]);
288+ __builtin_mma_xvf64gerpp (& acc6 , rowB6 , rowA6 [3 ]);
289+ __builtin_mma_xvf64gerpp (& acc7 , rowB6_1 , rowA6 [3 ]);
290+
291+ vec_t * rowA7 = (vec_t * )& AO [(l + 7 ) << 3 ];
292+ __vector_pair rowB7 = * ((__vector_pair * )((void * )& BO [(l + 7 ) << 3 ]));
293+ __vector_pair rowB7_1 = * ((__vector_pair * )((void * )& BO [((l + 7 ) << 3 ) + 4 ]));
294+ __builtin_mma_xvf64gerpp (& acc0 , rowB7 , rowA7 [0 ]);
295+ __builtin_mma_xvf64gerpp (& acc1 , rowB7_1 , rowA7 [0 ]);
296+ __builtin_mma_xvf64gerpp (& acc2 , rowB7 , rowA7 [1 ]);
297+ __builtin_mma_xvf64gerpp (& acc3 , rowB7_1 , rowA7 [1 ]);
298+ __builtin_mma_xvf64gerpp (& acc4 , rowB7 , rowA7 [2 ]);
299+ __builtin_mma_xvf64gerpp (& acc5 , rowB7_1 , rowA7 [2 ]);
300+ __builtin_mma_xvf64gerpp (& acc6 , rowB7 , rowA7 [3 ]);
301+ __builtin_mma_xvf64gerpp (& acc7 , rowB7_1 , rowA7 [3 ]);
302+
303+ vec_t * rowA8 = (vec_t * )& AO [(l + 8 ) << 3 ];
304+ __vector_pair rowB8 = * ((__vector_pair * )((void * )& BO [(l + 8 ) << 3 ]));
305+ __vector_pair rowB8_1 = * ((__vector_pair * )((void * )& BO [((l + 8 ) << 3 ) + 4 ]));
306+ __builtin_mma_xvf64gerpp (& acc0 , rowB8 , rowA8 [0 ]);
307+ __builtin_mma_xvf64gerpp (& acc1 , rowB8_1 , rowA8 [0 ]);
308+ __builtin_mma_xvf64gerpp (& acc2 , rowB8 , rowA8 [1 ]);
309+ __builtin_mma_xvf64gerpp (& acc3 , rowB8_1 , rowA8 [1 ]);
310+ __builtin_mma_xvf64gerpp (& acc4 , rowB8 , rowA8 [2 ]);
311+ __builtin_mma_xvf64gerpp (& acc5 , rowB8_1 , rowA8 [2 ]);
312+ __builtin_mma_xvf64gerpp (& acc6 , rowB8 , rowA8 [3 ]);
313+ __builtin_mma_xvf64gerpp (& acc7 , rowB8_1 , rowA8 [3 ]);
314+
315+ vec_t * rowA9 = (vec_t * )& AO [(l + 9 ) << 3 ];
316+ __vector_pair rowB9 = * ((__vector_pair * )((void * )& BO [(l + 9 ) << 3 ]));
317+ __vector_pair rowB9_1 = * ((__vector_pair * )((void * )& BO [((l + 9 ) << 3 ) + 4 ]));
318+ __builtin_mma_xvf64gerpp (& acc0 , rowB9 , rowA9 [0 ]);
319+ __builtin_mma_xvf64gerpp (& acc1 , rowB9_1 , rowA9 [0 ]);
320+ __builtin_mma_xvf64gerpp (& acc2 , rowB9 , rowA9 [1 ]);
321+ __builtin_mma_xvf64gerpp (& acc3 , rowB9_1 , rowA9 [1 ]);
322+ __builtin_mma_xvf64gerpp (& acc4 , rowB9 , rowA9 [2 ]);
323+ __builtin_mma_xvf64gerpp (& acc5 , rowB9_1 , rowA9 [2 ]);
324+ __builtin_mma_xvf64gerpp (& acc6 , rowB9 , rowA9 [3 ]);
325+ __builtin_mma_xvf64gerpp (& acc7 , rowB9_1 , rowA9 [3 ]);
326+
327+ vec_t * rowA10 = (vec_t * )& AO [(l + 10 ) << 3 ];
328+ __vector_pair rowB10 = * ((__vector_pair * )((void * )& BO [(l + 10 ) << 3 ]));
329+ __vector_pair rowB10_1 = * ((__vector_pair * )((void * )& BO [((l + 10 ) << 3 ) + 4 ]));
330+ __builtin_mma_xvf64gerpp (& acc0 , rowB10 , rowA10 [0 ]);
331+ __builtin_mma_xvf64gerpp (& acc1 , rowB10_1 , rowA10 [0 ]);
332+ __builtin_mma_xvf64gerpp (& acc2 , rowB10 , rowA10 [1 ]);
333+ __builtin_mma_xvf64gerpp (& acc3 , rowB10_1 , rowA10 [1 ]);
334+ __builtin_mma_xvf64gerpp (& acc4 , rowB10 , rowA10 [2 ]);
335+ __builtin_mma_xvf64gerpp (& acc5 , rowB10_1 , rowA10 [2 ]);
336+ __builtin_mma_xvf64gerpp (& acc6 , rowB10 , rowA10 [3 ]);
337+ __builtin_mma_xvf64gerpp (& acc7 , rowB10_1 , rowA10 [3 ]);
338+
339+ vec_t * rowA11 = (vec_t * )& AO [(l + 11 ) << 3 ];
340+ __vector_pair rowB11 = * ((__vector_pair * )((void * )& BO [(l + 11 ) << 3 ]));
341+ __vector_pair rowB11_1 = * ((__vector_pair * )((void * )& BO [((l + 11 ) << 3 ) + 4 ]));
342+ __builtin_mma_xvf64gerpp (& acc0 , rowB11 , rowA11 [0 ]);
343+ __builtin_mma_xvf64gerpp (& acc1 , rowB11_1 , rowA11 [0 ]);
344+ __builtin_mma_xvf64gerpp (& acc2 , rowB11 , rowA11 [1 ]);
345+ __builtin_mma_xvf64gerpp (& acc3 , rowB11_1 , rowA11 [1 ]);
346+ __builtin_mma_xvf64gerpp (& acc4 , rowB11 , rowA11 [2 ]);
347+ __builtin_mma_xvf64gerpp (& acc5 , rowB11_1 , rowA11 [2 ]);
348+ __builtin_mma_xvf64gerpp (& acc6 , rowB11 , rowA11 [3 ]);
349+ __builtin_mma_xvf64gerpp (& acc7 , rowB11_1 , rowA11 [3 ]);
350+
351+ vec_t * rowA12 = (vec_t * )& AO [(l + 12 ) << 3 ];
352+ __vector_pair rowB12 = * ((__vector_pair * )((void * )& BO [(l + 12 ) << 3 ]));
353+ __vector_pair rowB12_1 = * ((__vector_pair * )((void * )& BO [((l + 12 ) << 3 ) + 4 ]));
354+ __builtin_mma_xvf64gerpp (& acc0 , rowB12 , rowA12 [0 ]);
355+ __builtin_mma_xvf64gerpp (& acc1 , rowB12_1 , rowA12 [0 ]);
356+ __builtin_mma_xvf64gerpp (& acc2 , rowB12 , rowA12 [1 ]);
357+ __builtin_mma_xvf64gerpp (& acc3 , rowB12_1 , rowA12 [1 ]);
358+ __builtin_mma_xvf64gerpp (& acc4 , rowB12 , rowA12 [2 ]);
359+ __builtin_mma_xvf64gerpp (& acc5 , rowB12_1 , rowA12 [2 ]);
360+ __builtin_mma_xvf64gerpp (& acc6 , rowB12 , rowA12 [3 ]);
361+ __builtin_mma_xvf64gerpp (& acc7 , rowB12_1 , rowA12 [3 ]);
362+
363+ vec_t * rowA13 = (vec_t * )& AO [(l + 13 ) << 3 ];
364+ __vector_pair rowB13 = * ((__vector_pair * )((void * )& BO [(l + 13 ) << 3 ]));
365+ __vector_pair rowB13_1 = * ((__vector_pair * )((void * )& BO [((l + 13 ) << 3 ) + 4 ]));
366+ __builtin_mma_xvf64gerpp (& acc0 , rowB13 , rowA13 [0 ]);
367+ __builtin_mma_xvf64gerpp (& acc1 , rowB13_1 , rowA13 [0 ]);
368+ __builtin_mma_xvf64gerpp (& acc2 , rowB13 , rowA13 [1 ]);
369+ __builtin_mma_xvf64gerpp (& acc3 , rowB13_1 , rowA13 [1 ]);
370+ __builtin_mma_xvf64gerpp (& acc4 , rowB13 , rowA13 [2 ]);
371+ __builtin_mma_xvf64gerpp (& acc5 , rowB13_1 , rowA13 [2 ]);
372+ __builtin_mma_xvf64gerpp (& acc6 , rowB13 , rowA13 [3 ]);
373+ __builtin_mma_xvf64gerpp (& acc7 , rowB13_1 , rowA13 [3 ]);
374+
375+ vec_t * rowA14 = (vec_t * )& AO [(l + 14 ) << 3 ];
376+ __vector_pair rowB14 = * ((__vector_pair * )((void * )& BO [(l + 14 ) << 3 ]));
377+ __vector_pair rowB14_1 = * ((__vector_pair * )((void * )& BO [((l + 14 ) << 3 ) + 4 ]));
378+ __builtin_mma_xvf64gerpp (& acc0 , rowB14 , rowA14 [0 ]);
379+ __builtin_mma_xvf64gerpp (& acc1 , rowB14_1 , rowA14 [0 ]);
380+ __builtin_mma_xvf64gerpp (& acc2 , rowB14 , rowA14 [1 ]);
381+ __builtin_mma_xvf64gerpp (& acc3 , rowB14_1 , rowA14 [1 ]);
382+ __builtin_mma_xvf64gerpp (& acc4 , rowB14 , rowA14 [2 ]);
383+ __builtin_mma_xvf64gerpp (& acc5 , rowB14_1 , rowA14 [2 ]);
384+ __builtin_mma_xvf64gerpp (& acc6 , rowB14 , rowA14 [3 ]);
385+ __builtin_mma_xvf64gerpp (& acc7 , rowB14_1 , rowA14 [3 ]);
386+
387+ vec_t * rowA15 = (vec_t * )& AO [(l + 15 ) << 3 ];
388+ __vector_pair rowB15 = * ((__vector_pair * )((void * )& BO [(l + 15 ) << 3 ]));
389+ __vector_pair rowB15_1 = * ((__vector_pair * )((void * )& BO [((l + 15 ) << 3 ) + 4 ]));
390+ __builtin_mma_xvf64gerpp (& acc0 , rowB15 , rowA15 [0 ]);
391+ __builtin_mma_xvf64gerpp (& acc1 , rowB15_1 , rowA15 [0 ]);
392+ __builtin_mma_xvf64gerpp (& acc2 , rowB15 , rowA15 [1 ]);
393+ __builtin_mma_xvf64gerpp (& acc3 , rowB15_1 , rowA15 [1 ]);
394+ __builtin_mma_xvf64gerpp (& acc4 , rowB15 , rowA15 [2 ]);
395+ __builtin_mma_xvf64gerpp (& acc5 , rowB15_1 , rowA15 [2 ]);
396+ __builtin_mma_xvf64gerpp (& acc6 , rowB15 , rowA15 [3 ]);
397+ __builtin_mma_xvf64gerpp (& acc7 , rowB15_1 , rowA15 [3 ]);
398+
399+ }
400+ for (; l < temp ; l ++ )
205401 {
206402 rowA = (vec_t * ) & AO [l << 3 ];
207403 rowB = * ((__vector_pair * )((void * )& BO [l << 3 ]));
0 commit comments