|
51 | 51 | #ifdef ALPHA_ONE |
52 | 52 | #define UPDATE_C(PG16, PG32, PTR, SRC) \ |
53 | 53 | do { \ |
54 | | - tmp32 = svreinterpret_f32_u32(svld1uh_u32((PG16), (uint16_t*)PTR)); \ |
| 54 | + tmp16 = svld1_bf16((PG16), (PTR)); \ |
| 55 | + tmp32 = svreinterpret_f32(svzip1_bf16(zeros, tmp16)); \ |
55 | 56 | tmp32 = svadd_z((PG32), SRC, tmp32); \ |
56 | 57 | tmp16 = svcvt_bf16_f32_z((PG32), tmp32); \ |
57 | 58 | tmp16 = svuzp1_bf16(tmp16, tmp16); \ |
|
60 | 61 | #else |
61 | 62 | #define UPDATE_C(PG16, PG32, PTR, SRC) \ |
62 | 63 | do { \ |
63 | | - tmp32 = svreinterpret_f32_u32(svld1uh_u32((PG16), (uint16_t*)PTR)); \ |
| 64 | + tmp16 = svld1_bf16((PG16), (PTR)); \ |
| 65 | + tmp32 = svreinterpret_f32(svzip1_bf16(zeros, tmp16)); \ |
64 | 66 | tmp32 = svmad_z((PG32), svalpha, SRC, tmp32); \ |
65 | 67 | tmp16 = svcvt_bf16_f32_z((PG32), tmp32); \ |
66 | 68 | tmp16 = svuzp1_bf16(tmp16, tmp16); \ |
@@ -121,6 +123,7 @@ static int gemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOA |
121 | 123 | #ifdef BGEMM |
122 | 124 | svbool_t pg16_first_2 = svdupq_b16(1, 1, 0, 0, 0, 0, 0, 0); |
123 | 125 | svbool_t pg16_first_1 = svdupq_b16(1, 0, 0, 0, 0, 0, 0, 0); |
| 126 | + svbfloat16_t zeros = svdup_n_bf16(vcvth_bf16_f32(0.0)); |
124 | 127 | #endif |
125 | 128 |
|
126 | 129 | bfloat16_t *ptr_a = (bfloat16_t *)A; |
|
0 commit comments