diff --git a/filelist.json b/filelist.json index 81bd5ef9c8..35fa008c46 100644 --- a/filelist.json +++ b/filelist.json @@ -1807,6 +1807,7 @@ "src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/sme1_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp", diff --git a/src/BUILD.bazel b/src/BUILD.bazel index 89e632ddd4..c4d77f435c 100644 --- a/src/BUILD.bazel +++ b/src/BUILD.bazel @@ -282,6 +282,7 @@ filegroup( "core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp", "core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp", "core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp", + "core/NEON/kernels/arm_gemm/kernels/sme1_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp", "core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp", "core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp", "core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp", diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3a8815d836..fd4672d686 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -251,6 +251,7 @@ target_sources( core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp + core/NEON/kernels/arm_gemm/kernels/sme1_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp @@ -1081,4 +1082,4 @@ target_sources( cpu/kernels/select/generic/neon/fp16.cpp cpu/kernels/softmax/generic/neon/fp16.cpp cpu/kernels/sub/neon/fp16.cpp -) \ No newline at end of file +) diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp index 5da7161671..4e283b0cf2 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp @@ -56,6 +56,11 @@ #include "kernels/sve_ffinterleaved_fp32_mla_8x3VL.hpp" #include "kernels/sve_ffinterleaved_bf16fp32_mmla_8x3VL.hpp" #endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS + +#ifdef ARM_COMPUTE_ENABLE_SME +#include "kernels/sme1_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp" +#endif // ARM_COMPUTE_ENABLE_SME + #ifdef ARM_COMPUTE_ENABLE_SME2 #include "kernels/sme2_gemv_fp32_mla_16VL.hpp" #include "kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp" @@ -118,6 +123,15 @@ GemmImplementation::with_estimate( [](const GemmArgs &args) { return new GemmHybridIndirect(args); } ), #endif // ARM_COMPUTE_ENABLE_BF16 +#ifdef ARM_COMPUTE_ENABLE_SME +{ + GemmMethod::GEMM_INTERLEAVED, + "sme1_interleaved_nomerge_fp32_mopa_2VLx2VL", + [](const GemmArgs &args) { return args._ci->has_sme() && !args._accumulate; }, + nullptr, + [](const GemmArgs &args) { return new GemmInterleavedNoMerge(args); } +}, +#endif // ARM_COMPUTE_ENABLE_SME #ifdef ARM_COMPUTE_ENABLE_SVE #ifdef ARM_COMPUTE_ENABLE_SME2 // SME kernels @@ -187,6 +201,7 @@ GemmImplementation::with_estimate( nullptr, [](const GemmArgs &args) { return new GemmInterleavedNoMerge(args); } }, + #endif // ARM_COMPUTE_ENABLE_SME2 #ifdef ARM_COMPUTE_ENABLE_BF16 GemmImplementation::with_estimate( diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme1_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme1_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp new file mode 100644 index 0000000000..0eb06f0485 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme1_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +// + Changes from Qualcomm Technologies, Inc. are provided under the following license: +// + Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// + SPDX-License-Identifier: MIT +// + + +#pragma once + +#ifdef ARM_COMPUTE_ENABLE_SME + + +#include "../std_transforms_sme.hpp" + +namespace arm_gemm +{ + +// Implementations +void sme1_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer); + +class cls_sme1_interleaved_nomerge_fp32_mopa_2VLx2VL +{ +public: + typedef float lhs_operand_type; + typedef float rhs_operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer); + + /* Kernel blocking parameters */ + static unsigned int out_height() + { + return sme::get_vector_length() * 2; + } + + static unsigned int out_width() + { + return sme::get_vector_length() * 2; + } + + static constexpr unsigned int k_unroll() + { + return 1; + } + + static constexpr bool supports_accumulate() + { + return true; + } + + static constexpr bool supports_bias() + { + return true; + } + + static constexpr bool supports_activation() + { + return true; + } + + static constexpr bool is_sme() + { + return true; + } + + // Default to the generic kernel + kern_type kernel = sme1_interleaved_nomerge_fp32_mopa_2VLx2VL; + + StdTransformsSME transforms = {}; + + cls_sme1_interleaved_nomerge_fp32_mopa_2VLx2VL(const CPUInfo *) + { + } +}; + +} // namespace arm_gemm + +#endif // ARM_COMPUTE_ENABLE_SME diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme1_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme1_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp new file mode 100644 index 0000000000..74fb679820 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sme1_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp @@ -0,0 +1,788 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +// + Changes from Qualcomm Technologies, Inc. are provided under the following license: +// + Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +// + SPDX-License-Identifier: MIT +// +#ifdef ARM_COMPUTE_ENABLE_SME + +#include "arm_gemm.hpp" + + +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void sme1_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer) +{ + struct KernelArgs + { + KernelArgs( + const float *const A, + const float *const B, + float *const C, const int ldc, + const int M, const int N, const int K, + const float *const bias, + const Activation act, + bool accumulate, + float *const accumulator_buffer + ) : A(A), + B(B), kstride_bytes(K * sizeof(float)), + C(C), ldcb(ldc * sizeof(float)), + M(M), N(N), K(K), + min(-std::numeric_limits::infinity()), + max(std::numeric_limits::infinity()), + bias(bias), + accumulator_buffer(accumulator_buffer), + flags(0x0) + { + if (accumulate) + { + flags |= 1 << 0; // FILL_ACCUMULATORS_FROM_BUFFER + } + if (C == nullptr) + { + flags |= 1 << 1; // STORE_ACCUMULATORS_TO_BUFFER + } + if (act.type == Activation::Type::None) + { + flags |= 1 << 2; // SKIP_ACTIVATION + } + + // Initialise the activation values + switch (act.type) + { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + this->max = static_cast(act.param1); + /* fall through */ + case Activation::Type::ReLU: + this->min = static_cast(0); + break; + } + } + + const float *const A; + const float *const B; + const long kstride_bytes; + float *const C; + const long ldcb; + const long M, N, K; + float min = -std::numeric_limits::infinity(); + float max = std::numeric_limits::infinity(); + + const float *const bias; + + + float *const accumulator_buffer; + uint64_t flags; + }; + + // Construct arguments for this kernel + KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer); + + __asm__ __volatile__( + "ldr x16, [%x[args], %[offsetof_flags]]\n" + ".inst 0xd503477f // SMSTART\n" + "ptrue p0.b\n" + "ptrue p1.b\n" + "ldr x15, [%x[args], %[offsetof_accumulator_buffer]]\n" + "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n" + "tbz x16, #0, 2f\n" + "mov x12, #0x0\n" + "cntw x20\n" + "1:" // Initial accumulator load from buffer: Loop + ".inst 0xa540a5f8 // ld1w z24.s, p1/Z, [x15]\n" + ".inst 0xa541a5f9 // ld1w z25.s, p1/Z, [x15, #1, MUL VL]\n" + ".inst 0xa542a5fa // ld1w z26.s, p1/Z, [x15, #2, MUL VL]\n" + ".inst 0xa543a5fb // ld1w z27.s, p1/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0xa540a5ec // ld1w z12.s, p1/Z, [x15]\n" + ".inst 0xa541a5ed // ld1w z13.s, p1/Z, [x15, #1, MUL VL]\n" + ".inst 0xa542a5ee // ld1w z14.s, p1/Z, [x15, #2, MUL VL]\n" + ".inst 0xa543a5ef // ld1w z15.s, p1/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0xa540a5e0 // ld1w z0.s, p1/Z, [x15]\n" + ".inst 0xa541a5e1 // ld1w z1.s, p1/Z, [x15, #1, MUL VL]\n" + ".inst 0xa542a5e2 // ld1w z2.s, p1/Z, [x15, #2, MUL VL]\n" + ".inst 0xa543a5e3 // ld1w z3.s, p1/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0xa540a5f0 // ld1w z16.s, p1/Z, [x15]\n" + ".inst 0xa541a5f1 // ld1w z17.s, p1/Z, [x15, #1, MUL VL]\n" + ".inst 0xa542a5f2 // ld1w z18.s, p1/Z, [x15, #2, MUL VL]\n" + ".inst 0xa543a5f3 // ld1w z19.s, p1/Z, [x15, #3, MUL VL]\n" + "addvl x15, x15, #4\n" + ".inst 0xc0800700 // mova za0h.s[w12, 0], p1/M, z24.s\n" + ".inst 0xc0800721 // mova za0h.s[w12, 1], p1/M, z25.s\n" + ".inst 0xc0800742 // mova za0h.s[w12, 2], p1/M, z26.s\n" + ".inst 0xc0800763 // mova za0h.s[w12, 3], p1/M, z27.s\n" + ".inst 0xc0800584 // mova za1h.s[w12, 0], p1/M, z12.s\n" + ".inst 0xc08005a5 // mova za1h.s[w12, 1], p1/M, z13.s\n" + ".inst 0xc08005c6 // mova za1h.s[w12, 2], p1/M, z14.s\n" + ".inst 0xc08005e7 // mova za1h.s[w12, 3], p1/M, z15.s\n" + ".inst 0xc0800408 // mova za2h.s[w12, 0], p1/M, z0.s\n" + ".inst 0xc0800429 // mova za2h.s[w12, 1], p1/M, z1.s\n" + ".inst 0xc080044a // mova za2h.s[w12, 2], p1/M, z2.s\n" + ".inst 0xc080046b // mova za2h.s[w12, 3], p1/M, z3.s\n" + ".inst 0xc080060c // mova za3h.s[w12, 0], p1/M, z16.s\n" + ".inst 0xc080062d // mova za3h.s[w12, 1], p1/M, z17.s\n" + ".inst 0xc080064e // mova za3h.s[w12, 2], p1/M, z18.s\n" + ".inst 0xc080066f // mova za3h.s[w12, 3], p1/M, z19.s\n" + "add x12, x12, #0x4\n" + "cmp x12, x20\n" + "blt 1b\n" + "2:" // Initial accumulator load from buffer: End + "ldr w13, [%x[args], %[offsetof_M]]\n" + "mov x11, #0x0\n" + "mov x10, #0x0\n" + "ldr w9, [%x[args], %[offsetof_N]]\n" + "ldr x28, [%x[args], %[offsetof_A]]\n" + "3:" // M and N loop + "mov x27, x28\n" + ".inst 0x25a91542 // whilelt p2.s, x10, x9\n" + "incw x10\n" + ".inst 0x25a91543 // whilelt p3.s, x10, x9\n" + "decw x10\n" + "tbnz x16, #0, 4f\n" + "ldr x20, [%x[args], %[offsetof_bias]]\n" + ".inst 0xc00800ff // zero {za}\n" + "cbz x20, 5f\n" + "fmov z17.s, #1.0\n" + "add x21, x20, x10, LSL #2\n" + ".inst 0xa540aaaa // ld1w z10.s, p2/Z, [x21]\n" + ".inst 0xa541aeab // ld1w z11.s, p3/Z, [x21, #1, MUL VL]\n" + ".inst 0x808a0220 // fmopa za0.s, p0/M, p0/M, z17.s, z10.s\n" + ".inst 0x808b0221 // fmopa za1.s, p0/M, p0/M, z17.s, z11.s\n" + ".inst 0x808a0222 // fmopa za2.s, p0/M, p0/M, z17.s, z10.s\n" + ".inst 0x808b0223 // fmopa za3.s, p0/M, p0/M, z17.s, z11.s\n" + "4:" // Prepare accumulators: Test for last block + "mov x20, x10\n" + "mov x21, x11\n" + "incw x20, ALL, MUL #2\n" + "incw x21, ALL, MUL #2\n" + "cmp x20, x9\n" + "mov x20, x16\n" + "csel x21, x11, x21, LT\n" + "bfm x16, XZR, #0x0, #0x0 // bfc x16, #0x0, #0x1\n" + "cmp x21, x13\n" + "csel x16, x20, x16, LT\n" + "5:" // Prepare accumulators: End + "ldr x20, [%x[args], %[offsetof_K]]\n" + "ldr x23, [%x[args], %[offsetof_B]]\n" + "ldr x22, [%x[args], %[offsetof_kstride_bytes]]\n" + "lsr x21, x20, #0x2\n" + "and x20, x20, #0x3\n" + "madd x23, x10, x22, x23\n" // bptr = B + n * kstride_bytes + "cbz x21, 8f\n" + "subs x21, x21, #0x1\n" + ".inst 0xa540a776 // ld1w z22.s, p1/Z, [x27]\n" + ".inst 0xa541a777 // ld1w z23.s, p1/Z, [x27, #1, MUL VL]\n" + "addvl x27, x27, #2\n" + ".inst 0xa540a6e7 // ld1w z7.s, p1/Z, [x23]\n" + ".inst 0xa541a6ef // ld1w z15.s, p1/Z, [x23, #1, MUL VL]\n" + "addvl x23, x23, #2\n" + ".inst 0xa540a766 // ld1w z6.s, p1/Z, [x27]\n" + ".inst 0xa541a76e // ld1w z14.s, p1/Z, [x27, #1, MUL VL]\n" + "addvl x27, x27, #2\n" + ".inst 0xa540a6f4 // ld1w z20.s, p1/Z, [x23]\n" + ".inst 0xa541a6f5 // ld1w z21.s, p1/Z, [x23, #1, MUL VL]\n" + "addvl x23, x23, #2\n" + ".inst 0xa540a762 // ld1w z2.s, p1/Z, [x27]\n" + ".inst 0xa541a76a // ld1w z10.s, p1/Z, [x27, #1, MUL VL]\n" + "addvl x27, x27, #2\n" + ".inst 0xa540a6e3 // ld1w z3.s, p1/Z, [x23]\n" + ".inst 0xa541a6eb // ld1w z11.s, p1/Z, [x23, #1, MUL VL]\n" + "addvl x23, x23, #2\n" + ".inst 0xa540a761 // ld1w z1.s, p1/Z, [x27]\n" + ".inst 0xa541a769 // ld1w z9.s, p1/Z, [x27, #1, MUL VL]\n" + "addvl x27, x27, #2\n" + ".inst 0xa540a6e4 // ld1w z4.s, p1/Z, [x23]\n" + ".inst 0xa541a6e5 // ld1w z5.s, p1/Z, [x23, #1, MUL VL]\n" + "addvl x23, x23, #2\n" + "ble 7f\n" + "6:" // K loop + ".inst 0x808702c0 // fmopa za0.s, p0/M, p0/M, z22.s, z7.s\n" + "subs x21, x21, #0x1\n" + ".inst 0x808f02c1 // fmopa za1.s, p0/M, p0/M, z22.s, z15.s\n" + ".inst 0x808702e2 // fmopa za2.s, p0/M, p0/M, z23.s, z7.s\n" + ".inst 0x808f02e3 // fmopa za3.s, p0/M, p0/M, z23.s, z15.s\n" + ".inst 0xa540a776 // ld1w z22.s, p1/Z, [x27]\n" + ".inst 0xa541a777 // ld1w z23.s, p1/Z, [x27, #1, MUL VL]\n" + "addvl x27, x27, #2\n" + ".inst 0x809400c0 // fmopa za0.s, p0/M, p0/M, z6.s, z20.s\n" + ".inst 0xa540a6e7 // ld1w z7.s, p1/Z, [x23]\n" + ".inst 0xa541a6ef // ld1w z15.s, p1/Z, [x23, #1, MUL VL]\n" + "addvl x23, x23, #2\n" + ".inst 0x809500c1 // fmopa za1.s, p0/M, p0/M, z6.s, z21.s\n" + ".inst 0x809401c2 // fmopa za2.s, p0/M, p0/M, z14.s, z20.s\n" + ".inst 0x809501c3 // fmopa za3.s, p0/M, p0/M, z14.s, z21.s\n" + ".inst 0xa540a766 // ld1w z6.s, p1/Z, [x27]\n" + ".inst 0xa541a76e // ld1w z14.s, p1/Z, [x27, #1, MUL VL]\n" + "addvl x27, x27, #2\n" + ".inst 0x80830040 // fmopa za0.s, p0/M, p0/M, z2.s, z3.s\n" + ".inst 0xa540a6f4 // ld1w z20.s, p1/Z, [x23]\n" + ".inst 0xa541a6f5 // ld1w z21.s, p1/Z, [x23, #1, MUL VL]\n" + "addvl x23, x23, #2\n" + ".inst 0x808b0041 // fmopa za1.s, p0/M, p0/M, z2.s, z11.s\n" + ".inst 0x80830142 // fmopa za2.s, p0/M, p0/M, z10.s, z3.s\n" + ".inst 0x808b0143 // fmopa za3.s, p0/M, p0/M, z10.s, z11.s\n" + ".inst 0xa540a762 // ld1w z2.s, p1/Z, [x27]\n" + ".inst 0xa541a76a // ld1w z10.s, p1/Z, [x27, #1, MUL VL]\n" + "addvl x27, x27, #2\n" + ".inst 0xa540a6e3 // ld1w z3.s, p1/Z, [x23]\n" + ".inst 0xa541a6eb // ld1w z11.s, p1/Z, [x23, #1, MUL VL]\n" + "addvl x23, x23, #2\n" + ".inst 0x80840020 // fmopa za0.s, p0/M, p0/M, z1.s, z4.s\n" + ".inst 0x80850021 // fmopa za1.s, p0/M, p0/M, z1.s, z5.s\n" + ".inst 0x80840122 // fmopa za2.s, p0/M, p0/M, z9.s, z4.s\n" + ".inst 0x80850123 // fmopa za3.s, p0/M, p0/M, z9.s, z5.s\n" + ".inst 0xa540a761 // ld1w z1.s, p1/Z, [x27]\n" + ".inst 0xa541a769 // ld1w z9.s, p1/Z, [x27, #1, MUL VL]\n" + "addvl x27, x27, #2\n" + ".inst 0xa540a6e4 // ld1w z4.s, p1/Z, [x23]\n" + ".inst 0xa541a6e5 // ld1w z5.s, p1/Z, [x23, #1, MUL VL]\n" + "addvl x23, x23, #2\n" + "bgt 6b\n" + "7:" // K loop tail + ".inst 0x808702c0 // fmopa za0.s, p0/M, p0/M, z22.s, z7.s\n" + ".inst 0x808f02c1 // fmopa za1.s, p0/M, p0/M, z22.s, z15.s\n" + ".inst 0x808702e2 // fmopa za2.s, p0/M, p0/M, z23.s, z7.s\n" + ".inst 0x808f02e3 // fmopa za3.s, p0/M, p0/M, z23.s, z15.s\n" + ".inst 0x809400c0 // fmopa za0.s, p0/M, p0/M, z6.s, z20.s\n" + ".inst 0x809500c1 // fmopa za1.s, p0/M, p0/M, z6.s, z21.s\n" + ".inst 0x809401c2 // fmopa za2.s, p0/M, p0/M, z14.s, z20.s\n" + ".inst 0x809501c3 // fmopa za3.s, p0/M, p0/M, z14.s, z21.s\n" + ".inst 0x80830040 // fmopa za0.s, p0/M, p0/M, z2.s, z3.s\n" + ".inst 0x808b0041 // fmopa za1.s, p0/M, p0/M, z2.s, z11.s\n" + ".inst 0x80830142 // fmopa za2.s, p0/M, p0/M, z10.s, z3.s\n" + ".inst 0x808b0143 // fmopa za3.s, p0/M, p0/M, z10.s, z11.s\n" + ".inst 0x80840020 // fmopa za0.s, p0/M, p0/M, z1.s, z4.s\n" + ".inst 0x80850021 // fmopa za1.s, p0/M, p0/M, z1.s, z5.s\n" + ".inst 0x80840122 // fmopa za2.s, p0/M, p0/M, z9.s, z4.s\n" + ".inst 0x80850123 // fmopa za3.s, p0/M, p0/M, z9.s, z5.s\n" + "8:" // K oddments + "cbz x20, 10f\n" + "9:" // K oddments: Loop + ".inst 0xa540a76a // ld1w z10.s, p1/Z, [x27]\n" + ".inst 0xa541a76b // ld1w z11.s, p1/Z, [x27, #1, MUL VL]\n" + "addvl x27, x27, #2\n" + "subs x20, x20, #0x1\n" + ".inst 0xa540a6ee // ld1w z14.s, p1/Z, [x23]\n" + ".inst 0xa541a6ef // ld1w z15.s, p1/Z, [x23, #1, MUL VL]\n" + "addvl x23, x23, #2\n" + ".inst 0x808e0140 // fmopa za0.s, p0/M, p0/M, z10.s, z14.s\n" + ".inst 0x808f0141 // fmopa za1.s, p0/M, p0/M, z10.s, z15.s\n" + ".inst 0x808e0162 // fmopa za2.s, p0/M, p0/M, z11.s, z14.s\n" + ".inst 0x808f0163 // fmopa za3.s, p0/M, p0/M, z11.s, z15.s\n" + "bgt 9b\n" + "10:" // K oddments: End + "tbz x16, #1, 14f\n" + "tbz x16, #0, 12f\n" + "mov x12, #0x0\n" + "cntw x20\n" + "11:" // Store to partial result buffer: Store and refill: Loop + ".inst 0xa540a5e0 // ld1w z0.s, p1/Z, [x15]\n" + ".inst 0xa541a5e1 // ld1w z1.s, p1/Z, [x15, #1, MUL VL]\n" + ".inst 0xa542a5e2 // ld1w z2.s, p1/Z, [x15, #2, MUL VL]\n" + ".inst 0xa543a5e3 // ld1w z3.s, p1/Z, [x15, #3, MUL VL]\n" + ".inst 0x042f508f // addvl x15, x15, #4\n" + ".inst 0xc0820414 // mova z20.s, p1/M, za0h.s[w12, 0]\n" + ".inst 0xc0820435 // mova z21.s, p1/M, za0h.s[w12, 1]\n" + ".inst 0xc0820456 // mova z22.s, p1/M, za0h.s[w12, 2]\n" + ".inst 0xc0820477 // mova z23.s, p1/M, za0h.s[w12, 3]\n" + ".inst 0xc082049c // mova z28.s, p1/M, za1h.s[w12, 0]\n" + ".inst 0xc08204bd // mova z29.s, p1/M, za1h.s[w12, 1]\n" + ".inst 0xc08204de // mova z30.s, p1/M, za1h.s[w12, 2]\n" + ".inst 0xc08204ff // mova z31.s, p1/M, za1h.s[w12, 3]\n" + ".inst 0xa540a5e4 // ld1w z4.s, p1/Z, [x15]\n" + ".inst 0xa541a5e5 // ld1w z5.s, p1/Z, [x15, #1, MUL VL]\n" + ".inst 0xa542a5e6 // ld1w z6.s, p1/Z, [x15, #2, MUL VL]\n" + ".inst 0xa543a5e7 // ld1w z7.s, p1/Z, [x15, #3, MUL VL]\n" + ".inst 0x042f508f // addvl x15, x15, #4\n" + ".inst 0xc0820508 // mova z8.s, p1/M, za2h.s[w12, 0]\n" + ".inst 0xc0820529 // mova z9.s, p1/M, za2h.s[w12, 1]\n" + ".inst 0xc082054a // mova z10.s, p1/M, za2h.s[w12, 2]\n" + ".inst 0xc082056b // mova z11.s, p1/M, za2h.s[w12, 3]\n" + ".inst 0xc082058c // mova z12.s, p1/M, za3h.s[w12, 0]\n" + ".inst 0xc08205ad // mova z13.s, p1/M, za3h.s[w12, 1]\n" + ".inst 0xc08205ce // mova z14.s, p1/M, za3h.s[w12, 2]\n" + ".inst 0xc08205ef // mova z15.s, p1/M, za3h.s[w12, 3]\n" + ".inst 0xa540a5f0 // ld1w z16.s, p1/Z, [x15]\n" + ".inst 0xa541a5f1 // ld1w z17.s, p1/Z, [x15, #1, MUL VL]\n" + ".inst 0xa542a5f2 // ld1w z18.s, p1/Z, [x15, #2, MUL VL]\n" + ".inst 0xa543a5f3 // ld1w z19.s, p1/Z, [x15, #3, MUL VL]\n" + ".inst 0x042f508f // addvl x15, x15, #4\n" + ".inst 0xa540a5f8 // ld1w z24.s, p1/Z, [x15]\n" + ".inst 0xa541a5f9 // ld1w z25.s, p1/Z, [x15, #1, MUL VL]\n" + ".inst 0xa542a5fa // ld1w z26.s, p1/Z, [x15, #2, MUL VL]\n" + ".inst 0xa543a5fb // ld1w z27.s, p1/Z, [x15, #3, MUL VL]\n" + ".inst 0x042f508f // addvl x15, x15, #4\n" + ".inst 0xc0800400 // mova za0h.s[w12, 0], p1/M, z0.s\n" + ".inst 0xc0800421 // mova za0h.s[w12, 1], p1/M, z1.s\n" + ".inst 0xc0800442 // mova za0h.s[w12, 2], p1/M, z2.s\n" + ".inst 0xc0800463 // mova za0h.s[w12, 3], p1/M, z3.s\n" + ".inst 0xc0800484 // mova za1h.s[w12, 0], p1/M, z4.s\n" + ".inst 0xc08004a5 // mova za1h.s[w12, 1], p1/M, z5.s\n" + ".inst 0xc08004c6 // mova za1h.s[w12, 2], p1/M, z6.s\n" + ".inst 0xc08004e7 // mova za1h.s[w12, 3], p1/M, z7.s\n" + ".inst 0xe540e5d4 // st1w z20.s, p1, [x14]\n" + ".inst 0xe541e5d5 // st1w z21.s, p1, [x14, #1, MUL VL]\n" + ".inst 0xe542e5d6 // st1w z22.s, p1, [x14, #2, MUL VL]\n" + ".inst 0xe543e5d7 // st1w z23.s, p1, [x14, #3, MUL VL]\n" + ".inst 0x042e508e // addvl x14, x14, #4\n" + ".inst 0xc0800608 // mova za2h.s[w12, 0], p1/M, z16.s\n" + ".inst 0xc0800629 // mova za2h.s[w12, 1], p1/M, z17.s\n" + ".inst 0xc080064a // mova za2h.s[w12, 2], p1/M, z18.s\n" + ".inst 0xc080066b // mova za2h.s[w12, 3], p1/M, z19.s\n" + ".inst 0xe540e5dc // st1w z28.s, p1, [x14]\n" + ".inst 0xe541e5dd // st1w z29.s, p1, [x14, #1, MUL VL]\n" + ".inst 0xe542e5de // st1w z30.s, p1, [x14, #2, MUL VL]\n" + ".inst 0xe543e5df // st1w z31.s, p1, [x14, #3, MUL VL]\n" + ".inst 0x042e508e // addvl x14, x14, #4\n" + ".inst 0xc080070c // mova za3h.s[w12, 0], p1/M, z24.s\n" + ".inst 0xc080072d // mova za3h.s[w12, 1], p1/M, z25.s\n" + ".inst 0xc080074e // mova za3h.s[w12, 2], p1/M, z26.s\n" + ".inst 0xc080076f // mova za3h.s[w12, 3], p1/M, z27.s\n" + ".inst 0x9100118c // add x12, x12, #0x4\n" + ".inst 0xe540e5c8 // st1w z8.s, p1, [x14]\n" + ".inst 0xe541e5c9 // st1w z9.s, p1, [x14, #1, MUL VL]\n" + ".inst 0xe542e5ca // st1w z10.s, p1, [x14, #2, MUL VL]\n" + ".inst 0xe543e5cb // st1w z11.s, p1, [x14, #3, MUL VL]\n" + ".inst 0x042e508e // addvl x14, x14, #4\n" + ".inst 0xeb14019f // cmp x12, x20\n" + ".inst 0xe540e5cc // st1w z12.s, p1, [x14]\n" + ".inst 0xe541e5cd // st1w z13.s, p1, [x14, #1, MUL VL]\n" + ".inst 0xe542e5ce // st1w z14.s, p1, [x14, #2, MUL VL]\n" + ".inst 0xe543e5cf // st1w z15.s, p1, [x14, #3, MUL VL]\n" + ".inst 0x042e508e // addvl x14, x14, #4\n" + "blt 11b\n" + "b 30f\n" + "12:" // Store to partial result buffer: Store only + "mov x12, #0x0\n" + "cntw x20\n" + "13:" // Store to partial result buffer: Store only: Loop + ".inst 0xc0820400 // mova z0.s, p1/M, za0h.s[w12, 0]\n" + ".inst 0xc0820421 // mova z1.s, p1/M, za0h.s[w12, 1]\n" + ".inst 0xc0820442 // mova z2.s, p1/M, za0h.s[w12, 2]\n" + ".inst 0xc0820463 // mova z3.s, p1/M, za0h.s[w12, 3]\n" + ".inst 0xc0820490 // mova z16.s, p1/M, za1h.s[w12, 0]\n" + ".inst 0xc08204b1 // mova z17.s, p1/M, za1h.s[w12, 1]\n" + ".inst 0xc08204d2 // mova z18.s, p1/M, za1h.s[w12, 2]\n" + ".inst 0xc08204f3 // mova z19.s, p1/M, za1h.s[w12, 3]\n" + ".inst 0xc082051c // mova z28.s, p1/M, za2h.s[w12, 0]\n" + ".inst 0xc082053d // mova z29.s, p1/M, za2h.s[w12, 1]\n" + ".inst 0xc082055e // mova z30.s, p1/M, za2h.s[w12, 2]\n" + ".inst 0xc082057f // mova z31.s, p1/M, za2h.s[w12, 3]\n" + ".inst 0xc0820594 // mova z20.s, p1/M, za3h.s[w12, 0]\n" + ".inst 0xc08205b5 // mova z21.s, p1/M, za3h.s[w12, 1]\n" + ".inst 0xc08205d6 // mova z22.s, p1/M, za3h.s[w12, 2]\n" + ".inst 0xc08205f7 // mova z23.s, p1/M, za3h.s[w12, 3]\n" + ".inst 0xe540e5c0 // st1w z0.s, p1, [x14]\n" + ".inst 0xe541e5c1 // st1w z1.s, p1, [x14, #1, MUL VL]\n" + ".inst 0xe542e5c2 // st1w z2.s, p1, [x14, #2, MUL VL]\n" + ".inst 0xe543e5c3 // st1w z3.s, p1, [x14, #3, MUL VL]\n" + ".inst 0x042e508e // addvl x14, x14, #4\n" + ".inst 0x9100118c // add x12, x12, #0x4\n" + ".inst 0xe540e5d0 // st1w z16.s, p1, [x14]\n" + ".inst 0xe541e5d1 // st1w z17.s, p1, [x14, #1, MUL VL]\n" + ".inst 0xe542e5d2 // st1w z18.s, p1, [x14, #2, MUL VL]\n" + ".inst 0xe543e5d3 // st1w z19.s, p1, [x14, #3, MUL VL]\n" + ".inst 0x042e508e // addvl x14, x14, #4\n" + ".inst 0xeb14019f // cmp x12, x20\n" + ".inst 0xe540e5dc // st1w z28.s, p1, [x14]\n" + ".inst 0xe541e5dd // st1w z29.s, p1, [x14, #1, MUL VL]\n" + ".inst 0xe542e5de // st1w z30.s, p1, [x14, #2, MUL VL]\n" + ".inst 0xe543e5df // st1w z31.s, p1, [x14, #3, MUL VL]\n" + ".inst 0x042e508e // addvl x14, x14, #4\n" + ".inst 0xe540e5d4 // st1w z20.s, p1, [x14]\n" + ".inst 0xe541e5d5 // st1w z21.s, p1, [x14, #1, MUL VL]\n" + ".inst 0xe542e5d6 // st1w z22.s, p1, [x14, #2, MUL VL]\n" + ".inst 0xe543e5d7 // st1w z23.s, p1, [x14, #3, MUL VL]\n" + ".inst 0x042e508e // addvl x14, x14, #4\n" + "blt 13b\n" + "b 30f\n" + "14:" // Store to output array + "ldr x26, [%x[args], %[offsetof_C]]\n" + "sub x25, x13, x11\n" + "ldr x24, [%x[args], %[offsetof_ldcb]]\n" + "add x26, x26, x10, LSL #2\n" // C += n + "madd x26, x11, x24, x26\n" // C += m * ldc + "tbz x16, #2, 21f\n" + "cntw x23\n" + "mov x12, #0x0\n" + "cmp x25, x23\n" + "csel x22, x25, x23, LT\n" + "lsr x21, x22, #0x2\n" + "and x20, x22, #0x3\n" + "cbz x21, 16f\n" + "15:" // Store to output array: Skip activation: Accumulator row 0 loop + ".inst 0xc0820404 // mova z4.s, p1/M, za0h.s[w12, 0]\n" + ".inst 0xc0820425 // mova z5.s, p1/M, za0h.s[w12, 1]\n" + ".inst 0xc0820446 // mova z6.s, p1/M, za0h.s[w12, 2]\n" + ".inst 0xc0820467 // mova z7.s, p1/M, za0h.s[w12, 3]\n" + ".inst 0xc082048c // mova z12.s, p1/M, za1h.s[w12, 0]\n" + ".inst 0xc08204ad // mova z13.s, p1/M, za1h.s[w12, 1]\n" + ".inst 0xc08204ce // mova z14.s, p1/M, za1h.s[w12, 2]\n" + ".inst 0xc08204ef // mova z15.s, p1/M, za1h.s[w12, 3]\n" + ".inst 0xe540eb44 // st1w z4.s, p2, [x26]\n" + ".inst 0xe541ef4c // st1w z12.s, p3, [x26, #1, MUL VL]\n" + ".inst 0x8b18035a // add x26, x26, x24\n" + ".inst 0x9100118c // add x12, x12, #0x4\n" + ".inst 0xe540eb45 // st1w z5.s, p2, [x26]\n" + ".inst 0xe541ef4d // st1w z13.s, p3, [x26, #1, MUL VL]\n" + ".inst 0x8b18035a // add x26, x26, x24\n" + ".inst 0xeb15099f // cmp x12, x21, LSL #2\n" + ".inst 0xe540eb46 // st1w z6.s, p2, [x26]\n" + ".inst 0xe541ef4e // st1w z14.s, p3, [x26, #1, MUL VL]\n" + ".inst 0x8b18035a // add x26, x26, x24\n" + ".inst 0xe540eb47 // st1w z7.s, p2, [x26]\n" + ".inst 0xe541ef4f // st1w z15.s, p3, [x26, #1, MUL VL]\n" + "add x26, x26, x24\n" + "blt 15b\n" + "16:" // Store to output array: Skip activation: Accumulator row 0 oddments + "cbz x20, 17f\n" + "subs x20, x20, #0x1\n" + ".inst 0xc0820400 // mova z0.s, p1/M, za0h.s[w12, 0]\n" + ".inst 0xc0820421 // mova z1.s, p1/M, za0h.s[w12, 1]\n" + ".inst 0xc0820442 // mova z2.s, p1/M, za0h.s[w12, 2]\n" + ".inst 0xc0820463 // mova z3.s, p1/M, za0h.s[w12, 3]\n" + ".inst 0xc0820488 // mova z8.s, p1/M, za1h.s[w12, 0]\n" + ".inst 0xc08204a9 // mova z9.s, p1/M, za1h.s[w12, 1]\n" + ".inst 0xc08204ca // mova z10.s, p1/M, za1h.s[w12, 2]\n" + ".inst 0xc08204eb // mova z11.s, p1/M, za1h.s[w12, 3]\n" + ".inst 0xe540eb40 // st1w z0.s, p2, [x26]\n" + ".inst 0xe541ef48 // st1w z8.s, p3, [x26, #1, MUL VL]\n" + "add x26, x26, x24\n" + "beq 17f\n" + "subs x20, x20, #0x1\n" + ".inst 0xe540eb41 // st1w z1.s, p2, [x26]\n" + ".inst 0xe541ef49 // st1w z9.s, p3, [x26, #1, MUL VL]\n" + "add x26, x26, x24\n" + "beq 17f\n" + ".inst 0xe540eb42 // st1w z2.s, p2, [x26]\n" + ".inst 0xe541ef4a // st1w z10.s, p3, [x26, #1, MUL VL]\n" + "add x26, x26, x24\n" + "17:" // Store to output array: Skip activation: Accumulator row 0 oddments: End + "subs x25, x25, x22\n" + "beq 21f\n" + "cmp x25, x23\n" + "mov x12, #0x0\n" + "csel x22, x25, x23, LT\n" + "lsr x21, x22, #0x2\n" + "and x20, x22, #0x3\n" + "cbz x21, 19f\n" + "18:" // Store to output array: Skip activation: Accumulator row 1 loop + ".inst 0xc0820504 // mova z4.s, p1/M, za2h.s[w12, 0]\n" + ".inst 0xc0820525 // mova z5.s, p1/M, za2h.s[w12, 1]\n" + ".inst 0xc0820546 // mova z6.s, p1/M, za2h.s[w12, 2]\n" + ".inst 0xc0820567 // mova z7.s, p1/M, za2h.s[w12, 3]\n" + ".inst 0xc082058c // mova z12.s, p1/M, za3h.s[w12, 0]\n" + ".inst 0xc08205ad // mova z13.s, p1/M, za3h.s[w12, 1]\n" + ".inst 0xc08205ce // mova z14.s, p1/M, za3h.s[w12, 2]\n" + ".inst 0xc08205ef // mova z15.s, p1/M, za3h.s[w12, 3]\n" + ".inst 0xe540eb44 // st1w z4.s, p2, [x26]\n" + ".inst 0xe541ef4c // st1w z12.s, p3, [x26, #1, MUL VL]\n" + "add x26, x26, x24\n" + "add x12, x12, #0x4\n" + ".inst 0xe540eb45 // st1w z5.s, p2, [x26]\n" + ".inst 0xe541ef4d // st1w z13.s, p3, [x26, #1, MUL VL]\n" + ".inst 0x8b18035a // add x26, x26, x24\n" + ".inst 0xeb15099f // cmp x12, x21, LSL #2\n" + ".inst 0xe540eb46 // st1w z6.s, p2, [x26]\n" + ".inst 0xe541ef4e // st1w z14.s, p3, [x26, #1, MUL VL]\n" + ".inst 0x8b18035a // add x26, x26, x24\n" + ".inst 0xe540eb47 // st1w z7.s, p2, [x26]\n" + ".inst 0xe541ef4f // st1w z15.s, p3, [x26, #1, MUL VL]\n" + "add x26, x26, x24\n" + "blt 18b\n" + "19:" // Store to output array: Skip activation: Accumulator row 1 oddments + "cbz x20, 20f\n" + "subs x20, x20, #0x1\n" + ".inst 0xc0820504 // mova z4.s, p1/M, za2h.s[w12, 0]\n" + ".inst 0xc0820525 // mova z5.s, p1/M, za2h.s[w12, 1]\n" + ".inst 0xc0820546 // mova z6.s, p1/M, za2h.s[w12, 2]\n" + ".inst 0xc0820567 // mova z7.s, p1/M, za2h.s[w12, 3]\n" + ".inst 0xc082058c // mova z12.s, p1/M, za3h.s[w12, 0]\n" + ".inst 0xc08205ad // mova z13.s, p1/M, za3h.s[w12, 1]\n" + ".inst 0xc08205ce // mova z14.s, p1/M, za3h.s[w12, 2]\n" + ".inst 0xc08205ef // mova z15.s, p1/M, za3h.s[w12, 3]\n" + ".inst 0xe540eb44 // st1w z4.s, p2, [x26]\n" + ".inst 0xe541ef4c // st1w z12.s, p3, [x26, #1, MUL VL]\n" + "add x26, x26, x24\n" + "beq 20f\n" + "subs x20, x20, #0x1\n" + ".inst 0xe540eb45 // st1w z5.s, p2, [x26]\n" + ".inst 0xe541ef4d // st1w z13.s, p3, [x26, #1, MUL VL]\n" + "add x26, x26, x24\n" + "beq 20f\n" + ".inst 0xe540eb46 // st1w z6.s, p2, [x26]\n" + ".inst 0xe541ef4e // st1w z14.s, p3, [x26, #1, MUL VL]\n" + "add x26, x26, x24\n" + "20:" // Store to output array: Skip activation: Accumulator row 1 oddments: End + "subs x25, x25, x22\n" + "beq 21f\n" + "b 28f\n" + "21:" // Store to output array: Skip activation: End + "cntw x23\n" + "ld1rw { z21.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n" + "mov x12, #0x0\n" + "cmp x25, x23\n" + "ld1rw { z20.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n" + "csel x22, x25, x23, LT\n" + "lsr x21, x22, #0x2\n" + "and x20, x22, #0x3\n" + "cbz x21, 23f\n" + "22:" // Store to output array: Accumulator row 0 loop + ".inst 0xc0820410 // mova z16.s, p1/M, za0h.s[w12, 0]\n" + ".inst 0xc0820431 // mova z17.s, p1/M, za0h.s[w12, 1]\n" + ".inst 0xc0820452 // mova z18.s, p1/M, za0h.s[w12, 2]\n" + ".inst 0xc0820473 // mova z19.s, p1/M, za0h.s[w12, 3]\n" + ".inst 0xc0820498 // mova z24.s, p1/M, za1h.s[w12, 0]\n" + ".inst 0xc08204b9 // mova z25.s, p1/M, za1h.s[w12, 1]\n" + ".inst 0xc08204da // mova z26.s, p1/M, za1h.s[w12, 2]\n" + ".inst 0xc08204fb // mova z27.s, p1/M, za1h.s[w12, 3]\n" + ".inst 0x65878690 // fmin z16.s, p1/M, z16.s, z20.s\n" + ".inst 0x65878691 // fmin z17.s, p1/M, z17.s, z20.s\n" + ".inst 0x65878692 // fmin z18.s, p1/M, z18.s, z20.s\n" + ".inst 0x65878693 // fmin z19.s, p1/M, z19.s, z20.s\n" + ".inst 0x658686b0 // fmax z16.s, p1/M, z16.s, z21.s\n" + ".inst 0x658686b1 // fmax z17.s, p1/M, z17.s, z21.s\n" + ".inst 0x658686b2 // fmax z18.s, p1/M, z18.s, z21.s\n" + ".inst 0x658686b3 // fmax z19.s, p1/M, z19.s, z21.s\n" + ".inst 0x65878698 // fmin z24.s, p1/M, z24.s, z20.s\n" + ".inst 0x65878699 // fmin z25.s, p1/M, z25.s, z20.s\n" + ".inst 0x6587869a // fmin z26.s, p1/M, z26.s, z20.s\n" + ".inst 0x6587869b // fmin z27.s, p1/M, z27.s, z20.s\n" + ".inst 0x658686b8 // fmax z24.s, p1/M, z24.s, z21.s\n" + ".inst 0x658686b9 // fmax z25.s, p1/M, z25.s, z21.s\n" + ".inst 0x658686ba // fmax z26.s, p1/M, z26.s, z21.s\n" + ".inst 0x658686bb // fmax z27.s, p1/M, z27.s, z21.s\n" + ".inst 0x9100118c // add x12, x12, #0x4\n" + ".inst 0xeb15099f // cmp x12, x21, LSL #2\n" + ".inst 0xe540eb50 // st1w z16.s, p2, [x26]\n" + ".inst 0xe541ef58 // st1w z24.s, p3, [x26, #1, MUL VL]\n" + ".inst 0x8b18035a // add x26, x26, x24\n" + ".inst 0xe540eb51 // st1w z17.s, p2, [x26]\n" + ".inst 0xe541ef59 // st1w z25.s, p3, [x26, #1, MUL VL]\n" + ".inst 0x8b18035a // add x26, x26, x24\n" + "st1w z18.s, p2, [x26]\n" + "st1w z26.s, p3, [x26, #1, MUL VL]\n" + "add x26, x26, x24\n" + "st1w z19.s, p2, [x26]\n" + "st1w z27.s, p3, [x26, #1, MUL VL]\n" + "add x26, x26, x24\n" + "blt 22b\n" + "23:" // Store to output array: Accumulator row 0 oddments + "cbz x20, 24f\n" + ".inst 0xc0820410 // mova z16.s, p1/M, za0h.s[w12, 0]\n" + ".inst 0xc0820431 // mova z17.s, p1/M, za0h.s[w12, 1]\n" + ".inst 0xc0820452 // mova z18.s, p1/M, za0h.s[w12, 2]\n" + ".inst 0xc0820473 // mova z19.s, p1/M, za0h.s[w12, 3]\n" + ".inst 0xc0820498 // mova z24.s, p1/M, za1h.s[w12, 0]\n" + ".inst 0xc08204b9 // mova z25.s, p1/M, za1h.s[w12, 1]\n" + ".inst 0xc08204da // mova z26.s, p1/M, za1h.s[w12, 2]\n" + ".inst 0xc08204fb // mova z27.s, p1/M, za1h.s[w12, 3]\n" + ".inst 0xf1000694 // subs x20, x20, #0x1\n" + ".inst 0x65878690 // fmin z16.s, p1/M, z16.s, z20.s\n" + ".inst 0x65878691 // fmin z17.s, p1/M, z17.s, z20.s\n" + ".inst 0x65878692 // fmin z18.s, p1/M, z18.s, z20.s\n" + ".inst 0x65878693 // fmin z19.s, p1/M, z19.s, z20.s\n" + ".inst 0x658686b0 // fmax z16.s, p1/M, z16.s, z21.s\n" + ".inst 0x658686b1 // fmax z17.s, p1/M, z17.s, z21.s\n" + ".inst 0x658686b2 // fmax z18.s, p1/M, z18.s, z21.s\n" + ".inst 0x658686b3 // fmax z19.s, p1/M, z19.s, z21.s\n" + ".inst 0x65878698 // fmin z24.s, p1/M, z24.s, z20.s\n" + ".inst 0x65878699 // fmin z25.s, p1/M, z25.s, z20.s\n" + ".inst 0x6587869a // fmin z26.s, p1/M, z26.s, z20.s\n" + ".inst 0x6587869b // fmin z27.s, p1/M, z27.s, z20.s\n" + ".inst 0x658686b8 // fmax z24.s, p1/M, z24.s, z21.s\n" + ".inst 0x658686b9 // fmax z25.s, p1/M, z25.s, z21.s\n" + ".inst 0x658686ba // fmax z26.s, p1/M, z26.s, z21.s\n" + ".inst 0x658686bb // fmax z27.s, p1/M, z27.s, z21.s\n" + ".inst 0xe540eb50 // st1w z16.s, p2, [x26]\n" + ".inst 0xe541ef58 // st1w z24.s, p3, [x26, #1, MUL VL]\n" + ".inst 0x8b18035a // add x26, x26, x24\n" + "beq 24f\n" + "subs x20, x20, #0x1\n" + ".inst 0xe540eb51 // st1w z17.s, p2, [x26]\n" + ".inst 0xe541ef59 // st1w z25.s, p3, [x26, #1, MUL VL]\n" + "add x26, x26, x24\n" + "beq 24f\n" + ".inst 0xe540eb52 // st1w z18.s, p2, [x26]\n" + ".inst 0xe541ef5a // st1w z26.s, p3, [x26, #1, MUL VL]\n" + "add x26, x26, x24\n" + "24:" // Store to output array: Accumulator row 0 oddments: End + "subs x25, x25, x22\n" + "beq 28f\n" + "cmp x25, x23\n" + "mov x12, #0x0\n" + "csel x20, x25, x23, LT\n" + "lsr x21, x20, #0x2\n" + "and x20, x20, #0x3\n" + "cbz x21, 26f\n" + "25:" // Store to output array: Accumulator row 1 loop + ".inst 0xc0820500 // mova z0.s, p1/M, za2h.s[w12, 0]\n" + ".inst 0xc0820521 // mova z1.s, p1/M, za2h.s[w12, 1]\n" + ".inst 0xc0820542 // mova z2.s, p1/M, za2h.s[w12, 2]\n" + ".inst 0xc0820563 // mova z3.s, p1/M, za2h.s[w12, 3]\n" + ".inst 0xc0820588 // mova z8.s, p1/M, za3h.s[w12, 0]\n" + ".inst 0xc08205a9 // mova z9.s, p1/M, za3h.s[w12, 1]\n" + ".inst 0xc08205ca // mova z10.s, p1/M, za3h.s[w12, 2]\n" + ".inst 0xc08205eb // mova z11.s, p1/M, za3h.s[w12, 3]\n" + ".inst 0x65878680 // fmin z0.s, p1/M, z0.s, z20.s\n" + ".inst 0x65878681 // fmin z1.s, p1/M, z1.s, z20.s\n" + ".inst 0x65878682 // fmin z2.s, p1/M, z2.s, z20.s\n" + ".inst 0x65878683 // fmin z3.s, p1/M, z3.s, z20.s\n" + ".inst 0x658686a0 // fmax z0.s, p1/M, z0.s, z21.s\n" + ".inst 0x658686a1 // fmax z1.s, p1/M, z1.s, z21.s\n" + ".inst 0x658686a2 // fmax z2.s, p1/M, z2.s, z21.s\n" + ".inst 0x658686a3 // fmax z3.s, p1/M, z3.s, z21.s\n" + ".inst 0x65878688 // fmin z8.s, p1/M, z8.s, z20.s\n" + ".inst 0x65878689 // fmin z9.s, p1/M, z9.s, z20.s\n" + ".inst 0x6587868a // fmin z10.s, p1/M, z10.s, z20.s\n" + ".inst 0x6587868b // fmin z11.s, p1/M, z11.s, z20.s\n" + ".inst 0x658686a8 // fmax z8.s, p1/M, z8.s, z21.s\n" + ".inst 0x658686a9 // fmax z9.s, p1/M, z9.s, z21.s\n" + ".inst 0x658686aa // fmax z10.s, p1/M, z10.s, z21.s\n" + ".inst 0x658686ab // fmax z11.s, p1/M, z11.s, z21.s\n" + ".inst 0x9100118c // add x12, x12, #0x4\n" + ".inst 0xeb15099f // cmp x12, x21, LSL #2\n" + ".inst 0xe540eb40 // st1w z0.s, p2, [x26]\n" + ".inst 0xe541ef48 // st1w z8.s, p3, [x26, #1, MUL VL]\n" + ".inst 0x8b18035a // add x26, x26, x24\n" + ".inst 0xe540eb41 // st1w z1.s, p2, [x26]\n" + ".inst 0xe541ef49 // st1w z9.s, p3, [x26, #1, MUL VL]\n" + ".inst 0x8b18035a // add x26, x26, x24\n" + ".inst 0xe540eb42 // st1w z2.s, p2, [x26]\n" + ".inst 0xe541ef4a // st1w z10.s, p3, [x26, #1, MUL VL]\n" + ".inst 0x8b18035a // add x26, x26, x24\n" + ".inst 0xe540eb43 // st1w z3.s, p2, [x26]\n" + ".inst 0xe541ef4b // st1w z11.s, p3, [x26, #1, MUL VL]\n" + "add x26, x26, x24\n" + "blt 25b\n" + "26:" // Store to output array: Accumulator row 1 oddments + "cbz x20, 27f\n" + ".inst 0xc0820510 // mova z16.s, p1/M, za2h.s[w12, 0]\n" + ".inst 0xc0820531 // mova z17.s, p1/M, za2h.s[w12, 1]\n" + ".inst 0xc0820552 // mova z18.s, p1/M, za2h.s[w12, 2]\n" + ".inst 0xc0820573 // mova z19.s, p1/M, za2h.s[w12, 3]\n" + ".inst 0xc0820598 // mova z24.s, p1/M, za3h.s[w12, 0]\n" + ".inst 0xc08205b9 // mova z25.s, p1/M, za3h.s[w12, 1]\n" + ".inst 0xc08205da // mova z26.s, p1/M, za3h.s[w12, 2]\n" + ".inst 0xc08205fb // mova z27.s, p1/M, za3h.s[w12, 3]\n" + ".inst 0xf1000694 // subs x20, x20, #0x1\n" + ".inst 0x65878690 // fmin z16.s, p1/M, z16.s, z20.s\n" + ".inst 0x65878691 // fmin z17.s, p1/M, z17.s, z20.s\n" + ".inst 0x65878692 // fmin z18.s, p1/M, z18.s, z20.s\n" + ".inst 0x65878693 // fmin z19.s, p1/M, z19.s, z20.s\n" + ".inst 0x658686b0 // fmax z16.s, p1/M, z16.s, z21.s\n" + ".inst 0x658686b1 // fmax z17.s, p1/M, z17.s, z21.s\n" + ".inst 0x658686b2 // fmax z18.s, p1/M, z18.s, z21.s\n" + ".inst 0x658686b3 // fmax z19.s, p1/M, z19.s, z21.s\n" + ".inst 0x65878698 // fmin z24.s, p1/M, z24.s, z20.s\n" + ".inst 0x65878699 // fmin z25.s, p1/M, z25.s, z20.s\n" + ".inst 0x6587869a // fmin z26.s, p1/M, z26.s, z20.s\n" + ".inst 0x6587869b // fmin z27.s, p1/M, z27.s, z20.s\n" + ".inst 0x658686b8 // fmax z24.s, p1/M, z24.s, z21.s\n" + ".inst 0x658686b9 // fmax z25.s, p1/M, z25.s, z21.s\n" + ".inst 0x658686ba // fmax z26.s, p1/M, z26.s, z21.s\n" + ".inst 0x658686bb // fmax z27.s, p1/M, z27.s, z21.s\n" + ".inst 0xe540eb50 // st1w z16.s, p2, [x26]\n" + ".inst 0xe541ef58 // st1w z24.s, p3, [x26, #1, MUL VL]\n" + "add x26, x26, x24\n" + "beq 27f\n" + "subs x20, x20, #0x1\n" + ".inst 0xe540eb51 // st1w z17.s, p2, [x26]\n" + ".inst 0xe541ef59 // st1w z25.s, p3, [x26, #1, MUL VL]\n" + "add x26, x26, x24\n" + "beq 27f\n" + ".inst 0xe540eb52 // st1w z18.s, p2, [x26]\n" + ".inst 0xe541ef5a // st1w z26.s, p3, [x26, #1, MUL VL]\n" + "27:" // Store to output array: Accumulator row 1 oddments: End + "28:" // Store to output array: End + "tbz x16, #0, 30f\n" + "mov x12, #0x0\n" + "cntw x20\n" + "29:" // Store to output array: Refill accumulators: Loop + ".inst 0xa540a5e8 // ld1w z8.s, p1/Z, [x15]\n" + ".inst 0xa541a5e9 // ld1w z9.s, p1/Z, [x15, #1, MUL VL]\n" + ".inst 0xa542a5ea // ld1w z10.s, p1/Z, [x15, #2, MUL VL]\n" + ".inst 0xa543a5eb // ld1w z11.s, p1/Z, [x15, #3, MUL VL]\n" + ".inst 0x042f508f // addvl x15, x15, #4\n" + ".inst 0xa540a5e0 // ld1w z0.s, p1/Z, [x15]\n" + ".inst 0xa541a5e1 // ld1w z1.s, p1/Z, [x15, #1, MUL VL]\n" + ".inst 0xa542a5e2 // ld1w z2.s, p1/Z, [x15, #2, MUL VL]\n" + ".inst 0xa543a5e3 // ld1w z3.s, p1/Z, [x15, #3, MUL VL]\n" + ".inst 0x042f508f // addvl x15, x15, #4\n" + ".inst 0xa540a5e4 // ld1w z4.s, p1/Z, [x15]\n" + ".inst 0xa541a5e5 // ld1w z5.s, p1/Z, [x15, #1, MUL VL]\n" + ".inst 0xa542a5e6 // ld1w z6.s, p1/Z, [x15, #2, MUL VL]\n" + ".inst 0xa543a5e7 // ld1w z7.s, p1/Z, [x15, #3, MUL VL]\n" + ".inst 0x042f508f // addvl x15, x15, #4\n" + ".inst 0xa540a5ec // ld1w z12.s, p1/Z, [x15]\n" + ".inst 0xa541a5ed // ld1w z13.s, p1/Z, [x15, #1, MUL VL]\n" + ".inst 0xa542a5ee // ld1w z14.s, p1/Z, [x15, #2, MUL VL]\n" + ".inst 0xa543a5ef // ld1w z15.s, p1/Z, [x15, #3, MUL VL]\n" + ".inst 0x042f508f // addvl x15, x15, #4\n" + ".inst 0xc0800500 // mova za0h.s[w12, 0], p1/M, z8.s\n" + ".inst 0xc0800521 // mova za0h.s[w12, 1], p1/M, z9.s\n" + ".inst 0xc0800542 // mova za0h.s[w12, 2], p1/M, z10.s\n" + ".inst 0xc0800563 // mova za0h.s[w12, 3], p1/M, z11.s\n" + ".inst 0xc0800404 // mova za1h.s[w12, 0], p1/M, z0.s\n" + ".inst 0xc0800425 // mova za1h.s[w12, 1], p1/M, z1.s\n" + ".inst 0xc0800446 // mova za1h.s[w12, 2], p1/M, z2.s\n" + ".inst 0xc0800467 // mova za1h.s[w12, 3], p1/M, z3.s\n" + ".inst 0xc0800488 // mova za2h.s[w12, 0], p1/M, z4.s\n" + ".inst 0xc08004a9 // mova za2h.s[w12, 1], p1/M, z5.s\n" + ".inst 0xc08004ca // mova za2h.s[w12, 2], p1/M, z6.s\n" + ".inst 0xc08004eb // mova za2h.s[w12, 3], p1/M, z7.s\n" + ".inst 0xc080058c // mova za3h.s[w12, 0], p1/M, z12.s\n" + ".inst 0xc08005ad // mova za3h.s[w12, 1], p1/M, z13.s\n" + ".inst 0xc08005ce // mova za3h.s[w12, 2], p1/M, z14.s\n" + ".inst 0xc08005ef // mova za3h.s[w12, 3], p1/M, z15.s\n" + "add x12, x12, #0x4\n" + "cmp x12, x20\n" + "blt 29b\n" + "30:" // End block + "incw x10, ALL, MUL #2\n" + "cmp x10, x9\n" + "blt 3b\n" + "incw x11, ALL, MUL #2\n" + "mov x10, #0x0\n" + "cmp x11, x13\n" + "mov x28, x27\n" + "blt 3b\n" + ".inst 0xd503467f // SMSTOP\n" + : + : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb)) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace arm_gemm + +#endif // ARM_COMPUTE_ENABLE_SME