From b15ba6ba54f5d2b79d626b6c73624b8e3d7819d7 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Mon, 8 Sep 2025 11:53:55 -0400 Subject: [PATCH 1/6] Added optimized ppc64le support functions for ML-KEM. The supported native functions include: 1. MLK_USE_NATIVE_NTT (ntt_ppc.S) 2. MLK_USE_NATIVE_INTT (intt_ppc.S) 3. MLK_USE_NATIVE_POLY_REDUCE (reduce.S) 4. MLK_USE_NATIVE_POLY_TOMONT (poly_tomont.S) And other interface functions and headers. Signed-off-by: Danny Tsen --- BIBLIOGRAPHY.md | 1 + dev/ppc64le/README.md | 6 + dev/ppc64le/meta.h | 53 ++ dev/ppc64le/src/arith_native_ppc64le.h | 24 + dev/ppc64le/src/consts.c | 155 ++++ dev/ppc64le/src/consts.h | 26 + dev/ppc64le/src/intt_ppc.S | 713 ++++++++++++++++++ dev/ppc64le/src/ntt_ppc.S | 515 +++++++++++++ dev/ppc64le/src/poly_tomont.S | 192 +++++ dev/ppc64le/src/reduce.S | 242 ++++++ integration/liboqs/ML-KEM-1024_META.yml | 19 + integration/liboqs/ML-KEM-512_META.yml | 19 + integration/liboqs/ML-KEM-768_META.yml | 19 + integration/liboqs/config_ppc64le.h | 266 +++++++ mlkem/mlkem_native.S | 27 + mlkem/mlkem_native.c | 27 + mlkem/src/native/meta.h | 4 + mlkem/src/native/ppc64le/README.md | 6 + mlkem/src/native/ppc64le/meta.h | 53 ++ .../native/ppc64le/src/arith_native_ppc64le.h | 24 + mlkem/src/native/ppc64le/src/consts.c | 155 ++++ mlkem/src/native/ppc64le/src/consts.h | 26 + mlkem/src/native/ppc64le/src/intt_ppc.S | 711 +++++++++++++++++ mlkem/src/native/ppc64le/src/ntt_ppc.S | 513 +++++++++++++ mlkem/src/native/ppc64le/src/poly_tomont.S | 190 +++++ mlkem/src/native/ppc64le/src/reduce.S | 240 ++++++ test/mk/components.mk | 1 + 27 files changed, 4227 insertions(+) create mode 100644 dev/ppc64le/README.md create mode 100644 dev/ppc64le/meta.h create mode 100644 dev/ppc64le/src/arith_native_ppc64le.h create mode 100644 dev/ppc64le/src/consts.c create mode 100644 dev/ppc64le/src/consts.h create mode 100644 dev/ppc64le/src/intt_ppc.S create mode 100644 dev/ppc64le/src/ntt_ppc.S create mode 100644 dev/ppc64le/src/poly_tomont.S create mode 100644 dev/ppc64le/src/reduce.S create mode 100644 integration/liboqs/config_ppc64le.h create mode 100644 mlkem/src/native/ppc64le/README.md create mode 100644 mlkem/src/native/ppc64le/meta.h create mode 100644 mlkem/src/native/ppc64le/src/arith_native_ppc64le.h create mode 100644 mlkem/src/native/ppc64le/src/consts.c create mode 100644 mlkem/src/native/ppc64le/src/consts.h create mode 100644 mlkem/src/native/ppc64le/src/intt_ppc.S create mode 100644 mlkem/src/native/ppc64le/src/ntt_ppc.S create mode 100644 mlkem/src/native/ppc64le/src/poly_tomont.S create mode 100644 mlkem/src/native/ppc64le/src/reduce.S diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md index f10a15f6ec..d75d368ef1 100644 --- a/BIBLIOGRAPHY.md +++ b/BIBLIOGRAPHY.md @@ -29,6 +29,7 @@ source code and documentation. - [examples/basic_deterministic/mlkem_native/custom_no_randomized_config.h](examples/basic_deterministic/mlkem_native/custom_no_randomized_config.h) - [integration/liboqs/config_aarch64.h](integration/liboqs/config_aarch64.h) - [integration/liboqs/config_c.h](integration/liboqs/config_c.h) + - [integration/liboqs/config_ppc64le.h](integration/liboqs/config_ppc64le.h) - [integration/liboqs/config_x86_64.h](integration/liboqs/config_x86_64.h) - [mlkem/src/config.h](mlkem/src/config.h) - [mlkem/src/kem.c](mlkem/src/kem.c) diff --git a/dev/ppc64le/README.md b/dev/ppc64le/README.md new file mode 100644 index 0000000000..5125a40eae --- /dev/null +++ b/dev/ppc64le/README.md @@ -0,0 +1,6 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# ppc64le backend (little endian) + +This directory contains a native backend for little endian POWER 8 (ppc64le) and above systems. + diff --git a/dev/ppc64le/meta.h b/dev/ppc64le/meta.h new file mode 100644 index 0000000000..34f8cbec66 --- /dev/null +++ b/dev/ppc64le/meta.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_DEV_PPC64LE_META_H +#define MLK_DEV_PPC64LE_META_H + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT + +#define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT + +/* Set of primitives that this backend replaces */ +#define MLK_USE_NATIVE_NTT +#define MLK_USE_NATIVE_INTT +#define MLK_USE_NATIVE_POLY_REDUCE +#define MLK_USE_NATIVE_POLY_TOMONT + +#if !defined(__ASSEMBLER__) +#include +#include "../../common.h" +#include "../../params.h" +#include "../api.h" +#include "src/arith_native_ppc64le.h" + +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) +{ + mlk_ntt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) +{ + mlk_intt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) +{ + mlk_reduce_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) +{ + mlk_poly_tomont_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_DEV_PPC64LE_META_H */ diff --git a/dev/ppc64le/src/arith_native_ppc64le.h b/dev/ppc64le/src/arith_native_ppc64le.h new file mode 100644 index 0000000000..1c75346689 --- /dev/null +++ b/dev/ppc64le/src/arith_native_ppc64le.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2024-2025 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#define MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H + +#include +#include "../../../common.h" +#include "consts.h" + +#define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) +void mlk_ntt_ppc(int16_t *, const int16_t *); + +#define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) +void mlk_intt_ppc(int16_t *, const int16_t *); + +#define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) +void mlk_reduce_ppc(int16_t *r, const int16_t *); + +#define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) +void mlk_poly_tomont_ppc(int16_t *, const int16_t *); + +#endif /* !MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/dev/ppc64le/src/consts.c b/dev/ppc64le/src/consts.c new file mode 100644 index 0000000000..4c2fbdf61a --- /dev/null +++ b/dev/ppc64le/src/consts.c @@ -0,0 +1,155 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" + +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +MLK_ALIGN const int16_t mlk_ppc_qdata[1568] = { + /* -Q */ + -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329, + /* QINV */ + -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327, + /* Q */ + 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329, + /* const 20159 for reduce.S and intt */ + 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159, + /* const 1441 for intt */ + 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441, + /* for poly_tomont.S */ + 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353, + /* zetas */ + /* For ntt Len=128, offset 96 */ + -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, + -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, + 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, + 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, + -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, + 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, + 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, + 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, + -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, + -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, + -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, + 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, + 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, + -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, + 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, + -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, + 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, + 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, + -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, + 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, + -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, + -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, + -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, + 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, + -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, + -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, + 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, + -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, + -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, + -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, + 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, + -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, + -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, + 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, + /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ + -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, + 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, + 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, + 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, + -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, + -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, + -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, + -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, + 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, + 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, + 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, + 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, + 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, + 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, + 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, + -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, + -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, + -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, + -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, + 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, + 1628, 1628, + /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ + 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, + 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, + -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, + -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, + 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, + -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, + 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, + 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, + 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, + 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, + 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, + 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, + -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, + 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, + -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, + -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, + -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, + 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, + 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, + 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, + -1103, + /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ + -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, + 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, + -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, + -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, + -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, + -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, + 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, + -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, + -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, + 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, + -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, + -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, + 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, + 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, + 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, + -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, + -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, + 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, + -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, + 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, + -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, + -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, + 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, + -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, + 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, + 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, + 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, + -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, + 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, + -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, + 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, + -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, + -359, -758, -758, -758, -758, -758, -758, -758, -758}; + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/consts.h b/dev/ppc64le/src/consts.h new file mode 100644 index 0000000000..d424601ac1 --- /dev/null +++ b/dev/ppc64le/src/consts.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_DEV_PPC64LE_SRC_CONSTS_H +#define MLK_DEV_PPC64LE_SRC_CONSTS_H +#include "../../../common.h" + +#define NQ_OFFSET 0 +#define QINV_OFFSET 16 +#define Q_OFFSET 32 +#define C20159_OFFSET 48 +#define C1441_OFFSET 64 +#define C1353_OFFSET 80 +#define ZETA_NTT_OFFSET 96 +#define ZETA_NTT_OFFSET64 1104 +#define IZETA_NTT_OFFSET127 1616 +#define IZETA_NTT_OFFSET63 2128 + +#ifndef __ASSEMBLER__ +#define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) +extern const int16_t mlk_ppc_qdata[]; +#endif + +#endif /* !MLK_DEV_PPC64LE_SRC_CONSTS_H */ diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S new file mode 100644 index 0000000000..4fc49edcd6 --- /dev/null +++ b/dev/ppc64le/src/intt_ppc.S @@ -0,0 +1,713 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +.machine "any" +.text + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +# Montgomery reduce constatnts +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 +#define V1441 10 + +.macro SAVE_REGS + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 +.endm + +.macro RESTORE_REGS + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 +.endm + +.macro Compute_4Coeffs + vsubuhm 25, 8, 21 # r[j+len] - t + vsubuhm 26, 12, 22 # r[j+len] - t + vsubuhm 30, 16, 23 # r[j+len] - t + vsubuhm 31, 20, 24 # r[j+len] - t + vadduhm 8, 8, 21 # r[j+len] + t + vadduhm 12, 12, 22 # r[j+len] + t + vadduhm 16, 16, 23 # r[j+len] + t + vadduhm 20, 20, 24 # r[j+len] + t +.endm + +.macro Load_4Coeffs start next step + mr 9, \start # j + add 10, 7, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+8, 3, 10 # r[j+len] + lxvd2x 32+12, 3, 17 # r[j+len] + lxvd2x 32+16, 3, 19 # r[j+len] + lxvd2x 32+20, 3, 21 # r[j+len] + + lxvd2x 32+21, 3, 9 + lxvd2x 32+22, 3, 16 + lxvd2x 32+23, 3, 18 + lxvd2x 32+24, 3, 20 + + Compute_4Coeffs +.endm + +# +# Load Coeffients and setup vectors +# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 +# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 +# +# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 +# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 +# +.macro Load_L24Coeffs + lxvd2x 32+25, 0, 5 # a[j], r[j+len] + lxvd2x 32+26, 10, 5 # a[j], r[j+len] + vmrgew 8, 25, 26 + vmrgow 21, 25, 26 + lxvd2x 32+25, 11, 5 # a[j], r[j+len] + lxvd2x 32+26, 12, 5 # a[j], r[j+len] + vmrgew 12, 25, 26 + vmrgow 22, 25, 26 + lxvd2x 32+25, 15, 5 # a[j], r[j+len] + lxvd2x 32+26, 16, 5 # a[j], r[j+len] + vmrgew 16, 25, 26 + vmrgow 23, 25, 26 + lxvd2x 32+25, 17, 5 # a[j], r[j+len] + lxvd2x 32+26, 18, 5 # a[j], r[j+len] + vmrgew 20, 25, 26 + vmrgow 24, 25, 26 +.endm + +# +# Permute +# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 +# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 +# +# to +# rjlen4 - rjlen7, rjlen12 - rjlen15 +# rj0 - rj4, rj8 - rj11 +# +.macro Load_L44Coeffs + lxvd2x 10, 0, 5 # rj0, rj1, rj2, rj3, + # rjlen4, rjlen5, rjlen6, rjlen7 + lxvd2x 11, 10, 5 # rj8, rj9, rj10, rj11 + # rjlen12, rjlen13, rjlen14, rjlen15 + xxpermdi 32+8, 11, 10, 3 # rjlen4 - rjlen7, rjlen12 - rjlen15 + xxpermdi 32+21, 11, 10, 0 # rj0 - rj4, rj8 - rj11 + lxvd2x 10, 11, 5 + lxvd2x 11, 12, 5 + xxpermdi 32+12, 11, 10, 3 + xxpermdi 32+22, 11, 10, 0 + lxvd2x 10, 15, 5 + lxvd2x 11, 16, 5 + xxpermdi 32+16, 11, 10, 3 + xxpermdi 32+23, 11, 10, 0 + lxvd2x 10, 17, 5 + lxvd2x 11, 18, 5 + xxpermdi 32+20, 11, 10, 3 + xxpermdi 32+24, 11, 10, 0 +.endm + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + vxor 7, 7, 7 + xxlor 32+3, 6, 6 # V_MKQ + xxlor 32+1, 7, 7 # V_25 + xxlor 32+2, 8, 8 # V_26 + # Multify Odd/Even signed halfword; + # Results word bound by 2^32 in abs value. + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + # Right shift and pack lower halfword, + # results bond to 2^16 in abs value + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + # Modulo multify-Low unsigned halfword; + # results bond to 2^16 * q in abs value. + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +#----------------------------------- +# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) +# +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 + # Modular multification bond by 2^16 * q in abs value + vmladduhm 15, 25, \_vz0, 3 + vmladduhm 20, 26, \_vz1, 3 + vmladduhm 27, 30, \_vz2, 3 + vmladduhm 28, 31, \_vz3, 3 + + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 25, \_vz0, 3 + vmhraddshs 19, 26, \_vz1, 3 + vmhraddshs 24, 30, \_vz2, 3 + vmhraddshs 29, 31, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 27, V_QINV, 3 + vmladduhm 30, 28, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah \_vo0, 15, 4 # >> 1 + vsrah \_vo1, 20, 4 # >> 1 + vsrah \_vo2, 25, 4 # >> 1 + vsrah \_vo3, 30, 4 # >> 1 +.endm + +.macro Set_mont_consts + xxlor 32+5, 0, 0 # V_NMKQ + xxlor 32+2, 2, 2 # V_QINV + xxlor 32+3, 3, 3 # 0 + xxlor 32+4, 4, 4 # 1 +.endm + +.macro Load_next_4zetas + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 32+V_Z0, 0, 14 + lxvd2x 32+V_Z1, 8, 14 + lxvd2x 32+V_Z2, 11, 14 + lxvd2x 32+V_Z3, 12, 14 + addi 14, 14, 64 +.endm + +.macro Perm_4zetas + xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 + xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 + xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 + xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 +.endm + +.macro Write_B4C _vs0 _vs1 _vs2 _vs3 + stxvd2x \_vs0, 3, 9 + stxvd2x \_vs1, 3, 16 + stxvd2x \_vs2, 3, 18 + stxvd2x \_vs3, 3, 20 +.endm + +.macro Write_M4C _vs0 _vs1 _vs2 _vs3 + stxvd2x \_vs0, 3, 10 + stxvd2x \_vs1, 3, 17 + stxvd2x \_vs2, 3, 19 + stxvd2x \_vs3, 3, 21 +.endm + +.macro Reload_4coeffs + lxvd2x 32+25, 0, 3 + lxvd2x 32+26, 10, 3 + lxvd2x 32+30, 11, 3 + lxvd2x 32+31, 12, 3 + addi 3, 3, 64 +.endm + +.macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 + addi 3, 3, -128 + stxvd2x \_vs0, 0, 3 + stxvd2x \_vs1, 10, 3 + stxvd2x \_vs2, 11, 3 + stxvd2x \_vs3, 12, 3 + stxvd2x \_vs4, 15, 3 + stxvd2x \_vs5, 16, 3 + stxvd2x \_vs6, 17, 3 + stxvd2x \_vs7, 18, 3 + addi 3, 3, 128 +.endm + +.macro PermWriteL44 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + xxpermdi 32+10, 32+14, 32+13, 3 + xxpermdi 32+11, 32+14, 32+13, 0 + xxpermdi 32+12, 32+19, 32+18, 3 + xxpermdi 32+13, 32+19, 32+18, 0 + xxpermdi 32+14, 32+24, 32+23, 3 + xxpermdi 32+15, 32+24, 32+23, 0 + xxpermdi 32+16, 32+29, 32+28, 3 + xxpermdi 32+17, 32+29, 32+28, 0 + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 +.endm + +.macro PermWriteL24 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + vmrgew 10, 13, 14 + vmrgow 11, 13, 14 + vmrgew 12, 18, 19 + vmrgow 13, 18, 19 + vmrgew 14, 23, 24 + vmrgow 15, 23, 24 + vmrgew 16, 28, 29 + vmrgow 17, 28, 29 + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 +.endm + +.macro INTT_REDUCE_L24 + Load_L24Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL24 +.endm + +.macro INTT_REDUCE_L44 + Load_L44Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + Perm_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL44 +.endm + +.macro INTT_REDUCE_4X start next step + Load_4Coeffs \start, \next, \step + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 +.endm + +# intt +# t = r[j]; +# r[j] = barrett_reduce(t + r[j + len]); +# r[j + len] = r[j + len] - t; +# r[j + len] = fqmul(zeta, r[j + len]); + +# +# mlk_intt_ppc(r) +# +.global MLK_ASM_NAMESPACE(intt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(intt_ppc) + + SAVE_REGS + + # init vectors and constants + # Setup for Montgomery reduce + lxvx 0, 0, 4 + + li 10, QINV_OFFSET + lxvx 32+V_QINV, 10, 4 # QINV + xxlxor 32+3, 32+3, 32+3 + vspltish 4, 1 + xxlor 2, 32+2, 32+2 # QINV + xxlor 3, 32+3, 32+3 # 0 + xxlor 4, 32+4, 32+4 # 1 + + # Setup for Barrett reduce + li 10, Q_OFFSET + li 11, C20159_OFFSET + lxvx 6, 10, 4 # V_MKQ + lxvx 32+V20159, 11, 4 # V20159 + + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 32+8, 32+8 # V_26 store at vs8 + + vspltisw 9, 1 + vsubuwm 10, 8, 9 # 25 + vslw 9, 9, 10 + xxlor 7, 32+9, 32+9 # V_25 syore at vs7 + + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 + + # + # Montgomery reduce loops with constant 1441 + # + addi 14, 4, C1441_OFFSET + lvx V1441, 0, 14 + li 8, 4 # loops + mtctr 8 + + Set_mont_consts +intt_ppc__Loopf: + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + bdnz intt_ppc__Loopf + + addi 3, 3, -512 + +.align 4 + # + # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # Update zetas vectors, each vector has 2 zetas + addi 14, 4, ZETA_INTT_OFFSET + li 7, 4 + li 8, 4 + mtctr 8 + mr 5, 3 +intt_ppc__Loop2: + INTT_REDUCE_L24 + addi 5, 5, 128 + bdnz intt_ppc__Loop2 + +.align 4 + # + # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + mr 5, 3 + li 7, 8 + li 8, 4 # loops + mtctr 8 +intt_ppc__Loop4: + INTT_REDUCE_L44 + addi 5, 5, 128 + bdnz intt_ppc__Loop4 + +.align 4 + # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + li 7, 16 + li 5, 0 + li 15, 4 # loops + mtctr 15 + +intt_ppc__Loop8: + INTT_REDUCE_4X 5, 32, 32 + addi 5, 5, 128 + bdnz intt_ppc__Loop8 + +.align 4 + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + li 5, 0 + li 7, 32 + + INTT_REDUCE_4X 5, 64, 64 + + li 5, 16 + addi 14, 14, -64 + INTT_REDUCE_4X 5, 64, 64 + + li 5, 256 + INTT_REDUCE_4X 5, 64, 64 + + li 5, 272 + addi 14, 14, -64 + INTT_REDUCE_4X 5, 64, 64 + +.align 4 + # + # 5. len = 32, start = 0, 64, 128, 192 + li 5, 0 + li 7, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 + # + # 6. len = 64, start = 0, 128 + li 5, 0 + li 7, 128 + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 320 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 + # 7. len = 128, start = 0 + # + li 5, 0 # start + li 7, 256 # len * 2 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + xxlor 9, 32+10, 32+10 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 192 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + RESTORE_REGS + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ +#undef V_QINV +#undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 +#undef V_ZETA +#undef V1441 + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S new file mode 100644 index 0000000000..c8dba7b27e --- /dev/null +++ b/dev/ppc64le/src/ntt_ppc.S @@ -0,0 +1,515 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 + +.machine "any" +.text + +.macro SAVE_REGS + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 +.endm + +.macro RESTORE_REGS + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 +.endm + +.macro Load_4Coeffs start next step + mr 9, \start + add 10, 7, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+13, 3, 10 # r[j+len] + lxvd2x 32+18, 3, 17 # r[j+len] + lxvd2x 32+23, 3, 19 # r[j+len] + lxvd2x 32+28, 3, 21 # r[j+len] +.endm + +# +# Load Coeffients and setup vectors +# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 +# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 +# +# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 +# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 +# +.macro Load_L24Coeffs + lxvd2x 32+25, 0, 5 # a[j], r[j+len] + lxvd2x 32+26, 10, 5 # a[j], r[j+len] + vmrgew 13, 25, 26 + vmrgow 12, 25, 26 + lxvd2x 32+25, 11, 5 # a[j], r[j+len] + lxvd2x 32+26, 12, 5 # a[j], r[j+len] + vmrgew 18, 25, 26 + vmrgow 17, 25, 26 + lxvd2x 32+25, 15, 5 # a[j], r[j+len] + lxvd2x 32+26, 16, 5 # a[j], r[j+len] + vmrgew 23, 25, 26 + vmrgow 22, 25, 26 + lxvd2x 32+25, 17, 5 # a[j], r[j+len] + lxvd2x 32+26, 18, 5 # a[j], r[j+len] + vmrgew 28, 25, 26 + vmrgow 27, 25, 26 +.endm + +# +# Permute +# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 +# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 +# +# to +# rjlen4 - rjlen7, rjlen12 - rjlen15 +# rj0 - rj4, rj8 - rj11 +# +.macro Load_L44Coeffs + lxvd2x 1, 0, 5 # rj0, rj1, rj2, rj3, + # rjlen4, rjlen5, rjlen6, rjlen7 + lxvd2x 2, 10, 5 # rj8, rj9, rj10, rj11 + # rjlen12, rjlen13, rjlen14, rjlen15 + xxpermdi 32+13, 2, 1, 3 # rjlen4 - rjlen7, rjlen12 - rjlen15 + xxpermdi 32+12, 2, 1, 0 # rj0 - rj4, rj8 - rj11 + lxvd2x 3, 11, 5 + lxvd2x 4, 12, 5 + xxpermdi 32+18, 4, 3, 3 + xxpermdi 32+17, 4, 3, 0 + lxvd2x 1, 15, 5 + lxvd2x 2, 16, 5 + xxpermdi 32+23, 2, 1, 3 + xxpermdi 32+22, 2, 1, 0 + lxvd2x 3, 17, 5 + lxvd2x 4, 18, 5 + xxpermdi 32+28, 4, 3, 3 + xxpermdi 32+27, 4, 3, 0 +.endm + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) +# +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 + # fqmul = zeta * coefficient + # Modular multification bond by 2^16 * q in abs value + vmladduhm 15, 13, \_vz0, 3 + vmladduhm 20, 18, \_vz1, 3 + vmladduhm 25, 23, \_vz2, 3 + vmladduhm 30, 28, \_vz3, 3 + + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 13, \_vz0, 3 + vmhraddshs 19, 18, \_vz1, 3 + vmhraddshs 24, 23, \_vz2, 3 + vmhraddshs 29, 28, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 30, 30, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah 13, 15, 4 # >> 1 + vsrah 18, 20, 4 # >> 1 + vsrah 23, 25, 4 # >> 1 + vsrah 28, 30, 4 # >> 1 + +.endm + +.macro Load_4Aj + lxvd2x 32+12, 3, 9 # r[j] + lxvd2x 32+17, 3, 16 # r[j] + lxvd2x 32+22, 3, 18 # r[j] + lxvd2x 32+27, 3, 20 # r[j] +.endm + +.macro Compute_4Coeffs + # Since the result of the Montgomery multiplication is bounded + # by q in absolute value. + # Finally to complete the final update of the results with add/sub + vsubuhm 16, 12, 13 # r - t + vadduhm 15, 13, 12 # r + t + vsubuhm 21, 17, 18 # r - t + vadduhm 20, 18, 17 # r + t + vsubuhm 26, 22, 23 # r - t + vadduhm 25, 23, 22 # r + t + vsubuhm 31, 27, 28 # r - t + vadduhm 30, 28, 27 # r + t +.endm + +.macro NTT_MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 + Load_4Coeffs \start, \next, \step + MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3 + Load_4Aj + Compute_4Coeffs +.endm + +.macro Write_One + stxvd2x 32+15, 3, 9 + stxvd2x 32+16, 3, 10 + stxvd2x 32+20, 3, 16 + stxvd2x 32+21, 3, 17 + stxvd2x 32+25, 3, 18 + stxvd2x 32+26, 3, 19 + stxvd2x 32+30, 3, 20 + stxvd2x 32+31, 3, 21 +.endm + +.macro PermWriteL44 + Compute_4Coeffs + xxpermdi 0, 32+15, 32+16, 3 + xxpermdi 1, 32+15, 32+16, 0 + xxpermdi 2, 32+20, 32+21, 3 + xxpermdi 3, 32+20, 32+21, 0 + xxpermdi 4, 32+25, 32+26, 3 + xxpermdi 5, 32+25, 32+26, 0 + xxpermdi 6, 32+30, 32+31, 3 + xxpermdi 7, 32+30, 32+31, 0 + stxvd2x 0, 0, 5 + stxvd2x 1, 10, 5 + stxvd2x 2, 11, 5 + stxvd2x 3, 12, 5 + stxvd2x 4, 15, 5 + stxvd2x 5, 16, 5 + stxvd2x 6, 17, 5 + stxvd2x 7, 18, 5 +.endm + +.macro PermWriteL24 + Compute_4Coeffs + vmrgew 10, 16, 15 + vmrgow 11, 16, 15 + vmrgew 12, 21, 20 + vmrgow 13, 21, 20 + vmrgew 14, 26, 25 + vmrgow 15, 26, 25 + vmrgew 16, 31, 30 + vmrgow 17, 31, 30 + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 +.endm + +.macro Load_next_4zetas + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 32+V_Z0, 0, 14 + lxvd2x 32+V_Z1, 10, 14 + lxvd2x 32+V_Z2, 11, 14 + lxvd2x 32+V_Z3, 12, 14 + addi 14, 14, 64 +.endm + +.macro Perm_4zetas + xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 + xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 + xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 + xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 +.endm + +# +# mlk_ntt_ppc(int16_t *r) +# +.global MLK_ASM_NAMESPACE(ntt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(ntt_ppc) + + SAVE_REGS + + # get MLKEM_Q + lvx V_NMKQ,0,4 + + # zetas array + addi 14, 4, ZETA_NTT_OFFSET + + vxor 3, 3, 3 + vspltish 4, 1 + + li 10, QINV_OFFSET + lvx V_QINV, 10, 4 + +.align 4 + # + # Compute coefficients of the NTT based on the following loop. + # for (len = 128; len ≥ 2; len = len/2) + # + # 1. len = 128, start = 0 + # + li 5, 0 # start + li 7, 256 # len * 2 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 128 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 192 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 + # + # 2. len = 64, start = 0, 128 + # k += 2 + li 5, 0 + li 7, 128 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 320 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 + # + # 3. len = 32, start = 0, 64, 128, 192 + # k += 4 + li 5, 0 + li 7, 64 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 128 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 384 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + # k += 8 + li 5, 0 + li 7, 32 + Load_next_4zetas + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 16 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + Load_next_4zetas + li 5, 256 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 272 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + +.align 4 + # + # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + # k += 16 + li 5, 0 + li 7, 16 + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 128 + + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 256 + + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 384 + + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + # + # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + # k += 32 + li 15, 4 # loops + mtctr 15 + mr 5, 3 + li 7, 8 + + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 + +.align 4 +ntt_ppc__Len4: + Load_next_4zetas + Perm_4zetas + Load_L44Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL44 + addi 5, 5, 128 + + bdnz ntt_ppc__Len4 + + # + # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # k += 64 + # Update zetas vectors, each vector has 2 zetas + + li 8, 4 + mtctr 8 + mr 5, 3 + li 7, 4 + +.align 4 +ntt_ppc__Len2: + Load_next_4zetas + Load_L24Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL24 + addi 5, 5, 128 + + bdnz ntt_ppc__Len2 + + RESTORE_REGS + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V_QINV +#undef V_NMKQ +#undef V_ZETA + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S new file mode 100644 index 0000000000..72c6310f28 --- /dev/null +++ b/dev/ppc64le/src/poly_tomont.S @@ -0,0 +1,192 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +# Poly_tomont: Inplace conversion of all coefficients of a polynomial +# from normal domain to Montgomery domain +# +# Arguments:*r: pointer to input/output polynomial +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +#define V1353 0 +#define V_QINV 2 +#define V_NMKQ 5 + +.machine "any" +.text + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(_v0, _v1, _v2, _v3) +# +.macro MREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+13, 0, 3 + addi 3, 3, 16 + lxvd2x 32+18, 0, 3 + addi 3, 3, 16 + lxvd2x 32+23, 0, 3 + addi 3, 3, 16 + lxvd2x 32+7, 0, 3 + addi 3, 3, 16 + + vmladduhm 15, 13, V1353, 3 + vmladduhm 20, 18, V1353, 3 + vmladduhm 25, 23, V1353, 3 + vmladduhm 9, 7, V1353, 3 + + vmhraddshs 14, 13, V1353, 3 + vmhraddshs 19, 18, V1353, 3 + vmhraddshs 24, 23, V1353, 3 + vmhraddshs 8, 7, V1353, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 9, 9, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 9, 9, V_NMKQ, 8 + + vsrah \_v0, 15, 4 # >> 1 + vsrah \_v1, 20, 4 # >> 1 + vsrah \_v2, 25, 4 # >> 1 + vsrah \_v3, 9, 4 # >> 1 +.endm + +.macro Write_8X + stxvd2x 32+27, 4, 3 + stxvd2x 32+28, 5, 3 + stxvd2x 32+29, 6, 3 + stxvd2x 32+30, 7, 3 + stxvd2x 32+13, 8, 3 + stxvd2x 32+18, 9, 3 + stxvd2x 32+23, 10, 3 + stxvd2x 32+7, 11, 3 +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(poly_tomont_ppc) +MLK_ASM_FN_SYMBOL(poly_tomont_ppc) + stdu 1, -320(1) + mflr 0 + + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + stxvx 32+25, 11, 1 + stxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + stxvx 32+27, 6, 1 + stxvx 32+28, 7, 1 + stxvx 32+29, 8, 1 + stxvx 32+30, 9, 1 + + li 6, NQ_OFFSET + li 7, QINV_OFFSET + li 8, C1353_OFFSET + lxvx 32+V_NMKQ, 6, 4 + lxvx 32+V_QINV, 7, 4 + lxvx 32+V1353, 8, 4 + + vxor 3, 3, 3 + vspltish 4, 1 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + lxvx 32+25, 11, 1 + lxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + lxvx 32+27, 6, 1 + lxvx 32+28, 7, 1 + lxvx 32+29, 8, 1 + lxvx 32+30, 9, 1 + mtlr 0 + addi 1, 1, 320 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S new file mode 100644 index 0000000000..b7c6235b9a --- /dev/null +++ b/dev/ppc64le/src/reduce.S @@ -0,0 +1,242 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +# +# poly_reduce: Applies Barrett reduction to all coefficients of a polynomial +# for details of the Barrett reduction +# +# Arguments: *r: pointer to input/output polynomial +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +.machine "any" +.text + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+8, 0, 3 + lxvd2x 32+12, 14, 3 + lxvd2x 32+16, 15, 3 + lxvd2x 32+20, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +.macro Write_8X + stxvd2x 32+21, 4, 3 + stxvd2x 32+22, 5, 3 + stxvd2x 32+23, 6, 3 + stxvd2x 32+24, 7, 3 + stxvd2x 32+4, 8, 3 + stxvd2x 32+9, 9, 3 + stxvd2x 32+13, 10, 3 + stxvd2x 32+17, 11, 3 +.endm + +# +# Conditional addition to get unsigned canonical representative +# +.macro To_unsigned_16 + lxvd2x 32+12, 0, 3 + lxvd2x 32+13, 14, 3 + lxvd2x 32+14, 15, 3 + lxvd2x 32+15, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 32+1, 32+7,32+12, 32+1 + xxsel 32+0, 32+8,32+13, 32+0 + xxsel 32+3, 32+5,32+14, 32+3 + xxsel 32+2, 32+6,32+15, 32+2 + stxvd2x 32+3, 10, 3 + stxvd2x 32+2, 11, 3 + stxvd2x 32+1, 8, 3 + stxvd2x 32+0, 9, 3 +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(reduce_ppc) +MLK_ASM_FN_SYMBOL(reduce_ppc) + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + + vxor 7, 7, 7 + + li 6, Q_OFFSET + li 7, C20159_OFFSET + lxvx 32+V_MKQ, 6, 4 + lxvx 32+V20159, 7, 4 + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + vspltisw 4, 1 + vsubuwm 5, V_26, 4 + vslw V_25, 4, 5 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + li 14, 16 + li 15, 32 + li 16, 48 + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + # + # To unsigned canonical + # +.align 4 + addi 3, 3, -512 + vxor 9, 9, 9 + vspltish 10, 15 + vmr 11, V_MKQ + + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + mtlr 0 + addi 1, 1, 224 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ diff --git a/integration/liboqs/ML-KEM-1024_META.yml b/integration/liboqs/ML-KEM-1024_META.yml index 7d8e50d4c6..9c7fe672ab 100644 --- a/integration/liboqs/ML-KEM-1024_META.yml +++ b/integration/liboqs/ML-KEM-1024_META.yml @@ -89,3 +89,22 @@ implementations: - Darwin required_flags: - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/ML-KEM-512_META.yml b/integration/liboqs/ML-KEM-512_META.yml index aa88537d3f..f46dbfdbf1 100644 --- a/integration/liboqs/ML-KEM-512_META.yml +++ b/integration/liboqs/ML-KEM-512_META.yml @@ -89,3 +89,22 @@ implementations: - Darwin required_flags: - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/ML-KEM-768_META.yml b/integration/liboqs/ML-KEM-768_META.yml index 254d67478a..1b01c4d426 100644 --- a/integration/liboqs/ML-KEM-768_META.yml +++ b/integration/liboqs/ML-KEM-768_META.yml @@ -89,3 +89,22 @@ implementations: - Darwin required_flags: - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="....//integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/config_ppc64le.h b/integration/liboqs/config_ppc64le.h new file mode 100644 index 0000000000..2fa1cdbcf6 --- /dev/null +++ b/integration/liboqs/config_ppc64le.h @@ -0,0 +1,266 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [FIPS140_3_IG] + * Implementation Guidance for FIPS 140-3 and the Cryptographic Module + * Validation Program National Institute of Standards and Technology + * https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements + */ + +#ifndef MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H +#define MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H + +/****************************************************************************** + * Name: MLK_CONFIG_PARAMETER_SET + * + * Description: Specifies the parameter set for ML-KEM + * - MLK_CONFIG_PARAMETER_SET=512 corresponds to ML-KEM-512 + * - MLK_CONFIG_PARAMETER_SET=768 corresponds to ML-KEM-768 + * - MLK_CONFIG_PARAMETER_SET=1024 corresponds to ML-KEM-1024 + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#ifndef MLK_CONFIG_PARAMETER_SET +#define MLK_CONFIG_PARAMETER_SET \ + 768 /* Change this for different security strengths */ +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_NAMESPACE_PREFIX + * + * Description: The prefix to use to namespace global symbols from mlkem/. + * + * In a multi-level build (that is, if either + * - MLK_CONFIG_MULTILEVEL_WITH_SHARED, or + * - MLK_CONFIG_MULTILEVEL_NO_SHARED, + * are set, level-dependent symbols will additionally be prefixed + * with the parameter set (512/768/1024). + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if MLK_CONFIG_PARAMETER_SET == 512 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE +#elif MLK_CONFIG_PARAMETER_SET == 768 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE +#elif MLK_CONFIG_PARAMETER_SET == 1024 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_USE_NATIVE_BACKEND_ARITH + * + * Description: Determines whether an native arithmetic backend should be used. + * + * The arithmetic backend covers performance critical functions + * such as the number-theoretic transform (NTT). + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the arithmetic backend to be use is + * determined by MLK_CONFIG_ARITH_BACKEND_FILE: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#define MLK_CONFIG_USE_NATIVE_BACKEND_ARITH + +/****************************************************************************** + * Name: MLK_CONFIG_ARITH_BACKEND_FILE + * + * Description: The arithmetic backend to use. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is unset, this option + * is ignored. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is set, this option must + * either be undefined or the filename of an arithmetic backend. + * If unset, the default backend will be used. + * + * This can be set using CFLAGS. + * + *****************************************************************************/ +#define MLK_CONFIG_ARITH_BACKEND_FILE "native/meta.h" + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202_CUSTOM_HEADER + * + * Description: Custom header to use for FIPS-202 + * + * This should only be set if you intend to use a custom + * FIPS-202 implementation, different from the one shipped + * with mlkem-native. + * + * If set, it must be the name of a file serving as the + * replacement for mlkem/fips202/fips202.h, and exposing + * the same API (see FIPS202.md). + * + *****************************************************************************/ +/* +#define MLK_CONFIG_FIPS202_CUSTOM_HEADER \ + "../../integration/liboqs/fips202_glue.h" +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202X4_CUSTOM_HEADER + * + * Description: Custom header to use for FIPS-202-X4 + * + * This should only be set if you intend to use a custom + * FIPS-202 implementation, different from the one shipped + * with mlkem-native. + * + * If set, it must be the name of a file serving as the + * replacement for mlkem/fips202/fips202x4.h, and exposing + * the same API (see FIPS202.md). + * + *****************************************************************************/ +/* +#define MLK_CONFIG_FIPS202X4_CUSTOM_HEADER \ + "../../integration/liboqs/fips202x4_glue.h" +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_ZEROIZE + * + * Description: In compliance with FIPS 203 Section 3.3, mlkem-native zeroizes + * intermediate stack buffers before returning from function calls. + * + * Set this option and define `mlk_zeroize` if you want to + * use a custom method to zeroize intermediate stack buffers. + * The default implementation uses SecureZeroMemory on Windows + * and a memset + compiler barrier otherwise. If neither of those + * is available on the target platform, compilation will fail, + * and you will need to use MLK_CONFIG_CUSTOM_ZEROIZE to provide + * a custom implementation of `mlk_zeroize()`. + * + * WARNING: + * The explicit stack zeroization conducted by mlkem-native + * reduces the likelihood of data leaking on the stack, but + * does not eliminate it! The C standard makes no guarantee about + * where a compiler allocates structures and whether/where it makes + * copies of them. Also, in addition to entire structures, there + * may also be potentially exploitable leakage of individual values + * on the stack. + * + * If you need bullet-proof zeroization of the stack, you need to + * consider additional measures instead of of what this feature + * provides. In this case, you can set mlk_zeroize to a no-op. + * + *****************************************************************************/ +/* #define MLK_CONFIG_CUSTOM_ZEROIZE + #if !defined(__ASSEMBLER__) + #include + #include "sys.h" + static MLK_INLINE void mlk_zeroize(void *ptr, size_t len) + { + ... your implementation ... + } + #endif +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_RANDOMBYTES + * + * Description: mlkem-native does not provide a secure randombytes + * implementation. Such an implementation has to provided by the + * consumer. + * + * If this option is not set, mlkem-native expects a function + * void randombytes(uint8_t *out, size_t outlen). + * + * Set this option and define `mlk_randombytes` if you want to + * use a custom method to sample randombytes with a different name + * or signature. + * + *****************************************************************************/ +#define MLK_CONFIG_CUSTOM_RANDOMBYTES +#if !defined(__ASSEMBLER__) +#include +#include +#include "../../mlkem/src/sys.h" +static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len) +{ + OQS_randombytes(ptr, len); +} +#endif /* !__ASSEMBLER__ */ + +/****************************************************************************** + * Name: MLK_CONFIG_NO_ASM + * + * Description: If this option is set, mlkem-native will be built without + * use of native code or inline assembly. + * + * By default, inline assembly is used to implement value barriers. + * Without inline assembly, mlkem-native will use a global volatile + * 'opt blocker' instead; see verify.h. + * + * Inline assembly is also used to implement a secure zeroization + * function on non-Windows platforms. If this option is set and + * the target platform is not Windows, you MUST set + * MLK_CONFIG_CUSTOM_ZEROIZE and provide a custom zeroization + * function. + * + * If this option is set, MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 and + * and MLK_CONFIG_USE_NATIVE_BACKEND_ARITH will be ignored, and no + *native backends will be used. + * + *****************************************************************************/ +/* #define MLK_CONFIG_NO_ASM */ + +/****************************************************************************** + * Name: MLK_CONFIG_KEYGEN_PCT + * + * Description: Compliance with @[FIPS140_3_IG, p.87] requires a + * Pairwise Consistency Test (PCT) to be carried out on a freshly + * generated keypair before it can be exported. + * + * Set this option if such a check should be implemented. + * In this case, crypto_kem_keypair_derand and crypto_kem_keypair + * will return a non-zero error code if the PCT failed. + * + * NOTE: This feature will drastically lower the performance of + * key generation. + * + *****************************************************************************/ +/* #define MLK_CONFIG_KEYGEN_PCT */ + +/****************************************************************************** + * Name: MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST + * + * Description: If this option is set, the user must provide a runtime + * function `static inline int mlk_break_pct() { ... }` to + * indicate whether the PCT should be made fail. + * + * This option only has an effect if MLK_CONFIG_KEYGEN_PCT is set. + * + *****************************************************************************/ +/* #define MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST + #if !defined(__ASSEMBLER__) + #include "sys.h" + static MLK_INLINE int mlk_break_pct(void) + { + ... return 0/1 depending on whether PCT should be broken ... + } + #endif +*/ + +/* Enable valgrind-based assertions in mlkem-native through macro + * from libOQS. */ +#if !defined(__ASSEMBLER__) +#include +#if defined(OQS_ENABLE_TEST_CONSTANT_TIME) +#define MLK_CONFIG_CT_TESTING_ENABLED +#endif +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H */ diff --git a/mlkem/mlkem_native.S b/mlkem/mlkem_native.S index bff0400796..6f2a8b221f 100644 --- a/mlkem/mlkem_native.S +++ b/mlkem/mlkem_native.S @@ -458,6 +458,33 @@ #undef MLK_NTT_BOUND /* mlkem/src/native/meta.h */ #undef MLK_NATIVE_META_H +/* mlkem/src/native/ppc64le/meta.h */ +#undef MLK_ARITH_BACKEND_NAME +#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_NATIVE_PPC64LE_META_H +#undef MLK_USE_NATIVE_INTT +#undef MLK_USE_NATIVE_NTT +#undef MLK_USE_NATIVE_POLY_REDUCE +#undef MLK_USE_NATIVE_POLY_TOMONT +/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ +#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#undef mlk_intt_ppc +#undef mlk_ntt_ppc +#undef mlk_poly_tomont_ppc +#undef mlk_reduce_ppc +/* mlkem/src/native/ppc64le/src/consts.h */ +#undef C1353_OFFSET +#undef C1441_OFFSET +#undef C20159_OFFSET +#undef IZETA_NTT_OFFSET127 +#undef IZETA_NTT_OFFSET63 +#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#undef NQ_OFFSET +#undef QINV_OFFSET +#undef Q_OFFSET +#undef ZETA_NTT_OFFSET +#undef ZETA_NTT_OFFSET64 +#undef mlk_ppc_qdata #if defined(MLK_SYS_AARCH64) /* * Undefine macros from native code (Arith, AArch64) diff --git a/mlkem/mlkem_native.c b/mlkem/mlkem_native.c index 74c1f93877..74903ed1da 100644 --- a/mlkem/mlkem_native.c +++ b/mlkem/mlkem_native.c @@ -445,6 +445,33 @@ #undef MLK_NTT_BOUND /* mlkem/src/native/meta.h */ #undef MLK_NATIVE_META_H +/* mlkem/src/native/ppc64le/meta.h */ +#undef MLK_ARITH_BACKEND_NAME +#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_NATIVE_PPC64LE_META_H +#undef MLK_USE_NATIVE_INTT +#undef MLK_USE_NATIVE_NTT +#undef MLK_USE_NATIVE_POLY_REDUCE +#undef MLK_USE_NATIVE_POLY_TOMONT +/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ +#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#undef mlk_intt_ppc +#undef mlk_ntt_ppc +#undef mlk_poly_tomont_ppc +#undef mlk_reduce_ppc +/* mlkem/src/native/ppc64le/src/consts.h */ +#undef C1353_OFFSET +#undef C1441_OFFSET +#undef C20159_OFFSET +#undef IZETA_NTT_OFFSET127 +#undef IZETA_NTT_OFFSET63 +#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#undef NQ_OFFSET +#undef QINV_OFFSET +#undef Q_OFFSET +#undef ZETA_NTT_OFFSET +#undef ZETA_NTT_OFFSET64 +#undef mlk_ppc_qdata #if defined(MLK_SYS_AARCH64) /* * Undefine macros from native code (Arith, AArch64) diff --git a/mlkem/src/native/meta.h b/mlkem/src/native/meta.h index f2b9b848b7..e391883231 100644 --- a/mlkem/src/native/meta.h +++ b/mlkem/src/native/meta.h @@ -18,4 +18,8 @@ #include "x86_64/meta.h" #endif +#ifdef MLK_SYS_PPC64LE +#include "ppc64le/meta.h" +#endif + #endif /* !MLK_NATIVE_META_H */ diff --git a/mlkem/src/native/ppc64le/README.md b/mlkem/src/native/ppc64le/README.md new file mode 100644 index 0000000000..5125a40eae --- /dev/null +++ b/mlkem/src/native/ppc64le/README.md @@ -0,0 +1,6 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# ppc64le backend (little endian) + +This directory contains a native backend for little endian POWER 8 (ppc64le) and above systems. + diff --git a/mlkem/src/native/ppc64le/meta.h b/mlkem/src/native/ppc64le/meta.h new file mode 100644 index 0000000000..54b3ddd9c6 --- /dev/null +++ b/mlkem/src/native/ppc64le/meta.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_NATIVE_PPC64LE_META_H +#define MLK_NATIVE_PPC64LE_META_H + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT + +#define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT + +/* Set of primitives that this backend replaces */ +#define MLK_USE_NATIVE_NTT +#define MLK_USE_NATIVE_INTT +#define MLK_USE_NATIVE_POLY_REDUCE +#define MLK_USE_NATIVE_POLY_TOMONT + +#if !defined(__ASSEMBLER__) +#include +#include "../../common.h" +#include "../../params.h" +#include "../api.h" +#include "src/arith_native_ppc64le.h" + +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) +{ + mlk_ntt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) +{ + mlk_intt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) +{ + mlk_reduce_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) +{ + mlk_poly_tomont_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_NATIVE_PPC64LE_META_H */ diff --git a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h new file mode 100644 index 0000000000..dbcee3e3ee --- /dev/null +++ b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2024-2025 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#define MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H + +#include +#include "../../../common.h" +#include "consts.h" + +#define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) +void mlk_ntt_ppc(int16_t *, const int16_t *); + +#define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) +void mlk_intt_ppc(int16_t *, const int16_t *); + +#define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) +void mlk_reduce_ppc(int16_t *r, const int16_t *); + +#define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) +void mlk_poly_tomont_ppc(int16_t *, const int16_t *); + +#endif /* !MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/mlkem/src/native/ppc64le/src/consts.c b/mlkem/src/native/ppc64le/src/consts.c new file mode 100644 index 0000000000..4c2fbdf61a --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts.c @@ -0,0 +1,155 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" + +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +MLK_ALIGN const int16_t mlk_ppc_qdata[1568] = { + /* -Q */ + -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329, + /* QINV */ + -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327, + /* Q */ + 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329, + /* const 20159 for reduce.S and intt */ + 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159, + /* const 1441 for intt */ + 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441, + /* for poly_tomont.S */ + 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353, + /* zetas */ + /* For ntt Len=128, offset 96 */ + -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, + -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, + 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, + 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, + -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, + 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, + 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, + 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, + -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, + -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, + -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, + 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, + 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, + -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, + 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, + -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, + 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, + 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, + -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, + 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, + -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, + -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, + -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, + 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, + -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, + -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, + 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, + -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, + -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, + -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, + 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, + -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, + -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, + 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, + /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ + -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, + 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, + 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, + 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, + -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, + -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, + -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, + -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, + 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, + 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, + 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, + 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, + 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, + 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, + 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, + -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, + -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, + -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, + -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, + 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, + 1628, 1628, + /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ + 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, + 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, + -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, + -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, + 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, + -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, + 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, + 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, + 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, + 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, + 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, + 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, + -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, + 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, + -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, + -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, + -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, + 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, + 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, + 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, + -1103, + /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ + -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, + 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, + -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, + -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, + -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, + -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, + 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, + -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, + -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, + 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, + -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, + -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, + 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, + 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, + 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, + -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, + -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, + 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, + -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, + 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, + -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, + -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, + 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, + -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, + 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, + 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, + 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, + -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, + 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, + -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, + 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, + -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, + -359, -758, -758, -758, -758, -758, -758, -758, -758}; + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/consts.h b/mlkem/src/native/ppc64le/src/consts.h new file mode 100644 index 0000000000..49f519d0c3 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#define MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#include "../../../common.h" + +#define NQ_OFFSET 0 +#define QINV_OFFSET 16 +#define Q_OFFSET 32 +#define C20159_OFFSET 48 +#define C1441_OFFSET 64 +#define C1353_OFFSET 80 +#define ZETA_NTT_OFFSET 96 +#define ZETA_NTT_OFFSET64 1104 +#define IZETA_NTT_OFFSET127 1616 +#define IZETA_NTT_OFFSET63 2128 + +#ifndef __ASSEMBLER__ +#define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) +extern const int16_t mlk_ppc_qdata[]; +#endif + +#endif /* !MLK_NATIVE_PPC64LE_SRC_CONSTS_H */ diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S new file mode 100644 index 0000000000..07663c4950 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -0,0 +1,711 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +.machine "any" +.text + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +# Montgomery reduce constatnts +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 +#define V1441 10 + +.macro SAVE_REGS + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 +.endm + +.macro RESTORE_REGS + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 +.endm + +.macro Compute_4Coeffs + vsubuhm 25, 8, 21 # r[j+len] - t + vsubuhm 26, 12, 22 # r[j+len] - t + vsubuhm 30, 16, 23 # r[j+len] - t + vsubuhm 31, 20, 24 # r[j+len] - t + vadduhm 8, 8, 21 # r[j+len] + t + vadduhm 12, 12, 22 # r[j+len] + t + vadduhm 16, 16, 23 # r[j+len] + t + vadduhm 20, 20, 24 # r[j+len] + t +.endm + +.macro Load_4Coeffs start next step + mr 9, \start # j + add 10, 7, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+8, 3, 10 # r[j+len] + lxvd2x 32+12, 3, 17 # r[j+len] + lxvd2x 32+16, 3, 19 # r[j+len] + lxvd2x 32+20, 3, 21 # r[j+len] + + lxvd2x 32+21, 3, 9 + lxvd2x 32+22, 3, 16 + lxvd2x 32+23, 3, 18 + lxvd2x 32+24, 3, 20 + + Compute_4Coeffs +.endm + +# +# Load Coeffients and setup vectors +# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 +# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 +# +# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 +# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 +# +.macro Load_L24Coeffs + lxvd2x 32+25, 0, 5 # a[j], r[j+len] + lxvd2x 32+26, 10, 5 # a[j], r[j+len] + vmrgew 8, 25, 26 + vmrgow 21, 25, 26 + lxvd2x 32+25, 11, 5 # a[j], r[j+len] + lxvd2x 32+26, 12, 5 # a[j], r[j+len] + vmrgew 12, 25, 26 + vmrgow 22, 25, 26 + lxvd2x 32+25, 15, 5 # a[j], r[j+len] + lxvd2x 32+26, 16, 5 # a[j], r[j+len] + vmrgew 16, 25, 26 + vmrgow 23, 25, 26 + lxvd2x 32+25, 17, 5 # a[j], r[j+len] + lxvd2x 32+26, 18, 5 # a[j], r[j+len] + vmrgew 20, 25, 26 + vmrgow 24, 25, 26 +.endm + +# +# Permute +# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 +# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 +# +# to +# rjlen4 - rjlen7, rjlen12 - rjlen15 +# rj0 - rj4, rj8 - rj11 +# +.macro Load_L44Coeffs + lxvd2x 10, 0, 5 # rj0, rj1, rj2, rj3, + # rjlen4, rjlen5, rjlen6, rjlen7 + lxvd2x 11, 10, 5 # rj8, rj9, rj10, rj11 + # rjlen12, rjlen13, rjlen14, rjlen15 + xxpermdi 32+8, 11, 10, 3 # rjlen4 - rjlen7, rjlen12 - rjlen15 + xxpermdi 32+21, 11, 10, 0 # rj0 - rj4, rj8 - rj11 + lxvd2x 10, 11, 5 + lxvd2x 11, 12, 5 + xxpermdi 32+12, 11, 10, 3 + xxpermdi 32+22, 11, 10, 0 + lxvd2x 10, 15, 5 + lxvd2x 11, 16, 5 + xxpermdi 32+16, 11, 10, 3 + xxpermdi 32+23, 11, 10, 0 + lxvd2x 10, 17, 5 + lxvd2x 11, 18, 5 + xxpermdi 32+20, 11, 10, 3 + xxpermdi 32+24, 11, 10, 0 +.endm + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + vxor 7, 7, 7 + xxlor 32+3, 6, 6 # V_MKQ + xxlor 32+1, 7, 7 # V_25 + xxlor 32+2, 8, 8 # V_26 + # Multify Odd/Even signed halfword; + # Results word bound by 2^32 in abs value. + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + # Right shift and pack lower halfword, + # results bond to 2^16 in abs value + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + # Modulo multify-Low unsigned halfword; + # results bond to 2^16 * q in abs value. + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +#----------------------------------- +# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) +# +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 + # Modular multification bond by 2^16 * q in abs value + vmladduhm 15, 25, \_vz0, 3 + vmladduhm 20, 26, \_vz1, 3 + vmladduhm 27, 30, \_vz2, 3 + vmladduhm 28, 31, \_vz3, 3 + + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 25, \_vz0, 3 + vmhraddshs 19, 26, \_vz1, 3 + vmhraddshs 24, 30, \_vz2, 3 + vmhraddshs 29, 31, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 27, V_QINV, 3 + vmladduhm 30, 28, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah \_vo0, 15, 4 # >> 1 + vsrah \_vo1, 20, 4 # >> 1 + vsrah \_vo2, 25, 4 # >> 1 + vsrah \_vo3, 30, 4 # >> 1 +.endm + +.macro Set_mont_consts + xxlor 32+5, 0, 0 # V_NMKQ + xxlor 32+2, 2, 2 # V_QINV + xxlor 32+3, 3, 3 # 0 + xxlor 32+4, 4, 4 # 1 +.endm + +.macro Load_next_4zetas + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 32+V_Z0, 0, 14 + lxvd2x 32+V_Z1, 8, 14 + lxvd2x 32+V_Z2, 11, 14 + lxvd2x 32+V_Z3, 12, 14 + addi 14, 14, 64 +.endm + +.macro Perm_4zetas + xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 + xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 + xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 + xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 +.endm + +.macro Write_B4C _vs0 _vs1 _vs2 _vs3 + stxvd2x \_vs0, 3, 9 + stxvd2x \_vs1, 3, 16 + stxvd2x \_vs2, 3, 18 + stxvd2x \_vs3, 3, 20 +.endm + +.macro Write_M4C _vs0 _vs1 _vs2 _vs3 + stxvd2x \_vs0, 3, 10 + stxvd2x \_vs1, 3, 17 + stxvd2x \_vs2, 3, 19 + stxvd2x \_vs3, 3, 21 +.endm + +.macro Reload_4coeffs + lxvd2x 32+25, 0, 3 + lxvd2x 32+26, 10, 3 + lxvd2x 32+30, 11, 3 + lxvd2x 32+31, 12, 3 + addi 3, 3, 64 +.endm + +.macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 + addi 3, 3, -128 + stxvd2x \_vs0, 0, 3 + stxvd2x \_vs1, 10, 3 + stxvd2x \_vs2, 11, 3 + stxvd2x \_vs3, 12, 3 + stxvd2x \_vs4, 15, 3 + stxvd2x \_vs5, 16, 3 + stxvd2x \_vs6, 17, 3 + stxvd2x \_vs7, 18, 3 + addi 3, 3, 128 +.endm + +.macro PermWriteL44 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + xxpermdi 32+10, 32+14, 32+13, 3 + xxpermdi 32+11, 32+14, 32+13, 0 + xxpermdi 32+12, 32+19, 32+18, 3 + xxpermdi 32+13, 32+19, 32+18, 0 + xxpermdi 32+14, 32+24, 32+23, 3 + xxpermdi 32+15, 32+24, 32+23, 0 + xxpermdi 32+16, 32+29, 32+28, 3 + xxpermdi 32+17, 32+29, 32+28, 0 + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 +.endm + +.macro PermWriteL24 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + vmrgew 10, 13, 14 + vmrgow 11, 13, 14 + vmrgew 12, 18, 19 + vmrgow 13, 18, 19 + vmrgew 14, 23, 24 + vmrgow 15, 23, 24 + vmrgew 16, 28, 29 + vmrgow 17, 28, 29 + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 +.endm + +.macro INTT_REDUCE_L24 + Load_L24Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL24 +.endm + +.macro INTT_REDUCE_L44 + Load_L44Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + Perm_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL44 +.endm + +.macro INTT_REDUCE_4X start next step + Load_4Coeffs \start, \next, \step + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 +.endm + +# intt +# t = r[j]; +# r[j] = barrett_reduce(t + r[j + len]); +# r[j + len] = r[j + len] - t; +# r[j + len] = fqmul(zeta, r[j + len]); + +# +# mlk_intt_ppc(r) +# +.global MLK_ASM_NAMESPACE(intt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(intt_ppc) + + SAVE_REGS + + # init vectors and constants + # Setup for Montgomery reduce + lxvx 0, 0, 4 + + li 10, QINV_OFFSET + lxvx 32+V_QINV, 10, 4 # QINV + xxlxor 32+3, 32+3, 32+3 + vspltish 4, 1 + xxlor 2, 32+2, 32+2 # QINV + xxlor 3, 32+3, 32+3 # 0 + xxlor 4, 32+4, 32+4 # 1 + + # Setup for Barrett reduce + li 10, Q_OFFSET + li 11, C20159_OFFSET + lxvx 6, 10, 4 # V_MKQ + lxvx 32+V20159, 11, 4 # V20159 + + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 32+8, 32+8 # V_26 store at vs8 + + vspltisw 9, 1 + vsubuwm 10, 8, 9 # 25 + vslw 9, 9, 10 + xxlor 7, 32+9, 32+9 # V_25 syore at vs7 + + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 + + # + # Montgomery reduce loops with constant 1441 + # + addi 14, 4, C1441_OFFSET + lvx V1441, 0, 14 + li 8, 4 # loops + mtctr 8 + + Set_mont_consts +intt_ppc__Loopf: + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + bdnz intt_ppc__Loopf + + addi 3, 3, -512 + +.align 4 + # + # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # Update zetas vectors, each vector has 2 zetas + addi 14, 4, ZETA_INTT_OFFSET + li 7, 4 + li 8, 4 + mtctr 8 + mr 5, 3 +intt_ppc__Loop2: + INTT_REDUCE_L24 + addi 5, 5, 128 + bdnz intt_ppc__Loop2 + +.align 4 + # + # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + mr 5, 3 + li 7, 8 + li 8, 4 # loops + mtctr 8 +intt_ppc__Loop4: + INTT_REDUCE_L44 + addi 5, 5, 128 + bdnz intt_ppc__Loop4 + +.align 4 + # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + li 7, 16 + li 5, 0 + li 15, 4 # loops + mtctr 15 + +intt_ppc__Loop8: + INTT_REDUCE_4X 5, 32, 32 + addi 5, 5, 128 + bdnz intt_ppc__Loop8 + +.align 4 + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + li 5, 0 + li 7, 32 + + INTT_REDUCE_4X 5, 64, 64 + + li 5, 16 + addi 14, 14, -64 + INTT_REDUCE_4X 5, 64, 64 + + li 5, 256 + INTT_REDUCE_4X 5, 64, 64 + + li 5, 272 + addi 14, 14, -64 + INTT_REDUCE_4X 5, 64, 64 + +.align 4 + # + # 5. len = 32, start = 0, 64, 128, 192 + li 5, 0 + li 7, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 + # + # 6. len = 64, start = 0, 128 + li 5, 0 + li 7, 128 + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 320 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 + # 7. len = 128, start = 0 + # + li 5, 0 # start + li 7, 256 # len * 2 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + xxlor 9, 32+10, 32+10 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 192 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + RESTORE_REGS + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ +#undef V_QINV +#undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 +#undef V_ZETA +#undef V1441 + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S new file mode 100644 index 0000000000..dbe7c82fa5 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -0,0 +1,513 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 + +.machine "any" +.text + +.macro SAVE_REGS + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 +.endm + +.macro RESTORE_REGS + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 +.endm + +.macro Load_4Coeffs start next step + mr 9, \start + add 10, 7, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+13, 3, 10 # r[j+len] + lxvd2x 32+18, 3, 17 # r[j+len] + lxvd2x 32+23, 3, 19 # r[j+len] + lxvd2x 32+28, 3, 21 # r[j+len] +.endm + +# +# Load Coeffients and setup vectors +# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 +# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 +# +# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 +# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 +# +.macro Load_L24Coeffs + lxvd2x 32+25, 0, 5 # a[j], r[j+len] + lxvd2x 32+26, 10, 5 # a[j], r[j+len] + vmrgew 13, 25, 26 + vmrgow 12, 25, 26 + lxvd2x 32+25, 11, 5 # a[j], r[j+len] + lxvd2x 32+26, 12, 5 # a[j], r[j+len] + vmrgew 18, 25, 26 + vmrgow 17, 25, 26 + lxvd2x 32+25, 15, 5 # a[j], r[j+len] + lxvd2x 32+26, 16, 5 # a[j], r[j+len] + vmrgew 23, 25, 26 + vmrgow 22, 25, 26 + lxvd2x 32+25, 17, 5 # a[j], r[j+len] + lxvd2x 32+26, 18, 5 # a[j], r[j+len] + vmrgew 28, 25, 26 + vmrgow 27, 25, 26 +.endm + +# +# Permute +# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 +# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 +# +# to +# rjlen4 - rjlen7, rjlen12 - rjlen15 +# rj0 - rj4, rj8 - rj11 +# +.macro Load_L44Coeffs + lxvd2x 1, 0, 5 # rj0, rj1, rj2, rj3, + # rjlen4, rjlen5, rjlen6, rjlen7 + lxvd2x 2, 10, 5 # rj8, rj9, rj10, rj11 + # rjlen12, rjlen13, rjlen14, rjlen15 + xxpermdi 32+13, 2, 1, 3 # rjlen4 - rjlen7, rjlen12 - rjlen15 + xxpermdi 32+12, 2, 1, 0 # rj0 - rj4, rj8 - rj11 + lxvd2x 3, 11, 5 + lxvd2x 4, 12, 5 + xxpermdi 32+18, 4, 3, 3 + xxpermdi 32+17, 4, 3, 0 + lxvd2x 1, 15, 5 + lxvd2x 2, 16, 5 + xxpermdi 32+23, 2, 1, 3 + xxpermdi 32+22, 2, 1, 0 + lxvd2x 3, 17, 5 + lxvd2x 4, 18, 5 + xxpermdi 32+28, 4, 3, 3 + xxpermdi 32+27, 4, 3, 0 +.endm + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) +# +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 + # fqmul = zeta * coefficient + # Modular multification bond by 2^16 * q in abs value + vmladduhm 15, 13, \_vz0, 3 + vmladduhm 20, 18, \_vz1, 3 + vmladduhm 25, 23, \_vz2, 3 + vmladduhm 30, 28, \_vz3, 3 + + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 13, \_vz0, 3 + vmhraddshs 19, 18, \_vz1, 3 + vmhraddshs 24, 23, \_vz2, 3 + vmhraddshs 29, 28, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 30, 30, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah 13, 15, 4 # >> 1 + vsrah 18, 20, 4 # >> 1 + vsrah 23, 25, 4 # >> 1 + vsrah 28, 30, 4 # >> 1 + +.endm + +.macro Load_4Aj + lxvd2x 32+12, 3, 9 # r[j] + lxvd2x 32+17, 3, 16 # r[j] + lxvd2x 32+22, 3, 18 # r[j] + lxvd2x 32+27, 3, 20 # r[j] +.endm + +.macro Compute_4Coeffs + # Since the result of the Montgomery multiplication is bounded + # by q in absolute value. + # Finally to complete the final update of the results with add/sub + vsubuhm 16, 12, 13 # r - t + vadduhm 15, 13, 12 # r + t + vsubuhm 21, 17, 18 # r - t + vadduhm 20, 18, 17 # r + t + vsubuhm 26, 22, 23 # r - t + vadduhm 25, 23, 22 # r + t + vsubuhm 31, 27, 28 # r - t + vadduhm 30, 28, 27 # r + t +.endm + +.macro NTT_MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 + Load_4Coeffs \start, \next, \step + MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3 + Load_4Aj + Compute_4Coeffs +.endm + +.macro Write_One + stxvd2x 32+15, 3, 9 + stxvd2x 32+16, 3, 10 + stxvd2x 32+20, 3, 16 + stxvd2x 32+21, 3, 17 + stxvd2x 32+25, 3, 18 + stxvd2x 32+26, 3, 19 + stxvd2x 32+30, 3, 20 + stxvd2x 32+31, 3, 21 +.endm + +.macro PermWriteL44 + Compute_4Coeffs + xxpermdi 0, 32+15, 32+16, 3 + xxpermdi 1, 32+15, 32+16, 0 + xxpermdi 2, 32+20, 32+21, 3 + xxpermdi 3, 32+20, 32+21, 0 + xxpermdi 4, 32+25, 32+26, 3 + xxpermdi 5, 32+25, 32+26, 0 + xxpermdi 6, 32+30, 32+31, 3 + xxpermdi 7, 32+30, 32+31, 0 + stxvd2x 0, 0, 5 + stxvd2x 1, 10, 5 + stxvd2x 2, 11, 5 + stxvd2x 3, 12, 5 + stxvd2x 4, 15, 5 + stxvd2x 5, 16, 5 + stxvd2x 6, 17, 5 + stxvd2x 7, 18, 5 +.endm + +.macro PermWriteL24 + Compute_4Coeffs + vmrgew 10, 16, 15 + vmrgow 11, 16, 15 + vmrgew 12, 21, 20 + vmrgow 13, 21, 20 + vmrgew 14, 26, 25 + vmrgow 15, 26, 25 + vmrgew 16, 31, 30 + vmrgow 17, 31, 30 + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 +.endm + +.macro Load_next_4zetas + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 32+V_Z0, 0, 14 + lxvd2x 32+V_Z1, 10, 14 + lxvd2x 32+V_Z2, 11, 14 + lxvd2x 32+V_Z3, 12, 14 + addi 14, 14, 64 +.endm + +.macro Perm_4zetas + xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 + xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 + xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 + xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 +.endm + +# +# mlk_ntt_ppc(int16_t *r) +# +.global MLK_ASM_NAMESPACE(ntt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(ntt_ppc) + + SAVE_REGS + + # get MLKEM_Q + lvx V_NMKQ,0,4 + + # zetas array + addi 14, 4, ZETA_NTT_OFFSET + + vxor 3, 3, 3 + vspltish 4, 1 + + li 10, QINV_OFFSET + lvx V_QINV, 10, 4 + +.align 4 + # + # Compute coefficients of the NTT based on the following loop. + # for (len = 128; len ≥ 2; len = len/2) + # + # 1. len = 128, start = 0 + # + li 5, 0 # start + li 7, 256 # len * 2 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 128 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 192 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 + # + # 2. len = 64, start = 0, 128 + # k += 2 + li 5, 0 + li 7, 128 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 320 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 + # + # 3. len = 32, start = 0, 64, 128, 192 + # k += 4 + li 5, 0 + li 7, 64 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 128 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 384 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + # k += 8 + li 5, 0 + li 7, 32 + Load_next_4zetas + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 16 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + Load_next_4zetas + li 5, 256 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 272 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + +.align 4 + # + # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + # k += 16 + li 5, 0 + li 7, 16 + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 128 + + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 256 + + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 384 + + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + # + # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + # k += 32 + li 15, 4 # loops + mtctr 15 + mr 5, 3 + li 7, 8 + + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 + +.align 4 +ntt_ppc__Len4: + Load_next_4zetas + Perm_4zetas + Load_L44Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL44 + addi 5, 5, 128 + + bdnz ntt_ppc__Len4 + + # + # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # k += 64 + # Update zetas vectors, each vector has 2 zetas + + li 8, 4 + mtctr 8 + mr 5, 3 + li 7, 4 + +.align 4 +ntt_ppc__Len2: + Load_next_4zetas + Load_L24Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL24 + addi 5, 5, 128 + + bdnz ntt_ppc__Len2 + + RESTORE_REGS + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V_QINV +#undef V_NMKQ +#undef V_ZETA + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S new file mode 100644 index 0000000000..765ef91763 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/poly_tomont.S @@ -0,0 +1,190 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +# Poly_tomont: Inplace conversion of all coefficients of a polynomial +# from normal domain to Montgomery domain +# +# Arguments:*r: pointer to input/output polynomial +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +#define V1353 0 +#define V_QINV 2 +#define V_NMKQ 5 + +.machine "any" +.text + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(_v0, _v1, _v2, _v3) +# +.macro MREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+13, 0, 3 + addi 3, 3, 16 + lxvd2x 32+18, 0, 3 + addi 3, 3, 16 + lxvd2x 32+23, 0, 3 + addi 3, 3, 16 + lxvd2x 32+7, 0, 3 + addi 3, 3, 16 + + vmladduhm 15, 13, V1353, 3 + vmladduhm 20, 18, V1353, 3 + vmladduhm 25, 23, V1353, 3 + vmladduhm 9, 7, V1353, 3 + + vmhraddshs 14, 13, V1353, 3 + vmhraddshs 19, 18, V1353, 3 + vmhraddshs 24, 23, V1353, 3 + vmhraddshs 8, 7, V1353, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 9, 9, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 9, 9, V_NMKQ, 8 + + vsrah \_v0, 15, 4 # >> 1 + vsrah \_v1, 20, 4 # >> 1 + vsrah \_v2, 25, 4 # >> 1 + vsrah \_v3, 9, 4 # >> 1 +.endm + +.macro Write_8X + stxvd2x 32+27, 4, 3 + stxvd2x 32+28, 5, 3 + stxvd2x 32+29, 6, 3 + stxvd2x 32+30, 7, 3 + stxvd2x 32+13, 8, 3 + stxvd2x 32+18, 9, 3 + stxvd2x 32+23, 10, 3 + stxvd2x 32+7, 11, 3 +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(poly_tomont_ppc) +MLK_ASM_FN_SYMBOL(poly_tomont_ppc) + stdu 1, -320(1) + mflr 0 + + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + stxvx 32+25, 11, 1 + stxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + stxvx 32+27, 6, 1 + stxvx 32+28, 7, 1 + stxvx 32+29, 8, 1 + stxvx 32+30, 9, 1 + + li 6, NQ_OFFSET + li 7, QINV_OFFSET + li 8, C1353_OFFSET + lxvx 32+V_NMKQ, 6, 4 + lxvx 32+V_QINV, 7, 4 + lxvx 32+V1353, 8, 4 + + vxor 3, 3, 3 + vspltish 4, 1 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + lxvx 32+25, 11, 1 + lxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + lxvx 32+27, 6, 1 + lxvx 32+28, 7, 1 + lxvx 32+29, 8, 1 + lxvx 32+30, 9, 1 + mtlr 0 + addi 1, 1, 320 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S new file mode 100644 index 0000000000..40c7a4cef5 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -0,0 +1,240 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +# +# poly_reduce: Applies Barrett reduction to all coefficients of a polynomial +# for details of the Barrett reduction +# +# Arguments: *r: pointer to input/output polynomial +# + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +.machine "any" +.text + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+8, 0, 3 + lxvd2x 32+12, 14, 3 + lxvd2x 32+16, 15, 3 + lxvd2x 32+20, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +.macro Write_8X + stxvd2x 32+21, 4, 3 + stxvd2x 32+22, 5, 3 + stxvd2x 32+23, 6, 3 + stxvd2x 32+24, 7, 3 + stxvd2x 32+4, 8, 3 + stxvd2x 32+9, 9, 3 + stxvd2x 32+13, 10, 3 + stxvd2x 32+17, 11, 3 +.endm + +# +# Conditional addition to get unsigned canonical representative +# +.macro To_unsigned_16 + lxvd2x 32+12, 0, 3 + lxvd2x 32+13, 14, 3 + lxvd2x 32+14, 15, 3 + lxvd2x 32+15, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 32+1, 32+7,32+12, 32+1 + xxsel 32+0, 32+8,32+13, 32+0 + xxsel 32+3, 32+5,32+14, 32+3 + xxsel 32+2, 32+6,32+15, 32+2 + stxvd2x 32+3, 10, 3 + stxvd2x 32+2, 11, 3 + stxvd2x 32+1, 8, 3 + stxvd2x 32+0, 9, 3 +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(reduce_ppc) +MLK_ASM_FN_SYMBOL(reduce_ppc) + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + + vxor 7, 7, 7 + + li 6, Q_OFFSET + li 7, C20159_OFFSET + lxvx 32+V_MKQ, 6, 4 + lxvx 32+V20159, 7, 4 + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + vspltisw 4, 1 + vsubuwm 5, V_26, 4 + vslw V_25, 4, 5 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + li 14, 16 + li 15, 32 + li 16, 48 + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + # + # To unsigned canonical + # +.align 4 + addi 3, 3, -512 + vxor 9, 9, 9 + vspltish 10, 15 + vmr 11, V_MKQ + + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + mtlr 0 + addi 1, 1, 224 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ diff --git a/test/mk/components.mk b/test/mk/components.mk index cdcc3eb5d1..88158f7036 100644 --- a/test/mk/components.mk +++ b/test/mk/components.mk @@ -8,6 +8,7 @@ endif SOURCES += $(wildcard mlkem/src/*.c) ifeq ($(OPT),1) SOURCES += $(wildcard mlkem/src/native/aarch64/src/*.[csS]) $(wildcard mlkem/src/native/x86_64/src/*.[csS]) + SOURCES += $(wildcard mlkem/src/native/ppc64le/src/*.[csS]) CFLAGS += -DMLK_CONFIG_USE_NATIVE_BACKEND_ARITH -DMLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 endif From 08dbe9cff384a0252c3805784a71d9740821b63a Mon Sep 17 00:00:00 2001 From: Hanno Becker Date: Thu, 11 Sep 2025 09:03:15 +0100 Subject: [PATCH 2/6] autogen: Prepare for rv64 and ppc64le backends This commit prepares scripts/autogen and scripts/cfify for the work-in-progress addition of riscv64 and ppc64le backends. Specifically, simpasm needs to be invoked with the right cross compiler for those architectures, and scripts/cfify needs to accept riscv64 and ppc64le architecture parameters. Signed-off-by: Hanno Becker --- scripts/autogen | 112 ++++++++++++++++++++++++++++-------------------- scripts/cfify | 15 ++++++- scripts/simpasm | 10 ++++- 3 files changed, 88 insertions(+), 49 deletions(-) diff --git a/scripts/autogen b/scripts/autogen index 819f00def3..74c68b6507 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -1431,6 +1431,10 @@ def gen_monolithic_source_file(dry_run=False): for c in filter(native_arith_x86_64, c_sources): yield f'#include "{c}"' yield "#endif" + yield "#if defined(MLK_SYS_PPC64LE)" + for c in filter(native_arith_ppc64le, c_sources): + yield f'#include "{c}"' + yield "#endif" yield "#endif" yield "" yield "#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202)" @@ -1766,6 +1770,10 @@ def update_via_simpasm( source_arch = "aarch64" elif "x86_64" in infile_full: source_arch = "x86_64" + elif "ppc64le" in infile_full: + source_arch = "ppc64le" + elif "riscv64" in infile_full: + source_arch = "riscv64" else: raise Exception(f"Could not detect architecture of source file {infile_full}.") # Check native architecture @@ -1775,7 +1783,14 @@ def update_via_simpasm( native_arch = "x86_64" if native_arch != source_arch: - cross_prefix = f"{source_arch}-unknown-linux-gnu-" + arch_to_cross_prefix = { + "aarch64": "aarch64-unknown-linux-gnu-", + "x86_64": "x86_64-unknown-linux-gnu-", + "ppc64le": "powerpc64le-unknown-linux-gnu-", + "riscv64": "riscv64-unknown-linux-gnu-", + } + + cross_prefix = arch_to_cross_prefix[source_arch] cross_gcc = cross_prefix + "gcc" # Check if cross-compiler is present if shutil.which(cross_gcc) is None: @@ -1788,13 +1803,12 @@ def update_via_simpasm( with tempfile.NamedTemporaryFile(suffix=".S") as tmp: try: # Determine architecture from filename - arch = "aarch64" if "aarch64" in infile_full else "x86_64" cmd = [ "./scripts/simpasm", "--objdump=llvm-objdump", "--cfify", - "--arch=" + arch, + "--arch=" + source_arch, "-i", infile_full, "-o", @@ -2058,49 +2072,55 @@ def synchronize_backends( ), ) - synchronize_backend( - f"dev/aarch64_{ty}/src", - "mlkem/src/native/aarch64/src", - dry_run=dry_run, - delete=delete, - force_cross=force_cross, - no_simplify=no_simplify, - cflags="-Imlkem/src/native/aarch64/src", - ) - synchronize_backend( - "dev/fips202/aarch64/src", - "mlkem/src/fips202/native/aarch64/src", - dry_run=dry_run, - delete=delete, - force_cross=force_cross, - no_simplify=no_simplify, - cflags="-Imlkem/src/fips202/native/aarch64/src -march=armv8.4-a+sha3", - ) - synchronize_backend( - "dev/fips202/aarch64", - "mlkem/src/fips202/native/aarch64", - dry_run=dry_run, - delete=delete, - force_cross=force_cross, - no_simplify=no_simplify, - cflags="-Imlkem/src/fips202/native/aarch64 -march=armv8.4-a+sha3", - ) - synchronize_backend( - "dev/x86_64/src", - "mlkem/src/native/x86_64/src", - dry_run=dry_run, - delete=delete, - force_cross=force_cross, - no_simplify=no_simplify, - # Turn off control-flow protection (CET) explicitly. Newer versions of - # clang turn it on by default and insert endbr64 instructions at every - # global symbol. - # We insert endbr64 instruction manually via the MLK_ASM_FN_SYMBOL - # macro. - # This leads to duplicate endbr64 instructions causing a failure when - # comparing the object code before and after simplification. - cflags="-Imlkem/src/native/x86_64/src/ -mavx2 -mbmi2 -msse4 -fcf-protection=none", - ) + # Triples of + # - input backend directory under dev/ + # - output backend directory under mlkem/ + # - cflags + worklist = [ + ( + f"dev/aarch64_{ty}/src", + "mlkem/src/native/aarch64/src", + "-Imlkem/src/native/aarch64/src", + ), + ( + "dev/fips202/aarch64/src", + "mlkem/src/fips202/native/aarch64/src", + "-Imlkem/src/fips202/native/aarch64/src -march=armv8.4-a+sha3", + ), + ( + "dev/fips202/aarch64", + "mlkem/src/fips202/native/aarch64", + "-Imlkem/src/fips202/native/aarch64 -march=armv8.4-a+sha3", + ), + ( + "dev/x86_64/src", + "mlkem/src/native/x86_64/src", + # Turn off control-flow protection (CET) explicitly. Newer versions of + # clang turn it on by default and insert endbr64 instructions at every + # global symbol. + # We insert endbr64 instruction manually via the MLK_ASM_FN_SYMBOL + # macro. + # This leads to duplicate endbr64 instructions causing a failure when + # comparing the object code before and after simplification. + "-Imlkem/src/native/x86_64/src/ -mavx2 -mbmi2 -msse4 -fcf-protection=none", + ), + ( + "dev/ppc64le/src", + "mlkem/src/native/ppc64le/src", + "-Imlkem/src/native/ppc64le/src -mvsx", + ), + ] + + for in_dir, out_dir, cflags in worklist: + synchronize_backend( + in_dir, + out_dir, + dry_run=dry_run, + delete=delete, + force_cross=force_cross, + no_simplify=no_simplify, + cflags=cflags, + ) def adjust_header_guard_for_filename(content, header_file): diff --git a/scripts/cfify b/scripts/cfify index a08d237079..fca0381fdf 100755 --- a/scripts/cfify +++ b/scripts/cfify @@ -226,6 +226,19 @@ def add_cfi_directives(text, arch): i += 1 continue + elif arch == "riscv64": + # No special handling of riscv64 for now + pass + elif arch == "ppc64le": + # ppc64le: blr -> .cfi_endproc after blr + match = re.match(r"(\s*)blr\s*$", line, re.IGNORECASE) + if match: + indent = match.group(1) + result.append(line) + result.append(f"{indent}.cfi_endproc") + i += 1 + continue + result.append(line) i += 1 @@ -246,7 +259,7 @@ def main(): ) parser.add_argument( "--arch", - choices=["aarch64", "x86_64"], + choices=["aarch64", "x86_64", "riscv64", "ppc64le"], default="aarch64", help="Target architecture (default: aarch64)", ) diff --git a/scripts/simpasm b/scripts/simpasm index 5afa6bd9ac..5a02221d66 100755 --- a/scripts/simpasm +++ b/scripts/simpasm @@ -246,7 +246,7 @@ def simplify(logger, args, asm_input, asm_output=None): logger.debug(f"Using raw global symbol {sym} going forward ...") cmd = [args.objdump, "--disassemble", tmp_objfile0] - if platform.system() == "Darwin": + if platform.system() == "Darwin" and args.arch == "aarch64": cmd += ["--triple=aarch64"] logger.debug(f"Disassembling temporary object file {tmp_objfile0} ...") @@ -255,6 +255,12 @@ def simplify(logger, args, asm_input, asm_output=None): logger.debug("Patching up disassembly ...") simplified = patchup_disasm(disasm, cfify=args.cfify) + # On ppc64le we're using 16 byte alignment + if args.arch == "ppc64le": + align = 16 + else: + align = 4 + autogen_header = [ "", "/*", @@ -264,7 +270,7 @@ def simplify(logger, args, asm_input, asm_output=None): "", "", ".text", - ".balign 4", + f".balign {align}", ] if args.preserve_preprocessor_directives is False: From dfe3983174a7ced91643146212850d120774e70c Mon Sep 17 00:00:00 2001 From: Hanno Becker Date: Sat, 20 Sep 2025 05:17:22 +0100 Subject: [PATCH 3/6] ppc64le: `untabify` assembly Signed-off-by: Hanno Becker --- dev/ppc64le/src/intt_ppc.S | 860 ++++++++++----------- dev/ppc64le/src/ntt_ppc.S | 656 ++++++++-------- dev/ppc64le/src/poly_tomont.S | 254 +++--- dev/ppc64le/src/reduce.S | 338 ++++---- mlkem/src/native/ppc64le/src/intt_ppc.S | 860 ++++++++++----------- mlkem/src/native/ppc64le/src/ntt_ppc.S | 656 ++++++++-------- mlkem/src/native/ppc64le/src/poly_tomont.S | 254 +++--- mlkem/src/native/ppc64le/src/reduce.S | 338 ++++---- 8 files changed, 2108 insertions(+), 2108 deletions(-) diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index 4fc49edcd6..51cae8e621 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -37,111 +37,111 @@ #define V1441 10 .macro SAVE_REGS - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - stxvx 32+20, 10, 1 - stxvx 32+21, 11, 1 - stxvx 32+22, 12, 1 - stxvx 32+23, 14, 1 - stxvx 32+24, 15, 1 - stxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - stxvx 32+26, 10, 1 - stxvx 32+27, 11, 1 - stxvx 32+28, 12, 1 - stxvx 32+29, 14, 1 - stxvx 32+30, 15, 1 - stxvx 32+31, 16, 1 + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 .endm .macro RESTORE_REGS - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - lxvx 32+20, 10, 1 - lxvx 32+21, 11, 1 - lxvx 32+22, 12, 1 - lxvx 32+23, 14, 1 - lxvx 32+24, 15, 1 - lxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - lxvx 32+26, 10, 1 - lxvx 32+27, 11, 1 - lxvx 32+28, 12, 1 - lxvx 32+29, 14, 1 - lxvx 32+30, 15, 1 - lxvx 32+31, 16, 1 - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 .endm .macro Compute_4Coeffs - vsubuhm 25, 8, 21 # r[j+len] - t - vsubuhm 26, 12, 22 # r[j+len] - t - vsubuhm 30, 16, 23 # r[j+len] - t - vsubuhm 31, 20, 24 # r[j+len] - t - vadduhm 8, 8, 21 # r[j+len] + t - vadduhm 12, 12, 22 # r[j+len] + t - vadduhm 16, 16, 23 # r[j+len] + t - vadduhm 20, 20, 24 # r[j+len] + t + vsubuhm 25, 8, 21 # r[j+len] - t + vsubuhm 26, 12, 22 # r[j+len] - t + vsubuhm 30, 16, 23 # r[j+len] - t + vsubuhm 31, 20, 24 # r[j+len] - t + vadduhm 8, 8, 21 # r[j+len] + t + vadduhm 12, 12, 22 # r[j+len] + t + vadduhm 16, 16, 23 # r[j+len] + t + vadduhm 20, 20, 24 # r[j+len] + t .endm .macro Load_4Coeffs start next step - mr 9, \start # j - add 10, 7, 9 # J + len*2 - addi 16, 9, \next - addi 17, 10, \step - addi 18, 16, \next - addi 19, 17, \step - addi 20, 18, \next - addi 21, 19, \step - lxvd2x 32+8, 3, 10 # r[j+len] - lxvd2x 32+12, 3, 17 # r[j+len] - lxvd2x 32+16, 3, 19 # r[j+len] - lxvd2x 32+20, 3, 21 # r[j+len] - - lxvd2x 32+21, 3, 9 - lxvd2x 32+22, 3, 16 - lxvd2x 32+23, 3, 18 - lxvd2x 32+24, 3, 20 - - Compute_4Coeffs + mr 9, \start # j + add 10, 7, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+8, 3, 10 # r[j+len] + lxvd2x 32+12, 3, 17 # r[j+len] + lxvd2x 32+16, 3, 19 # r[j+len] + lxvd2x 32+20, 3, 21 # r[j+len] + + lxvd2x 32+21, 3, 9 + lxvd2x 32+22, 3, 16 + lxvd2x 32+23, 3, 18 + lxvd2x 32+24, 3, 20 + + Compute_4Coeffs .endm # @@ -202,99 +202,99 @@ .endm .macro BREDUCE_4X _v0 _v1 _v2 _v3 - vxor 7, 7, 7 - xxlor 32+3, 6, 6 # V_MKQ - xxlor 32+1, 7, 7 # V_25 - xxlor 32+2, 8, 8 # V_26 - # Multify Odd/Even signed halfword; - # Results word bound by 2^32 in abs value. - vmulosh 6, 8, V20159 - vmulesh 5, 8, V20159 - vmulosh 11, 12, V20159 - vmulesh 10, 12, V20159 - vmulosh 15, 16, V20159 - vmulesh 14, 16, V20159 - vmulosh 19, 20, V20159 - vmulesh 18, 20, V20159 - xxmrglw 32+4, 32+5, 32+6 - xxmrghw 32+5, 32+5, 32+6 - xxmrglw 32+9, 32+10, 32+11 - xxmrghw 32+10, 32+10, 32+11 - xxmrglw 32+13, 32+14, 32+15 - xxmrghw 32+14, 32+14, 32+15 - xxmrglw 32+17, 32+18, 32+19 - xxmrghw 32+18, 32+18, 32+19 - vadduwm 4, 4, V_25 - vadduwm 5, 5, V_25 - vadduwm 9, 9, V_25 - vadduwm 10, 10, V_25 - vadduwm 13, 13, V_25 - vadduwm 14, 14, V_25 - vadduwm 17, 17, V_25 - vadduwm 18, 18, V_25 - # Right shift and pack lower halfword, - # results bond to 2^16 in abs value - vsraw 4, 4, V_26 - vsraw 5, 5, V_26 - vsraw 9, 9, V_26 - vsraw 10, 10, V_26 - vsraw 13, 13, V_26 - vsraw 14, 14, V_26 - vsraw 17, 17, V_26 - vsraw 18, 18, V_26 - vpkuwum 4, 5, 4 - vsubuhm 4, 7, 4 - vpkuwum 9, 10, 9 - vsubuhm 9, 7, 9 - vpkuwum 13, 14, 13 - vsubuhm 13, 7, 13 - vpkuwum 17, 18, 17 - vsubuhm 17, 7, 17 - # Modulo multify-Low unsigned halfword; - # results bond to 2^16 * q in abs value. - vmladduhm \_v0, 4, V_MKQ, 8 - vmladduhm \_v1, 9, V_MKQ, 12 - vmladduhm \_v2, 13, V_MKQ, 16 - vmladduhm \_v3, 17, V_MKQ, 20 + vxor 7, 7, 7 + xxlor 32+3, 6, 6 # V_MKQ + xxlor 32+1, 7, 7 # V_25 + xxlor 32+2, 8, 8 # V_26 + # Multify Odd/Even signed halfword; + # Results word bound by 2^32 in abs value. + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + # Right shift and pack lower halfword, + # results bond to 2^16 in abs value + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + # Modulo multify-Low unsigned halfword; + # results bond to 2^16 * q in abs value. + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 .endm #----------------------------------- # MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) # .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 - # Modular multification bond by 2^16 * q in abs value - vmladduhm 15, 25, \_vz0, 3 - vmladduhm 20, 26, \_vz1, 3 - vmladduhm 27, 30, \_vz2, 3 - vmladduhm 28, 31, \_vz3, 3 - - # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value - vmhraddshs 14, 25, \_vz0, 3 - vmhraddshs 19, 26, \_vz1, 3 - vmhraddshs 24, 30, \_vz2, 3 - vmhraddshs 29, 31, \_vz3, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 27, V_QINV, 3 - vmladduhm 30, 28, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 - - vsrah \_vo0, 15, 4 # >> 1 - vsrah \_vo1, 20, 4 # >> 1 - vsrah \_vo2, 25, 4 # >> 1 - vsrah \_vo3, 30, 4 # >> 1 + # Modular multification bond by 2^16 * q in abs value + vmladduhm 15, 25, \_vz0, 3 + vmladduhm 20, 26, \_vz1, 3 + vmladduhm 27, 30, \_vz2, 3 + vmladduhm 28, 31, \_vz3, 3 + + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 25, \_vz0, 3 + vmhraddshs 19, 26, \_vz1, 3 + vmhraddshs 24, 30, \_vz2, 3 + vmhraddshs 29, 31, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 27, V_QINV, 3 + vmladduhm 30, 28, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah \_vo0, 15, 4 # >> 1 + vsrah \_vo1, 20, 4 # >> 1 + vsrah \_vo2, 25, 4 # >> 1 + vsrah \_vo3, 30, 4 # >> 1 .endm .macro Set_mont_consts - xxlor 32+5, 0, 0 # V_NMKQ - xxlor 32+2, 2, 2 # V_QINV - xxlor 32+3, 3, 3 # 0 - xxlor 32+4, 4, 4 # 1 + xxlor 32+5, 0, 0 # V_NMKQ + xxlor 32+2, 2, 2 # V_QINV + xxlor 32+3, 3, 3 # 0 + xxlor 32+4, 4, 4 # 1 .endm .macro Load_next_4zetas @@ -316,53 +316,53 @@ .endm .macro Write_B4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, 3, 9 - stxvd2x \_vs1, 3, 16 - stxvd2x \_vs2, 3, 18 - stxvd2x \_vs3, 3, 20 + stxvd2x \_vs0, 3, 9 + stxvd2x \_vs1, 3, 16 + stxvd2x \_vs2, 3, 18 + stxvd2x \_vs3, 3, 20 .endm .macro Write_M4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, 3, 10 - stxvd2x \_vs1, 3, 17 - stxvd2x \_vs2, 3, 19 - stxvd2x \_vs3, 3, 21 + stxvd2x \_vs0, 3, 10 + stxvd2x \_vs1, 3, 17 + stxvd2x \_vs2, 3, 19 + stxvd2x \_vs3, 3, 21 .endm .macro Reload_4coeffs - lxvd2x 32+25, 0, 3 - lxvd2x 32+26, 10, 3 - lxvd2x 32+30, 11, 3 - lxvd2x 32+31, 12, 3 - addi 3, 3, 64 + lxvd2x 32+25, 0, 3 + lxvd2x 32+26, 10, 3 + lxvd2x 32+30, 11, 3 + lxvd2x 32+31, 12, 3 + addi 3, 3, 64 .endm .macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 - addi 3, 3, -128 - stxvd2x \_vs0, 0, 3 - stxvd2x \_vs1, 10, 3 - stxvd2x \_vs2, 11, 3 - stxvd2x \_vs3, 12, 3 - stxvd2x \_vs4, 15, 3 - stxvd2x \_vs5, 16, 3 - stxvd2x \_vs6, 17, 3 - stxvd2x \_vs7, 18, 3 - addi 3, 3, 128 + addi 3, 3, -128 + stxvd2x \_vs0, 0, 3 + stxvd2x \_vs1, 10, 3 + stxvd2x \_vs2, 11, 3 + stxvd2x \_vs3, 12, 3 + stxvd2x \_vs4, 15, 3 + stxvd2x \_vs5, 16, 3 + stxvd2x \_vs6, 17, 3 + stxvd2x \_vs7, 18, 3 + addi 3, 3, 128 .endm .macro PermWriteL44 - xxlor 32+14, 10, 10 - xxlor 32+19, 11, 11 - xxlor 32+24, 12, 12 - xxlor 32+29, 13, 13 - xxpermdi 32+10, 32+14, 32+13, 3 - xxpermdi 32+11, 32+14, 32+13, 0 - xxpermdi 32+12, 32+19, 32+18, 3 - xxpermdi 32+13, 32+19, 32+18, 0 - xxpermdi 32+14, 32+24, 32+23, 3 - xxpermdi 32+15, 32+24, 32+23, 0 - xxpermdi 32+16, 32+29, 32+28, 3 - xxpermdi 32+17, 32+29, 32+28, 0 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + xxpermdi 32+10, 32+14, 32+13, 3 + xxpermdi 32+11, 32+14, 32+13, 0 + xxpermdi 32+12, 32+19, 32+18, 3 + xxpermdi 32+13, 32+19, 32+18, 0 + xxpermdi 32+14, 32+24, 32+23, 3 + xxpermdi 32+15, 32+24, 32+23, 0 + xxpermdi 32+16, 32+29, 32+28, 3 + xxpermdi 32+17, 32+29, 32+28, 0 stxvd2x 32+10, 0, 5 stxvd2x 32+11, 10, 5 stxvd2x 32+12, 11, 5 @@ -374,10 +374,10 @@ .endm .macro PermWriteL24 - xxlor 32+14, 10, 10 - xxlor 32+19, 11, 11 - xxlor 32+24, 12, 12 - xxlor 32+29, 13, 13 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 vmrgew 10, 13, 14 vmrgow 11, 13, 14 vmrgew 12, 18, 19 @@ -397,42 +397,42 @@ .endm .macro INTT_REDUCE_L24 - Load_L24Coeffs - Compute_4Coeffs - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - PermWriteL24 + Load_L24Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL24 .endm .macro INTT_REDUCE_L44 - Load_L44Coeffs - Compute_4Coeffs - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - Perm_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - PermWriteL44 + Load_L44Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + Perm_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL44 .endm .macro INTT_REDUCE_4X start next step - Load_4Coeffs \start, \next, \step - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + Load_4Coeffs \start, \next, \step + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .endm # intt @@ -448,34 +448,34 @@ .align 4 MLK_ASM_FN_SYMBOL(intt_ppc) - SAVE_REGS + SAVE_REGS - # init vectors and constants - # Setup for Montgomery reduce - lxvx 0, 0, 4 + # init vectors and constants + # Setup for Montgomery reduce + lxvx 0, 0, 4 - li 10, QINV_OFFSET - lxvx 32+V_QINV, 10, 4 # QINV - xxlxor 32+3, 32+3, 32+3 - vspltish 4, 1 - xxlor 2, 32+2, 32+2 # QINV - xxlor 3, 32+3, 32+3 # 0 - xxlor 4, 32+4, 32+4 # 1 + li 10, QINV_OFFSET + lxvx 32+V_QINV, 10, 4 # QINV + xxlxor 32+3, 32+3, 32+3 + vspltish 4, 1 + xxlor 2, 32+2, 32+2 # QINV + xxlor 3, 32+3, 32+3 # 0 + xxlor 4, 32+4, 32+4 # 1 - # Setup for Barrett reduce - li 10, Q_OFFSET - li 11, C20159_OFFSET - lxvx 6, 10, 4 # V_MKQ - lxvx 32+V20159, 11, 4 # V20159 + # Setup for Barrett reduce + li 10, Q_OFFSET + li 11, C20159_OFFSET + lxvx 6, 10, 4 # V_MKQ + lxvx 32+V20159, 11, 4 # V20159 - vspltisw 8, 13 - vadduwm 8, 8, 8 - xxlor 8, 32+8, 32+8 # V_26 store at vs8 + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 32+8, 32+8 # V_26 store at vs8 - vspltisw 9, 1 - vsubuwm 10, 8, 9 # 25 - vslw 9, 9, 10 - xxlor 7, 32+9, 32+9 # V_25 syore at vs7 + vspltisw 9, 1 + vsubuwm 10, 8, 9 # 25 + vslw 9, 9, 10 + xxlor 7, 32+9, 32+9 # V_25 syore at vs7 li 10, 16 li 11, 32 @@ -505,193 +505,193 @@ intt_ppc__Loopf: addi 3, 3, -512 .align 4 - # - # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - # Update zetas vectors, each vector has 2 zetas - addi 14, 4, ZETA_INTT_OFFSET - li 7, 4 - li 8, 4 - mtctr 8 - mr 5, 3 + # + # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # Update zetas vectors, each vector has 2 zetas + addi 14, 4, ZETA_INTT_OFFSET + li 7, 4 + li 8, 4 + mtctr 8 + mr 5, 3 intt_ppc__Loop2: - INTT_REDUCE_L24 - addi 5, 5, 128 - bdnz intt_ppc__Loop2 + INTT_REDUCE_L24 + addi 5, 5, 128 + bdnz intt_ppc__Loop2 .align 4 - # - # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - mr 5, 3 - li 7, 8 - li 8, 4 # loops - mtctr 8 + # + # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + mr 5, 3 + li 7, 8 + li 8, 4 # loops + mtctr 8 intt_ppc__Loop4: - INTT_REDUCE_L44 - addi 5, 5, 128 - bdnz intt_ppc__Loop4 + INTT_REDUCE_L44 + addi 5, 5, 128 + bdnz intt_ppc__Loop4 .align 4 - # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - li 7, 16 - li 5, 0 - li 15, 4 # loops - mtctr 15 + # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + li 7, 16 + li 5, 0 + li 15, 4 # loops + mtctr 15 intt_ppc__Loop8: - INTT_REDUCE_4X 5, 32, 32 - addi 5, 5, 128 - bdnz intt_ppc__Loop8 + INTT_REDUCE_4X 5, 32, 32 + addi 5, 5, 128 + bdnz intt_ppc__Loop8 .align 4 - # - # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - li 5, 0 - li 7, 32 + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + li 5, 0 + li 7, 32 - INTT_REDUCE_4X 5, 64, 64 + INTT_REDUCE_4X 5, 64, 64 li 5, 16 - addi 14, 14, -64 - INTT_REDUCE_4X 5, 64, 64 + addi 14, 14, -64 + INTT_REDUCE_4X 5, 64, 64 li 5, 256 - INTT_REDUCE_4X 5, 64, 64 + INTT_REDUCE_4X 5, 64, 64 li 5, 272 - addi 14, 14, -64 - INTT_REDUCE_4X 5, 64, 64 + addi 14, 14, -64 + INTT_REDUCE_4X 5, 64, 64 .align 4 # # 5. len = 32, start = 0, 64, 128, 192 - li 5, 0 - li 7, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + li 5, 0 + li 7, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 384 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 - # - # 6. len = 64, start = 0, 128 - li 5, 0 - li 7, 128 - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 320 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + # + # 6. len = 64, start = 0, 128 + li 5, 0 + li 7, 128 + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 320 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 - # 7. len = 128, start = 0 - # - li 5, 0 # start - li 7, 256 # len * 2 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - xxlor 9, 32+10, 32+10 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 192 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - - RESTORE_REGS - blr + # 7. len = 128, start = 0 + # + li 5, 0 # start + li 7, 256 # len * 2 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + xxlor 9, 32+10, 32+10 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 192 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + RESTORE_REGS + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S index c8dba7b27e..0a7a3eed58 100644 --- a/dev/ppc64le/src/ntt_ppc.S +++ b/dev/ppc64le/src/ntt_ppc.S @@ -19,103 +19,103 @@ #define V_QINV 2 #define V_NMKQ 5 -#define V_Z0 7 -#define V_Z1 8 -#define V_Z2 9 -#define V_Z3 10 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 #define V_ZETA 10 .machine "any" .text .macro SAVE_REGS - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - stxvx 32+20, 10, 1 - stxvx 32+21, 11, 1 - stxvx 32+22, 12, 1 - stxvx 32+23, 14, 1 - stxvx 32+24, 15, 1 - stxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - stxvx 32+26, 10, 1 - stxvx 32+27, 11, 1 - stxvx 32+28, 12, 1 - stxvx 32+29, 14, 1 - stxvx 32+30, 15, 1 - stxvx 32+31, 16, 1 + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 .endm .macro RESTORE_REGS - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - lxvx 32+20, 10, 1 - lxvx 32+21, 11, 1 - lxvx 32+22, 12, 1 - lxvx 32+23, 14, 1 - lxvx 32+24, 15, 1 - lxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - lxvx 32+26, 10, 1 - lxvx 32+27, 11, 1 - lxvx 32+28, 12, 1 - lxvx 32+29, 14, 1 - lxvx 32+30, 15, 1 - lxvx 32+31, 16, 1 - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 .endm .macro Load_4Coeffs start next step - mr 9, \start - add 10, 7, 9 # J + len*2 - addi 16, 9, \next - addi 17, 10, \step - addi 18, 16, \next - addi 19, 17, \step - addi 20, 18, \next - addi 21, 19, \step - lxvd2x 32+13, 3, 10 # r[j+len] - lxvd2x 32+18, 3, 17 # r[j+len] - lxvd2x 32+23, 3, 19 # r[j+len] - lxvd2x 32+28, 3, 21 # r[j+len] + mr 9, \start + add 10, 7, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+13, 3, 10 # r[j+len] + lxvd2x 32+18, 3, 17 # r[j+len] + lxvd2x 32+23, 3, 19 # r[j+len] + lxvd2x 32+28, 3, 21 # r[j+len] .endm # @@ -184,55 +184,55 @@ # MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) # .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 - # fqmul = zeta * coefficient - # Modular multification bond by 2^16 * q in abs value - vmladduhm 15, 13, \_vz0, 3 - vmladduhm 20, 18, \_vz1, 3 - vmladduhm 25, 23, \_vz2, 3 - vmladduhm 30, 28, \_vz3, 3 - - # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value - vmhraddshs 14, 13, \_vz0, 3 - vmhraddshs 19, 18, \_vz1, 3 - vmhraddshs 24, 23, \_vz2, 3 - vmhraddshs 29, 28, \_vz3, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 30, 30, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 - - vsrah 13, 15, 4 # >> 1 - vsrah 18, 20, 4 # >> 1 - vsrah 23, 25, 4 # >> 1 - vsrah 28, 30, 4 # >> 1 + # fqmul = zeta * coefficient + # Modular multification bond by 2^16 * q in abs value + vmladduhm 15, 13, \_vz0, 3 + vmladduhm 20, 18, \_vz1, 3 + vmladduhm 25, 23, \_vz2, 3 + vmladduhm 30, 28, \_vz3, 3 + + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 13, \_vz0, 3 + vmhraddshs 19, 18, \_vz1, 3 + vmhraddshs 24, 23, \_vz2, 3 + vmhraddshs 29, 28, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 30, 30, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah 13, 15, 4 # >> 1 + vsrah 18, 20, 4 # >> 1 + vsrah 23, 25, 4 # >> 1 + vsrah 28, 30, 4 # >> 1 .endm .macro Load_4Aj - lxvd2x 32+12, 3, 9 # r[j] - lxvd2x 32+17, 3, 16 # r[j] - lxvd2x 32+22, 3, 18 # r[j] - lxvd2x 32+27, 3, 20 # r[j] + lxvd2x 32+12, 3, 9 # r[j] + lxvd2x 32+17, 3, 16 # r[j] + lxvd2x 32+22, 3, 18 # r[j] + lxvd2x 32+27, 3, 20 # r[j] .endm .macro Compute_4Coeffs - # Since the result of the Montgomery multiplication is bounded - # by q in absolute value. - # Finally to complete the final update of the results with add/sub - vsubuhm 16, 12, 13 # r - t - vadduhm 15, 13, 12 # r + t - vsubuhm 21, 17, 18 # r - t - vadduhm 20, 18, 17 # r + t - vsubuhm 26, 22, 23 # r - t - vadduhm 25, 23, 22 # r + t - vsubuhm 31, 27, 28 # r - t - vadduhm 30, 28, 27 # r + t + # Since the result of the Montgomery multiplication is bounded + # by q in absolute value. + # Finally to complete the final update of the results with add/sub + vsubuhm 16, 12, 13 # r - t + vadduhm 15, 13, 12 # r + t + vsubuhm 21, 17, 18 # r - t + vadduhm 20, 18, 17 # r + t + vsubuhm 26, 22, 23 # r - t + vadduhm 25, 23, 22 # r + t + vsubuhm 31, 27, 28 # r - t + vadduhm 30, 28, 27 # r + t .endm .macro NTT_MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 @@ -243,34 +243,34 @@ .endm .macro Write_One - stxvd2x 32+15, 3, 9 - stxvd2x 32+16, 3, 10 - stxvd2x 32+20, 3, 16 - stxvd2x 32+21, 3, 17 - stxvd2x 32+25, 3, 18 - stxvd2x 32+26, 3, 19 - stxvd2x 32+30, 3, 20 - stxvd2x 32+31, 3, 21 + stxvd2x 32+15, 3, 9 + stxvd2x 32+16, 3, 10 + stxvd2x 32+20, 3, 16 + stxvd2x 32+21, 3, 17 + stxvd2x 32+25, 3, 18 + stxvd2x 32+26, 3, 19 + stxvd2x 32+30, 3, 20 + stxvd2x 32+31, 3, 21 .endm .macro PermWriteL44 - Compute_4Coeffs - xxpermdi 0, 32+15, 32+16, 3 - xxpermdi 1, 32+15, 32+16, 0 - xxpermdi 2, 32+20, 32+21, 3 - xxpermdi 3, 32+20, 32+21, 0 - xxpermdi 4, 32+25, 32+26, 3 - xxpermdi 5, 32+25, 32+26, 0 - xxpermdi 6, 32+30, 32+31, 3 - xxpermdi 7, 32+30, 32+31, 0 - stxvd2x 0, 0, 5 - stxvd2x 1, 10, 5 - stxvd2x 2, 11, 5 - stxvd2x 3, 12, 5 - stxvd2x 4, 15, 5 - stxvd2x 5, 16, 5 - stxvd2x 6, 17, 5 - stxvd2x 7, 18, 5 + Compute_4Coeffs + xxpermdi 0, 32+15, 32+16, 3 + xxpermdi 1, 32+15, 32+16, 0 + xxpermdi 2, 32+20, 32+21, 3 + xxpermdi 3, 32+20, 32+21, 0 + xxpermdi 4, 32+25, 32+26, 3 + xxpermdi 5, 32+25, 32+26, 0 + xxpermdi 6, 32+30, 32+31, 3 + xxpermdi 7, 32+30, 32+31, 0 + stxvd2x 0, 0, 5 + stxvd2x 1, 10, 5 + stxvd2x 2, 11, 5 + stxvd2x 3, 12, 5 + stxvd2x 4, 15, 5 + stxvd2x 5, 16, 5 + stxvd2x 6, 17, 5 + stxvd2x 7, 18, 5 .endm .macro PermWriteL24 @@ -283,32 +283,32 @@ vmrgow 15, 26, 25 vmrgew 16, 31, 30 vmrgow 17, 31, 30 - stxvd2x 32+10, 0, 5 - stxvd2x 32+11, 10, 5 - stxvd2x 32+12, 11, 5 - stxvd2x 32+13, 12, 5 - stxvd2x 32+14, 15, 5 - stxvd2x 32+15, 16, 5 - stxvd2x 32+16, 17, 5 - stxvd2x 32+17, 18, 5 + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 .endm .macro Load_next_4zetas - li 10, 16 - li 11, 32 - li 12, 48 - lxvd2x 32+V_Z0, 0, 14 - lxvd2x 32+V_Z1, 10, 14 - lxvd2x 32+V_Z2, 11, 14 - lxvd2x 32+V_Z3, 12, 14 - addi 14, 14, 64 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 32+V_Z0, 0, 14 + lxvd2x 32+V_Z1, 10, 14 + lxvd2x 32+V_Z2, 11, 14 + lxvd2x 32+V_Z3, 12, 14 + addi 14, 14, 64 .endm .macro Perm_4zetas - xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 - xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 - xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 - xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 + xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 + xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 + xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 + xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 .endm # @@ -318,191 +318,191 @@ .align 4 MLK_ASM_FN_SYMBOL(ntt_ppc) - SAVE_REGS + SAVE_REGS - # get MLKEM_Q - lvx V_NMKQ,0,4 + # get MLKEM_Q + lvx V_NMKQ,0,4 - # zetas array - addi 14, 4, ZETA_NTT_OFFSET + # zetas array + addi 14, 4, ZETA_NTT_OFFSET - vxor 3, 3, 3 - vspltish 4, 1 + vxor 3, 3, 3 + vspltish 4, 1 - li 10, QINV_OFFSET - lvx V_QINV, 10, 4 + li 10, QINV_OFFSET + lvx V_QINV, 10, 4 .align 4 - # - # Compute coefficients of the NTT based on the following loop. - # for (len = 128; len ≥ 2; len = len/2) - # - # 1. len = 128, start = 0 - # - li 5, 0 # start - li 7, 256 # len * 2 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 64 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 128 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 192 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One + # + # Compute coefficients of the NTT based on the following loop. + # for (len = 128; len ≥ 2; len = len/2) + # + # 1. len = 128, start = 0 + # + li 5, 0 # start + li 7, 256 # len * 2 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 128 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 192 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One .align 4 - # - # 2. len = 64, start = 0, 128 - # k += 2 - li 5, 0 - li 7, 128 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 64 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 256 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 320 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One + # + # 2. len = 64, start = 0, 128 + # k += 2 + li 5, 0 + li 7, 128 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 320 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One .align 4 - # - # 3. len = 32, start = 0, 64, 128, 192 - # k += 4 - li 5, 0 - li 7, 64 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 128 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 256 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 384 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One + # + # 3. len = 32, start = 0, 64, 128, 192 + # k += 4 + li 5, 0 + li 7, 64 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 128 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 384 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One .align 4 - # - # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - # k += 8 - li 5, 0 - li 7, 32 - Load_next_4zetas - NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 16 - NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - - Load_next_4zetas - li 5, 256 - NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 272 - NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + # k += 8 + li 5, 0 + li 7, 32 + Load_next_4zetas + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 16 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + Load_next_4zetas + li 5, 256 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 272 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One .align 4 - # - # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - # k += 16 - li 5, 0 - li 7, 16 - Load_next_4zetas - NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 128 - - Load_next_4zetas - NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 256 - - Load_next_4zetas - NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 384 - - Load_next_4zetas - NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - - # - # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - # k += 32 - li 15, 4 # loops - mtctr 15 + # + # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + # k += 16 + li 5, 0 + li 7, 16 + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 128 + + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 256 + + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 384 + + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + # + # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + # k += 32 + li 15, 4 # loops + mtctr 15 mr 5, 3 - li 7, 8 + li 7, 8 - li 10, 16 - li 11, 32 - li 12, 48 - li 15, 64 - li 16, 80 - li 17, 96 - li 18, 112 + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 .align 4 ntt_ppc__Len4: - Load_next_4zetas - Perm_4zetas + Load_next_4zetas + Perm_4zetas Load_L44Coeffs - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 - PermWriteL44 - addi 5, 5, 128 + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL44 + addi 5, 5, 128 - bdnz ntt_ppc__Len4 + bdnz ntt_ppc__Len4 - # - # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - # k += 64 - # Update zetas vectors, each vector has 2 zetas + # + # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # k += 64 + # Update zetas vectors, each vector has 2 zetas - li 8, 4 - mtctr 8 - mr 5, 3 - li 7, 4 + li 8, 4 + mtctr 8 + mr 5, 3 + li 7, 4 .align 4 ntt_ppc__Len2: - Load_next_4zetas + Load_next_4zetas Load_L24Coeffs - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 - PermWriteL24 - addi 5, 5, 128 + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL24 + addi 5, 5, 128 - bdnz ntt_ppc__Len2 + bdnz ntt_ppc__Len2 - RESTORE_REGS - blr + RESTORE_REGS + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S index 72c6310f28..b1f7f8c725 100644 --- a/dev/ppc64le/src/poly_tomont.S +++ b/dev/ppc64le/src/poly_tomont.S @@ -39,141 +39,141 @@ # MREDUCE_4X(_v0, _v1, _v2, _v3) # .macro MREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+13, 0, 3 - addi 3, 3, 16 - lxvd2x 32+18, 0, 3 - addi 3, 3, 16 - lxvd2x 32+23, 0, 3 - addi 3, 3, 16 - lxvd2x 32+7, 0, 3 - addi 3, 3, 16 - - vmladduhm 15, 13, V1353, 3 - vmladduhm 20, 18, V1353, 3 - vmladduhm 25, 23, V1353, 3 - vmladduhm 9, 7, V1353, 3 - - vmhraddshs 14, 13, V1353, 3 - vmhraddshs 19, 18, V1353, 3 - vmhraddshs 24, 23, V1353, 3 - vmhraddshs 8, 7, V1353, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 9, 9, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 9, 9, V_NMKQ, 8 - - vsrah \_v0, 15, 4 # >> 1 - vsrah \_v1, 20, 4 # >> 1 - vsrah \_v2, 25, 4 # >> 1 - vsrah \_v3, 9, 4 # >> 1 + lxvd2x 32+13, 0, 3 + addi 3, 3, 16 + lxvd2x 32+18, 0, 3 + addi 3, 3, 16 + lxvd2x 32+23, 0, 3 + addi 3, 3, 16 + lxvd2x 32+7, 0, 3 + addi 3, 3, 16 + + vmladduhm 15, 13, V1353, 3 + vmladduhm 20, 18, V1353, 3 + vmladduhm 25, 23, V1353, 3 + vmladduhm 9, 7, V1353, 3 + + vmhraddshs 14, 13, V1353, 3 + vmhraddshs 19, 18, V1353, 3 + vmhraddshs 24, 23, V1353, 3 + vmhraddshs 8, 7, V1353, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 9, 9, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 9, 9, V_NMKQ, 8 + + vsrah \_v0, 15, 4 # >> 1 + vsrah \_v1, 20, 4 # >> 1 + vsrah \_v2, 25, 4 # >> 1 + vsrah \_v3, 9, 4 # >> 1 .endm .macro Write_8X - stxvd2x 32+27, 4, 3 - stxvd2x 32+28, 5, 3 - stxvd2x 32+29, 6, 3 - stxvd2x 32+30, 7, 3 - stxvd2x 32+13, 8, 3 - stxvd2x 32+18, 9, 3 - stxvd2x 32+23, 10, 3 - stxvd2x 32+7, 11, 3 + stxvd2x 32+27, 4, 3 + stxvd2x 32+28, 5, 3 + stxvd2x 32+29, 6, 3 + stxvd2x 32+30, 7, 3 + stxvd2x 32+13, 8, 3 + stxvd2x 32+18, 9, 3 + stxvd2x 32+23, 10, 3 + stxvd2x 32+7, 11, 3 .endm .align 4 .globl MLK_ASM_NAMESPACE(poly_tomont_ppc) MLK_ASM_FN_SYMBOL(poly_tomont_ppc) - stdu 1, -320(1) - mflr 0 - - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - li 11, 208 - li 12, 224 - stxvx 32+20, 6, 1 - stxvx 32+21, 7, 1 - stxvx 32+22, 8, 1 - stxvx 32+23, 9, 1 - stxvx 32+24, 10, 1 - stxvx 32+25, 11, 1 - stxvx 32+26, 12, 1 - li 6, 240 - li 7, 256 - li 8, 272 - li 9, 288 - stxvx 32+27, 6, 1 - stxvx 32+28, 7, 1 - stxvx 32+29, 8, 1 - stxvx 32+30, 9, 1 - - li 6, NQ_OFFSET - li 7, QINV_OFFSET - li 8, C1353_OFFSET - lxvx 32+V_NMKQ, 6, 4 - lxvx 32+V_QINV, 7, 4 - lxvx 32+V1353, 8, 4 - - vxor 3, 3, 3 - vspltish 4, 1 - - li 4, -128 - li 5, -112 - li 6, -96 - li 7, -80 - li 8, -64 - li 9, -48 - li 10, -32 - li 11, -16 - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - li 11, 208 - li 12, 224 - lxvx 32+20, 6, 1 - lxvx 32+21, 7, 1 - lxvx 32+22, 8, 1 - lxvx 32+23, 9, 1 - lxvx 32+24, 10, 1 - lxvx 32+25, 11, 1 - lxvx 32+26, 12, 1 - li 6, 240 - li 7, 256 - li 8, 272 - li 9, 288 - lxvx 32+27, 6, 1 - lxvx 32+28, 7, 1 - lxvx 32+29, 8, 1 - lxvx 32+30, 9, 1 - mtlr 0 - addi 1, 1, 320 - blr + stdu 1, -320(1) + mflr 0 + + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + stxvx 32+25, 11, 1 + stxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + stxvx 32+27, 6, 1 + stxvx 32+28, 7, 1 + stxvx 32+29, 8, 1 + stxvx 32+30, 9, 1 + + li 6, NQ_OFFSET + li 7, QINV_OFFSET + li 8, C1353_OFFSET + lxvx 32+V_NMKQ, 6, 4 + lxvx 32+V_QINV, 7, 4 + lxvx 32+V1353, 8, 4 + + vxor 3, 3, 3 + vspltish 4, 1 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + lxvx 32+25, 11, 1 + lxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + lxvx 32+27, 6, 1 + lxvx 32+28, 7, 1 + lxvx 32+29, 8, 1 + lxvx 32+30, 9, 1 + mtlr 0 + addi 1, 1, 320 + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S index b7c6235b9a..45fef03f2a 100644 --- a/dev/ppc64le/src/reduce.S +++ b/dev/ppc64le/src/reduce.S @@ -34,66 +34,66 @@ .text .macro BREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+8, 0, 3 - lxvd2x 32+12, 14, 3 - lxvd2x 32+16, 15, 3 - lxvd2x 32+20, 16, 3 - addi 3, 3, 64 - vmulosh 6, 8, V20159 - vmulesh 5, 8, V20159 - vmulosh 11, 12, V20159 - vmulesh 10, 12, V20159 - vmulosh 15, 16, V20159 - vmulesh 14, 16, V20159 - vmulosh 19, 20, V20159 - vmulesh 18, 20, V20159 - xxmrglw 32+4, 32+5, 32+6 - xxmrghw 32+5, 32+5, 32+6 - xxmrglw 32+9, 32+10, 32+11 - xxmrghw 32+10, 32+10, 32+11 - xxmrglw 32+13, 32+14, 32+15 - xxmrghw 32+14, 32+14, 32+15 - xxmrglw 32+17, 32+18, 32+19 - xxmrghw 32+18, 32+18, 32+19 - vadduwm 4, 4, V_25 - vadduwm 5, 5, V_25 - vadduwm 9, 9, V_25 - vadduwm 10, 10, V_25 - vadduwm 13, 13, V_25 - vadduwm 14, 14, V_25 - vadduwm 17, 17, V_25 - vadduwm 18, 18, V_25 - vsraw 4, 4, V_26 - vsraw 5, 5, V_26 - vsraw 9, 9, V_26 - vsraw 10, 10, V_26 - vsraw 13, 13, V_26 - vsraw 14, 14, V_26 - vsraw 17, 17, V_26 - vsraw 18, 18, V_26 - vpkuwum 4, 5, 4 - vsubuhm 4, 7, 4 - vpkuwum 9, 10, 9 - vsubuhm 9, 7, 9 - vpkuwum 13, 14, 13 - vsubuhm 13, 7, 13 - vpkuwum 17, 18, 17 - vsubuhm 17, 7, 17 - vmladduhm \_v0, 4, V_MKQ, 8 - vmladduhm \_v1, 9, V_MKQ, 12 - vmladduhm \_v2, 13, V_MKQ, 16 - vmladduhm \_v3, 17, V_MKQ, 20 + lxvd2x 32+8, 0, 3 + lxvd2x 32+12, 14, 3 + lxvd2x 32+16, 15, 3 + lxvd2x 32+20, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 .endm .macro Write_8X - stxvd2x 32+21, 4, 3 - stxvd2x 32+22, 5, 3 - stxvd2x 32+23, 6, 3 - stxvd2x 32+24, 7, 3 - stxvd2x 32+4, 8, 3 - stxvd2x 32+9, 9, 3 - stxvd2x 32+13, 10, 3 - stxvd2x 32+17, 11, 3 + stxvd2x 32+21, 4, 3 + stxvd2x 32+22, 5, 3 + stxvd2x 32+23, 6, 3 + stxvd2x 32+24, 7, 3 + stxvd2x 32+4, 8, 3 + stxvd2x 32+9, 9, 3 + stxvd2x 32+13, 10, 3 + stxvd2x 32+17, 11, 3 .endm # @@ -104,124 +104,124 @@ lxvd2x 32+13, 14, 3 lxvd2x 32+14, 15, 3 lxvd2x 32+15, 16, 3 - addi 3, 3, 64 - vsrh 1, 12, 10 - vsrh 0, 13, 10 - vsrh 3, 14, 10 - vsrh 2, 15, 10 - vadduhm 7, 12, 11 - vadduhm 8, 13, 11 - vadduhm 5, 14, 11 - vadduhm 6, 15, 11 - vcmpequh 1, 1, 9 - vcmpequh 0, 0, 9 - vcmpequh 3, 3, 9 - vcmpequh 2, 2, 9 - xxsel 32+1, 32+7,32+12, 32+1 - xxsel 32+0, 32+8,32+13, 32+0 - xxsel 32+3, 32+5,32+14, 32+3 - xxsel 32+2, 32+6,32+15, 32+2 - stxvd2x 32+3, 10, 3 - stxvd2x 32+2, 11, 3 - stxvd2x 32+1, 8, 3 - stxvd2x 32+0, 9, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 32+1, 32+7,32+12, 32+1 + xxsel 32+0, 32+8,32+13, 32+0 + xxsel 32+3, 32+5,32+14, 32+3 + xxsel 32+2, 32+6,32+15, 32+2 + stxvd2x 32+3, 10, 3 + stxvd2x 32+2, 11, 3 + stxvd2x 32+1, 8, 3 + stxvd2x 32+0, 9, 3 .endm .align 4 .globl MLK_ASM_NAMESPACE(reduce_ppc) MLK_ASM_FN_SYMBOL(reduce_ppc) - stdu 1, -224(1) - mflr 0 - std 14, 96(1) - std 15, 104(1) - std 16, 112(1) - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - stxvx 32+20, 6, 1 - stxvx 32+21, 7, 1 - stxvx 32+22, 8, 1 - stxvx 32+23, 9, 1 - stxvx 32+24, 10, 1 - - vxor 7, 7, 7 - - li 6, Q_OFFSET - li 7, C20159_OFFSET - lxvx 32+V_MKQ, 6, 4 - lxvx 32+V20159, 7, 4 - - vspltisw V_26, 13 - vadduwm V_26, V_26, V_26 - vspltisw 4, 1 - vsubuwm 5, V_26, 4 - vslw V_25, 4, 5 - - li 4, -128 - li 5, -112 - li 6, -96 - li 7, -80 - li 8, -64 - li 9, -48 - li 10, -32 - li 11, -16 - - li 14, 16 - li 15, 32 - li 16, 48 - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - # - # To unsigned canonical - # + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + + vxor 7, 7, 7 + + li 6, Q_OFFSET + li 7, C20159_OFFSET + lxvx 32+V_MKQ, 6, 4 + lxvx 32+V20159, 7, 4 + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + vspltisw 4, 1 + vsubuwm 5, V_26, 4 + vslw V_25, 4, 5 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + li 14, 16 + li 15, 32 + li 16, 48 + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + # + # To unsigned canonical + # .align 4 - addi 3, 3, -512 - vxor 9, 9, 9 - vspltish 10, 15 - vmr 11, V_MKQ - - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - - ld 14, 96(1) - ld 15, 104(1) - ld 16, 112(1) - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - lxvx 32+20, 6, 1 - lxvx 32+21, 7, 1 - lxvx 32+22, 8, 1 - lxvx 32+23, 9, 1 - lxvx 32+24, 10, 1 - mtlr 0 - addi 1, 1, 224 - blr + addi 3, 3, -512 + vxor 9, 9, 9 + vspltish 10, 15 + vmr 11, V_MKQ + + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + mtlr 0 + addi 1, 1, 224 + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S index 07663c4950..5d7aa86e38 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -36,111 +36,111 @@ #define V1441 10 .macro SAVE_REGS - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - stxvx 32+20, 10, 1 - stxvx 32+21, 11, 1 - stxvx 32+22, 12, 1 - stxvx 32+23, 14, 1 - stxvx 32+24, 15, 1 - stxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - stxvx 32+26, 10, 1 - stxvx 32+27, 11, 1 - stxvx 32+28, 12, 1 - stxvx 32+29, 14, 1 - stxvx 32+30, 15, 1 - stxvx 32+31, 16, 1 + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 .endm .macro RESTORE_REGS - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - lxvx 32+20, 10, 1 - lxvx 32+21, 11, 1 - lxvx 32+22, 12, 1 - lxvx 32+23, 14, 1 - lxvx 32+24, 15, 1 - lxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - lxvx 32+26, 10, 1 - lxvx 32+27, 11, 1 - lxvx 32+28, 12, 1 - lxvx 32+29, 14, 1 - lxvx 32+30, 15, 1 - lxvx 32+31, 16, 1 - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 .endm .macro Compute_4Coeffs - vsubuhm 25, 8, 21 # r[j+len] - t - vsubuhm 26, 12, 22 # r[j+len] - t - vsubuhm 30, 16, 23 # r[j+len] - t - vsubuhm 31, 20, 24 # r[j+len] - t - vadduhm 8, 8, 21 # r[j+len] + t - vadduhm 12, 12, 22 # r[j+len] + t - vadduhm 16, 16, 23 # r[j+len] + t - vadduhm 20, 20, 24 # r[j+len] + t + vsubuhm 25, 8, 21 # r[j+len] - t + vsubuhm 26, 12, 22 # r[j+len] - t + vsubuhm 30, 16, 23 # r[j+len] - t + vsubuhm 31, 20, 24 # r[j+len] - t + vadduhm 8, 8, 21 # r[j+len] + t + vadduhm 12, 12, 22 # r[j+len] + t + vadduhm 16, 16, 23 # r[j+len] + t + vadduhm 20, 20, 24 # r[j+len] + t .endm .macro Load_4Coeffs start next step - mr 9, \start # j - add 10, 7, 9 # J + len*2 - addi 16, 9, \next - addi 17, 10, \step - addi 18, 16, \next - addi 19, 17, \step - addi 20, 18, \next - addi 21, 19, \step - lxvd2x 32+8, 3, 10 # r[j+len] - lxvd2x 32+12, 3, 17 # r[j+len] - lxvd2x 32+16, 3, 19 # r[j+len] - lxvd2x 32+20, 3, 21 # r[j+len] - - lxvd2x 32+21, 3, 9 - lxvd2x 32+22, 3, 16 - lxvd2x 32+23, 3, 18 - lxvd2x 32+24, 3, 20 - - Compute_4Coeffs + mr 9, \start # j + add 10, 7, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+8, 3, 10 # r[j+len] + lxvd2x 32+12, 3, 17 # r[j+len] + lxvd2x 32+16, 3, 19 # r[j+len] + lxvd2x 32+20, 3, 21 # r[j+len] + + lxvd2x 32+21, 3, 9 + lxvd2x 32+22, 3, 16 + lxvd2x 32+23, 3, 18 + lxvd2x 32+24, 3, 20 + + Compute_4Coeffs .endm # @@ -201,99 +201,99 @@ .endm .macro BREDUCE_4X _v0 _v1 _v2 _v3 - vxor 7, 7, 7 - xxlor 32+3, 6, 6 # V_MKQ - xxlor 32+1, 7, 7 # V_25 - xxlor 32+2, 8, 8 # V_26 - # Multify Odd/Even signed halfword; - # Results word bound by 2^32 in abs value. - vmulosh 6, 8, V20159 - vmulesh 5, 8, V20159 - vmulosh 11, 12, V20159 - vmulesh 10, 12, V20159 - vmulosh 15, 16, V20159 - vmulesh 14, 16, V20159 - vmulosh 19, 20, V20159 - vmulesh 18, 20, V20159 - xxmrglw 32+4, 32+5, 32+6 - xxmrghw 32+5, 32+5, 32+6 - xxmrglw 32+9, 32+10, 32+11 - xxmrghw 32+10, 32+10, 32+11 - xxmrglw 32+13, 32+14, 32+15 - xxmrghw 32+14, 32+14, 32+15 - xxmrglw 32+17, 32+18, 32+19 - xxmrghw 32+18, 32+18, 32+19 - vadduwm 4, 4, V_25 - vadduwm 5, 5, V_25 - vadduwm 9, 9, V_25 - vadduwm 10, 10, V_25 - vadduwm 13, 13, V_25 - vadduwm 14, 14, V_25 - vadduwm 17, 17, V_25 - vadduwm 18, 18, V_25 - # Right shift and pack lower halfword, - # results bond to 2^16 in abs value - vsraw 4, 4, V_26 - vsraw 5, 5, V_26 - vsraw 9, 9, V_26 - vsraw 10, 10, V_26 - vsraw 13, 13, V_26 - vsraw 14, 14, V_26 - vsraw 17, 17, V_26 - vsraw 18, 18, V_26 - vpkuwum 4, 5, 4 - vsubuhm 4, 7, 4 - vpkuwum 9, 10, 9 - vsubuhm 9, 7, 9 - vpkuwum 13, 14, 13 - vsubuhm 13, 7, 13 - vpkuwum 17, 18, 17 - vsubuhm 17, 7, 17 - # Modulo multify-Low unsigned halfword; - # results bond to 2^16 * q in abs value. - vmladduhm \_v0, 4, V_MKQ, 8 - vmladduhm \_v1, 9, V_MKQ, 12 - vmladduhm \_v2, 13, V_MKQ, 16 - vmladduhm \_v3, 17, V_MKQ, 20 + vxor 7, 7, 7 + xxlor 32+3, 6, 6 # V_MKQ + xxlor 32+1, 7, 7 # V_25 + xxlor 32+2, 8, 8 # V_26 + # Multify Odd/Even signed halfword; + # Results word bound by 2^32 in abs value. + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + # Right shift and pack lower halfword, + # results bond to 2^16 in abs value + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + # Modulo multify-Low unsigned halfword; + # results bond to 2^16 * q in abs value. + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 .endm #----------------------------------- # MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) # .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 - # Modular multification bond by 2^16 * q in abs value - vmladduhm 15, 25, \_vz0, 3 - vmladduhm 20, 26, \_vz1, 3 - vmladduhm 27, 30, \_vz2, 3 - vmladduhm 28, 31, \_vz3, 3 - - # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value - vmhraddshs 14, 25, \_vz0, 3 - vmhraddshs 19, 26, \_vz1, 3 - vmhraddshs 24, 30, \_vz2, 3 - vmhraddshs 29, 31, \_vz3, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 27, V_QINV, 3 - vmladduhm 30, 28, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 - - vsrah \_vo0, 15, 4 # >> 1 - vsrah \_vo1, 20, 4 # >> 1 - vsrah \_vo2, 25, 4 # >> 1 - vsrah \_vo3, 30, 4 # >> 1 + # Modular multification bond by 2^16 * q in abs value + vmladduhm 15, 25, \_vz0, 3 + vmladduhm 20, 26, \_vz1, 3 + vmladduhm 27, 30, \_vz2, 3 + vmladduhm 28, 31, \_vz3, 3 + + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 25, \_vz0, 3 + vmhraddshs 19, 26, \_vz1, 3 + vmhraddshs 24, 30, \_vz2, 3 + vmhraddshs 29, 31, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 27, V_QINV, 3 + vmladduhm 30, 28, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah \_vo0, 15, 4 # >> 1 + vsrah \_vo1, 20, 4 # >> 1 + vsrah \_vo2, 25, 4 # >> 1 + vsrah \_vo3, 30, 4 # >> 1 .endm .macro Set_mont_consts - xxlor 32+5, 0, 0 # V_NMKQ - xxlor 32+2, 2, 2 # V_QINV - xxlor 32+3, 3, 3 # 0 - xxlor 32+4, 4, 4 # 1 + xxlor 32+5, 0, 0 # V_NMKQ + xxlor 32+2, 2, 2 # V_QINV + xxlor 32+3, 3, 3 # 0 + xxlor 32+4, 4, 4 # 1 .endm .macro Load_next_4zetas @@ -315,53 +315,53 @@ .endm .macro Write_B4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, 3, 9 - stxvd2x \_vs1, 3, 16 - stxvd2x \_vs2, 3, 18 - stxvd2x \_vs3, 3, 20 + stxvd2x \_vs0, 3, 9 + stxvd2x \_vs1, 3, 16 + stxvd2x \_vs2, 3, 18 + stxvd2x \_vs3, 3, 20 .endm .macro Write_M4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, 3, 10 - stxvd2x \_vs1, 3, 17 - stxvd2x \_vs2, 3, 19 - stxvd2x \_vs3, 3, 21 + stxvd2x \_vs0, 3, 10 + stxvd2x \_vs1, 3, 17 + stxvd2x \_vs2, 3, 19 + stxvd2x \_vs3, 3, 21 .endm .macro Reload_4coeffs - lxvd2x 32+25, 0, 3 - lxvd2x 32+26, 10, 3 - lxvd2x 32+30, 11, 3 - lxvd2x 32+31, 12, 3 - addi 3, 3, 64 + lxvd2x 32+25, 0, 3 + lxvd2x 32+26, 10, 3 + lxvd2x 32+30, 11, 3 + lxvd2x 32+31, 12, 3 + addi 3, 3, 64 .endm .macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 - addi 3, 3, -128 - stxvd2x \_vs0, 0, 3 - stxvd2x \_vs1, 10, 3 - stxvd2x \_vs2, 11, 3 - stxvd2x \_vs3, 12, 3 - stxvd2x \_vs4, 15, 3 - stxvd2x \_vs5, 16, 3 - stxvd2x \_vs6, 17, 3 - stxvd2x \_vs7, 18, 3 - addi 3, 3, 128 + addi 3, 3, -128 + stxvd2x \_vs0, 0, 3 + stxvd2x \_vs1, 10, 3 + stxvd2x \_vs2, 11, 3 + stxvd2x \_vs3, 12, 3 + stxvd2x \_vs4, 15, 3 + stxvd2x \_vs5, 16, 3 + stxvd2x \_vs6, 17, 3 + stxvd2x \_vs7, 18, 3 + addi 3, 3, 128 .endm .macro PermWriteL44 - xxlor 32+14, 10, 10 - xxlor 32+19, 11, 11 - xxlor 32+24, 12, 12 - xxlor 32+29, 13, 13 - xxpermdi 32+10, 32+14, 32+13, 3 - xxpermdi 32+11, 32+14, 32+13, 0 - xxpermdi 32+12, 32+19, 32+18, 3 - xxpermdi 32+13, 32+19, 32+18, 0 - xxpermdi 32+14, 32+24, 32+23, 3 - xxpermdi 32+15, 32+24, 32+23, 0 - xxpermdi 32+16, 32+29, 32+28, 3 - xxpermdi 32+17, 32+29, 32+28, 0 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + xxpermdi 32+10, 32+14, 32+13, 3 + xxpermdi 32+11, 32+14, 32+13, 0 + xxpermdi 32+12, 32+19, 32+18, 3 + xxpermdi 32+13, 32+19, 32+18, 0 + xxpermdi 32+14, 32+24, 32+23, 3 + xxpermdi 32+15, 32+24, 32+23, 0 + xxpermdi 32+16, 32+29, 32+28, 3 + xxpermdi 32+17, 32+29, 32+28, 0 stxvd2x 32+10, 0, 5 stxvd2x 32+11, 10, 5 stxvd2x 32+12, 11, 5 @@ -373,10 +373,10 @@ .endm .macro PermWriteL24 - xxlor 32+14, 10, 10 - xxlor 32+19, 11, 11 - xxlor 32+24, 12, 12 - xxlor 32+29, 13, 13 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 vmrgew 10, 13, 14 vmrgow 11, 13, 14 vmrgew 12, 18, 19 @@ -396,42 +396,42 @@ .endm .macro INTT_REDUCE_L24 - Load_L24Coeffs - Compute_4Coeffs - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - PermWriteL24 + Load_L24Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL24 .endm .macro INTT_REDUCE_L44 - Load_L44Coeffs - Compute_4Coeffs - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - Perm_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - PermWriteL44 + Load_L44Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + Perm_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL44 .endm .macro INTT_REDUCE_4X start next step - Load_4Coeffs \start, \next, \step - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + Load_4Coeffs \start, \next, \step + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .endm # intt @@ -447,34 +447,34 @@ .align 4 MLK_ASM_FN_SYMBOL(intt_ppc) - SAVE_REGS + SAVE_REGS - # init vectors and constants - # Setup for Montgomery reduce - lxvx 0, 0, 4 + # init vectors and constants + # Setup for Montgomery reduce + lxvx 0, 0, 4 - li 10, QINV_OFFSET - lxvx 32+V_QINV, 10, 4 # QINV - xxlxor 32+3, 32+3, 32+3 - vspltish 4, 1 - xxlor 2, 32+2, 32+2 # QINV - xxlor 3, 32+3, 32+3 # 0 - xxlor 4, 32+4, 32+4 # 1 + li 10, QINV_OFFSET + lxvx 32+V_QINV, 10, 4 # QINV + xxlxor 32+3, 32+3, 32+3 + vspltish 4, 1 + xxlor 2, 32+2, 32+2 # QINV + xxlor 3, 32+3, 32+3 # 0 + xxlor 4, 32+4, 32+4 # 1 - # Setup for Barrett reduce - li 10, Q_OFFSET - li 11, C20159_OFFSET - lxvx 6, 10, 4 # V_MKQ - lxvx 32+V20159, 11, 4 # V20159 + # Setup for Barrett reduce + li 10, Q_OFFSET + li 11, C20159_OFFSET + lxvx 6, 10, 4 # V_MKQ + lxvx 32+V20159, 11, 4 # V20159 - vspltisw 8, 13 - vadduwm 8, 8, 8 - xxlor 8, 32+8, 32+8 # V_26 store at vs8 + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 32+8, 32+8 # V_26 store at vs8 - vspltisw 9, 1 - vsubuwm 10, 8, 9 # 25 - vslw 9, 9, 10 - xxlor 7, 32+9, 32+9 # V_25 syore at vs7 + vspltisw 9, 1 + vsubuwm 10, 8, 9 # 25 + vslw 9, 9, 10 + xxlor 7, 32+9, 32+9 # V_25 syore at vs7 li 10, 16 li 11, 32 @@ -504,193 +504,193 @@ intt_ppc__Loopf: addi 3, 3, -512 .align 4 - # - # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - # Update zetas vectors, each vector has 2 zetas - addi 14, 4, ZETA_INTT_OFFSET - li 7, 4 - li 8, 4 - mtctr 8 - mr 5, 3 + # + # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # Update zetas vectors, each vector has 2 zetas + addi 14, 4, ZETA_INTT_OFFSET + li 7, 4 + li 8, 4 + mtctr 8 + mr 5, 3 intt_ppc__Loop2: - INTT_REDUCE_L24 - addi 5, 5, 128 - bdnz intt_ppc__Loop2 + INTT_REDUCE_L24 + addi 5, 5, 128 + bdnz intt_ppc__Loop2 .align 4 - # - # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - mr 5, 3 - li 7, 8 - li 8, 4 # loops - mtctr 8 + # + # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + mr 5, 3 + li 7, 8 + li 8, 4 # loops + mtctr 8 intt_ppc__Loop4: - INTT_REDUCE_L44 - addi 5, 5, 128 - bdnz intt_ppc__Loop4 + INTT_REDUCE_L44 + addi 5, 5, 128 + bdnz intt_ppc__Loop4 .align 4 - # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - li 7, 16 - li 5, 0 - li 15, 4 # loops - mtctr 15 + # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + li 7, 16 + li 5, 0 + li 15, 4 # loops + mtctr 15 intt_ppc__Loop8: - INTT_REDUCE_4X 5, 32, 32 - addi 5, 5, 128 - bdnz intt_ppc__Loop8 + INTT_REDUCE_4X 5, 32, 32 + addi 5, 5, 128 + bdnz intt_ppc__Loop8 .align 4 - # - # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - li 5, 0 - li 7, 32 + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + li 5, 0 + li 7, 32 - INTT_REDUCE_4X 5, 64, 64 + INTT_REDUCE_4X 5, 64, 64 li 5, 16 - addi 14, 14, -64 - INTT_REDUCE_4X 5, 64, 64 + addi 14, 14, -64 + INTT_REDUCE_4X 5, 64, 64 li 5, 256 - INTT_REDUCE_4X 5, 64, 64 + INTT_REDUCE_4X 5, 64, 64 li 5, 272 - addi 14, 14, -64 - INTT_REDUCE_4X 5, 64, 64 + addi 14, 14, -64 + INTT_REDUCE_4X 5, 64, 64 .align 4 # # 5. len = 32, start = 0, 64, 128, 192 - li 5, 0 - li 7, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + li 5, 0 + li 7, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 384 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 - # - # 6. len = 64, start = 0, 128 - li 5, 0 - li 7, 128 - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 320 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + # + # 6. len = 64, start = 0, 128 + li 5, 0 + li 7, 128 + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 320 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 - # 7. len = 128, start = 0 - # - li 5, 0 # start - li 7, 256 # len * 2 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - xxlor 9, 32+10, 32+10 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 192 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - - RESTORE_REGS - blr + # 7. len = 128, start = 0 + # + li 5, 0 # start + li 7, 256 # len * 2 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + xxlor 9, 32+10, 32+10 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 192 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + RESTORE_REGS + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S index dbe7c82fa5..3d65856ad9 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -18,103 +18,103 @@ #define V_QINV 2 #define V_NMKQ 5 -#define V_Z0 7 -#define V_Z1 8 -#define V_Z2 9 -#define V_Z3 10 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 #define V_ZETA 10 .machine "any" .text .macro SAVE_REGS - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - stxvx 32+20, 10, 1 - stxvx 32+21, 11, 1 - stxvx 32+22, 12, 1 - stxvx 32+23, 14, 1 - stxvx 32+24, 15, 1 - stxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - stxvx 32+26, 10, 1 - stxvx 32+27, 11, 1 - stxvx 32+28, 12, 1 - stxvx 32+29, 14, 1 - stxvx 32+30, 15, 1 - stxvx 32+31, 16, 1 + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 .endm .macro RESTORE_REGS - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - lxvx 32+20, 10, 1 - lxvx 32+21, 11, 1 - lxvx 32+22, 12, 1 - lxvx 32+23, 14, 1 - lxvx 32+24, 15, 1 - lxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - lxvx 32+26, 10, 1 - lxvx 32+27, 11, 1 - lxvx 32+28, 12, 1 - lxvx 32+29, 14, 1 - lxvx 32+30, 15, 1 - lxvx 32+31, 16, 1 - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 .endm .macro Load_4Coeffs start next step - mr 9, \start - add 10, 7, 9 # J + len*2 - addi 16, 9, \next - addi 17, 10, \step - addi 18, 16, \next - addi 19, 17, \step - addi 20, 18, \next - addi 21, 19, \step - lxvd2x 32+13, 3, 10 # r[j+len] - lxvd2x 32+18, 3, 17 # r[j+len] - lxvd2x 32+23, 3, 19 # r[j+len] - lxvd2x 32+28, 3, 21 # r[j+len] + mr 9, \start + add 10, 7, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+13, 3, 10 # r[j+len] + lxvd2x 32+18, 3, 17 # r[j+len] + lxvd2x 32+23, 3, 19 # r[j+len] + lxvd2x 32+28, 3, 21 # r[j+len] .endm # @@ -183,55 +183,55 @@ # MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) # .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 - # fqmul = zeta * coefficient - # Modular multification bond by 2^16 * q in abs value - vmladduhm 15, 13, \_vz0, 3 - vmladduhm 20, 18, \_vz1, 3 - vmladduhm 25, 23, \_vz2, 3 - vmladduhm 30, 28, \_vz3, 3 - - # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value - vmhraddshs 14, 13, \_vz0, 3 - vmhraddshs 19, 18, \_vz1, 3 - vmhraddshs 24, 23, \_vz2, 3 - vmhraddshs 29, 28, \_vz3, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 30, 30, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 - - vsrah 13, 15, 4 # >> 1 - vsrah 18, 20, 4 # >> 1 - vsrah 23, 25, 4 # >> 1 - vsrah 28, 30, 4 # >> 1 + # fqmul = zeta * coefficient + # Modular multification bond by 2^16 * q in abs value + vmladduhm 15, 13, \_vz0, 3 + vmladduhm 20, 18, \_vz1, 3 + vmladduhm 25, 23, \_vz2, 3 + vmladduhm 30, 28, \_vz3, 3 + + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + vmhraddshs 14, 13, \_vz0, 3 + vmhraddshs 19, 18, \_vz1, 3 + vmhraddshs 24, 23, \_vz2, 3 + vmhraddshs 29, 28, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 30, 30, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah 13, 15, 4 # >> 1 + vsrah 18, 20, 4 # >> 1 + vsrah 23, 25, 4 # >> 1 + vsrah 28, 30, 4 # >> 1 .endm .macro Load_4Aj - lxvd2x 32+12, 3, 9 # r[j] - lxvd2x 32+17, 3, 16 # r[j] - lxvd2x 32+22, 3, 18 # r[j] - lxvd2x 32+27, 3, 20 # r[j] + lxvd2x 32+12, 3, 9 # r[j] + lxvd2x 32+17, 3, 16 # r[j] + lxvd2x 32+22, 3, 18 # r[j] + lxvd2x 32+27, 3, 20 # r[j] .endm .macro Compute_4Coeffs - # Since the result of the Montgomery multiplication is bounded - # by q in absolute value. - # Finally to complete the final update of the results with add/sub - vsubuhm 16, 12, 13 # r - t - vadduhm 15, 13, 12 # r + t - vsubuhm 21, 17, 18 # r - t - vadduhm 20, 18, 17 # r + t - vsubuhm 26, 22, 23 # r - t - vadduhm 25, 23, 22 # r + t - vsubuhm 31, 27, 28 # r - t - vadduhm 30, 28, 27 # r + t + # Since the result of the Montgomery multiplication is bounded + # by q in absolute value. + # Finally to complete the final update of the results with add/sub + vsubuhm 16, 12, 13 # r - t + vadduhm 15, 13, 12 # r + t + vsubuhm 21, 17, 18 # r - t + vadduhm 20, 18, 17 # r + t + vsubuhm 26, 22, 23 # r - t + vadduhm 25, 23, 22 # r + t + vsubuhm 31, 27, 28 # r - t + vadduhm 30, 28, 27 # r + t .endm .macro NTT_MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 @@ -242,34 +242,34 @@ .endm .macro Write_One - stxvd2x 32+15, 3, 9 - stxvd2x 32+16, 3, 10 - stxvd2x 32+20, 3, 16 - stxvd2x 32+21, 3, 17 - stxvd2x 32+25, 3, 18 - stxvd2x 32+26, 3, 19 - stxvd2x 32+30, 3, 20 - stxvd2x 32+31, 3, 21 + stxvd2x 32+15, 3, 9 + stxvd2x 32+16, 3, 10 + stxvd2x 32+20, 3, 16 + stxvd2x 32+21, 3, 17 + stxvd2x 32+25, 3, 18 + stxvd2x 32+26, 3, 19 + stxvd2x 32+30, 3, 20 + stxvd2x 32+31, 3, 21 .endm .macro PermWriteL44 - Compute_4Coeffs - xxpermdi 0, 32+15, 32+16, 3 - xxpermdi 1, 32+15, 32+16, 0 - xxpermdi 2, 32+20, 32+21, 3 - xxpermdi 3, 32+20, 32+21, 0 - xxpermdi 4, 32+25, 32+26, 3 - xxpermdi 5, 32+25, 32+26, 0 - xxpermdi 6, 32+30, 32+31, 3 - xxpermdi 7, 32+30, 32+31, 0 - stxvd2x 0, 0, 5 - stxvd2x 1, 10, 5 - stxvd2x 2, 11, 5 - stxvd2x 3, 12, 5 - stxvd2x 4, 15, 5 - stxvd2x 5, 16, 5 - stxvd2x 6, 17, 5 - stxvd2x 7, 18, 5 + Compute_4Coeffs + xxpermdi 0, 32+15, 32+16, 3 + xxpermdi 1, 32+15, 32+16, 0 + xxpermdi 2, 32+20, 32+21, 3 + xxpermdi 3, 32+20, 32+21, 0 + xxpermdi 4, 32+25, 32+26, 3 + xxpermdi 5, 32+25, 32+26, 0 + xxpermdi 6, 32+30, 32+31, 3 + xxpermdi 7, 32+30, 32+31, 0 + stxvd2x 0, 0, 5 + stxvd2x 1, 10, 5 + stxvd2x 2, 11, 5 + stxvd2x 3, 12, 5 + stxvd2x 4, 15, 5 + stxvd2x 5, 16, 5 + stxvd2x 6, 17, 5 + stxvd2x 7, 18, 5 .endm .macro PermWriteL24 @@ -282,32 +282,32 @@ vmrgow 15, 26, 25 vmrgew 16, 31, 30 vmrgow 17, 31, 30 - stxvd2x 32+10, 0, 5 - stxvd2x 32+11, 10, 5 - stxvd2x 32+12, 11, 5 - stxvd2x 32+13, 12, 5 - stxvd2x 32+14, 15, 5 - stxvd2x 32+15, 16, 5 - stxvd2x 32+16, 17, 5 - stxvd2x 32+17, 18, 5 + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 .endm .macro Load_next_4zetas - li 10, 16 - li 11, 32 - li 12, 48 - lxvd2x 32+V_Z0, 0, 14 - lxvd2x 32+V_Z1, 10, 14 - lxvd2x 32+V_Z2, 11, 14 - lxvd2x 32+V_Z3, 12, 14 - addi 14, 14, 64 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 32+V_Z0, 0, 14 + lxvd2x 32+V_Z1, 10, 14 + lxvd2x 32+V_Z2, 11, 14 + lxvd2x 32+V_Z3, 12, 14 + addi 14, 14, 64 .endm .macro Perm_4zetas - xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 - xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 - xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 - xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 + xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 + xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 + xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 + xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 .endm # @@ -317,191 +317,191 @@ .align 4 MLK_ASM_FN_SYMBOL(ntt_ppc) - SAVE_REGS + SAVE_REGS - # get MLKEM_Q - lvx V_NMKQ,0,4 + # get MLKEM_Q + lvx V_NMKQ,0,4 - # zetas array - addi 14, 4, ZETA_NTT_OFFSET + # zetas array + addi 14, 4, ZETA_NTT_OFFSET - vxor 3, 3, 3 - vspltish 4, 1 + vxor 3, 3, 3 + vspltish 4, 1 - li 10, QINV_OFFSET - lvx V_QINV, 10, 4 + li 10, QINV_OFFSET + lvx V_QINV, 10, 4 .align 4 - # - # Compute coefficients of the NTT based on the following loop. - # for (len = 128; len ≥ 2; len = len/2) - # - # 1. len = 128, start = 0 - # - li 5, 0 # start - li 7, 256 # len * 2 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 64 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 128 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 192 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One + # + # Compute coefficients of the NTT based on the following loop. + # for (len = 128; len ≥ 2; len = len/2) + # + # 1. len = 128, start = 0 + # + li 5, 0 # start + li 7, 256 # len * 2 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 128 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 192 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One .align 4 - # - # 2. len = 64, start = 0, 128 - # k += 2 - li 5, 0 - li 7, 128 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 64 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 256 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 320 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One + # + # 2. len = 64, start = 0, 128 + # k += 2 + li 5, 0 + li 7, 128 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 320 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One .align 4 - # - # 3. len = 32, start = 0, 64, 128, 192 - # k += 4 - li 5, 0 - li 7, 64 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 128 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 256 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 384 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One + # + # 3. len = 32, start = 0, 64, 128, 192 + # k += 4 + li 5, 0 + li 7, 64 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 128 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 384 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One .align 4 - # - # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - # k += 8 - li 5, 0 - li 7, 32 - Load_next_4zetas - NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 16 - NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - - Load_next_4zetas - li 5, 256 - NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 272 - NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + # k += 8 + li 5, 0 + li 7, 32 + Load_next_4zetas + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 16 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + Load_next_4zetas + li 5, 256 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 272 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One .align 4 - # - # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - # k += 16 - li 5, 0 - li 7, 16 - Load_next_4zetas - NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 128 - - Load_next_4zetas - NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 256 - - Load_next_4zetas - NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 384 - - Load_next_4zetas - NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - - # - # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - # k += 32 - li 15, 4 # loops - mtctr 15 + # + # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + # k += 16 + li 5, 0 + li 7, 16 + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 128 + + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 256 + + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 384 + + Load_next_4zetas + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + # + # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + # k += 32 + li 15, 4 # loops + mtctr 15 mr 5, 3 - li 7, 8 + li 7, 8 - li 10, 16 - li 11, 32 - li 12, 48 - li 15, 64 - li 16, 80 - li 17, 96 - li 18, 112 + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 .align 4 ntt_ppc__Len4: - Load_next_4zetas - Perm_4zetas + Load_next_4zetas + Perm_4zetas Load_L44Coeffs - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 - PermWriteL44 - addi 5, 5, 128 + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL44 + addi 5, 5, 128 - bdnz ntt_ppc__Len4 + bdnz ntt_ppc__Len4 - # - # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - # k += 64 - # Update zetas vectors, each vector has 2 zetas + # + # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # k += 64 + # Update zetas vectors, each vector has 2 zetas - li 8, 4 - mtctr 8 - mr 5, 3 - li 7, 4 + li 8, 4 + mtctr 8 + mr 5, 3 + li 7, 4 .align 4 ntt_ppc__Len2: - Load_next_4zetas + Load_next_4zetas Load_L24Coeffs - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 - PermWriteL24 - addi 5, 5, 128 + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL24 + addi 5, 5, 128 - bdnz ntt_ppc__Len2 + bdnz ntt_ppc__Len2 - RESTORE_REGS - blr + RESTORE_REGS + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S index 765ef91763..c0170f6015 100644 --- a/mlkem/src/native/ppc64le/src/poly_tomont.S +++ b/mlkem/src/native/ppc64le/src/poly_tomont.S @@ -38,141 +38,141 @@ # MREDUCE_4X(_v0, _v1, _v2, _v3) # .macro MREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+13, 0, 3 - addi 3, 3, 16 - lxvd2x 32+18, 0, 3 - addi 3, 3, 16 - lxvd2x 32+23, 0, 3 - addi 3, 3, 16 - lxvd2x 32+7, 0, 3 - addi 3, 3, 16 - - vmladduhm 15, 13, V1353, 3 - vmladduhm 20, 18, V1353, 3 - vmladduhm 25, 23, V1353, 3 - vmladduhm 9, 7, V1353, 3 - - vmhraddshs 14, 13, V1353, 3 - vmhraddshs 19, 18, V1353, 3 - vmhraddshs 24, 23, V1353, 3 - vmhraddshs 8, 7, V1353, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 9, 9, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 9, 9, V_NMKQ, 8 - - vsrah \_v0, 15, 4 # >> 1 - vsrah \_v1, 20, 4 # >> 1 - vsrah \_v2, 25, 4 # >> 1 - vsrah \_v3, 9, 4 # >> 1 + lxvd2x 32+13, 0, 3 + addi 3, 3, 16 + lxvd2x 32+18, 0, 3 + addi 3, 3, 16 + lxvd2x 32+23, 0, 3 + addi 3, 3, 16 + lxvd2x 32+7, 0, 3 + addi 3, 3, 16 + + vmladduhm 15, 13, V1353, 3 + vmladduhm 20, 18, V1353, 3 + vmladduhm 25, 23, V1353, 3 + vmladduhm 9, 7, V1353, 3 + + vmhraddshs 14, 13, V1353, 3 + vmhraddshs 19, 18, V1353, 3 + vmhraddshs 24, 23, V1353, 3 + vmhraddshs 8, 7, V1353, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 9, 9, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 9, 9, V_NMKQ, 8 + + vsrah \_v0, 15, 4 # >> 1 + vsrah \_v1, 20, 4 # >> 1 + vsrah \_v2, 25, 4 # >> 1 + vsrah \_v3, 9, 4 # >> 1 .endm .macro Write_8X - stxvd2x 32+27, 4, 3 - stxvd2x 32+28, 5, 3 - stxvd2x 32+29, 6, 3 - stxvd2x 32+30, 7, 3 - stxvd2x 32+13, 8, 3 - stxvd2x 32+18, 9, 3 - stxvd2x 32+23, 10, 3 - stxvd2x 32+7, 11, 3 + stxvd2x 32+27, 4, 3 + stxvd2x 32+28, 5, 3 + stxvd2x 32+29, 6, 3 + stxvd2x 32+30, 7, 3 + stxvd2x 32+13, 8, 3 + stxvd2x 32+18, 9, 3 + stxvd2x 32+23, 10, 3 + stxvd2x 32+7, 11, 3 .endm .align 4 .globl MLK_ASM_NAMESPACE(poly_tomont_ppc) MLK_ASM_FN_SYMBOL(poly_tomont_ppc) - stdu 1, -320(1) - mflr 0 - - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - li 11, 208 - li 12, 224 - stxvx 32+20, 6, 1 - stxvx 32+21, 7, 1 - stxvx 32+22, 8, 1 - stxvx 32+23, 9, 1 - stxvx 32+24, 10, 1 - stxvx 32+25, 11, 1 - stxvx 32+26, 12, 1 - li 6, 240 - li 7, 256 - li 8, 272 - li 9, 288 - stxvx 32+27, 6, 1 - stxvx 32+28, 7, 1 - stxvx 32+29, 8, 1 - stxvx 32+30, 9, 1 - - li 6, NQ_OFFSET - li 7, QINV_OFFSET - li 8, C1353_OFFSET - lxvx 32+V_NMKQ, 6, 4 - lxvx 32+V_QINV, 7, 4 - lxvx 32+V1353, 8, 4 - - vxor 3, 3, 3 - vspltish 4, 1 - - li 4, -128 - li 5, -112 - li 6, -96 - li 7, -80 - li 8, -64 - li 9, -48 - li 10, -32 - li 11, -16 - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - li 11, 208 - li 12, 224 - lxvx 32+20, 6, 1 - lxvx 32+21, 7, 1 - lxvx 32+22, 8, 1 - lxvx 32+23, 9, 1 - lxvx 32+24, 10, 1 - lxvx 32+25, 11, 1 - lxvx 32+26, 12, 1 - li 6, 240 - li 7, 256 - li 8, 272 - li 9, 288 - lxvx 32+27, 6, 1 - lxvx 32+28, 7, 1 - lxvx 32+29, 8, 1 - lxvx 32+30, 9, 1 - mtlr 0 - addi 1, 1, 320 - blr + stdu 1, -320(1) + mflr 0 + + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + stxvx 32+25, 11, 1 + stxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + stxvx 32+27, 6, 1 + stxvx 32+28, 7, 1 + stxvx 32+29, 8, 1 + stxvx 32+30, 9, 1 + + li 6, NQ_OFFSET + li 7, QINV_OFFSET + li 8, C1353_OFFSET + lxvx 32+V_NMKQ, 6, 4 + lxvx 32+V_QINV, 7, 4 + lxvx 32+V1353, 8, 4 + + vxor 3, 3, 3 + vspltish 4, 1 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + lxvx 32+25, 11, 1 + lxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + lxvx 32+27, 6, 1 + lxvx 32+28, 7, 1 + lxvx 32+29, 8, 1 + lxvx 32+30, 9, 1 + mtlr 0 + addi 1, 1, 320 + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S index 40c7a4cef5..c0cb022050 100644 --- a/mlkem/src/native/ppc64le/src/reduce.S +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -33,66 +33,66 @@ .text .macro BREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+8, 0, 3 - lxvd2x 32+12, 14, 3 - lxvd2x 32+16, 15, 3 - lxvd2x 32+20, 16, 3 - addi 3, 3, 64 - vmulosh 6, 8, V20159 - vmulesh 5, 8, V20159 - vmulosh 11, 12, V20159 - vmulesh 10, 12, V20159 - vmulosh 15, 16, V20159 - vmulesh 14, 16, V20159 - vmulosh 19, 20, V20159 - vmulesh 18, 20, V20159 - xxmrglw 32+4, 32+5, 32+6 - xxmrghw 32+5, 32+5, 32+6 - xxmrglw 32+9, 32+10, 32+11 - xxmrghw 32+10, 32+10, 32+11 - xxmrglw 32+13, 32+14, 32+15 - xxmrghw 32+14, 32+14, 32+15 - xxmrglw 32+17, 32+18, 32+19 - xxmrghw 32+18, 32+18, 32+19 - vadduwm 4, 4, V_25 - vadduwm 5, 5, V_25 - vadduwm 9, 9, V_25 - vadduwm 10, 10, V_25 - vadduwm 13, 13, V_25 - vadduwm 14, 14, V_25 - vadduwm 17, 17, V_25 - vadduwm 18, 18, V_25 - vsraw 4, 4, V_26 - vsraw 5, 5, V_26 - vsraw 9, 9, V_26 - vsraw 10, 10, V_26 - vsraw 13, 13, V_26 - vsraw 14, 14, V_26 - vsraw 17, 17, V_26 - vsraw 18, 18, V_26 - vpkuwum 4, 5, 4 - vsubuhm 4, 7, 4 - vpkuwum 9, 10, 9 - vsubuhm 9, 7, 9 - vpkuwum 13, 14, 13 - vsubuhm 13, 7, 13 - vpkuwum 17, 18, 17 - vsubuhm 17, 7, 17 - vmladduhm \_v0, 4, V_MKQ, 8 - vmladduhm \_v1, 9, V_MKQ, 12 - vmladduhm \_v2, 13, V_MKQ, 16 - vmladduhm \_v3, 17, V_MKQ, 20 + lxvd2x 32+8, 0, 3 + lxvd2x 32+12, 14, 3 + lxvd2x 32+16, 15, 3 + lxvd2x 32+20, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 .endm .macro Write_8X - stxvd2x 32+21, 4, 3 - stxvd2x 32+22, 5, 3 - stxvd2x 32+23, 6, 3 - stxvd2x 32+24, 7, 3 - stxvd2x 32+4, 8, 3 - stxvd2x 32+9, 9, 3 - stxvd2x 32+13, 10, 3 - stxvd2x 32+17, 11, 3 + stxvd2x 32+21, 4, 3 + stxvd2x 32+22, 5, 3 + stxvd2x 32+23, 6, 3 + stxvd2x 32+24, 7, 3 + stxvd2x 32+4, 8, 3 + stxvd2x 32+9, 9, 3 + stxvd2x 32+13, 10, 3 + stxvd2x 32+17, 11, 3 .endm # @@ -103,124 +103,124 @@ lxvd2x 32+13, 14, 3 lxvd2x 32+14, 15, 3 lxvd2x 32+15, 16, 3 - addi 3, 3, 64 - vsrh 1, 12, 10 - vsrh 0, 13, 10 - vsrh 3, 14, 10 - vsrh 2, 15, 10 - vadduhm 7, 12, 11 - vadduhm 8, 13, 11 - vadduhm 5, 14, 11 - vadduhm 6, 15, 11 - vcmpequh 1, 1, 9 - vcmpequh 0, 0, 9 - vcmpequh 3, 3, 9 - vcmpequh 2, 2, 9 - xxsel 32+1, 32+7,32+12, 32+1 - xxsel 32+0, 32+8,32+13, 32+0 - xxsel 32+3, 32+5,32+14, 32+3 - xxsel 32+2, 32+6,32+15, 32+2 - stxvd2x 32+3, 10, 3 - stxvd2x 32+2, 11, 3 - stxvd2x 32+1, 8, 3 - stxvd2x 32+0, 9, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 32+1, 32+7,32+12, 32+1 + xxsel 32+0, 32+8,32+13, 32+0 + xxsel 32+3, 32+5,32+14, 32+3 + xxsel 32+2, 32+6,32+15, 32+2 + stxvd2x 32+3, 10, 3 + stxvd2x 32+2, 11, 3 + stxvd2x 32+1, 8, 3 + stxvd2x 32+0, 9, 3 .endm .align 4 .globl MLK_ASM_NAMESPACE(reduce_ppc) MLK_ASM_FN_SYMBOL(reduce_ppc) - stdu 1, -224(1) - mflr 0 - std 14, 96(1) - std 15, 104(1) - std 16, 112(1) - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - stxvx 32+20, 6, 1 - stxvx 32+21, 7, 1 - stxvx 32+22, 8, 1 - stxvx 32+23, 9, 1 - stxvx 32+24, 10, 1 - - vxor 7, 7, 7 - - li 6, Q_OFFSET - li 7, C20159_OFFSET - lxvx 32+V_MKQ, 6, 4 - lxvx 32+V20159, 7, 4 - - vspltisw V_26, 13 - vadduwm V_26, V_26, V_26 - vspltisw 4, 1 - vsubuwm 5, V_26, 4 - vslw V_25, 4, 5 - - li 4, -128 - li 5, -112 - li 6, -96 - li 7, -80 - li 8, -64 - li 9, -48 - li 10, -32 - li 11, -16 - - li 14, 16 - li 15, 32 - li 16, 48 - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - # - # To unsigned canonical - # + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + + vxor 7, 7, 7 + + li 6, Q_OFFSET + li 7, C20159_OFFSET + lxvx 32+V_MKQ, 6, 4 + lxvx 32+V20159, 7, 4 + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + vspltisw 4, 1 + vsubuwm 5, V_26, 4 + vslw V_25, 4, 5 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + li 14, 16 + li 15, 32 + li 16, 48 + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + # + # To unsigned canonical + # .align 4 - addi 3, 3, -512 - vxor 9, 9, 9 - vspltish 10, 15 - vmr 11, V_MKQ - - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - - ld 14, 96(1) - ld 15, 104(1) - ld 16, 112(1) - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - lxvx 32+20, 6, 1 - lxvx 32+21, 7, 1 - lxvx 32+22, 8, 1 - lxvx 32+23, 9, 1 - lxvx 32+24, 10, 1 - mtlr 0 - addi 1, 1, 224 - blr + addi 3, 3, -512 + vxor 9, 9, 9 + vspltish 10, 15 + vmr 11, V_MKQ + + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + mtlr 0 + addi 1, 1, 224 + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ From a962197d281350348f86e20257ce4385aa244736 Mon Sep 17 00:00:00 2001 From: Hanno Becker Date: Sat, 20 Sep 2025 08:23:09 +0100 Subject: [PATCH 4/6] Various minor adjustments to PPC64 backend setup Signed-off-by: Hanno Becker --- dev/ppc64le/meta.h | 2 +- dev/ppc64le/src/consts.c | 202 +++++++++-------------------- dev/ppc64le/src/consts.h | 3 + dev/ppc64le/src/consts_intt.inc | 90 +++++++++++++ dev/ppc64le/src/consts_ntt.inc | 45 +++++++ dev/ppc64le/src/intt_ppc.S | 149 ++++++++++------------ dev/ppc64le/src/ntt_ppc.S | 216 ++++++++++++++++---------------- dev/ppc64le/src/poly_tomont.S | 64 ++++------ dev/ppc64le/src/reduce.S | 59 ++++----- 9 files changed, 426 insertions(+), 404 deletions(-) create mode 100644 dev/ppc64le/src/consts_intt.inc create mode 100644 dev/ppc64le/src/consts_ntt.inc diff --git a/dev/ppc64le/meta.h b/dev/ppc64le/meta.h index 34f8cbec66..8fec0c2ad6 100644 --- a/dev/ppc64le/meta.h +++ b/dev/ppc64le/meta.h @@ -8,7 +8,7 @@ /* Identifier for this backend so that source and assembly files * in the build can be appropriately guarded. */ -#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#define MLK_ARITH_BACKEND_PPC64LE #define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT diff --git a/dev/ppc64le/src/consts.c b/dev/ppc64le/src/consts.c index 4c2fbdf61a..c9c869a607 100644 --- a/dev/ppc64le/src/consts.c +++ b/dev/ppc64le/src/consts.c @@ -5,151 +5,73 @@ #include "../../../common.h" -#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) #include "consts.h" -MLK_ALIGN const int16_t mlk_ppc_qdata[1568] = { +MLK_ALIGN const int16_t mlk_ppc_qdata[] = { /* -Q */ - -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329, + /* check-magic: -3329 == -1 * MLKEM_Q */ + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, /* QINV */ - -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327, + /* check-magic: -3327 == pow(MLKEM_Q,-1,2^16) */ + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, /* Q */ - 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329, - /* const 20159 for reduce.S and intt */ - 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159, - /* const 1441 for intt */ - 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441, - /* for poly_tomont.S */ - 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353, - /* zetas */ - /* For ntt Len=128, offset 96 */ - -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, - -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, - -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, - 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, - 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, - -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, - 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, - 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, - -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, - 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, - -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, - 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, - -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, - 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, - -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, - -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, - 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, - -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, - 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, - -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, - 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, - -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, - 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, - 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, - -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, - 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, - -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, - -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, - -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, - 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, - -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, - -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, - 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, - -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, - -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, - -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, - 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, - -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, - -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, - 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, - /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ - -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, - 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, - 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, - 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, - -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, - -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, - -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, - -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, - 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, - 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, - 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, - 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, - 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, - 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, - 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, - -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, - -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, - -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, - -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, - 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, - 1628, 1628, - /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ - 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, - 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, - -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, - -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, - 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, - -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, - 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, - 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, - 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, - 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, - 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, - 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, - -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, - 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, - -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, - -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, - -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, - 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, - 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, - 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, - -1103, - /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ - -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, - 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, - 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, - -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, - 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, - -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, - -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, - -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, - 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, - -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, - -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, - 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, - -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, - -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, - -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, - 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, - -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, - 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, - 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, - -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, - 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, - -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, - 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, - -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, - 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, - -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, - -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, - 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, - -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, - 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, - 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, - 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, - -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, - 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, - 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, - -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, - 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, - 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, - -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, - -359, -758, -758, -758, -758, -758, -758, -758, -758}; + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + /* check-magic: 20159 == round(2^26 / MLKEM_Q) */ + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + /* check-magic: 1441 == pow(2,32-7,MLKEM_Q) */ + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + /* check-magic: 1353 == pow(2, 32, MLKEM_Q) */ + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, +/* zetas for NTT */ +#include "consts_ntt.inc" + , +/* zetas for invNTT */ +#include "consts_intt.inc" +}; -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/consts.h b/dev/ppc64le/src/consts.h index d424601ac1..59de765cf0 100644 --- a/dev/ppc64le/src/consts.h +++ b/dev/ppc64le/src/consts.h @@ -7,6 +7,8 @@ #define MLK_DEV_PPC64LE_SRC_CONSTS_H #include "../../../common.h" +/* Offsets into the constant table */ +/* check-magic: off */ #define NQ_OFFSET 0 #define QINV_OFFSET 16 #define Q_OFFSET 32 @@ -17,6 +19,7 @@ #define ZETA_NTT_OFFSET64 1104 #define IZETA_NTT_OFFSET127 1616 #define IZETA_NTT_OFFSET63 2128 +/* check-magic: on */ #ifndef __ASSEMBLER__ #define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) diff --git a/dev/ppc64le/src/consts_intt.inc b/dev/ppc64le/src/consts_intt.inc new file mode 100644 index 0000000000..7cd95fcd08 --- /dev/null +++ b/dev/ppc64le/src/consts_intt.inc @@ -0,0 +1,90 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + + /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ + -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, + 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, + 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, + 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, + -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, + -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, + -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, + -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, + 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, + 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, + 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, + 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, + 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, + 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, + 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, + -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, + -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, + -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, + -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, + 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, + 1628, 1628, + /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ + 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, + 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, + -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, + -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, + 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, + -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, + 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, + 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, + 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, + 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, + 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, + 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, + -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, + 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, + -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, + -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, + -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, + 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, + 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, + 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, + -1103, + /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ + -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, + 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, + -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, + -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, + -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, + -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, + 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, + -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, + -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, + 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, + -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, + -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, + 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, + 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, + 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, + -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, + -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, + 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, + -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, + 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, + -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, + -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, + 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, + -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, + 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, + 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, + 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, + -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, + 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, + -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, + 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, + -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, + -359, -758, -758, -758, -758, -758, -758, -758, -758 diff --git a/dev/ppc64le/src/consts_ntt.inc b/dev/ppc64le/src/consts_ntt.inc new file mode 100644 index 0000000000..bfb64e722c --- /dev/null +++ b/dev/ppc64le/src/consts_ntt.inc @@ -0,0 +1,45 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + /* For ntt Len=128, offset 96 */ + -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, + -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, + 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, + 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, + -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, + 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, + 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, + 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, + -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, + -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, + -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, + 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, + 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, + -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, + 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, + -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, + 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, + 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, + -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, + 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, + -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, + -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, + -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, + 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, + -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, + -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, + 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, + -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, + -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, + -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, + 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, + -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, + -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, + 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index 51cae8e621..2d604a1fbb 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -3,30 +3,28 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ #include "../../../common.h" -#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) /* simpasm: header-end */ #include "consts.h" -.machine "any" -.text - -# Barrett reduce constatnts +// Barrett reduce constants #define V20159 0 #define V_25 1 #define V_26 2 #define V_MKQ 3 -# Montgomery reduce constatnts +// Montgomery reduce constants #define V_QINV 2 #define V_NMKQ 5 #define V_Z0 7 @@ -123,18 +121,18 @@ .endm .macro Load_4Coeffs start next step - mr 9, \start # j - add 10, 7, 9 # J + len*2 + mr 9, \start // j + add 10, 7, 9 // J + len*2 addi 16, 9, \next addi 17, 10, \step addi 18, 16, \next addi 19, 17, \step addi 20, 18, \next addi 21, 19, \step - lxvd2x 32+8, 3, 10 # r[j+len] - lxvd2x 32+12, 3, 17 # r[j+len] - lxvd2x 32+16, 3, 19 # r[j+len] - lxvd2x 32+20, 3, 21 # r[j+len] + lxvd2x 32+8, 3, 10 // r[j+len] + lxvd2x 32+12, 3, 17 // r[j+len] + lxvd2x 32+16, 3, 19 // r[j+len] + lxvd2x 32+20, 3, 21 // r[j+len] lxvd2x 32+21, 3, 9 lxvd2x 32+22, 3, 16 @@ -144,14 +142,14 @@ Compute_4Coeffs .endm -# -# Load Coeffients and setup vectors -# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 -# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 -# -# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 -# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 -# +/* + * Load Coeffients and setup vectors + * aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 + * aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 + * + * a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 + * a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 + */ .macro Load_L24Coeffs lxvd2x 32+25, 0, 5 # a[j], r[j+len] lxvd2x 32+26, 10, 5 # a[j], r[j+len] @@ -171,15 +169,15 @@ vmrgow 24, 25, 26 .endm -# -# Permute -# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 -# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 -# -# to -# rjlen4 - rjlen7, rjlen12 - rjlen15 -# rj0 - rj4, rj8 - rj11 -# +/* + * Permute + * rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 + * rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 + * + * to + * rjlen4 - rjlen7, rjlen12 - rjlen15 + * rj0 - rj4, rj8 - rj11 + */ .macro Load_L44Coeffs lxvd2x 10, 0, 5 # rj0, rj1, rj2, rj3, # rjlen4, rjlen5, rjlen6, rjlen7 @@ -206,8 +204,8 @@ xxlor 32+3, 6, 6 # V_MKQ xxlor 32+1, 7, 7 # V_25 xxlor 32+2, 8, 8 # V_26 - # Multify Odd/Even signed halfword; - # Results word bound by 2^32 in abs value. + // Multiply Odd/Even signed halfword; + // Results word bound by 2^32 in abs value. vmulosh 6, 8, V20159 vmulesh 5, 8, V20159 vmulosh 11, 12, V20159 @@ -232,8 +230,8 @@ vadduwm 14, 14, V_25 vadduwm 17, 17, V_25 vadduwm 18, 18, V_25 - # Right shift and pack lower halfword, - # results bond to 2^16 in abs value + // Right shift and pack lower halfword, + // results bound by 2^16 in abs value vsraw 4, 4, V_26 vsraw 5, 5, V_26 vsraw 9, 9, V_26 @@ -250,25 +248,23 @@ vsubuhm 13, 7, 13 vpkuwum 17, 18, 17 vsubuhm 17, 7, 17 - # Modulo multify-Low unsigned halfword; - # results bond to 2^16 * q in abs value. + // Modulo multify-Low unsigned halfword; + // results bound by 2^16 * q in abs value. vmladduhm \_v0, 4, V_MKQ, 8 vmladduhm \_v1, 9, V_MKQ, 12 vmladduhm \_v2, 13, V_MKQ, 16 vmladduhm \_v3, 17, V_MKQ, 20 .endm -#----------------------------------- -# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) -# +/* MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) */ .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 - # Modular multification bond by 2^16 * q in abs value + // Modular multiplication bound by 2^16 * q in abs value vmladduhm 15, 25, \_vz0, 3 vmladduhm 20, 26, \_vz1, 3 vmladduhm 27, 30, \_vz2, 3 vmladduhm 28, 31, \_vz3, 3 - # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + // Signed multiply-high-round; outputs are bound by 2^15 * q in abs value vmhraddshs 14, 25, \_vz0, 3 vmhraddshs 19, 26, \_vz1, 3 vmhraddshs 24, 30, \_vz2, 3 @@ -284,17 +280,17 @@ vmhraddshs 25, 25, V_NMKQ, 24 vmhraddshs 30, 30, V_NMKQ, 29 - vsrah \_vo0, 15, 4 # >> 1 - vsrah \_vo1, 20, 4 # >> 1 - vsrah \_vo2, 25, 4 # >> 1 - vsrah \_vo3, 30, 4 # >> 1 + vsrah \_vo0, 15, 4 // >> 1 + vsrah \_vo1, 20, 4 // >> 1 + vsrah \_vo2, 25, 4 // >> 1 + vsrah \_vo3, 30, 4 // >> 1 .endm .macro Set_mont_consts - xxlor 32+5, 0, 0 # V_NMKQ - xxlor 32+2, 2, 2 # V_QINV - xxlor 32+3, 3, 3 # 0 - xxlor 32+4, 4, 4 # 1 + xxlor 32+5, 0, 0 // V_NMKQ + xxlor 32+2, 2, 2 // V_QINV + xxlor 32+3, 3, 3 // 0 + xxlor 32+4, 4, 4 // 1 .endm .macro Load_next_4zetas @@ -435,23 +431,15 @@ Write_M4C 32+13, 32+18, 32+23, 32+28 .endm -# intt -# t = r[j]; -# r[j] = barrett_reduce(t + r[j + len]); -# r[j + len] = r[j + len] - t; -# r[j + len] = fqmul(zeta, r[j + len]); - -# -# mlk_intt_ppc(r) -# +.text .global MLK_ASM_NAMESPACE(intt_ppc) -.align 4 +.balign 16 MLK_ASM_FN_SYMBOL(intt_ppc) SAVE_REGS - # init vectors and constants - # Setup for Montgomery reduce + /* init vectors and constants + * Setup for Montgomery reduce */ lxvx 0, 0, 4 li 10, QINV_OFFSET @@ -462,7 +450,7 @@ MLK_ASM_FN_SYMBOL(intt_ppc) xxlor 3, 32+3, 32+3 # 0 xxlor 4, 32+4, 32+4 # 1 - # Setup for Barrett reduce + /* Setup for Barrett reduce */ li 10, Q_OFFSET li 11, C20159_OFFSET lxvx 6, 10, 4 # V_MKQ @@ -505,9 +493,10 @@ intt_ppc__Loopf: addi 3, 3, -512 .align 4 - # - # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - # Update zetas vectors, each vector has 2 zetas + /* + * 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + * Update zetas vectors, each vector has 2 zetas + */ addi 14, 4, ZETA_INTT_OFFSET li 7, 4 li 8, 4 @@ -519,8 +508,7 @@ intt_ppc__Loop2: bdnz intt_ppc__Loop2 .align 4 - # - # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + /* 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 */ mr 5, 3 li 7, 8 li 8, 4 # loops @@ -531,7 +519,7 @@ intt_ppc__Loop4: bdnz intt_ppc__Loop4 .align 4 - # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + /* 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 */ li 7, 16 li 5, 0 li 15, 4 # loops @@ -543,8 +531,7 @@ intt_ppc__Loop8: bdnz intt_ppc__Loop8 .align 4 - # - # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + /* 4. len = 16, start = 0, 32, 64,,...160, 192, 224 */ li 5, 0 li 7, 32 @@ -562,8 +549,7 @@ intt_ppc__Loop8: INTT_REDUCE_4X 5, 64, 64 .align 4 - # - # 5. len = 32, start = 0, 64, 128, 192 + /* 5. len = 32, start = 0, 64, 128, 192 */ li 5, 0 li 7, 64 @@ -607,8 +593,7 @@ intt_ppc__Loop8: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 - # - # 6. len = 64, start = 0, 128 + /* 6. len = 64, start = 0, 128 */ li 5, 0 li 7, 128 Load_4Coeffs 5, 16, 16 @@ -649,8 +634,7 @@ intt_ppc__Loop8: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 - # 7. len = 128, start = 0 - # + /* 7. len = 128, start = 0 */ li 5, 0 # start li 7, 256 # len * 2 @@ -709,5 +693,4 @@ intt_ppc__Loop8: #undef V1441 /* simpasm: footer-start */ -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S index 0a7a3eed58..d48ba3aba9 100644 --- a/dev/ppc64le/src/ntt_ppc.S +++ b/dev/ppc64le/src/ntt_ppc.S @@ -3,15 +3,16 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ #include "../../../common.h" -#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) /* simpasm: header-end */ @@ -25,8 +26,12 @@ #define V_Z3 10 #define V_ZETA 10 -.machine "any" -.text +// montgomery_reduce +// t = a * QINV +// t = (a - (int32_t)t*_MLKEM_Q) >> 16 +// +//----------------------------------- +// MREDUCE_4X(start, _vz0, _vz1, _vz2, _vz3) .macro SAVE_REGS stdu 1, -352(1) @@ -105,62 +110,62 @@ .macro Load_4Coeffs start next step mr 9, \start - add 10, 7, 9 # J + len*2 + add 10, 7, 9 // J + len*2 addi 16, 9, \next addi 17, 10, \step addi 18, 16, \next addi 19, 17, \step addi 20, 18, \next addi 21, 19, \step - lxvd2x 32+13, 3, 10 # r[j+len] - lxvd2x 32+18, 3, 17 # r[j+len] - lxvd2x 32+23, 3, 19 # r[j+len] - lxvd2x 32+28, 3, 21 # r[j+len] + lxvd2x 32+13, 3, 10 // r[j+len] + lxvd2x 32+18, 3, 17 // r[j+len] + lxvd2x 32+23, 3, 19 // r[j+len] + lxvd2x 32+28, 3, 21 // r[j+len] .endm -# -# Load Coeffients and setup vectors -# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 -# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 -# -# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 -# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 -# +/* + * Load Coeffients and setup vectors + * aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 + * aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 + * + * a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 + * a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 + */ .macro Load_L24Coeffs - lxvd2x 32+25, 0, 5 # a[j], r[j+len] - lxvd2x 32+26, 10, 5 # a[j], r[j+len] + lxvd2x 32+25, 0, 5 // a[j], r[j+len] + lxvd2x 32+26, 10, 5 // a[j], r[j+len] vmrgew 13, 25, 26 vmrgow 12, 25, 26 - lxvd2x 32+25, 11, 5 # a[j], r[j+len] - lxvd2x 32+26, 12, 5 # a[j], r[j+len] + lxvd2x 32+25, 11, 5 // a[j], r[j+len] + lxvd2x 32+26, 12, 5 // a[j], r[j+len] vmrgew 18, 25, 26 vmrgow 17, 25, 26 - lxvd2x 32+25, 15, 5 # a[j], r[j+len] - lxvd2x 32+26, 16, 5 # a[j], r[j+len] + lxvd2x 32+25, 15, 5 // a[j], r[j+len] + lxvd2x 32+26, 16, 5 // a[j], r[j+len] vmrgew 23, 25, 26 vmrgow 22, 25, 26 - lxvd2x 32+25, 17, 5 # a[j], r[j+len] - lxvd2x 32+26, 18, 5 # a[j], r[j+len] + lxvd2x 32+25, 17, 5 // a[j], r[j+len] + lxvd2x 32+26, 18, 5 // a[j], r[j+len] vmrgew 28, 25, 26 vmrgow 27, 25, 26 .endm -# -# Permute -# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 -# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 -# -# to -# rjlen4 - rjlen7, rjlen12 - rjlen15 -# rj0 - rj4, rj8 - rj11 -# +/* + * Permute + * rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 + * rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 + * + * to + * rjlen4 - rjlen7, rjlen12 - rjlen15 + * rj0 - rj4, rj8 - rj11 + */ .macro Load_L44Coeffs - lxvd2x 1, 0, 5 # rj0, rj1, rj2, rj3, - # rjlen4, rjlen5, rjlen6, rjlen7 - lxvd2x 2, 10, 5 # rj8, rj9, rj10, rj11 - # rjlen12, rjlen13, rjlen14, rjlen15 - xxpermdi 32+13, 2, 1, 3 # rjlen4 - rjlen7, rjlen12 - rjlen15 - xxpermdi 32+12, 2, 1, 0 # rj0 - rj4, rj8 - rj11 + lxvd2x 1, 0, 5 // rj0, rj1, rj2, rj3, + // rjlen4, rjlen5, rjlen6, rjlen7 + lxvd2x 2, 10, 5 // rj8, rj9, rj10, rj11 + // rjlen12, rjlen13, rjlen14, rjlen15 + xxpermdi 32+13, 2, 1, 3 // rjlen4 - rjlen7, rjlen12 - rjlen15 + xxpermdi 32+12, 2, 1, 0 // rj0 - rj4, rj8 - rj11 lxvd2x 3, 11, 5 lxvd2x 4, 12, 5 xxpermdi 32+18, 4, 3, 3 @@ -175,23 +180,23 @@ xxpermdi 32+27, 4, 3, 0 .endm -# -# montgomery_reduce -# t = a * QINV -# t = (a - (int32_t)t*_MLKEM_Q) >> 16 -# -#----------------------------------- -# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) -# +/* + * montgomery_reduce + * t = a * QINV + * t = (a - (int32_t)t*_MLKEM_Q) >> 16 + * + *----------------------------------- + * MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) + */ .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 - # fqmul = zeta * coefficient - # Modular multification bond by 2^16 * q in abs value + // fqmul = zeta * coefficient + // Modular multification bound by 2^16 * q in abs value vmladduhm 15, 13, \_vz0, 3 vmladduhm 20, 18, \_vz1, 3 vmladduhm 25, 23, \_vz2, 3 vmladduhm 30, 28, \_vz3, 3 - # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value + // Signed multiply-high-round; outputs are bound by 2^15 * q in abs value vmhraddshs 14, 13, \_vz0, 3 vmhraddshs 19, 18, \_vz1, 3 vmhraddshs 24, 23, \_vz2, 3 @@ -207,32 +212,32 @@ vmhraddshs 25, 25, V_NMKQ, 24 vmhraddshs 30, 30, V_NMKQ, 29 - vsrah 13, 15, 4 # >> 1 - vsrah 18, 20, 4 # >> 1 - vsrah 23, 25, 4 # >> 1 - vsrah 28, 30, 4 # >> 1 + vsrah 13, 15, 4 // >> 1 + vsrah 18, 20, 4 // >> 1 + vsrah 23, 25, 4 // >> 1 + vsrah 28, 30, 4 // >> 1 .endm .macro Load_4Aj - lxvd2x 32+12, 3, 9 # r[j] - lxvd2x 32+17, 3, 16 # r[j] - lxvd2x 32+22, 3, 18 # r[j] - lxvd2x 32+27, 3, 20 # r[j] + lxvd2x 32+12, 3, 9 // r[j] + lxvd2x 32+17, 3, 16 // r[j] + lxvd2x 32+22, 3, 18 // r[j] + lxvd2x 32+27, 3, 20 // r[j] .endm .macro Compute_4Coeffs - # Since the result of the Montgomery multiplication is bounded - # by q in absolute value. - # Finally to complete the final update of the results with add/sub - vsubuhm 16, 12, 13 # r - t - vadduhm 15, 13, 12 # r + t - vsubuhm 21, 17, 18 # r - t - vadduhm 20, 18, 17 # r + t - vsubuhm 26, 22, 23 # r - t - vadduhm 25, 23, 22 # r + t - vsubuhm 31, 27, 28 # r - t - vadduhm 30, 28, 27 # r + t + /* Since the result of the Montgomery multiplication is bounded + * by q in absolute value. + * Finally to complete the final update of the results with add/sub */ + vsubuhm 16, 12, 13 // r - t + vadduhm 15, 13, 12 // r + t + vsubuhm 21, 17, 18 // r - t + vadduhm 20, 18, 17 // r + t + vsubuhm 26, 22, 23 // r - t + vadduhm 25, 23, 22 // r + t + vsubuhm 31, 27, 28 // r - t + vadduhm 30, 28, 27 // r + t .endm .macro NTT_MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 @@ -311,19 +316,17 @@ xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 .endm -# -# mlk_ntt_ppc(int16_t *r) -# +.text .global MLK_ASM_NAMESPACE(ntt_ppc) -.align 4 +.balign 16 MLK_ASM_FN_SYMBOL(ntt_ppc) SAVE_REGS - # get MLKEM_Q + // get MLKEM_Q lvx V_NMKQ,0,4 - # zetas array + // zetas array addi 14, 4, ZETA_NTT_OFFSET vxor 3, 3, 3 @@ -333,12 +336,12 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) lvx V_QINV, 10, 4 .align 4 - # - # Compute coefficients of the NTT based on the following loop. - # for (len = 128; len ≥ 2; len = len/2) - # - # 1. len = 128, start = 0 - # + /* + * Compute coefficients of the NTT based on the following loop. + * for (len = 128; len ≥ 2; len = len/2) + * + * 1. len = 128, start = 0 + */ li 5, 0 # start li 7, 256 # len * 2 lvx V_ZETA, 0, 14 @@ -357,9 +360,8 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 - # - # 2. len = 64, start = 0, 128 - # k += 2 + /* 2. len = 64, start = 0, 128 + * k += 2 */ li 5, 0 li 7, 128 lvx V_ZETA, 0, 14 @@ -380,9 +382,8 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 - # - # 3. len = 32, start = 0, 64, 128, 192 - # k += 4 + /* 3. len = 32, start = 0, 64, 128, 192 + * k += 4 */ li 5, 0 li 7, 64 lvx V_ZETA, 0, 14 @@ -409,9 +410,8 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 - # - # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - # k += 8 + /* 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + * k += 8 */ li 5, 0 li 7, 32 Load_next_4zetas @@ -430,9 +430,8 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 - # - # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - # k += 16 + /* 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + * k += 16 */ li 5, 0 li 7, 16 Load_next_4zetas @@ -454,9 +453,8 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One - # - # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - # k += 32 + // 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + // k += 32 li 15, 4 # loops mtctr 15 mr 5, 3 @@ -481,10 +479,9 @@ ntt_ppc__Len4: bdnz ntt_ppc__Len4 - # - # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - # k += 64 - # Update zetas vectors, each vector has 2 zetas + // 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + // k += 64 + // Update zetas vectors, each vector has 2 zetas li 8, 4 mtctr 8 @@ -508,8 +505,11 @@ ntt_ppc__Len2: * Don't modify by hand -- this is auto-generated by scripts/autogen. */ #undef V_QINV #undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 #undef V_ZETA /* simpasm: footer-start */ -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S index b1f7f8c725..66117f3679 100644 --- a/dev/ppc64le/src/poly_tomont.S +++ b/dev/ppc64le/src/poly_tomont.S @@ -3,21 +3,21 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# - -# Poly_tomont: Inplace conversion of all coefficients of a polynomial -# from normal domain to Montgomery domain -# -# Arguments:*r: pointer to input/output polynomial -# +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ + +// Poly_tomont: Inplace conversion of all coefficients of a polynomial +// from normal domain to Montgomery domain +// +// Arguments:*r: pointer to input/output polynomial #include "../../../common.h" -#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) /* simpasm: header-end */ @@ -27,17 +27,13 @@ #define V_QINV 2 #define V_NMKQ 5 -.machine "any" -.text +// montgomery_reduce +// t = a * QINV +// t = (a - (int32_t)t*_MLKEM_Q) >> 16 +// +//----------------------------------- +// MREDUCE_4X(_v0, _v1, _v2, _v3) -# -# montgomery_reduce -# t = a * QINV -# t = (a - (int32_t)t*_MLKEM_Q) >> 16 -# -#----------------------------------- -# MREDUCE_4X(_v0, _v1, _v2, _v3) -# .macro MREDUCE_4X _v0 _v1 _v2 _v3 lxvd2x 32+13, 0, 3 addi 3, 3, 16 @@ -68,10 +64,10 @@ vmhraddshs 25, 25, V_NMKQ, 24 vmhraddshs 9, 9, V_NMKQ, 8 - vsrah \_v0, 15, 4 # >> 1 - vsrah \_v1, 20, 4 # >> 1 - vsrah \_v2, 25, 4 # >> 1 - vsrah \_v3, 9, 4 # >> 1 + vsrah \_v0, 15, 4 // >> 1 + vsrah \_v1, 20, 4 // >> 1 + vsrah \_v2, 25, 4 // >> 1 + vsrah \_v3, 9, 4 // >> 1 .endm .macro Write_8X @@ -85,8 +81,9 @@ stxvd2x 32+7, 11, 3 .endm -.align 4 -.globl MLK_ASM_NAMESPACE(poly_tomont_ppc) +.text +.global MLK_ASM_NAMESPACE(poly_tomont_ppc) +.balign 16 MLK_ASM_FN_SYMBOL(poly_tomont_ppc) stdu 1, -320(1) mflr 0 @@ -182,11 +179,4 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc) #undef V_NMKQ /* simpasm: footer-start */ -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ - -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef V1353 -#undef V_QINV -#undef V_NMKQ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S index 45fef03f2a..bb82a158b1 100644 --- a/dev/ppc64le/src/reduce.S +++ b/dev/ppc64le/src/reduce.S @@ -3,36 +3,32 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# - -# -# poly_reduce: Applies Barrett reduction to all coefficients of a polynomial -# for details of the Barrett reduction -# -# Arguments: *r: pointer to input/output polynomial -# +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ #include "../../../common.h" -#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) /* simpasm: header-end */ #include "consts.h" -# Barrett reduce constatnts +// poly_reduce: Applies Barrett reduction to all coefficients of a polynomial +// for details of the Barrett reduction +// +// Arguments: *r: pointer to input/output polynomial + +// Barrett reduce constatnts #define V20159 0 #define V_25 1 #define V_26 2 #define V_MKQ 3 -.machine "any" -.text - .macro BREDUCE_4X _v0 _v1 _v2 _v3 lxvd2x 32+8, 0, 3 lxvd2x 32+12, 14, 3 @@ -96,9 +92,9 @@ stxvd2x 32+17, 11, 3 .endm -# -# Conditional addition to get unsigned canonical representative -# +// +// Conditional addition to get unsigned canonical representative +// .macro To_unsigned_16 lxvd2x 32+12, 0, 3 lxvd2x 32+13, 14, 3 @@ -127,8 +123,9 @@ stxvd2x 32+0, 9, 3 .endm -.align 4 -.globl MLK_ASM_NAMESPACE(reduce_ppc) +.text +.global MLK_ASM_NAMESPACE(reduce_ppc) +.balign 16 MLK_ASM_FN_SYMBOL(reduce_ppc) stdu 1, -224(1) mflr 0 @@ -188,10 +185,10 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) BREDUCE_4X 4, 9, 13, 17 Write_8X - # - # To unsigned canonical - # .align 4 + // + // To unsigned canonical + // addi 3, 3, -512 vxor 9, 9, 9 vspltish 10, 15 @@ -231,12 +228,4 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) #undef V_MKQ /* simpasm: footer-start */ -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ - -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef V20159 -#undef V_25 -#undef V_26 -#undef V_MKQ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ From d3c1dcc7a4ce80a708d0e446480bbe9047d576ee Mon Sep 17 00:00:00 2001 From: Hanno Becker Date: Fri, 12 Sep 2025 21:06:40 +0100 Subject: [PATCH 5/6] Add CI test for PPC64LE backend Signed-off-by: Hanno Becker --- .github/actions/multi-functest/action.yml | 2 +- .github/workflows/ci.yml | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/.github/actions/multi-functest/action.yml b/.github/actions/multi-functest/action.yml index 252931918a..64ccfebd1c 100644 --- a/.github/actions/multi-functest/action.yml +++ b/.github/actions/multi-functest/action.yml @@ -119,7 +119,7 @@ runs: nix-verbose: ${{ inputs.nix-verbose }} gh_token: ${{ inputs.gh_token }} custom_shell: ${{ inputs.custom_shell }} - cflags: "${{ inputs.cflags }} -DMLK_FORCE_PPC64LE" + cflags: "${{ inputs.cflags }} -DMLK_FORCE_PPC64LE -mvsx" cross_prefix: powerpc64le-unknown-linux-gnu- exec_wrapper: qemu-ppc64le opt: ${{ inputs.opt }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9f6f6fd395..b51ab25e21 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -134,16 +134,15 @@ jobs: runs-on: ${{ matrix.target.runner }} steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - - name: build + test + - name: build + test (no-opt) uses: ./.github/actions/multi-functest with: nix-shell: ${{ matrix.target.nix_shell }} nix-cache: ${{ matrix.target.mode == 'native' && 'false' || 'true' }} gh_token: ${{ secrets.GITHUB_TOKEN }} compile_mode: ${{ matrix.target.mode }} - # There is no native code yet on PPC64LE, R-V or AArch64_be, so no point running opt tests - opt: ${{ (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'riscv64' && matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') && 'all' || 'no_opt' }} - - name: build + test (+debug+memsan+ubsan) + opt: 'no_opt' + - name: build + test (+debug+memsan+ubsan, native) uses: ./.github/actions/multi-functest if: ${{ matrix.target.mode == 'native' }} with: @@ -151,6 +150,17 @@ jobs: compile_mode: native cflags: "-DMLKEM_DEBUG -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all" check_namespace: 'false' + - name: build + test (+debug, cross, opt) + uses: ./.github/actions/multi-functest + # There is no native code yet on riscv64, riscv32 or AArch64_be, so no point running opt tests + if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'riscv64' && matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') }} + with: + nix-shell: ${{ matrix.target.nix_shell }} + nix-cache: ${{ matrix.target.mode == 'native' && 'false' || 'true' }} + gh_token: ${{ secrets.GITHUB_TOKEN }} + compile_mode: ${{ matrix.target.mode }} + cflags: "-DMLKEM_DEBUG" + opt: 'opt' backend_tests: name: AArch64 FIPS202 backends (${{ matrix.backend }}) strategy: From 07e9c81e96db8f28aaf85f3b89d48021814afc21 Mon Sep 17 00:00:00 2001 From: Hanno Becker Date: Thu, 11 Sep 2025 09:50:51 +0100 Subject: [PATCH 6/6] Rerun autogen to update mlkem/* sources for PPC64Le Signed-off-by: Hanno Becker --- dev/ppc64le/src/arith_native_ppc64le.h | 4 +- dev/ppc64le/src/consts.h | 4 +- mlkem/mlkem_native.S | 12 +- mlkem/mlkem_native.c | 9 +- mlkem/src/native/ppc64le/meta.h | 2 +- .../native/ppc64le/src/arith_native_ppc64le.h | 4 +- mlkem/src/native/ppc64le/src/consts.c | 202 +- mlkem/src/native/ppc64le/src/consts.h | 7 +- mlkem/src/native/ppc64le/src/consts_intt.inc | 90 + mlkem/src/native/ppc64le/src/consts_ntt.inc | 45 + mlkem/src/native/ppc64le/src/intt_ppc.S | 2980 +++++++++++++---- mlkem/src/native/ppc64le/src/ntt_ppc.S | 1856 +++++++--- mlkem/src/native/ppc64le/src/poly_tomont.S | 512 ++- mlkem/src/native/ppc64le/src/reduce.S | 890 +++-- scripts/autogen | 12 + 15 files changed, 4936 insertions(+), 1693 deletions(-) create mode 100644 mlkem/src/native/ppc64le/src/consts_intt.inc create mode 100644 mlkem/src/native/ppc64le/src/consts_ntt.inc diff --git a/dev/ppc64le/src/arith_native_ppc64le.h b/dev/ppc64le/src/arith_native_ppc64le.h index 1c75346689..282b3566cd 100644 --- a/dev/ppc64le/src/arith_native_ppc64le.h +++ b/dev/ppc64le/src/arith_native_ppc64le.h @@ -1,6 +1,6 @@ /* - * Copyright (c) 2024-2025 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ #ifndef MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H #define MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H diff --git a/dev/ppc64le/src/consts.h b/dev/ppc64le/src/consts.h index 59de765cf0..c861ddec6c 100644 --- a/dev/ppc64le/src/consts.h +++ b/dev/ppc64le/src/consts.h @@ -16,9 +16,7 @@ #define C1441_OFFSET 64 #define C1353_OFFSET 80 #define ZETA_NTT_OFFSET 96 -#define ZETA_NTT_OFFSET64 1104 -#define IZETA_NTT_OFFSET127 1616 -#define IZETA_NTT_OFFSET63 2128 +#define ZETA_INTT_OFFSET 1104 /* check-magic: on */ #ifndef __ASSEMBLER__ diff --git a/mlkem/mlkem_native.S b/mlkem/mlkem_native.S index 6f2a8b221f..f5ea7c8b40 100644 --- a/mlkem/mlkem_native.S +++ b/mlkem/mlkem_native.S @@ -85,6 +85,12 @@ #include "mlkem/src/native/x86_64/src/rej_uniform_asm.S" #include "mlkem/src/native/x86_64/src/tomont.S" #endif /* MLK_SYS_X86_64 */ +#if defined(MLK_SYS_PPC64LE) +#include "mlkem/src/native/ppc64le/src/intt_ppc.S" +#include "mlkem/src/native/ppc64le/src/ntt_ppc.S" +#include "mlkem/src/native/ppc64le/src/poly_tomont.S" +#include "mlkem/src/native/ppc64le/src/reduce.S" +#endif /* MLK_SYS_PPC64LE */ #endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */ #if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202) @@ -460,7 +466,7 @@ #undef MLK_NATIVE_META_H /* mlkem/src/native/ppc64le/meta.h */ #undef MLK_ARITH_BACKEND_NAME -#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_ARITH_BACKEND_PPC64LE #undef MLK_NATIVE_PPC64LE_META_H #undef MLK_USE_NATIVE_INTT #undef MLK_USE_NATIVE_NTT @@ -476,14 +482,12 @@ #undef C1353_OFFSET #undef C1441_OFFSET #undef C20159_OFFSET -#undef IZETA_NTT_OFFSET127 -#undef IZETA_NTT_OFFSET63 #undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H #undef NQ_OFFSET #undef QINV_OFFSET #undef Q_OFFSET +#undef ZETA_INTT_OFFSET #undef ZETA_NTT_OFFSET -#undef ZETA_NTT_OFFSET64 #undef mlk_ppc_qdata #if defined(MLK_SYS_AARCH64) /* diff --git a/mlkem/mlkem_native.c b/mlkem/mlkem_native.c index 74903ed1da..d1652161e3 100644 --- a/mlkem/mlkem_native.c +++ b/mlkem/mlkem_native.c @@ -84,6 +84,9 @@ #include "src/native/x86_64/src/consts.c" #include "src/native/x86_64/src/rej_uniform_table.c" #endif +#if defined(MLK_SYS_PPC64LE) +#include "src/native/ppc64le/src/consts.c" +#endif #endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */ #if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202) @@ -447,7 +450,7 @@ #undef MLK_NATIVE_META_H /* mlkem/src/native/ppc64le/meta.h */ #undef MLK_ARITH_BACKEND_NAME -#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_ARITH_BACKEND_PPC64LE #undef MLK_NATIVE_PPC64LE_META_H #undef MLK_USE_NATIVE_INTT #undef MLK_USE_NATIVE_NTT @@ -463,14 +466,12 @@ #undef C1353_OFFSET #undef C1441_OFFSET #undef C20159_OFFSET -#undef IZETA_NTT_OFFSET127 -#undef IZETA_NTT_OFFSET63 #undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H #undef NQ_OFFSET #undef QINV_OFFSET #undef Q_OFFSET +#undef ZETA_INTT_OFFSET #undef ZETA_NTT_OFFSET -#undef ZETA_NTT_OFFSET64 #undef mlk_ppc_qdata #if defined(MLK_SYS_AARCH64) /* diff --git a/mlkem/src/native/ppc64le/meta.h b/mlkem/src/native/ppc64le/meta.h index 54b3ddd9c6..c5694f9c26 100644 --- a/mlkem/src/native/ppc64le/meta.h +++ b/mlkem/src/native/ppc64le/meta.h @@ -8,7 +8,7 @@ /* Identifier for this backend so that source and assembly files * in the build can be appropriately guarded. */ -#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#define MLK_ARITH_BACKEND_PPC64LE #define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT diff --git a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h index dbcee3e3ee..7ab3226c48 100644 --- a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h +++ b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h @@ -1,6 +1,6 @@ /* - * Copyright (c) 2024-2025 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ #ifndef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H #define MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H diff --git a/mlkem/src/native/ppc64le/src/consts.c b/mlkem/src/native/ppc64le/src/consts.c index 4c2fbdf61a..c9c869a607 100644 --- a/mlkem/src/native/ppc64le/src/consts.c +++ b/mlkem/src/native/ppc64le/src/consts.c @@ -5,151 +5,73 @@ #include "../../../common.h" -#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) #include "consts.h" -MLK_ALIGN const int16_t mlk_ppc_qdata[1568] = { +MLK_ALIGN const int16_t mlk_ppc_qdata[] = { /* -Q */ - -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329, + /* check-magic: -3329 == -1 * MLKEM_Q */ + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, /* QINV */ - -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327, + /* check-magic: -3327 == pow(MLKEM_Q,-1,2^16) */ + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, /* Q */ - 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329, - /* const 20159 for reduce.S and intt */ - 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159, - /* const 1441 for intt */ - 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441, - /* for poly_tomont.S */ - 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353, - /* zetas */ - /* For ntt Len=128, offset 96 */ - -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, - -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, - -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, - 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, - 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, - -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, - 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, - 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, - -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, - 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, - -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, - 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, - -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, - 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, - -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, - -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, - 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, - -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, - 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, - -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, - 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, - -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, - 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, - 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, - -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, - 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, - -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, - -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, - -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, - 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, - -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, - -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, - 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, - -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, - -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, - -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, - 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, - -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, - -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, - 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, - /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ - -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, - 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, - 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, - 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, - -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, - -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, - -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, - -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, - 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, - 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, - 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, - 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, - 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, - 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, - 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, - -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, - -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, - -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, - -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, - 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, - 1628, 1628, - /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ - 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, - 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, - -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, - -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, - 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, - -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, - 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, - 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, - 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, - 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, - 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, - 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, - -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, - 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, - -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, - -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, - -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, - 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, - 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, - 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, - -1103, - /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ - -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, - 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, - 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, - -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, - 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, - -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, - -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, - -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, - 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, - -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, - -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, - 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, - -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, - -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, - -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, - 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, - -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, - 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, - 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, - -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, - 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, - -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, - 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, - -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, - 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, - -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, - -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, - 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, - -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, - 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, - 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, - 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, - -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, - 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, - 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, - -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, - 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, - 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, - -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, - -359, -758, -758, -758, -758, -758, -758, -758, -758}; + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + /* check-magic: 20159 == round(2^26 / MLKEM_Q) */ + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + /* check-magic: 1441 == pow(2,32-7,MLKEM_Q) */ + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + /* check-magic: 1353 == pow(2, 32, MLKEM_Q) */ + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, +/* zetas for NTT */ +#include "consts_ntt.inc" + , +/* zetas for invNTT */ +#include "consts_intt.inc" +}; -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/consts.h b/mlkem/src/native/ppc64le/src/consts.h index 49f519d0c3..90ad7b51cf 100644 --- a/mlkem/src/native/ppc64le/src/consts.h +++ b/mlkem/src/native/ppc64le/src/consts.h @@ -7,6 +7,8 @@ #define MLK_NATIVE_PPC64LE_SRC_CONSTS_H #include "../../../common.h" +/* Offsets into the constant table */ +/* check-magic: off */ #define NQ_OFFSET 0 #define QINV_OFFSET 16 #define Q_OFFSET 32 @@ -14,9 +16,8 @@ #define C1441_OFFSET 64 #define C1353_OFFSET 80 #define ZETA_NTT_OFFSET 96 -#define ZETA_NTT_OFFSET64 1104 -#define IZETA_NTT_OFFSET127 1616 -#define IZETA_NTT_OFFSET63 2128 +#define ZETA_INTT_OFFSET 1104 +/* check-magic: on */ #ifndef __ASSEMBLER__ #define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) diff --git a/mlkem/src/native/ppc64le/src/consts_intt.inc b/mlkem/src/native/ppc64le/src/consts_intt.inc new file mode 100644 index 0000000000..7cd95fcd08 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts_intt.inc @@ -0,0 +1,90 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + + /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ + -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, + 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, + 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, + 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, + -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, + -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, + -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, + -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, + 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, + 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, + 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, + 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, + 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, + 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, + 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, + -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, + -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, + -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, + -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, + 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, + 1628, 1628, + /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ + 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, + 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, + -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, + -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, + 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, + -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, + 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, + 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, + 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, + 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, + 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, + 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, + -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, + 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, + -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, + -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, + -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, + 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, + 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, + 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, + -1103, + /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ + -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, + 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, + -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, + -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, + -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, + -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, + 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, + -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, + -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, + 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, + -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, + -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, + 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, + 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, + 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, + -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, + -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, + 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, + -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, + 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, + -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, + -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, + 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, + -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, + 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, + 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, + 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, + -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, + 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, + -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, + 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, + -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, + -359, -758, -758, -758, -758, -758, -758, -758, -758 diff --git a/mlkem/src/native/ppc64le/src/consts_ntt.inc b/mlkem/src/native/ppc64le/src/consts_ntt.inc new file mode 100644 index 0000000000..bfb64e722c --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts_ntt.inc @@ -0,0 +1,45 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + /* For ntt Len=128, offset 96 */ + -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, + -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, + 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, + 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, + -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, + 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, + 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, + 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, + -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, + -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, + -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, + 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, + 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, + -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, + 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, + -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, + 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, + 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, + -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, + 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, + -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, + -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, + -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, + 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, + -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, + -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, + 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, + -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, + -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, + -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, + 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, + -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, + -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, + 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S index 5d7aa86e38..0b62d997d3 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -3,244 +3,231 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ #include "../../../common.h" -#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) -#include "consts.h" - -.machine "any" -.text - -# Barrett reduce constatnts -#define V20159 0 -#define V_25 1 -#define V_26 2 -#define V_MKQ 3 - -# Montgomery reduce constatnts -#define V_QINV 2 -#define V_NMKQ 5 -#define V_Z0 7 -#define V_Z1 8 -#define V_Z2 9 -#define V_Z3 10 -#define V_ZETA 10 -#define V1441 10 - -.macro SAVE_REGS - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - stxvx 32+20, 10, 1 - stxvx 32+21, 11, 1 - stxvx 32+22, 12, 1 - stxvx 32+23, 14, 1 - stxvx 32+24, 15, 1 - stxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - stxvx 32+26, 10, 1 - stxvx 32+27, 11, 1 - stxvx 32+28, 12, 1 - stxvx 32+29, 14, 1 - stxvx 32+30, 15, 1 - stxvx 32+31, 16, 1 -.endm - -.macro RESTORE_REGS - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - lxvx 32+20, 10, 1 - lxvx 32+21, 11, 1 - lxvx 32+22, 12, 1 - lxvx 32+23, 14, 1 - lxvx 32+24, 15, 1 - lxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - lxvx 32+26, 10, 1 - lxvx 32+27, 11, 1 - lxvx 32+28, 12, 1 - lxvx 32+29, 14, 1 - lxvx 32+30, 15, 1 - lxvx 32+31, 16, 1 - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 -.endm - -.macro Compute_4Coeffs - vsubuhm 25, 8, 21 # r[j+len] - t - vsubuhm 26, 12, 22 # r[j+len] - t - vsubuhm 30, 16, 23 # r[j+len] - t - vsubuhm 31, 20, 24 # r[j+len] - t - vadduhm 8, 8, 21 # r[j+len] + t - vadduhm 12, 12, 22 # r[j+len] + t - vadduhm 16, 16, 23 # r[j+len] + t - vadduhm 20, 20, 24 # r[j+len] + t -.endm - -.macro Load_4Coeffs start next step - mr 9, \start # j - add 10, 7, 9 # J + len*2 - addi 16, 9, \next - addi 17, 10, \step - addi 18, 16, \next - addi 19, 17, \step - addi 20, 18, \next - addi 21, 19, \step - lxvd2x 32+8, 3, 10 # r[j+len] - lxvd2x 32+12, 3, 17 # r[j+len] - lxvd2x 32+16, 3, 19 # r[j+len] - lxvd2x 32+20, 3, 21 # r[j+len] +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/ppc64le/src/intt_ppc.S using scripts/simpasm. Do not modify it directly. + */ - lxvd2x 32+21, 3, 9 - lxvd2x 32+22, 3, 16 - lxvd2x 32+23, 3, 18 - lxvd2x 32+24, 3, 20 - Compute_4Coeffs -.endm +.text +.balign 16 +.global MLK_ASM_NAMESPACE(intt_ppc) +MLK_ASM_FN_SYMBOL(intt_ppc) -# -# Load Coeffients and setup vectors -# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 -# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 -# -# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 -# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 -# -.macro Load_L24Coeffs - lxvd2x 32+25, 0, 5 # a[j], r[j+len] - lxvd2x 32+26, 10, 5 # a[j], r[j+len] + .cfi_startproc + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 52, 10, 1 + stxvx 53, 11, 1 + stxvx 54, 12, 1 + stxvx 55, 14, 1 + stxvx 56, 15, 1 + stxvx 57, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 58, 10, 1 + stxvx 59, 11, 1 + stxvx 60, 12, 1 + stxvx 61, 14, 1 + stxvx 62, 15, 1 + stxvx 63, 16, 1 + lxvx 0, 0, 4 + li 10, 16 + lxvx 34, 10, 4 + xxlxor 35, 35, 35 + vspltish 4, 1 + xxlor 2, 34, 34 + xxlor 3, 35, 35 + xxlor 4, 36, 36 + li 10, 32 + li 11, 48 + lxvx 6, 10, 4 + lxvx 32, 11, 4 + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 40, 40 + vspltisw 9, 1 + vsubuwm 10, 8, 9 + vslw 9, 9, 10 + xxlor 7, 41, 41 + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 + addi 14, 4, 64 + lvx 10, 0, 14 + li 8, 4 + mtctr 8 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + +Lintt_ppc__Loopf: + lxvd2x 57, 0, 3 + lxvd2x 58, 10, 3 + lxvd2x 62, 11, 3 + lxvd2x 63, 12, 3 + addi 3, 3, 64 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 6, 15, 4 + vsrah 7, 20, 4 + vsrah 8, 25, 4 + vsrah 9, 30, 4 + lxvd2x 57, 0, 3 + lxvd2x 58, 10, 3 + lxvd2x 62, 11, 3 + lxvd2x 63, 12, 3 + addi 3, 3, 64 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + addi 3, 3, -128 + stxvd2x 38, 0, 3 + stxvd2x 39, 10, 3 + stxvd2x 40, 11, 3 + stxvd2x 41, 12, 3 + stxvd2x 45, 15, 3 + stxvd2x 50, 16, 3 + stxvd2x 55, 17, 3 + stxvd2x 60, 18, 3 + addi 3, 3, 128 + bdnz Lintt_ppc__Loopf + addi 3, 3, -512 + nop + ori 2, 2, 0 + addi 14, 4, 1104 + li 7, 4 + li 8, 4 + mtctr 8 + mr 5, 3 + +Lintt_ppc__Loop2: + lxvd2x 57, 0, 5 + lxvd2x 58, 10, 5 vmrgew 8, 25, 26 vmrgow 21, 25, 26 - lxvd2x 32+25, 11, 5 # a[j], r[j+len] - lxvd2x 32+26, 12, 5 # a[j], r[j+len] + lxvd2x 57, 11, 5 + lxvd2x 58, 12, 5 vmrgew 12, 25, 26 vmrgow 22, 25, 26 - lxvd2x 32+25, 15, 5 # a[j], r[j+len] - lxvd2x 32+26, 16, 5 # a[j], r[j+len] + lxvd2x 57, 15, 5 + lxvd2x 58, 16, 5 vmrgew 16, 25, 26 vmrgow 23, 25, 26 - lxvd2x 32+25, 17, 5 # a[j], r[j+len] - lxvd2x 32+26, 18, 5 # a[j], r[j+len] + lxvd2x 57, 17, 5 + lxvd2x 58, 18, 5 vmrgew 20, 25, 26 vmrgow 24, 25, 26 -.endm - -# -# Permute -# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 -# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 -# -# to -# rjlen4 - rjlen7, rjlen12 - rjlen15 -# rj0 - rj4, rj8 - rj11 -# -.macro Load_L44Coeffs - lxvd2x 10, 0, 5 # rj0, rj1, rj2, rj3, - # rjlen4, rjlen5, rjlen6, rjlen7 - lxvd2x 11, 10, 5 # rj8, rj9, rj10, rj11 - # rjlen12, rjlen13, rjlen14, rjlen15 - xxpermdi 32+8, 11, 10, 3 # rjlen4 - rjlen7, rjlen12 - rjlen15 - xxpermdi 32+21, 11, 10, 0 # rj0 - rj4, rj8 - rj11 - lxvd2x 10, 11, 5 - lxvd2x 11, 12, 5 - xxpermdi 32+12, 11, 10, 3 - xxpermdi 32+22, 11, 10, 0 - lxvd2x 10, 15, 5 - lxvd2x 11, 16, 5 - xxpermdi 32+16, 11, 10, 3 - xxpermdi 32+23, 11, 10, 0 - lxvd2x 10, 17, 5 - lxvd2x 11, 18, 5 - xxpermdi 32+20, 11, 10, 3 - xxpermdi 32+24, 11, 10, 0 -.endm - -.macro BREDUCE_4X _v0 _v1 _v2 _v3 - vxor 7, 7, 7 - xxlor 32+3, 6, 6 # V_MKQ - xxlor 32+1, 7, 7 # V_25 - xxlor 32+2, 8, 8 # V_26 - # Multify Odd/Even signed halfword; - # Results word bound by 2^32 in abs value. - vmulosh 6, 8, V20159 - vmulesh 5, 8, V20159 - vmulosh 11, 12, V20159 - vmulesh 10, 12, V20159 - vmulosh 15, 16, V20159 - vmulesh 14, 16, V20159 - vmulosh 19, 20, V20159 - vmulesh 18, 20, V20159 - xxmrglw 32+4, 32+5, 32+6 - xxmrghw 32+5, 32+5, 32+6 - xxmrglw 32+9, 32+10, 32+11 - xxmrghw 32+10, 32+10, 32+11 - xxmrglw 32+13, 32+14, 32+15 - xxmrghw 32+14, 32+14, 32+15 - xxmrglw 32+17, 32+18, 32+19 - xxmrghw 32+18, 32+18, 32+19 - vadduwm 4, 4, V_25 - vadduwm 5, 5, V_25 - vadduwm 9, 9, V_25 - vadduwm 10, 10, V_25 - vadduwm 13, 13, V_25 - vadduwm 14, 14, V_25 - vadduwm 17, 17, V_25 - vadduwm 18, 18, V_25 - # Right shift and pack lower halfword, - # results bond to 2^16 in abs value - vsraw 4, 4, V_26 - vsraw 5, 5, V_26 - vsraw 9, 9, V_26 - vsraw 10, 10, V_26 - vsraw 13, 13, V_26 - vsraw 14, 14, V_26 - vsraw 17, 17, V_26 - vsraw 18, 18, V_26 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 vpkuwum 4, 5, 4 vsubuhm 4, 7, 4 vpkuwum 9, 10, 9 @@ -249,134 +236,50 @@ vsubuhm 13, 7, 13 vpkuwum 17, 18, 17 vsubuhm 17, 7, 17 - # Modulo multify-Low unsigned halfword; - # results bond to 2^16 * q in abs value. - vmladduhm \_v0, 4, V_MKQ, 8 - vmladduhm \_v1, 9, V_MKQ, 12 - vmladduhm \_v2, 13, V_MKQ, 16 - vmladduhm \_v3, 17, V_MKQ, 20 -.endm - -#----------------------------------- -# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) -# -.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 - # Modular multification bond by 2^16 * q in abs value - vmladduhm 15, 25, \_vz0, 3 - vmladduhm 20, 26, \_vz1, 3 - vmladduhm 27, 30, \_vz2, 3 - vmladduhm 28, 31, \_vz3, 3 - - # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value - vmhraddshs 14, 25, \_vz0, 3 - vmhraddshs 19, 26, \_vz1, 3 - vmhraddshs 24, 30, \_vz2, 3 - vmhraddshs 29, 31, \_vz3, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 27, V_QINV, 3 - vmladduhm 30, 28, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 - - vsrah \_vo0, 15, 4 # >> 1 - vsrah \_vo1, 20, 4 # >> 1 - vsrah \_vo2, 25, 4 # >> 1 - vsrah \_vo3, 30, 4 # >> 1 -.endm - -.macro Set_mont_consts - xxlor 32+5, 0, 0 # V_NMKQ - xxlor 32+2, 2, 2 # V_QINV - xxlor 32+3, 3, 3 # 0 - xxlor 32+4, 4, 4 # 1 -.endm - -.macro Load_next_4zetas - li 8, 16 - li 11, 32 - li 12, 48 - lxvd2x 32+V_Z0, 0, 14 - lxvd2x 32+V_Z1, 8, 14 - lxvd2x 32+V_Z2, 11, 14 - lxvd2x 32+V_Z3, 12, 14 - addi 14, 14, 64 -.endm - -.macro Perm_4zetas - xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 - xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 - xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 - xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 -.endm - -.macro Write_B4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, 3, 9 - stxvd2x \_vs1, 3, 16 - stxvd2x \_vs2, 3, 18 - stxvd2x \_vs3, 3, 20 -.endm - -.macro Write_M4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, 3, 10 - stxvd2x \_vs1, 3, 17 - stxvd2x \_vs2, 3, 19 - stxvd2x \_vs3, 3, 21 -.endm - -.macro Reload_4coeffs - lxvd2x 32+25, 0, 3 - lxvd2x 32+26, 10, 3 - lxvd2x 32+30, 11, 3 - lxvd2x 32+31, 12, 3 - addi 3, 3, 64 -.endm - -.macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 - addi 3, 3, -128 - stxvd2x \_vs0, 0, 3 - stxvd2x \_vs1, 10, 3 - stxvd2x \_vs2, 11, 3 - stxvd2x \_vs3, 12, 3 - stxvd2x \_vs4, 15, 3 - stxvd2x \_vs5, 16, 3 - stxvd2x \_vs6, 17, 3 - stxvd2x \_vs7, 18, 3 - addi 3, 3, 128 -.endm - -.macro PermWriteL44 - xxlor 32+14, 10, 10 - xxlor 32+19, 11, 11 - xxlor 32+24, 12, 12 - xxlor 32+29, 13, 13 - xxpermdi 32+10, 32+14, 32+13, 3 - xxpermdi 32+11, 32+14, 32+13, 0 - xxpermdi 32+12, 32+19, 32+18, 3 - xxpermdi 32+13, 32+19, 32+18, 0 - xxpermdi 32+14, 32+24, 32+23, 3 - xxpermdi 32+15, 32+24, 32+23, 0 - xxpermdi 32+16, 32+29, 32+28, 3 - xxpermdi 32+17, 32+29, 32+28, 0 - stxvd2x 32+10, 0, 5 - stxvd2x 32+11, 10, 5 - stxvd2x 32+12, 11, 5 - stxvd2x 32+13, 12, 5 - stxvd2x 32+14, 15, 5 - stxvd2x 32+15, 16, 5 - stxvd2x 32+16, 17, 5 - stxvd2x 32+17, 18, 5 -.endm - -.macro PermWriteL24 - xxlor 32+14, 10, 10 - xxlor 32+19, 11, 11 - xxlor 32+24, 12, 12 - xxlor 32+29, 13, 13 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + xxlor 46, 10, 10 + xxlor 51, 11, 11 + xxlor 56, 12, 12 + xxlor 61, 13, 13 vmrgew 10, 13, 14 vmrgow 11, 13, 14 vmrgew 12, 18, 19 @@ -385,327 +288,2058 @@ vmrgow 15, 23, 24 vmrgew 16, 28, 29 vmrgow 17, 28, 29 - stxvd2x 32+10, 0, 5 - stxvd2x 32+11, 10, 5 - stxvd2x 32+12, 11, 5 - stxvd2x 32+13, 12, 5 - stxvd2x 32+14, 15, 5 - stxvd2x 32+15, 16, 5 - stxvd2x 32+16, 17, 5 - stxvd2x 32+17, 18, 5 -.endm - -.macro INTT_REDUCE_L24 - Load_L24Coeffs - Compute_4Coeffs - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - PermWriteL24 -.endm - -.macro INTT_REDUCE_L44 - Load_L44Coeffs - Compute_4Coeffs - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - Perm_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - PermWriteL44 -.endm - -.macro INTT_REDUCE_4X start next step - Load_4Coeffs \start, \next, \step - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 -.endm - -# intt -# t = r[j]; -# r[j] = barrett_reduce(t + r[j + len]); -# r[j + len] = r[j + len] - t; -# r[j + len] = fqmul(zeta, r[j + len]); - -# -# mlk_intt_ppc(r) -# -.global MLK_ASM_NAMESPACE(intt_ppc) -.align 4 -MLK_ASM_FN_SYMBOL(intt_ppc) - - SAVE_REGS - - # init vectors and constants - # Setup for Montgomery reduce - lxvx 0, 0, 4 - - li 10, QINV_OFFSET - lxvx 32+V_QINV, 10, 4 # QINV - xxlxor 32+3, 32+3, 32+3 - vspltish 4, 1 - xxlor 2, 32+2, 32+2 # QINV - xxlor 3, 32+3, 32+3 # 0 - xxlor 4, 32+4, 32+4 # 1 - - # Setup for Barrett reduce - li 10, Q_OFFSET - li 11, C20159_OFFSET - lxvx 6, 10, 4 # V_MKQ - lxvx 32+V20159, 11, 4 # V20159 - - vspltisw 8, 13 - vadduwm 8, 8, 8 - xxlor 8, 32+8, 32+8 # V_26 store at vs8 - - vspltisw 9, 1 - vsubuwm 10, 8, 9 # 25 - vslw 9, 9, 10 - xxlor 7, 32+9, 32+9 # V_25 syore at vs7 - - li 10, 16 - li 11, 32 - li 12, 48 - li 15, 64 - li 16, 80 - li 17, 96 - li 18, 112 - - # - # Montgomery reduce loops with constant 1441 - # - addi 14, 4, C1441_OFFSET - lvx V1441, 0, 14 - li 8, 4 # loops - mtctr 8 - - Set_mont_consts -intt_ppc__Loopf: - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - bdnz intt_ppc__Loopf - - addi 3, 3, -512 - -.align 4 - # - # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - # Update zetas vectors, each vector has 2 zetas - addi 14, 4, ZETA_INTT_OFFSET - li 7, 4 - li 8, 4 - mtctr 8 - mr 5, 3 -intt_ppc__Loop2: - INTT_REDUCE_L24 - addi 5, 5, 128 - bdnz intt_ppc__Loop2 - -.align 4 - # - # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - mr 5, 3 - li 7, 8 - li 8, 4 # loops - mtctr 8 -intt_ppc__Loop4: - INTT_REDUCE_L44 - addi 5, 5, 128 - bdnz intt_ppc__Loop4 - -.align 4 - # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - li 7, 16 - li 5, 0 - li 15, 4 # loops - mtctr 15 - -intt_ppc__Loop8: - INTT_REDUCE_4X 5, 32, 32 - addi 5, 5, 128 - bdnz intt_ppc__Loop8 - -.align 4 - # - # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - li 5, 0 - li 7, 32 - - INTT_REDUCE_4X 5, 64, 64 - - li 5, 16 - addi 14, 14, -64 - INTT_REDUCE_4X 5, 64, 64 - - li 5, 256 - INTT_REDUCE_4X 5, 64, 64 - - li 5, 272 - addi 14, 14, -64 - INTT_REDUCE_4X 5, 64, 64 - -.align 4 - # - # 5. len = 32, start = 0, 64, 128, 192 - li 5, 0 - li 7, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 384 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - -.align 4 - # - # 6. len = 64, start = 0, 128 - li 5, 0 - li 7, 128 - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 320 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - -.align 4 - # 7. len = 128, start = 0 - # - li 5, 0 # start - li 7, 256 # len * 2 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - xxlor 9, 32+10, 32+10 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 192 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - - RESTORE_REGS + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + bdnz Lintt_ppc__Loop2 + ori 2, 2, 0 + mr 5, 3 + li 7, 8 + li 8, 4 + mtctr 8 + +Lintt_ppc__Loop4: + lxvd2x 10, 0, 5 + lxvd2x 11, 10, 5 + xxmrgld 40, 11, 10 + xxmrghd 53, 11, 10 + lxvd2x 10, 11, 5 + lxvd2x 11, 12, 5 + xxmrgld 44, 11, 10 + xxmrghd 54, 11, 10 + lxvd2x 10, 15, 5 + lxvd2x 11, 16, 5 + xxmrgld 48, 11, 10 + xxmrghd 55, 11, 10 + lxvd2x 10, 17, 5 + lxvd2x 11, 18, 5 + xxmrgld 52, 11, 10 + xxmrghd 56, 11, 10 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + xxswapd 39, 39 + xxswapd 40, 40 + xxswapd 41, 41 + xxswapd 42, 42 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + xxlor 46, 10, 10 + xxlor 51, 11, 11 + xxlor 56, 12, 12 + xxlor 61, 13, 13 + xxmrgld 42, 46, 45 + xxmrghd 43, 46, 45 + xxmrgld 44, 51, 50 + xxmrghd 45, 51, 50 + xxmrgld 46, 56, 55 + xxmrghd 47, 56, 55 + xxmrgld 48, 61, 60 + xxmrghd 49, 61, 60 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + bdnz Lintt_ppc__Loop4 + nop + ori 2, 2, 0 + li 7, 16 + li 5, 0 + li 15, 4 + mtctr 15 + +Lintt_ppc__Loop8: + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + addi 5, 5, 128 + bdnz Lintt_ppc__Loop8 + nop + ori 2, 2, 0 + li 5, 0 + li 7, 32 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 5, 16 + addi 14, 14, -64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 5, 256 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 5, 272 + addi 14, 14, -64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + ori 2, 2, 0 + li 5, 0 + li 7, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + addi 14, 14, 16 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 5, 128 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + addi 14, 14, 16 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 5, 256 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + addi 14, 14, 16 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 5, 384 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + addi 14, 14, 16 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + nop + nop + ori 2, 2, 0 + li 5, 0 + li 7, 128 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 5, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + addi 14, 14, 16 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 5, 256 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 5, 320 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + addi 14, 14, 16 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + ori 2, 2, 0 + li 5, 0 + li 7, 256 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + xxlor 9, 42, 42 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 5, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + xxlor 42, 9, 9 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 5, 128 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + xxlor 42, 9, 9 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 5, 192 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + xxlor 42, 9, 9 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 52, 10, 1 + lxvx 53, 11, 1 + lxvx 54, 12, 1 + lxvx 55, 14, 1 + lxvx 56, 15, 1 + lxvx 57, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 58, 10, 1 + lxvx 59, 11, 1 + lxvx 60, 12, 1 + lxvx 61, 14, 1 + lxvx 62, 15, 1 + lxvx 63, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + mtlr 0 + addi 1, 1, 352 blr + .cfi_endproc -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef V20159 -#undef V_25 -#undef V_26 -#undef V_MKQ -#undef V_QINV -#undef V_NMKQ -#undef V_Z0 -#undef V_Z1 -#undef V_Z2 -#undef V_Z3 -#undef V_ZETA -#undef V1441 - -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S index 3d65856ad9..d5e9a9fed5 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -3,265 +3,1284 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ #include "../../../common.h" -#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) -#include "consts.h" +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/ppc64le/src/ntt_ppc.S using scripts/simpasm. Do not modify it directly. + */ -#define V_QINV 2 -#define V_NMKQ 5 -#define V_Z0 7 -#define V_Z1 8 -#define V_Z2 9 -#define V_Z3 10 -#define V_ZETA 10 -.machine "any" .text +.balign 16 +.global MLK_ASM_NAMESPACE(ntt_ppc) +MLK_ASM_FN_SYMBOL(ntt_ppc) -.macro SAVE_REGS - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - stxvx 32+20, 10, 1 - stxvx 32+21, 11, 1 - stxvx 32+22, 12, 1 - stxvx 32+23, 14, 1 - stxvx 32+24, 15, 1 - stxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - stxvx 32+26, 10, 1 - stxvx 32+27, 11, 1 - stxvx 32+28, 12, 1 - stxvx 32+29, 14, 1 - stxvx 32+30, 15, 1 - stxvx 32+31, 16, 1 -.endm - -.macro RESTORE_REGS - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - lxvx 32+20, 10, 1 - lxvx 32+21, 11, 1 - lxvx 32+22, 12, 1 - lxvx 32+23, 14, 1 - lxvx 32+24, 15, 1 - lxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - lxvx 32+26, 10, 1 - lxvx 32+27, 11, 1 - lxvx 32+28, 12, 1 - lxvx 32+29, 14, 1 - lxvx 32+30, 15, 1 - lxvx 32+31, 16, 1 - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 -.endm - -.macro Load_4Coeffs start next step - mr 9, \start - add 10, 7, 9 # J + len*2 - addi 16, 9, \next - addi 17, 10, \step - addi 18, 16, \next - addi 19, 17, \step - addi 20, 18, \next - addi 21, 19, \step - lxvd2x 32+13, 3, 10 # r[j+len] - lxvd2x 32+18, 3, 17 # r[j+len] - lxvd2x 32+23, 3, 19 # r[j+len] - lxvd2x 32+28, 3, 21 # r[j+len] -.endm - -# -# Load Coeffients and setup vectors -# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 -# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 -# -# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 -# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 -# -.macro Load_L24Coeffs - lxvd2x 32+25, 0, 5 # a[j], r[j+len] - lxvd2x 32+26, 10, 5 # a[j], r[j+len] - vmrgew 13, 25, 26 - vmrgow 12, 25, 26 - lxvd2x 32+25, 11, 5 # a[j], r[j+len] - lxvd2x 32+26, 12, 5 # a[j], r[j+len] - vmrgew 18, 25, 26 - vmrgow 17, 25, 26 - lxvd2x 32+25, 15, 5 # a[j], r[j+len] - lxvd2x 32+26, 16, 5 # a[j], r[j+len] - vmrgew 23, 25, 26 - vmrgow 22, 25, 26 - lxvd2x 32+25, 17, 5 # a[j], r[j+len] - lxvd2x 32+26, 18, 5 # a[j], r[j+len] - vmrgew 28, 25, 26 - vmrgow 27, 25, 26 -.endm - -# -# Permute -# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 -# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 -# -# to -# rjlen4 - rjlen7, rjlen12 - rjlen15 -# rj0 - rj4, rj8 - rj11 -# -.macro Load_L44Coeffs - lxvd2x 1, 0, 5 # rj0, rj1, rj2, rj3, - # rjlen4, rjlen5, rjlen6, rjlen7 - lxvd2x 2, 10, 5 # rj8, rj9, rj10, rj11 - # rjlen12, rjlen13, rjlen14, rjlen15 - xxpermdi 32+13, 2, 1, 3 # rjlen4 - rjlen7, rjlen12 - rjlen15 - xxpermdi 32+12, 2, 1, 0 # rj0 - rj4, rj8 - rj11 - lxvd2x 3, 11, 5 - lxvd2x 4, 12, 5 - xxpermdi 32+18, 4, 3, 3 - xxpermdi 32+17, 4, 3, 0 - lxvd2x 1, 15, 5 - lxvd2x 2, 16, 5 - xxpermdi 32+23, 2, 1, 3 - xxpermdi 32+22, 2, 1, 0 - lxvd2x 3, 17, 5 - lxvd2x 4, 18, 5 - xxpermdi 32+28, 4, 3, 3 - xxpermdi 32+27, 4, 3, 0 -.endm - -# -# montgomery_reduce -# t = a * QINV -# t = (a - (int32_t)t*_MLKEM_Q) >> 16 -# -#----------------------------------- -# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) -# -.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 - # fqmul = zeta * coefficient - # Modular multification bond by 2^16 * q in abs value - vmladduhm 15, 13, \_vz0, 3 - vmladduhm 20, 18, \_vz1, 3 - vmladduhm 25, 23, \_vz2, 3 - vmladduhm 30, 28, \_vz3, 3 - - # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value - vmhraddshs 14, 13, \_vz0, 3 - vmhraddshs 19, 18, \_vz1, 3 - vmhraddshs 24, 23, \_vz2, 3 - vmhraddshs 29, 28, \_vz3, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 30, 30, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 - - vsrah 13, 15, 4 # >> 1 - vsrah 18, 20, 4 # >> 1 - vsrah 23, 25, 4 # >> 1 - vsrah 28, 30, 4 # >> 1 - -.endm - -.macro Load_4Aj - lxvd2x 32+12, 3, 9 # r[j] - lxvd2x 32+17, 3, 16 # r[j] - lxvd2x 32+22, 3, 18 # r[j] - lxvd2x 32+27, 3, 20 # r[j] -.endm - -.macro Compute_4Coeffs - # Since the result of the Montgomery multiplication is bounded - # by q in absolute value. - # Finally to complete the final update of the results with add/sub - vsubuhm 16, 12, 13 # r - t - vadduhm 15, 13, 12 # r + t - vsubuhm 21, 17, 18 # r - t - vadduhm 20, 18, 17 # r + t - vsubuhm 26, 22, 23 # r - t - vadduhm 25, 23, 22 # r + t - vsubuhm 31, 27, 28 # r - t - vadduhm 30, 28, 27 # r + t -.endm - -.macro NTT_MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 - Load_4Coeffs \start, \next, \step - MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3 - Load_4Aj - Compute_4Coeffs -.endm - -.macro Write_One - stxvd2x 32+15, 3, 9 - stxvd2x 32+16, 3, 10 - stxvd2x 32+20, 3, 16 - stxvd2x 32+21, 3, 17 - stxvd2x 32+25, 3, 18 - stxvd2x 32+26, 3, 19 - stxvd2x 32+30, 3, 20 - stxvd2x 32+31, 3, 21 -.endm + .cfi_startproc + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 52, 10, 1 + stxvx 53, 11, 1 + stxvx 54, 12, 1 + stxvx 55, 14, 1 + stxvx 56, 15, 1 + stxvx 57, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 58, 10, 1 + stxvx 59, 11, 1 + stxvx 60, 12, 1 + stxvx 61, 14, 1 + stxvx 62, 15, 1 + stxvx 63, 16, 1 + lvx 5, 0, 4 + addi 14, 4, 96 + vxor 3, 3, 3 + vspltish 4, 1 + li 10, 16 + lvx 2, 10, 4 + li 5, 0 + li 7, 256 + lvx 10, 0, 14 + addi 14, 14, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 5, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 5, 128 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 5, 192 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + ori 2, 2, 0 + li 5, 0 + li 7, 128 + lvx 10, 0, 14 + addi 14, 14, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 5, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 5, 256 + lvx 10, 0, 14 + addi 14, 14, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 5, 320 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + nop + nop + ori 2, 2, 0 + li 5, 0 + li 7, 64 + lvx 10, 0, 14 + addi 14, 14, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 5, 128 + lvx 10, 0, 14 + addi 14, 14, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 5, 256 + lvx 10, 0, 14 + addi 14, 14, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 5, 384 + lvx 10, 0, 14 + addi 14, 14, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + nop + nop + ori 2, 2, 0 + li 5, 0 + li 7, 32 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 5, 16 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + li 5, 256 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 5, 272 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + nop + nop + ori 2, 2, 0 + li 5, 0 + li 7, 16 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 5, 128 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 5, 256 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 5, 384 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + mr 9, 5 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 15, 4 + mtctr 15 + mr 5, 3 + li 7, 8 + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 -.macro PermWriteL44 - Compute_4Coeffs - xxpermdi 0, 32+15, 32+16, 3 - xxpermdi 1, 32+15, 32+16, 0 - xxpermdi 2, 32+20, 32+21, 3 - xxpermdi 3, 32+20, 32+21, 0 - xxpermdi 4, 32+25, 32+26, 3 - xxpermdi 5, 32+25, 32+26, 0 - xxpermdi 6, 32+30, 32+31, 3 - xxpermdi 7, 32+30, 32+31, 0 +Lntt_ppc__Len4: + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + xxswapd 39, 39 + xxswapd 40, 40 + xxswapd 41, 41 + xxswapd 42, 42 + lxvd2x 1, 0, 5 + lxvd2x 2, 10, 5 + xxmrgld 45, 2, 1 + xxmrghd 44, 2, 1 + lxvd2x 3, 11, 5 + lxvd2x 4, 12, 5 + xxmrgld 50, 4, 3 + xxmrghd 49, 4, 3 + lxvd2x 1, 15, 5 + lxvd2x 2, 16, 5 + xxmrgld 55, 2, 1 + xxmrghd 54, 2, 1 + lxvd2x 3, 17, 5 + lxvd2x 4, 18, 5 + xxmrgld 60, 4, 3 + xxmrghd 59, 4, 3 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + xxmrgld 0, 47, 48 + xxmrghd 1, 47, 48 + xxmrgld 2, 52, 53 + xxmrghd 3, 52, 53 + xxmrgld 4, 57, 58 + xxmrghd 5, 57, 58 + xxmrgld 6, 62, 63 + xxmrghd 7, 62, 63 stxvd2x 0, 0, 5 stxvd2x 1, 10, 5 stxvd2x 2, 11, 5 @@ -270,10 +1289,68 @@ stxvd2x 5, 16, 5 stxvd2x 6, 17, 5 stxvd2x 7, 18, 5 -.endm + addi 5, 5, 128 + bdnz Lntt_ppc__Len4 + li 8, 4 + mtctr 8 + mr 5, 3 + li 7, 4 + nop + ori 2, 2, 0 -.macro PermWriteL24 - Compute_4Coeffs +Lntt_ppc__Len2: + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + lxvd2x 57, 0, 5 + lxvd2x 58, 10, 5 + vmrgew 13, 25, 26 + vmrgow 12, 25, 26 + lxvd2x 57, 11, 5 + lxvd2x 58, 12, 5 + vmrgew 18, 25, 26 + vmrgow 17, 25, 26 + lxvd2x 57, 15, 5 + lxvd2x 58, 16, 5 + vmrgew 23, 25, 26 + vmrgow 22, 25, 26 + lxvd2x 57, 17, 5 + lxvd2x 58, 18, 5 + vmrgew 28, 25, 26 + vmrgow 27, 25, 26 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 vmrgew 10, 16, 15 vmrgow 11, 16, 15 vmrgew 12, 21, 20 @@ -282,232 +1359,51 @@ vmrgow 15, 26, 25 vmrgew 16, 31, 30 vmrgow 17, 31, 30 - stxvd2x 32+10, 0, 5 - stxvd2x 32+11, 10, 5 - stxvd2x 32+12, 11, 5 - stxvd2x 32+13, 12, 5 - stxvd2x 32+14, 15, 5 - stxvd2x 32+15, 16, 5 - stxvd2x 32+16, 17, 5 - stxvd2x 32+17, 18, 5 -.endm - -.macro Load_next_4zetas - li 10, 16 - li 11, 32 - li 12, 48 - lxvd2x 32+V_Z0, 0, 14 - lxvd2x 32+V_Z1, 10, 14 - lxvd2x 32+V_Z2, 11, 14 - lxvd2x 32+V_Z3, 12, 14 - addi 14, 14, 64 -.endm - -.macro Perm_4zetas - xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 - xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 - xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 - xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 -.endm - -# -# mlk_ntt_ppc(int16_t *r) -# -.global MLK_ASM_NAMESPACE(ntt_ppc) -.align 4 -MLK_ASM_FN_SYMBOL(ntt_ppc) - - SAVE_REGS - - # get MLKEM_Q - lvx V_NMKQ,0,4 - - # zetas array - addi 14, 4, ZETA_NTT_OFFSET - - vxor 3, 3, 3 - vspltish 4, 1 - - li 10, QINV_OFFSET - lvx V_QINV, 10, 4 - -.align 4 - # - # Compute coefficients of the NTT based on the following loop. - # for (len = 128; len ≥ 2; len = len/2) - # - # 1. len = 128, start = 0 - # - li 5, 0 # start - li 7, 256 # len * 2 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 64 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 128 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 192 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - -.align 4 - # - # 2. len = 64, start = 0, 128 - # k += 2 - li 5, 0 - li 7, 128 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 64 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 256 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 320 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - -.align 4 - # - # 3. len = 32, start = 0, 64, 128, 192 - # k += 4 - li 5, 0 - li 7, 64 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 128 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 256 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 384 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - -.align 4 - # - # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - # k += 8 - li 5, 0 - li 7, 32 - Load_next_4zetas - NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 16 - NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - - Load_next_4zetas - li 5, 256 - NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 272 - NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - -.align 4 - # - # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - # k += 16 - li 5, 0 - li 7, 16 - Load_next_4zetas - NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 128 - - Load_next_4zetas - NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 256 - - Load_next_4zetas - NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 384 - - Load_next_4zetas - NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - - # - # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - # k += 32 - li 15, 4 # loops - mtctr 15 - mr 5, 3 - li 7, 8 - - li 10, 16 - li 11, 32 - li 12, 48 - li 15, 64 - li 16, 80 - li 17, 96 - li 18, 112 - -.align 4 -ntt_ppc__Len4: - Load_next_4zetas - Perm_4zetas - Load_L44Coeffs - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 - PermWriteL44 - addi 5, 5, 128 - - bdnz ntt_ppc__Len4 - - # - # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - # k += 64 - # Update zetas vectors, each vector has 2 zetas - - li 8, 4 - mtctr 8 - mr 5, 3 - li 7, 4 - -.align 4 -ntt_ppc__Len2: - Load_next_4zetas - Load_L24Coeffs - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 - PermWriteL24 - addi 5, 5, 128 - - bdnz ntt_ppc__Len2 - - RESTORE_REGS + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + bdnz Lntt_ppc__Len2 + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 52, 10, 1 + lxvx 53, 11, 1 + lxvx 54, 12, 1 + lxvx 55, 14, 1 + lxvx 56, 15, 1 + lxvx 57, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 58, 10, 1 + lxvx 59, 11, 1 + lxvx 60, 12, 1 + lxvx 61, 14, 1 + lxvx 62, 15, 1 + lxvx 63, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + mtlr 0 + addi 1, 1, 352 blr + .cfi_endproc -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef V_QINV -#undef V_NMKQ -#undef V_ZETA - -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S index c0170f6015..080b152e5b 100644 --- a/mlkem/src/native/ppc64le/src/poly_tomont.S +++ b/mlkem/src/native/ppc64le/src/poly_tomont.S @@ -3,188 +3,356 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ -# Poly_tomont: Inplace conversion of all coefficients of a polynomial -# from normal domain to Montgomery domain -# -# Arguments:*r: pointer to input/output polynomial -# +// Poly_tomont: Inplace conversion of all coefficients of a polynomial +// from normal domain to Montgomery domain +// +// Arguments:*r: pointer to input/output polynomial #include "../../../common.h" -#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) -#include "consts.h" +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/ppc64le/src/poly_tomont.S using scripts/simpasm. Do not modify it directly. + */ -#define V1353 0 -#define V_QINV 2 -#define V_NMKQ 5 -.machine "any" .text - -# -# montgomery_reduce -# t = a * QINV -# t = (a - (int32_t)t*_MLKEM_Q) >> 16 -# -#----------------------------------- -# MREDUCE_4X(_v0, _v1, _v2, _v3) -# -.macro MREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+13, 0, 3 - addi 3, 3, 16 - lxvd2x 32+18, 0, 3 - addi 3, 3, 16 - lxvd2x 32+23, 0, 3 - addi 3, 3, 16 - lxvd2x 32+7, 0, 3 - addi 3, 3, 16 - - vmladduhm 15, 13, V1353, 3 - vmladduhm 20, 18, V1353, 3 - vmladduhm 25, 23, V1353, 3 - vmladduhm 9, 7, V1353, 3 - - vmhraddshs 14, 13, V1353, 3 - vmhraddshs 19, 18, V1353, 3 - vmhraddshs 24, 23, V1353, 3 - vmhraddshs 8, 7, V1353, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 9, 9, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 9, 9, V_NMKQ, 8 - - vsrah \_v0, 15, 4 # >> 1 - vsrah \_v1, 20, 4 # >> 1 - vsrah \_v2, 25, 4 # >> 1 - vsrah \_v3, 9, 4 # >> 1 -.endm - -.macro Write_8X - stxvd2x 32+27, 4, 3 - stxvd2x 32+28, 5, 3 - stxvd2x 32+29, 6, 3 - stxvd2x 32+30, 7, 3 - stxvd2x 32+13, 8, 3 - stxvd2x 32+18, 9, 3 - stxvd2x 32+23, 10, 3 - stxvd2x 32+7, 11, 3 -.endm - -.align 4 -.globl MLK_ASM_NAMESPACE(poly_tomont_ppc) +.balign 16 +.global MLK_ASM_NAMESPACE(poly_tomont_ppc) MLK_ASM_FN_SYMBOL(poly_tomont_ppc) - stdu 1, -320(1) - mflr 0 - - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - li 11, 208 - li 12, 224 - stxvx 32+20, 6, 1 - stxvx 32+21, 7, 1 - stxvx 32+22, 8, 1 - stxvx 32+23, 9, 1 - stxvx 32+24, 10, 1 - stxvx 32+25, 11, 1 - stxvx 32+26, 12, 1 - li 6, 240 - li 7, 256 - li 8, 272 - li 9, 288 - stxvx 32+27, 6, 1 - stxvx 32+28, 7, 1 - stxvx 32+29, 8, 1 - stxvx 32+30, 9, 1 - - li 6, NQ_OFFSET - li 7, QINV_OFFSET - li 8, C1353_OFFSET - lxvx 32+V_NMKQ, 6, 4 - lxvx 32+V_QINV, 7, 4 - lxvx 32+V1353, 8, 4 - vxor 3, 3, 3 + .cfi_startproc + stdu 1, -320(1) + mflr 0 + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + stxvx 52, 6, 1 + stxvx 53, 7, 1 + stxvx 54, 8, 1 + stxvx 55, 9, 1 + stxvx 56, 10, 1 + stxvx 57, 11, 1 + stxvx 58, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + stxvx 59, 6, 1 + stxvx 60, 7, 1 + stxvx 61, 8, 1 + stxvx 62, 9, 1 + li 6, 0 + li 7, 16 + li 8, 80 + lxvx 37, 6, 4 + lxvx 34, 7, 4 + lxvx 32, 8, 4 + vxor 3, 3, 3 vspltish 4, 1 - - li 4, -128 - li 5, -112 - li 6, -96 - li 7, -80 - li 8, -64 - li 9, -48 - li 10, -32 - li 11, -16 - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - li 11, 208 - li 12, 224 - lxvx 32+20, 6, 1 - lxvx 32+21, 7, 1 - lxvx 32+22, 8, 1 - lxvx 32+23, 9, 1 - lxvx 32+24, 10, 1 - lxvx 32+25, 11, 1 - lxvx 32+26, 12, 1 - li 6, 240 - li 7, 256 - li 8, 272 - li 9, 288 - lxvx 32+27, 6, 1 - lxvx 32+28, 7, 1 - lxvx 32+29, 8, 1 - lxvx 32+30, 9, 1 - mtlr 0 - addi 1, 1, 320 + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 27, 15, 4 + vsrah 28, 20, 4 + vsrah 29, 25, 4 + vsrah 30, 9, 4 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 7, 9, 4 + stxvd2x 59, 4, 3 + stxvd2x 60, 5, 3 + stxvd2x 61, 6, 3 + stxvd2x 62, 7, 3 + stxvd2x 45, 8, 3 + stxvd2x 50, 9, 3 + stxvd2x 55, 10, 3 + stxvd2x 39, 11, 3 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 27, 15, 4 + vsrah 28, 20, 4 + vsrah 29, 25, 4 + vsrah 30, 9, 4 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 7, 9, 4 + stxvd2x 59, 4, 3 + stxvd2x 60, 5, 3 + stxvd2x 61, 6, 3 + stxvd2x 62, 7, 3 + stxvd2x 45, 8, 3 + stxvd2x 50, 9, 3 + stxvd2x 55, 10, 3 + stxvd2x 39, 11, 3 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 27, 15, 4 + vsrah 28, 20, 4 + vsrah 29, 25, 4 + vsrah 30, 9, 4 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 7, 9, 4 + stxvd2x 59, 4, 3 + stxvd2x 60, 5, 3 + stxvd2x 61, 6, 3 + stxvd2x 62, 7, 3 + stxvd2x 45, 8, 3 + stxvd2x 50, 9, 3 + stxvd2x 55, 10, 3 + stxvd2x 39, 11, 3 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 27, 15, 4 + vsrah 28, 20, 4 + vsrah 29, 25, 4 + vsrah 30, 9, 4 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 7, 9, 4 + stxvd2x 59, 4, 3 + stxvd2x 60, 5, 3 + stxvd2x 61, 6, 3 + stxvd2x 62, 7, 3 + stxvd2x 45, 8, 3 + stxvd2x 50, 9, 3 + stxvd2x 55, 10, 3 + stxvd2x 39, 11, 3 + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + lxvx 52, 6, 1 + lxvx 53, 7, 1 + lxvx 54, 8, 1 + lxvx 55, 9, 1 + lxvx 56, 10, 1 + lxvx 57, 11, 1 + lxvx 58, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + lxvx 59, 6, 1 + lxvx 60, 7, 1 + lxvx 61, 8, 1 + lxvx 62, 9, 1 + mtlr 0 + addi 1, 1, 320 blr + .cfi_endproc -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef V1353 -#undef V_QINV -#undef V_NMKQ - -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ - -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef V1353 -#undef V_QINV -#undef V_NMKQ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S index c0cb022050..8f5a5308cb 100644 --- a/mlkem/src/native/ppc64le/src/reduce.S +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -3,73 +3,103 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# - -# -# poly_reduce: Applies Barrett reduction to all coefficients of a polynomial -# for details of the Barrett reduction -# -# Arguments: *r: pointer to input/output polynomial -# +/* + * Copyright 2025- IBM Corp. + * + *=================================================================================== + * Written by Danny Tsen + * + */ #include "../../../common.h" -#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ +#if defined(MLK_ARITH_BACKEND_PPC64LE) && \ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) -#include "consts.h" +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/ppc64le/src/reduce.S using scripts/simpasm. Do not modify it directly. + */ -# Barrett reduce constatnts -#define V20159 0 -#define V_25 1 -#define V_26 2 -#define V_MKQ 3 -.machine "any" .text +.balign 16 +.global MLK_ASM_NAMESPACE(reduce_ppc) +MLK_ASM_FN_SYMBOL(reduce_ppc) -.macro BREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+8, 0, 3 - lxvd2x 32+12, 14, 3 - lxvd2x 32+16, 15, 3 - lxvd2x 32+20, 16, 3 - addi 3, 3, 64 - vmulosh 6, 8, V20159 - vmulesh 5, 8, V20159 - vmulosh 11, 12, V20159 - vmulesh 10, 12, V20159 - vmulosh 15, 16, V20159 - vmulesh 14, 16, V20159 - vmulosh 19, 20, V20159 - vmulesh 18, 20, V20159 - xxmrglw 32+4, 32+5, 32+6 - xxmrghw 32+5, 32+5, 32+6 - xxmrglw 32+9, 32+10, 32+11 - xxmrghw 32+10, 32+10, 32+11 - xxmrglw 32+13, 32+14, 32+15 - xxmrghw 32+14, 32+14, 32+15 - xxmrglw 32+17, 32+18, 32+19 - xxmrghw 32+18, 32+18, 32+19 - vadduwm 4, 4, V_25 - vadduwm 5, 5, V_25 - vadduwm 9, 9, V_25 - vadduwm 10, 10, V_25 - vadduwm 13, 13, V_25 - vadduwm 14, 14, V_25 - vadduwm 17, 17, V_25 - vadduwm 18, 18, V_25 - vsraw 4, 4, V_26 - vsraw 5, 5, V_26 - vsraw 9, 9, V_26 - vsraw 10, 10, V_26 - vsraw 13, 13, V_26 - vsraw 14, 14, V_26 - vsraw 17, 17, V_26 - vsraw 18, 18, V_26 + .cfi_startproc + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + stxvx 52, 6, 1 + stxvx 53, 7, 1 + stxvx 54, 8, 1 + stxvx 55, 9, 1 + stxvx 56, 10, 1 + vxor 7, 7, 7 + li 6, 32 + li 7, 48 + lxvx 35, 6, 4 + lxvx 32, 7, 4 + vspltisw 2, 13 + vadduwm 2, 2, 2 + vspltisw 4, 1 + vsubuwm 5, 2, 4 + vslw 1, 4, 5 + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + li 14, 16 + li 15, 32 + li 16, 48 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 vpkuwum 4, 5, 4 vsubuhm 4, 7, 4 vpkuwum 9, 10, 9 @@ -78,36 +108,398 @@ vsubuhm 13, 7, 13 vpkuwum 17, 18, 17 vsubuhm 17, 7, 17 - vmladduhm \_v0, 4, V_MKQ, 8 - vmladduhm \_v1, 9, V_MKQ, 12 - vmladduhm \_v2, 13, V_MKQ, 16 - vmladduhm \_v3, 17, V_MKQ, 20 -.endm - -.macro Write_8X - stxvd2x 32+21, 4, 3 - stxvd2x 32+22, 5, 3 - stxvd2x 32+23, 6, 3 - stxvd2x 32+24, 7, 3 - stxvd2x 32+4, 8, 3 - stxvd2x 32+9, 9, 3 - stxvd2x 32+13, 10, 3 - stxvd2x 32+17, 11, 3 -.endm - -# -# Conditional addition to get unsigned canonical representative -# -.macro To_unsigned_16 - lxvd2x 32+12, 0, 3 - lxvd2x 32+13, 14, 3 - lxvd2x 32+14, 15, 3 - lxvd2x 32+15, 16, 3 - addi 3, 3, 64 - vsrh 1, 12, 10 - vsrh 0, 13, 10 - vsrh 3, 14, 10 - vsrh 2, 15, 10 + vmladduhm 21, 4, 3, 8 + vmladduhm 22, 9, 3, 12 + vmladduhm 23, 13, 3, 16 + vmladduhm 24, 17, 3, 20 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 53, 4, 3 + stxvd2x 54, 5, 3 + stxvd2x 55, 6, 3 + stxvd2x 56, 7, 3 + stxvd2x 36, 8, 3 + stxvd2x 41, 9, 3 + stxvd2x 45, 10, 3 + stxvd2x 49, 11, 3 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 21, 4, 3, 8 + vmladduhm 22, 9, 3, 12 + vmladduhm 23, 13, 3, 16 + vmladduhm 24, 17, 3, 20 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 53, 4, 3 + stxvd2x 54, 5, 3 + stxvd2x 55, 6, 3 + stxvd2x 56, 7, 3 + stxvd2x 36, 8, 3 + stxvd2x 41, 9, 3 + stxvd2x 45, 10, 3 + stxvd2x 49, 11, 3 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 21, 4, 3, 8 + vmladduhm 22, 9, 3, 12 + vmladduhm 23, 13, 3, 16 + vmladduhm 24, 17, 3, 20 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 53, 4, 3 + stxvd2x 54, 5, 3 + stxvd2x 55, 6, 3 + stxvd2x 56, 7, 3 + stxvd2x 36, 8, 3 + stxvd2x 41, 9, 3 + stxvd2x 45, 10, 3 + stxvd2x 49, 11, 3 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 21, 4, 3, 8 + vmladduhm 22, 9, 3, 12 + vmladduhm 23, 13, 3, 16 + vmladduhm 24, 17, 3, 20 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 53, 4, 3 + stxvd2x 54, 5, 3 + stxvd2x 55, 6, 3 + stxvd2x 56, 7, 3 + stxvd2x 36, 8, 3 + stxvd2x 41, 9, 3 + stxvd2x 45, 10, 3 + stxvd2x 49, 11, 3 + addi 3, 3, -512 + vxor 9, 9, 9 + vspltish 10, 15 + vmr 11, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 vadduhm 7, 12, 11 vadduhm 8, 13, 11 vadduhm 5, 14, 11 @@ -116,125 +508,205 @@ vcmpequh 0, 0, 9 vcmpequh 3, 3, 9 vcmpequh 2, 2, 9 - xxsel 32+1, 32+7,32+12, 32+1 - xxsel 32+0, 32+8,32+13, 32+0 - xxsel 32+3, 32+5,32+14, 32+3 - xxsel 32+2, 32+6,32+15, 32+2 - stxvd2x 32+3, 10, 3 - stxvd2x 32+2, 11, 3 - stxvd2x 32+1, 8, 3 - stxvd2x 32+0, 9, 3 -.endm - -.align 4 -.globl MLK_ASM_NAMESPACE(reduce_ppc) -MLK_ASM_FN_SYMBOL(reduce_ppc) - stdu 1, -224(1) - mflr 0 - std 14, 96(1) - std 15, 104(1) - std 16, 112(1) - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - stxvx 32+20, 6, 1 - stxvx 32+21, 7, 1 - stxvx 32+22, 8, 1 - stxvx 32+23, 9, 1 - stxvx 32+24, 10, 1 - - vxor 7, 7, 7 - - li 6, Q_OFFSET - li 7, C20159_OFFSET - lxvx 32+V_MKQ, 6, 4 - lxvx 32+V20159, 7, 4 - - vspltisw V_26, 13 - vadduwm V_26, V_26, V_26 - vspltisw 4, 1 - vsubuwm 5, V_26, 4 - vslw V_25, 4, 5 - - li 4, -128 - li 5, -112 - li 6, -96 - li 7, -80 - li 8, -64 - li 9, -48 - li 10, -32 - li 11, -16 - - li 14, 16 - li 15, 32 - li 16, 48 - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - # - # To unsigned canonical - # -.align 4 - addi 3, 3, -512 - vxor 9, 9, 9 - vspltish 10, 15 - vmr 11, V_MKQ - - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - - ld 14, 96(1) - ld 15, 104(1) - ld 16, 112(1) - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - lxvx 32+20, 6, 1 - lxvx 32+21, 7, 1 - lxvx 32+22, 8, 1 - lxvx 32+23, 9, 1 - lxvx 32+24, 10, 1 - mtlr 0 - addi 1, 1, 224 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + lxvx 52, 6, 1 + lxvx 53, 7, 1 + lxvx 54, 8, 1 + lxvx 55, 9, 1 + lxvx 56, 10, 1 + mtlr 0 + addi 1, 1, 224 blr + .cfi_endproc -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef V20159 -#undef V_25 -#undef V_26 -#undef V_MKQ - -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ - -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef V20159 -#undef V_25 -#undef V_26 -#undef V_MKQ +#endif /* MLK_ARITH_BACKEND_PPC64LE && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/scripts/autogen b/scripts/autogen index 74c68b6507..d80d0b724f 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -1220,6 +1220,10 @@ def x86_64(c): return "/x86_64/" in c +def ppc64le(c): + return "/ppc64le/" in c + + def native_fips202(c): return native(c) and fips202(c) @@ -1252,6 +1256,10 @@ def native_arith_x86_64(c): return native_arith(c) and x86_64(c) +def native_arith_ppc64le(c): + return native_arith(c) and ppc64le(c) + + def native_arith_core(c): return ( native_arith(c) and not native_arith_x86_64(c) and not native_arith_aarch64(c) @@ -1519,6 +1527,10 @@ def gen_monolithic_asm_file(dry_run=False): for c in filter(native_arith_x86_64, asm_sources): yield f'#include "{c}"' yield "#endif" + yield "#if defined(MLK_SYS_PPC64LE)" + for c in filter(native_arith_ppc64le, asm_sources): + yield f'#include "{c}"' + yield "#endif" yield "#endif" yield "" yield "#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202)"