Skip to content

Commit 1915e47

Browse files
committed
SLOTHY: Superoptimize AArch64 INTT
Resolves #206 Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
1 parent c707054 commit 1915e47

File tree

5 files changed

+1837
-494
lines changed

5 files changed

+1837
-494
lines changed

dev/aarch64_clean/src/intt.S

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -230,17 +230,6 @@ MLD_ASM_FN_SYMBOL(intt_asm)
230230
movk wtmp, #0x7f, lsl #16
231231
dup consts.4s, wtmp
232232

233-
/* check-magic: 16382 == pow(2, 32-8, MLDSA_Q) */
234-
// Load scaling factor 1/256*R
235-
mov wtmp, #16382 // 2^(32 - 8) mod Q
236-
dup ninv.4s, wtmp
237-
238-
/* check-magic: 4197891 == round(16382 * 2**31 / MLDSA_Q) */
239-
// Load Barrett constant for 1/256*R
240-
movz wtmp, #0x0e03
241-
movk wtmp, #0x40, lsl #16
242-
dup ninv_tw.4s, wtmp
243-
244233
mov inp, in
245234
mov count, #16
246235

@@ -308,6 +297,17 @@ intt_layer5678_start:
308297

309298
// ---------------------------------------------------------------------
310299

300+
/* check-magic: 16382 == pow(2, 32-8, MLDSA_Q) */
301+
// Load scaling factor 1/256*R
302+
mov wtmp, #16382 // 2^(32 - 8) mod Q
303+
dup ninv.4s, wtmp
304+
305+
/* check-magic: 4197891 == round(16382 * 2**31 / MLDSA_Q) */
306+
// Load Barrett constant for 1/256*R
307+
movz wtmp, #0x0e03
308+
movk wtmp, #0x40, lsl #16
309+
dup ninv_tw.4s, wtmp
310+
311311
mov count, #4
312312
load_roots_1234
313313

dev/aarch64_opt/src/Makefile

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ SLOTHY_FLAGS_SPLIT= -c inputs_are_outputs \
2828
-c variable_size \
2929
-c constraints.stalls_first_attempt=64 \
3030
-c split_heuristic=true \
31-
-c split_heuristic_factor=1.5 \
3231
-c split_heuristic_repeat=2 \
3332
-c sw_pipelining.enabled=true \
3433
-c sw_pipelining.halving_heuristic=True \
@@ -67,11 +66,15 @@ ntt.S: ../../aarch64_clean/src/ntt.S
6766
$(eval TMPFILE := $(shell mktemp))
6867
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $(TMPFILE) -l ntt_layer123_start $(SLOTHY_FLAGS) $(RESERVE_X_ONLY_FLAG)
6968
# optimize second loop using split heuristic
70-
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $(TMPFILE) -o $@ -l ntt_layer45678_start $(SLOTHY_FLAGS_SPLIT) $(RESERVE_X_ONLY_FLAG)
69+
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $(TMPFILE) -o $@ -l ntt_layer45678_start $(SLOTHY_FLAGS_SPLIT) -c split_heuristic_factor=1.5 $(RESERVE_X_ONLY_FLAG)
7170

7271
# Copy remaining files without optimization for now
7372
intt.S: ../../aarch64_clean/src/intt.S
74-
cp $< $@
73+
# optimize first loop in one go and write to temp file
74+
$(eval TMPFILE := $(shell mktemp))
75+
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $(TMPFILE) -l intt_layer5678_start $(SLOTHY_FLAGS) -c reserved_regs="[x0,x18--x30,sp]"
76+
# optimize second loop using split heuristic
77+
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $(TMPFILE) -o $@ -l intt_layer1234_start $(SLOTHY_FLAGS_SPLIT) -c split_heuristic_factor=2.5 $(RESERVE_X_ONLY_FLAG)
7578

7679
mld_polyvecl_pointwise_acc_montgomery_l4.S: ../../aarch64_clean/src/mld_polyvecl_pointwise_acc_montgomery_l4.S
7780
cp $< $@

0 commit comments

Comments
 (0)