@@ -28,7 +28,6 @@ SLOTHY_FLAGS_SPLIT= -c inputs_are_outputs \
2828 -c variable_size \
2929 -c constraints.stalls_first_attempt=64 \
3030 -c split_heuristic=true \
31- -c split_heuristic_factor=1.5 \
3231 -c split_heuristic_repeat=2 \
3332 -c sw_pipelining.enabled=true \
3433 -c sw_pipelining.halving_heuristic=True \
@@ -67,11 +66,15 @@ ntt.S: ../../aarch64_clean/src/ntt.S
6766 $(eval TMPFILE := $(shell mktemp) )
6867 slothy-cli $(TARGET_ISA ) $(TARGET_MICROARCH ) $< -o $(TMPFILE ) -l ntt_layer123_start $(SLOTHY_FLAGS ) $(RESERVE_X_ONLY_FLAG )
6968 # optimize second loop using split heuristic
70- slothy-cli $(TARGET_ISA ) $(TARGET_MICROARCH ) $(TMPFILE ) -o $@ -l ntt_layer45678_start $(SLOTHY_FLAGS_SPLIT ) $(RESERVE_X_ONLY_FLAG )
69+ slothy-cli $(TARGET_ISA ) $(TARGET_MICROARCH ) $(TMPFILE ) -o $@ -l ntt_layer45678_start $(SLOTHY_FLAGS_SPLIT ) -c split_heuristic_factor=1.5 $(RESERVE_X_ONLY_FLAG )
7170
7271# Copy remaining files without optimization for now
7372intt.S : ../../aarch64_clean/src/intt.S
74- cp $< $@
73+ # optimize first loop in one go and write to temp file
74+ $(eval TMPFILE := $(shell mktemp) )
75+ slothy-cli $(TARGET_ISA ) $(TARGET_MICROARCH ) $< -o $(TMPFILE ) -l intt_layer5678_start $(SLOTHY_FLAGS ) -c reserved_regs=" [x0,x18--x30,sp]"
76+ # optimize second loop using split heuristic
77+ slothy-cli $(TARGET_ISA ) $(TARGET_MICROARCH ) $(TMPFILE ) -o $@ -l intt_layer1234_start $(SLOTHY_FLAGS_SPLIT ) -c split_heuristic_factor=2.5 $(RESERVE_X_ONLY_FLAG )
7578
7679mld_polyvecl_pointwise_acc_montgomery_l4.S : ../../aarch64_clean/src/mld_polyvecl_pointwise_acc_montgomery_l4.S
7780 cp $< $@
0 commit comments