1010TARGET_ISA =Arm_AArch64
1111
1212# MicroArch target to optimize for
13- TARGET_MICROARCH =Arm_Cortex_A55
13+ TARGET_MICROARCH =Arm_Neoverse_N1_experimental
1414
1515SLOTHY_EXTRA_FLAGS ?=
1616
@@ -51,14 +51,15 @@ ntt.S: ../../aarch64_clean/src/ntt.S $(COMMON_H)
5151
5252intt.S : ../../aarch64_clean/src/intt.S $(COMMON_H )
5353 slothy-cli $(TARGET_ISA ) $(TARGET_MICROARCH ) $< -o $@ -l intt_layer123_start -l intt_layer4567_start $(SLOTHY_FLAGS ) $(RESERVE_X_ONLY_FLAG )
54+
5455polyvec_basemul_acc_montgomery_cached_asm_k2.S : ../../aarch64_clean/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S $(COMMON_H )
55- slothy-cli $(TARGET_ISA ) $(TARGET_MICROARCH ) $< -o $@ -l polyvec_basemul_acc_montgomery_cached_k2_loop_start $(SLOTHY_FLAGS ) $(RESERVE_X_ONLY_FLAG )
56+ slothy-cli $(TARGET_ISA ) $(TARGET_MICROARCH ) $< -o $@ -l polyvec_basemul_acc_montgomery_cached_k2_loop_start -ctimeout=60 $(SLOTHY_FLAGS ) $(RESERVE_X_ONLY_FLAG )
5657
5758polyvec_basemul_acc_montgomery_cached_asm_k3.S : ../../aarch64_clean/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S $(COMMON_H )
58- slothy-cli $(TARGET_ISA ) $(TARGET_MICROARCH ) $< -o $@ -l polyvec_basemul_acc_montgomery_cached_k3_loop_start $(SLOTHY_FLAGS ) $(RESERVE_X_ONLY_FLAG )
59+ slothy-cli $(TARGET_ISA ) $(TARGET_MICROARCH ) $< -o $@ -l polyvec_basemul_acc_montgomery_cached_k3_loop_start -ctimeout=60 $(SLOTHY_FLAGS ) $(RESERVE_X_ONLY_FLAG )
5960
6061polyvec_basemul_acc_montgomery_cached_asm_k4.S : ../../aarch64_clean/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S $(COMMON_H )
61- slothy-cli $(TARGET_ISA ) $(TARGET_MICROARCH ) $< -o $@ -l polyvec_basemul_acc_montgomery_cached_k4_loop_start $(SLOTHY_FLAGS ) $(RESERVE_X_ONLY_FLAG )
62+ slothy-cli $(TARGET_ISA ) $(TARGET_MICROARCH ) $< -o $@ -l polyvec_basemul_acc_montgomery_cached_k4_loop_start -ctimeout=60 $(SLOTHY_FLAGS ) $(RESERVE_X_ONLY_FLAG )
6263
6364
6465
@@ -71,10 +72,15 @@ poly_reduce_asm.S: ../../aarch64_clean/src/poly_reduce_asm.S $(COMMON_H)
7172 slothy-cli $(TARGET_ISA ) $(TARGET_MICROARCH ) $< -o $@ -l poly_reduce_loop_start $(SLOTHY_FLAGS ) $(RESERVE_ALL_FLAG )
7273
7374poly_mulcache_compute_asm.S : ../../aarch64_clean/src/poly_mulcache_compute_asm.S $(COMMON_H )
74- slothy-cli $(TARGET_ISA ) $(TARGET_MICROARCH ) $< -o $@ -l poly_mulcache_compute_loop_start $(SLOTHY_FLAGS ) $(RESERVE_ALL_FLAG )
75+ # Unroll twice since kernel is small
76+ slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l poly_mulcache_compute_loop_start $(SLOTHY_FLAGS) $(RESERVE_ALL_FLAG) -c sw_pipelining.unroll=2
7577
7678poly_tobytes_asm.S : ../../aarch64_clean/src/poly_tobytes_asm.S $(COMMON_H )
77- slothy-cli $(TARGET_ISA ) $(TARGET_MICROARCH ) $< -o $@ -l poly_tobytes_loop_start $(SLOTHY_FLAGS ) $(RESERVE_ALL_FLAG ) -c sw_pipelining.unroll=4
79+ # Somehow SLOTHY tends to mark almost all instructions as 'early', which isn't
80+ # wrong but looks a bit odd in the optimized code. Minimize overlapping to avoid this.
81+ #
82+ # Unroll twice since kernel is small
83+ slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l poly_tobytes_loop_start $(SLOTHY_FLAGS) $(RESERVE_ALL_FLAG) -c sw_pipelining.unroll=2 -c sw_pipelining.minimize_overlapping -ctimeout=60
7884
7985# At the moment, SLOTHY can't process rej_uniform_asm.S
8086rej_uniform_asm.S : ../../aarch64_clean/src/rej_uniform_asm.S $(COMMON_H )
0 commit comments