Skip to content

Commit 5aaca94

Browse files
authored
Merge pull request #1088 from pq-code-package/slothy_for_neoverse_n1
SLOTHY: Optimize AArch64 arithmetic backend for Neoverse-N1
2 parents 82d3215 + a16907a commit 5aaca94

39 files changed

+9481
-8448
lines changed

.github/workflows/slothy.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ jobs:
1717
uses: ./.github/workflows/ci_ec2_reusable.yml
1818
with:
1919
name: SLOTHY
20-
ec2_instance_type: c8g.4xlarge
20+
ec2_instance_type: c8g.8xlarge
2121
ec2_ami: ubuntu-latest (custom AMI)
2222
ec2_ami_id: ami-08ddb0acd99dc3d33 # aarch64, ubuntu-latest, 64g
2323
lint: false

dev/aarch64_clean/src/intt.S

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -235,27 +235,6 @@ MLK_ASM_FN_SYMBOL(intt_asm)
235235
mov inp, in
236236
mov count, #8
237237

238-
intt_scale_start:
239-
240-
ldr q_data0, [inp, #(16*0)]
241-
ldr q_data1, [inp, #(16*1)]
242-
ldr q_data2, [inp, #(16*2)]
243-
ldr q_data3, [inp, #(16*3)]
244-
245-
mul_ninv data0, data1, data2, data3, data0, data1, data2, data3
246-
// Bounds: Absolute value < q
247-
248-
str q_data0, [inp], #64
249-
str q_data1, [inp, #(-64 + 16*1)]
250-
str q_data2, [inp, #(-64 + 16*2)]
251-
str q_data3, [inp, #(-64 + 16*3)]
252-
253-
subs count, count, #1
254-
cbnz count, intt_scale_start
255-
256-
mov inp, in
257-
mov count, #8
258-
259238
.p2align 2
260239
intt_layer4567_start:
261240

@@ -266,6 +245,9 @@ intt_layer4567_start:
266245

267246
transpose4 data // manual ld4
268247

248+
mul_ninv data0, data1, data2, data3, data0, data1, data2, data3
249+
// Bounds: Absolute value < q
250+
269251
load_next_roots_67
270252

271253
// Layer 7

dev/aarch64_opt/src/Makefile

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
TARGET_ISA=Arm_AArch64
1111

1212
# MicroArch target to optimize for
13-
TARGET_MICROARCH=Arm_Cortex_A55
13+
TARGET_MICROARCH=Arm_Neoverse_N1_experimental
1414

1515
SLOTHY_EXTRA_FLAGS ?=
1616

@@ -51,14 +51,15 @@ ntt.S: ../../aarch64_clean/src/ntt.S $(COMMON_H)
5151

5252
intt.S: ../../aarch64_clean/src/intt.S $(COMMON_H)
5353
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l intt_layer123_start -l intt_layer4567_start $(SLOTHY_FLAGS) $(RESERVE_X_ONLY_FLAG)
54+
5455
polyvec_basemul_acc_montgomery_cached_asm_k2.S: ../../aarch64_clean/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S $(COMMON_H)
55-
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l polyvec_basemul_acc_montgomery_cached_k2_loop_start $(SLOTHY_FLAGS) $(RESERVE_X_ONLY_FLAG)
56+
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l polyvec_basemul_acc_montgomery_cached_k2_loop_start -ctimeout=60 $(SLOTHY_FLAGS) $(RESERVE_X_ONLY_FLAG)
5657

5758
polyvec_basemul_acc_montgomery_cached_asm_k3.S: ../../aarch64_clean/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S $(COMMON_H)
58-
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l polyvec_basemul_acc_montgomery_cached_k3_loop_start $(SLOTHY_FLAGS) $(RESERVE_X_ONLY_FLAG)
59+
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l polyvec_basemul_acc_montgomery_cached_k3_loop_start -ctimeout=60 $(SLOTHY_FLAGS) $(RESERVE_X_ONLY_FLAG)
5960

6061
polyvec_basemul_acc_montgomery_cached_asm_k4.S: ../../aarch64_clean/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S $(COMMON_H)
61-
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l polyvec_basemul_acc_montgomery_cached_k4_loop_start $(SLOTHY_FLAGS) $(RESERVE_X_ONLY_FLAG)
62+
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l polyvec_basemul_acc_montgomery_cached_k4_loop_start -ctimeout=60 $(SLOTHY_FLAGS) $(RESERVE_X_ONLY_FLAG)
6263

6364

6465

@@ -71,10 +72,15 @@ poly_reduce_asm.S: ../../aarch64_clean/src/poly_reduce_asm.S $(COMMON_H)
7172
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l poly_reduce_loop_start $(SLOTHY_FLAGS) $(RESERVE_ALL_FLAG)
7273

7374
poly_mulcache_compute_asm.S: ../../aarch64_clean/src/poly_mulcache_compute_asm.S $(COMMON_H)
74-
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l poly_mulcache_compute_loop_start $(SLOTHY_FLAGS) $(RESERVE_ALL_FLAG)
75+
# Unroll twice since kernel is small
76+
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l poly_mulcache_compute_loop_start $(SLOTHY_FLAGS) $(RESERVE_ALL_FLAG) -c sw_pipelining.unroll=2
7577

7678
poly_tobytes_asm.S: ../../aarch64_clean/src/poly_tobytes_asm.S $(COMMON_H)
77-
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l poly_tobytes_loop_start $(SLOTHY_FLAGS) $(RESERVE_ALL_FLAG) -c sw_pipelining.unroll=4
79+
# Somehow SLOTHY tends to mark almost all instructions as 'early', which isn't
80+
# wrong but looks a bit odd in the optimized code. Minimize overlapping to avoid this.
81+
#
82+
# Unroll twice since kernel is small
83+
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l poly_tobytes_loop_start $(SLOTHY_FLAGS) $(RESERVE_ALL_FLAG) -c sw_pipelining.unroll=2 -c sw_pipelining.minimize_overlapping -ctimeout=60
7884

7985
# At the moment, SLOTHY can't process rej_uniform_asm.S
8086
rej_uniform_asm.S: ../../aarch64_clean/src/rej_uniform_asm.S $(COMMON_H)

dev/aarch64_opt/src/intt.S

Lines changed: 1087 additions & 1036 deletions
Large diffs are not rendered by default.

dev/aarch64_opt/src/ntt.S

Lines changed: 975 additions & 681 deletions
Large diffs are not rendered by default.

dev/aarch64_opt/src/poly_mulcache_compute_asm.S

Lines changed: 163 additions & 96 deletions
Large diffs are not rendered by default.

dev/aarch64_opt/src/poly_reduce_asm.S

Lines changed: 236 additions & 177 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)