3333#define CONSTBASE R16
3434#define BLOCKS R17
3535
36+ // for VPERMXOR
37+ #define MASK R18
38+
3639DATA consts<>+0x00 (SB)/8 , $0x3320646e61707865
3740DATA consts<>+0x08 (SB)/8 , $0x6b20657479622d32
3841DATA consts<>+0x10 (SB)/8 , $0x0000000000000001
@@ -53,7 +56,11 @@ DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
5356DATA consts<>+0x88 (SB)/8 , $0x6b2065746b206574
5457DATA consts<>+0x90 (SB)/8 , $0x0000000100000000
5558DATA consts<>+0x98 (SB)/8 , $0x0000000300000002
56- GLOBL consts<>(SB), RODATA, $0xa0
59+ DATA consts<>+0xa0 (SB)/8 , $0x5566774411223300
60+ DATA consts<>+0xa8 (SB)/8 , $0xddeeffcc99aabb88
61+ DATA consts<>+0xb0 (SB)/8 , $0x6677445522330011
62+ DATA consts<>+0xb8 (SB)/8 , $0xeeffccddaabb8899
63+ GLOBL consts<>(SB), RODATA, $0xc0
5764
5865//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
5966TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64 -40
@@ -70,6 +77,9 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
7077 MOVD $48 , R10
7178 MOVD $64 , R11
7279 SRD $6 , LEN, BLOCKS
80+ // for VPERMXOR
81+ MOVD $consts<>+0xa0 (SB), MASK
82+ MOVD $16 , R20
7383 // V16
7484 LXVW4X (CONSTBASE)(R0), VS48
7585 ADD $80 ,CONSTBASE
@@ -87,6 +97,10 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
8797 // V28
8898 LXVW4X (CONSTBASE)(R11), VS60
8999
100+ // Load mask constants for VPERMXOR
101+ LXVW4X (MASK)(R0), V20
102+ LXVW4X (MASK)(R20), V21
103+
90104 // splat slot from V19 -> V26
91105 VSPLTW $0 , V19, V26
92106
@@ -97,7 +111,7 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
97111
98112 MOVD $10 , R14
99113 MOVD R14, CTR
100-
114+ PCALIGN $16
101115loop_outer_vsx:
102116 // V0, V1, V2, V3
103117 LXVW4X (R0)(CONSTBASE), VS32
@@ -128,22 +142,17 @@ loop_outer_vsx:
128142 VSPLTISW $12 , V28
129143 VSPLTISW $8 , V29
130144 VSPLTISW $7 , V30
131-
145+ PCALIGN $16
132146loop_vsx:
133147 VADDUWM V0, V4, V0
134148 VADDUWM V1, V5, V1
135149 VADDUWM V2, V6, V2
136150 VADDUWM V3, V7, V3
137151
138- VXOR V12, V0, V12
139- VXOR V13, V1, V13
140- VXOR V14, V2, V14
141- VXOR V15, V3, V15
142-
143- VRLW V12, V27, V12
144- VRLW V13, V27, V13
145- VRLW V14, V27, V14
146- VRLW V15, V27, V15
152+ VPERMXOR V12, V0, V21, V12
153+ VPERMXOR V13, V1, V21, V13
154+ VPERMXOR V14, V2, V21, V14
155+ VPERMXOR V15, V3, V21, V15
147156
148157 VADDUWM V8, V12, V8
149158 VADDUWM V9, V13, V9
@@ -165,15 +174,10 @@ loop_vsx:
165174 VADDUWM V2, V6, V2
166175 VADDUWM V3, V7, V3
167176
168- VXOR V12, V0, V12
169- VXOR V13, V1, V13
170- VXOR V14, V2, V14
171- VXOR V15, V3, V15
172-
173- VRLW V12, V29, V12
174- VRLW V13, V29, V13
175- VRLW V14, V29, V14
176- VRLW V15, V29, V15
177+ VPERMXOR V12, V0, V20, V12
178+ VPERMXOR V13, V1, V20, V13
179+ VPERMXOR V14, V2, V20, V14
180+ VPERMXOR V15, V3, V20, V15
177181
178182 VADDUWM V8, V12, V8
179183 VADDUWM V9, V13, V9
@@ -195,15 +199,10 @@ loop_vsx:
195199 VADDUWM V2, V7, V2
196200 VADDUWM V3, V4, V3
197201
198- VXOR V15, V0, V15
199- VXOR V12, V1, V12
200- VXOR V13, V2, V13
201- VXOR V14, V3, V14
202-
203- VRLW V15, V27, V15
204- VRLW V12, V27, V12
205- VRLW V13, V27, V13
206- VRLW V14, V27, V14
202+ VPERMXOR V15, V0, V21, V15
203+ VPERMXOR V12, V1, V21, V12
204+ VPERMXOR V13, V2, V21, V13
205+ VPERMXOR V14, V3, V21, V14
207206
208207 VADDUWM V10, V15, V10
209208 VADDUWM V11, V12, V11
@@ -225,15 +224,10 @@ loop_vsx:
225224 VADDUWM V2, V7, V2
226225 VADDUWM V3, V4, V3
227226
228- VXOR V15, V0, V15
229- VXOR V12, V1, V12
230- VXOR V13, V2, V13
231- VXOR V14, V3, V14
232-
233- VRLW V15, V29, V15
234- VRLW V12, V29, V12
235- VRLW V13, V29, V13
236- VRLW V14, V29, V14
227+ VPERMXOR V15, V0, V20, V15
228+ VPERMXOR V12, V1, V20, V12
229+ VPERMXOR V13, V2, V20, V13
230+ VPERMXOR V14, V3, V20, V14
237231
238232 VADDUWM V10, V15, V10
239233 VADDUWM V11, V12, V11
@@ -249,48 +243,48 @@ loop_vsx:
249243 VRLW V6, V30, V6
250244 VRLW V7, V30, V7
251245 VRLW V4, V30, V4
252- BC 16 , LT, loop_vsx
246+ BDNZ loop_vsx
253247
254248 VADDUWM V12, V26, V12
255249
256- WORD $0x13600F8C // VMRGEW V0, V1, V27
257- WORD $0x13821F8C // VMRGEW V2, V3, V28
250+ VMRGEW V0, V1, V27
251+ VMRGEW V2, V3, V28
258252
259- WORD $0x10000E8C // VMRGOW V0, V1, V0
260- WORD $0x10421E8C // VMRGOW V2, V3, V2
253+ VMRGOW V0, V1, V0
254+ VMRGOW V2, V3, V2
261255
262- WORD $0x13A42F8C // VMRGEW V4, V5, V29
263- WORD $0x13C63F8C // VMRGEW V6, V7, V30
256+ VMRGEW V4, V5, V29
257+ VMRGEW V6, V7, V30
264258
265259 XXPERMDI VS32, VS34, $0 , VS33
266260 XXPERMDI VS32, VS34, $3 , VS35
267261 XXPERMDI VS59, VS60, $0 , VS32
268262 XXPERMDI VS59, VS60, $3 , VS34
269263
270- WORD $0x10842E8C // VMRGOW V4, V5, V4
271- WORD $0x10C63E8C // VMRGOW V6, V7, V6
264+ VMRGOW V4, V5, V4
265+ VMRGOW V6, V7, V6
272266
273- WORD $0x13684F8C // VMRGEW V8, V9, V27
274- WORD $0x138A5F8C // VMRGEW V10, V11, V28
267+ VMRGEW V8, V9, V27
268+ VMRGEW V10, V11, V28
275269
276270 XXPERMDI VS36, VS38, $0 , VS37
277271 XXPERMDI VS36, VS38, $3 , VS39
278272 XXPERMDI VS61, VS62, $0 , VS36
279273 XXPERMDI VS61, VS62, $3 , VS38
280274
281- WORD $0x11084E8C // VMRGOW V8, V9, V8
282- WORD $0x114A5E8C // VMRGOW V10, V11, V10
275+ VMRGOW V8, V9, V8
276+ VMRGOW V10, V11, V10
283277
284- WORD $0x13AC6F8C // VMRGEW V12, V13, V29
285- WORD $0x13CE7F8C // VMRGEW V14, V15, V30
278+ VMRGEW V12, V13, V29
279+ VMRGEW V14, V15, V30
286280
287281 XXPERMDI VS40, VS42, $0 , VS41
288282 XXPERMDI VS40, VS42, $3 , VS43
289283 XXPERMDI VS59, VS60, $0 , VS40
290284 XXPERMDI VS59, VS60, $3 , VS42
291285
292- WORD $0x118C6E8C // VMRGOW V12, V13, V12
293- WORD $0x11CE7E8C // VMRGOW V14, V15, V14
286+ VMRGOW V12, V13, V12
287+ VMRGOW V14, V15, V14
294288
295289 VSPLTISW $4 , V27
296290 VADDUWM V26, V27, V26
@@ -431,15 +425,15 @@ tail_vsx:
431425 ADD $-1 , R11, R12
432426 ADD $-1 , INP
433427 ADD $-1 , OUT
434-
428+ PCALIGN $16
435429looptail_vsx:
436430 // Copying the result to OUT
437431 // in bytes.
438432 MOVBZU 1 (R12), KEY
439433 MOVBZU 1 (INP), TMP
440434 XOR KEY, TMP, KEY
441435 MOVBU KEY, 1 (OUT )
442- BC 16 , LT, looptail_vsx
436+ BDNZ looptail_vsx
443437
444438 // Clear the stack values
445439 STXVW4X VS48, (R11)(R0)
0 commit comments