Skip to content

Commit 2ebaafc

Browse files
sophie-zhaogopherbot
authored andcommitted
blake2s: add loong64 SIMD implementation
The performance gains on Loongson 3A6000 and 3A5000 are as follows: goos: linux goarch: loong64 pkg: golang.org/x/crypto/blake2s cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Write64 277.8n ± 0% 113.5n ± 0% -59.14% (p=0.000 n=10) Write1K 4.246µ ± 0% 1.736µ ± 0% -59.11% (p=0.000 n=10) Sum64 289.9n ± 0% 137.7n ± 0% -52.51% (p=0.000 n=10) Sum1K 4.265µ ± 0% 1.758µ ± 0% -58.78% (p=0.000 n=10) geomean 1.099µ 467.3n -57.48% | bench.old | bench.new | | B/s | B/s vs base | Write64 219.7Mi ± 0% 537.9Mi ± 0% +144.86% (p=0.000 n=10) Write1K 230.0Mi ± 0% 562.6Mi ± 0% +144.62% (p=0.000 n=10) Sum64 210.5Mi ± 0% 443.3Mi ± 0% +110.59% (p=0.000 n=10) Sum1K 229.0Mi ± 0% 555.5Mi ± 0% +142.64% (p=0.000 n=10) geomean 222.1Mi 522.5Mi +135.21% goos: linux goarch: loong64 pkg: golang.org/x/crypto/blake2s cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Write64 373.8n ± 0% 175.0n ± 0% -53.18% (p=0.000 n=10) Write1K 5.763µ ± 0% 2.595µ ± 0% -54.97% (p=0.000 n=10) Sum64 397.8n ± 0% 205.7n ± 0% -48.29% (p=0.000 n=10) Sum1K 5.787µ ± 0% 2.627µ ± 0% -54.61% (p=0.000 n=10) geomean 1.492µ 703.8n -52.83% | bench.old | bench.new | | B/s | B/s vs base | Write64 163.3Mi ± 0% 348.9Mi ± 0% +113.62% (p=0.000 n=10) Write1K 169.5Mi ± 0% 376.3Mi ± 0% +122.09% (p=0.000 n=10) Sum64 153.4Mi ± 0% 296.7Mi ± 0% +93.37% (p=0.000 n=10) Sum1K 168.7Mi ± 0% 371.8Mi ± 0% +120.33% (p=0.000 n=10) geomean 163.6Mi 346.9Mi +112.03% Change-Id: Id91ffbefc538bce294875d72e6cde72fea43afbf Reviewed-on: https://go-review.googlesource.com/c/crypto/+/661215 Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Carlos Amedee <carlos@golang.org> Auto-Submit: Carlos Amedee <carlos@golang.org> Reviewed-by: abner chenc <chenguoqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
1 parent 4bc0711 commit 2ebaafc

File tree

4 files changed

+230
-7
lines changed

4 files changed

+230
-7
lines changed

blake2s/blake2s_loong64.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
// Copyright 2025 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build loong64 && gc && !purego
6+
7+
package blake2s
8+
9+
import "golang.org/x/sys/cpu"
10+
11+
//go:noescape
12+
func hashBlocksVX(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
13+
14+
func hashBlocks(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) {
15+
if cpu.Loong64.HasLSX {
16+
hashBlocksVX(h, c, flag, blocks)
17+
} else {
18+
hashBlocksGeneric(h, c, flag, blocks)
19+
}
20+
}

blake2s/blake2s_loong64.s

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
// Copyright 2025 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build loong64 && gc && !purego
6+
7+
#include "textflag.h"
8+
9+
DATA ·iv0<>+0(SB)/4, $0x6a09e667
10+
DATA ·iv0<>+4(SB)/4, $0xbb67ae85
11+
DATA ·iv0<>+8(SB)/4, $0x3c6ef372
12+
DATA ·iv0<>+12(SB)/4, $0xa54ff53a
13+
GLOBL ·iv0<>(SB), RODATA|NOPTR, $16
14+
15+
DATA ·iv1<>+0(SB)/4, $0x510e527f
16+
DATA ·iv1<>+4(SB)/4, $0x9b05688c
17+
DATA ·iv1<>+8(SB)/4, $0x1f83d9ab
18+
DATA ·iv1<>+12(SB)/4, $0x5be0cd19
19+
GLOBL ·iv1<>(SB), RODATA|NOPTR, $16
20+
21+
#define SHUFFLE_1 \
22+
VSHUF4IW $57, V1, V1; \
23+
VSHUF4IW $78, V2, V2; \
24+
VSHUF4IW $147, V3, V3; \
25+
26+
#define SHUFFLE_2 \
27+
VSHUF4IW $147, V1, V1; \
28+
VSHUF4IW $78, V2, V2; \
29+
VSHUF4IW $57, V3, V3; \
30+
31+
#define LOAD_M(a, b, c, d, e, f, g, h) \
32+
VMOVQ a, V8.W[0]; \
33+
VMOVQ b, V8.W[1]; \
34+
VMOVQ c, V8.W[2]; \
35+
VMOVQ d, V8.W[3]; \
36+
VMOVQ e, V9.W[0]; \
37+
VMOVQ f, V9.W[1]; \
38+
VMOVQ g, V9.W[2]; \
39+
VMOVQ h, V9.W[3]; \
40+
41+
#define ROUND_0 \
42+
VADDW V0, V8, V0; \
43+
VADDW V0, V1, V0; \
44+
VXORV V3, V0, V3; \
45+
VROTRW $16, V3, V3; \
46+
VADDW V2, V3, V2; \
47+
VXORV V1, V2, V1; \
48+
VROTRW $12, V1, V1; \
49+
VADDW V0, V9, V0; \
50+
VADDW V0, V1, V0; \
51+
VXORV V3, V0, V3; \
52+
VROTRW $8, V3, V3; \
53+
VADDW V2, V3, V2; \
54+
VXORV V1, V2, V1; \
55+
VROTRW $7, V1, V1; \
56+
57+
#define ROUND_8 ROUND_0
58+
59+
// func hashBlocksVX(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
60+
TEXT ·hashBlocksVX(SB), NOSPLIT, $0-48
61+
MOVV h+0(FP), R4
62+
MOVV c+8(FP), R5
63+
MOVWU flag+16(FP), R6
64+
MOVV blocks_base+24(FP), R7
65+
MOVV blocks_len+32(FP), R8
66+
MOVW (R5), R9 // c0
67+
MOVW 4(R5), R10 // c1
68+
69+
loop:
70+
ADD $0x40, R9
71+
SGTU $0x40, R9, R11
72+
ADD R10, R11, R10
73+
74+
MOVV $·iv0<>(SB), R11
75+
MOVV $·iv1<>(SB), R12
76+
MOVWU 0(R12), R13 // v12
77+
MOVWU 4(R12), R14 // v13
78+
MOVWU 8(R12), R15 // v14
79+
MOVWU 12(R12), R16 // v15
80+
XOR R13, R9, R13
81+
XOR R14, R10, R14
82+
XOR R15, R6, R15
83+
84+
VMOVQ (R4), V0
85+
VMOVQ 16(R4), V1
86+
VMOVQ (R11), V2
87+
VMOVQ R16, V3.W[3]
88+
VMOVQ R13, V3.W[0]
89+
VMOVQ R14, V3.W[1]
90+
VMOVQ R15, V3.W[2]
91+
92+
MOVWU (R7), R11
93+
MOVWU 4(R7), R12
94+
MOVWU 8(R7), R13
95+
MOVWU 12(R7), R14
96+
MOVWU 16(R7), R15
97+
MOVWU 20(R7), R16
98+
MOVWU 24(R7), R17
99+
MOVWU 28(R7), R18
100+
MOVWU 32(R7), R19
101+
MOVWU 36(R7), R24
102+
MOVWU 40(R7), R25
103+
MOVWU 44(R7), R26
104+
MOVWU 48(R7), R27
105+
MOVWU 52(R7), R28
106+
MOVWU 56(R7), R29
107+
MOVWU 60(R7), R30
108+
109+
LOAD_M(R11, R13, R15, R17, R12, R14, R16, R18)
110+
ROUND_0
111+
SHUFFLE_1
112+
LOAD_M(R19, R25, R27, R29, R24, R26, R28, R30)
113+
ROUND_8
114+
SHUFFLE_2
115+
116+
LOAD_M(R29, R15, R24, R28, R25, R19, R30, R17)
117+
ROUND_0
118+
SHUFFLE_1
119+
LOAD_M(R12, R11, R26, R16, R27, R13, R18, R14)
120+
ROUND_8
121+
SHUFFLE_2
122+
123+
LOAD_M(R26, R27, R16, R30, R19, R11, R13, R28)
124+
ROUND_0
125+
SHUFFLE_1
126+
LOAD_M(R25, R14, R18, R24, R29, R17, R12, R15)
127+
ROUND_8
128+
SHUFFLE_2
129+
130+
LOAD_M(R18, R14, R28, R26, R24, R12, R27, R29)
131+
ROUND_0
132+
SHUFFLE_1
133+
LOAD_M(R13, R16, R15, R30, R17, R25, R11, R19)
134+
ROUND_8
135+
SHUFFLE_2
136+
137+
LOAD_M(R24, R16, R13, R25, R11, R18, R15, R30)
138+
ROUND_0
139+
SHUFFLE_1
140+
LOAD_M(R29, R26, R17, R14, R12, R27, R19, R28)
141+
ROUND_8
142+
SHUFFLE_2
143+
144+
LOAD_M(R13, R17, R11, R19, R27, R25, R26, R14)
145+
ROUND_0
146+
SHUFFLE_1
147+
LOAD_M(R15, R18, R30, R12, R28, R16, R29, R24)
148+
ROUND_8
149+
SHUFFLE_2
150+
151+
LOAD_M(R27, R12, R29, R15, R16, R30, R28, R25)
152+
ROUND_0
153+
SHUFFLE_1
154+
LOAD_M(R11, R17, R24, R19, R18, R14, R13, R26)
155+
ROUND_8
156+
SHUFFLE_2
157+
158+
LOAD_M(R28, R18, R27, R14, R26, R29, R12, R24)
159+
ROUND_0
160+
SHUFFLE_1
161+
LOAD_M(R16, R30, R19, R13, R11, R15, R17, R25)
162+
ROUND_8
163+
SHUFFLE_2
164+
165+
LOAD_M(R17, R29, R26, R11, R30, R24, R14, R19)
166+
ROUND_0
167+
SHUFFLE_1
168+
LOAD_M(R27, R28, R12, R25, R13, R18, R15, R16)
169+
ROUND_8
170+
SHUFFLE_2
171+
172+
LOAD_M(R25, R19, R18, R12, R13, R15, R17, R16)
173+
ROUND_0
174+
SHUFFLE_1
175+
LOAD_M(R30, R24, R14, R28, R26, R29, R27, R11)
176+
ROUND_8
177+
SHUFFLE_2
178+
179+
VMOVQ (R4), V8
180+
VMOVQ 16(R4), V9
181+
VXORV V8, V0, V8
182+
VXORV V9, V1, V9
183+
VXORV V8, V2, V8
184+
VXORV V9, V3, V9
185+
VMOVQ V8, (R4)
186+
VMOVQ V9, 16(R4)
187+
188+
SUBV $64, R8
189+
ADDV $64, R7
190+
BNE R8, R0, loop
191+
192+
MOVW R9, (R5)
193+
MOVW R10, 4(R5)
194+
195+
RET
196+

blake2s/blake2s_ref.go

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,10 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build (!amd64 && !386) || !gc || purego
5+
//go:build (!amd64 && !386 && !loong64) || !gc || purego
66

77
package blake2s
88

9-
var (
10-
useSSE4 = false
11-
useSSSE3 = false
12-
useSSE2 = false
13-
)
14-
159
func hashBlocks(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) {
1610
hashBlocksGeneric(h, c, flag, blocks)
1711
}

blake2s/blake2s_var.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
// Copyright 2025 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build !amd64 && !386
6+
7+
package blake2s
8+
9+
var (
10+
useSSE4 = false
11+
useSSSE3 = false
12+
useSSE2 = false
13+
)

0 commit comments

Comments
 (0)