Skip to content

Commit 6f04a92

Browse files
committed
internal/chacha8rand: provide vector implementation for riscv64
Provide a vector implementation of chacha8rand for riscv64, which improves performance. goos: linux goarch: riscv64 pkg: internal/chacha8rand cpu: Spacemit(R) X60 │ /root/chacha8.rand.old.log │ /root/chacha8.rand.new.log │ │ sec/op │ sec/op vs base │ Block 1.640µ ± 0% 1.294µ ± 0% -21.10% (p=0.000 n=10) │ /root/chacha8.rand.old.log │ /root/chacha8.rand.new.log │ │ B/s │ B/s vs base │ Block 148.9Mi ± 0% 188.6Mi ± 0% +26.72% (p=0.000 n=10) Change-Id: I1e04c5c44e5ce0c78814a6a48c5ab65e4d758937 Reviewed-on: https://go-review.googlesource.com/c/go/+/710035 Reviewed-by: Joel Sing <joel@sing.id.au> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
1 parent 54e3adc commit 6f04a92

File tree

3 files changed

+115
-1
lines changed

3 files changed

+115
-1
lines changed

src/internal/chacha8rand/chacha8.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
// Offsets into internal/cpu records for use in assembly.
1717
const (
1818
offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX)
19+
offsetRISCV64HasV = unsafe.Offsetof(cpu.RISCV64.HasV)
1920
)
2021

2122
const (
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
// Copyright 2025 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
#include "asm_riscv64.h"
6+
#include "go_asm.h"
7+
#include "textflag.h"
8+
9+
// TODO(mzh): use Zvkb if possible
10+
11+
#define QR(A, B, C, D) \
12+
VADDVV A, B, A \
13+
VXORVV D, A, D \
14+
VSLLVI $16, D, V28 \
15+
VSRLVI $16, D, D \
16+
VXORVV V28, D, D \
17+
VADDVV D, C, C \
18+
VXORVV C, B, B \
19+
VSLLVI $12, B, V29 \
20+
VSRLVI $20, B, B \
21+
VXORVV V29, B, B \
22+
VADDVV B, A, A \
23+
VXORVV A, D, D \
24+
VSLLVI $8, D, V30 \
25+
VSRLVI $24, D, D \
26+
VXORVV V30, D, D \
27+
VADDVV D, C, C \
28+
VXORVV C, B, B \
29+
VSLLVI $7, B, V31 \
30+
VSRLVI $25, B, B \
31+
VXORVV V31, B, B
32+
33+
// block runs four ChaCha8 block transformations using four elements in each V register.
34+
// func block(seed *[8]uint32, blocks *[16][4]uint32, counter uint32)
35+
TEXT ·block<ABIInternal>(SB), NOSPLIT, $0
36+
// seed in X10
37+
// blocks in X11
38+
// counter in X12
39+
40+
#ifndef hasV
41+
MOVB internal∕cpu·RISCV64+const_offsetRISCV64HasV(SB), X13
42+
BNEZ X13, vector_chacha8
43+
JMP ·block_generic<ABIInternal>(SB)
44+
#endif
45+
46+
vector_chacha8:
47+
// At least VLEN >= 128
48+
VSETIVLI $4, E32, M1, TA, MA, X0
49+
// Load initial constants into top row.
50+
MOV $·chachaConst(SB), X14
51+
VLSSEG4E32V (X14), X0, V0 // V0, V1, V2, V3 = const row
52+
VLSSEG8E32V (X10), X0, V4 // V4 ... V11, seed
53+
VIDV V12
54+
VADDVX X12, V12, V12 // counter
55+
56+
// Clear all nonces.
57+
VXORVV V13, V13, V13
58+
VXORVV V14, V14, V14
59+
VXORVV V15, V15, V15
60+
61+
// Copy initial state.
62+
VMV4RV V4, V20
63+
VMV4RV V8, V24
64+
65+
MOV $4, X15
66+
PCALIGN $16
67+
loop:
68+
QR(V0, V4, V8, V12)
69+
QR(V1, V5, V9, V13)
70+
QR(V2, V6, V10, V14)
71+
QR(V3, V7, V11, V15)
72+
73+
QR(V0, V5, V10, V15)
74+
QR(V1, V6, V11, V12)
75+
QR(V2, V7, V8, V13)
76+
QR(V3, V4, V9, V14)
77+
78+
SUB $1, X15
79+
BNEZ X15, loop
80+
81+
VADDVV V20, V4, V4
82+
VADDVV V21, V5, V5
83+
VADDVV V22, V6, V6
84+
VADDVV V23, V7, V7
85+
VADDVV V24, V8, V8
86+
VADDVV V25, V9, V9
87+
VADDVV V26, V10, V10
88+
VADDVV V27, V11, V11
89+
90+
VSE32V V0, (X11); ADD $16, X11;
91+
VSE32V V1, (X11); ADD $16, X11;
92+
VSE32V V2, (X11); ADD $16, X11;
93+
VSE32V V3, (X11); ADD $16, X11;
94+
VSE32V V4, (X11); ADD $16, X11;
95+
VSE32V V5, (X11); ADD $16, X11;
96+
VSE32V V6, (X11); ADD $16, X11;
97+
VSE32V V7, (X11); ADD $16, X11;
98+
VSE32V V8, (X11); ADD $16, X11;
99+
VSE32V V9, (X11); ADD $16, X11;
100+
VSE32V V10, (X11); ADD $16, X11;
101+
VSE32V V11, (X11); ADD $16, X11;
102+
VSE32V V12, (X11); ADD $16, X11;
103+
VSE32V V13, (X11); ADD $16, X11;
104+
VSE32V V14, (X11); ADD $16, X11;
105+
VSE32V V15, (X11); ADD $16, X11;
106+
107+
RET
108+
109+
GLOBL ·chachaConst(SB), NOPTR|RODATA, $32
110+
DATA ·chachaConst+0x00(SB)/4, $0x61707865
111+
DATA ·chachaConst+0x04(SB)/4, $0x3320646e
112+
DATA ·chachaConst+0x08(SB)/4, $0x79622d32
113+
DATA ·chachaConst+0x0c(SB)/4, $0x6b206574

src/internal/chacha8rand/chacha8_stub.s

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build !amd64 && !arm64 && !loong64
5+
//go:build !amd64 && !arm64 && !loong64 && !riscv64
66

77
#include "textflag.h"
88

0 commit comments

Comments
 (0)