Skip to content

Commit 5c9a26c

Browse files
vleonenrandall77
authored andcommitted
cmd/compile: use arm64 neon in LoweredMemmove/LoweredMemmoveLoop
Raspberry Pi 5 (Cortex-A76) │ base.log │ opt.log │ │ sec/op │ sec/op vs base │ MemmoveKnownSize112 3.549n ± 0% 3.652n ± 0% +2.92% (p=0.000 n=10) MemmoveKnownSize128 3.979n ± 0% 3.617n ± 0% -9.09% (p=0.000 n=10) MemmoveKnownSize192 7.566n ± 0% 5.074n ± 0% -32.94% (p=0.000 n=10) MemmoveKnownSize248 8.549n ± 0% 7.184n ± 1% -15.97% (p=0.000 n=10) MemmoveKnownSize256 10.010n ± 0% 6.827n ± 0% -31.80% (p=0.000 n=10) MemmoveKnownSize512 19.81n ± 0% 13.59n ± 0% -31.40% (p=0.000 n=10) MemmoveKnownSize1024 39.66n ± 0% 27.00n ± 0% -31.93% (p=0.000 n=10) geomean 9.538n 7.392n -22.50% Change-Id: I7b17408cd0a500ceaa80bc93ffe2f19ddeea9c0d Reviewed-on: https://go-review.googlesource.com/c/go/+/692315 Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: David Chase <drchase@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
1 parent 61d1ff6 commit 5c9a26c

File tree

3 files changed

+79
-33
lines changed

3 files changed

+79
-33
lines changed

src/cmd/compile/internal/arm64/ssa.go

Lines changed: 67 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1189,19 +1189,27 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
11891189
if dstReg == srcReg {
11901190
break
11911191
}
1192-
tmpReg1 := int16(arm64.REG_R24)
1193-
tmpReg2 := int16(arm64.REG_R25)
1192+
tmpReg1 := int16(arm64.REG_R25)
1193+
tmpFReg1 := int16(arm64.REG_F16)
1194+
tmpFReg2 := int16(arm64.REG_F17)
11941195
n := v.AuxInt
11951196
if n < 16 {
11961197
v.Fatalf("Move too small %d", n)
11971198
}
11981199

11991200
// Generate copying instructions.
12001201
var off int64
1202+
for n >= 32 {
1203+
// FLDPQ off(srcReg), (tmpFReg1, tmpFReg2)
1204+
// FSTPQ (tmpFReg1, tmpFReg2), off(dstReg)
1205+
move32(s, srcReg, dstReg, tmpFReg1, tmpFReg2, off, false)
1206+
off += 32
1207+
n -= 32
1208+
}
12011209
for n >= 16 {
1202-
// LDP off(srcReg), (tmpReg1, tmpReg2)
1203-
// STP (tmpReg1, tmpReg2), off(dstReg)
1204-
move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false)
1210+
// FMOVQ off(src), tmpFReg1
1211+
// FMOVQ tmpFReg1, off(dst)
1212+
move16(s, srcReg, dstReg, tmpFReg1, off, false)
12051213
off += 16
12061214
n -= 16
12071215
}
@@ -1223,9 +1231,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
12231231
if dstReg == srcReg {
12241232
break
12251233
}
1226-
countReg := int16(arm64.REG_R23)
1227-
tmpReg1 := int16(arm64.REG_R24)
1228-
tmpReg2 := int16(arm64.REG_R25)
1234+
countReg := int16(arm64.REG_R24)
1235+
tmpReg1 := int16(arm64.REG_R25)
1236+
tmpFReg1 := int16(arm64.REG_F16)
1237+
tmpFReg2 := int16(arm64.REG_F17)
12291238
n := v.AuxInt
12301239
loopSize := int64(64)
12311240
if n < 3*loopSize {
@@ -1251,10 +1260,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
12511260

12521261
// Move loopSize bytes starting at srcReg to dstReg.
12531262
// Increment srcReg and destReg by loopSize as a side effect.
1254-
for range loopSize / 16 {
1255-
// LDP.P 16(srcReg), (tmpReg1, tmpReg2)
1256-
// STP.P (tmpReg1, tmpReg2), 16(dstReg)
1257-
move16(s, srcReg, dstReg, tmpReg1, tmpReg2, 0, true)
1263+
for range loopSize / 32 {
1264+
// FLDPQ.P 32(srcReg), (tmpFReg1, tmpFReg2)
1265+
// FSTPQ.P (tmpFReg1, tmpFReg2), 32(dstReg)
1266+
move32(s, srcReg, dstReg, tmpFReg1, tmpFReg2, 0, true)
12581267
}
12591268
// Decrement loop count.
12601269
// SUB $1, countReg
@@ -1276,10 +1285,17 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
12761285

12771286
// Copy any fractional portion.
12781287
var off int64
1288+
for n >= 32 {
1289+
// FLDPQ off(srcReg), (tmpFReg1, tmpFReg2)
1290+
// FSTPQ (tmpFReg1, tmpFReg2), off(dstReg)
1291+
move32(s, srcReg, dstReg, tmpFReg1, tmpFReg2, off, false)
1292+
off += 32
1293+
n -= 32
1294+
}
12791295
for n >= 16 {
1280-
// LDP off(srcReg), (tmpReg1, tmpReg2)
1281-
// STP (tmpReg1, tmpReg2), off(dstReg)
1282-
move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false)
1296+
// FMOVQ off(src), tmpFReg1
1297+
// FMOVQ tmpFReg1, off(dst)
1298+
move16(s, srcReg, dstReg, tmpFReg1, off, false)
12831299
off += 16
12841300
n -= 16
12851301
}
@@ -1699,26 +1715,55 @@ func zero8(s *ssagen.State, reg int16, off int64) {
16991715
p.To.Offset = off
17001716
}
17011717

1702-
// move16 copies 16 bytes at src+off to dst+off.
1718+
// move32 copies 32 bytes at src+off to dst+off.
17031719
// Uses registers tmp1 and tmp2.
1704-
// If postInc is true, increment src and dst by 16.
1705-
func move16(s *ssagen.State, src, dst, tmp1, tmp2 int16, off int64, postInc bool) {
1706-
// LDP off(src), (tmp1, tmp2)
1707-
ld := s.Prog(arm64.ALDP)
1720+
// If postInc is true, increment src and dst by 32.
1721+
func move32(s *ssagen.State, src, dst, tmp1, tmp2 int16, off int64, postInc bool) {
1722+
// FLDPQ off(src), (tmp1, tmp2)
1723+
ld := s.Prog(arm64.AFLDPQ)
17081724
ld.From.Type = obj.TYPE_MEM
17091725
ld.From.Reg = src
17101726
ld.From.Offset = off
17111727
ld.To.Type = obj.TYPE_REGREG
17121728
ld.To.Reg = tmp1
17131729
ld.To.Offset = int64(tmp2)
1714-
// STP (tmp1, tmp2), off(dst)
1715-
st := s.Prog(arm64.ASTP)
1730+
// FSTPQ (tmp1, tmp2), off(dst)
1731+
st := s.Prog(arm64.AFSTPQ)
17161732
st.From.Type = obj.TYPE_REGREG
17171733
st.From.Reg = tmp1
17181734
st.From.Offset = int64(tmp2)
17191735
st.To.Type = obj.TYPE_MEM
17201736
st.To.Reg = dst
17211737
st.To.Offset = off
1738+
if postInc {
1739+
if off != 0 {
1740+
panic("can't postinc with non-zero offset")
1741+
}
1742+
ld.Scond = arm64.C_XPOST
1743+
st.Scond = arm64.C_XPOST
1744+
ld.From.Offset = 32
1745+
st.To.Offset = 32
1746+
}
1747+
}
1748+
1749+
// move16 copies 16 bytes at src+off to dst+off.
1750+
// Uses register tmp1
1751+
// If postInc is true, increment src and dst by 16.
1752+
func move16(s *ssagen.State, src, dst, tmp1 int16, off int64, postInc bool) {
1753+
// FMOVQ off(src), tmp1
1754+
ld := s.Prog(arm64.AFMOVQ)
1755+
ld.From.Type = obj.TYPE_MEM
1756+
ld.From.Reg = src
1757+
ld.From.Offset = off
1758+
ld.To.Type = obj.TYPE_REG
1759+
ld.To.Reg = tmp1
1760+
// FMOVQ tmp1, off(dst)
1761+
st := s.Prog(arm64.AFMOVQ)
1762+
st.From.Type = obj.TYPE_REG
1763+
st.From.Reg = tmp1
1764+
st.To.Type = obj.TYPE_MEM
1765+
st.To.Reg = dst
1766+
st.To.Offset = off
17221767
if postInc {
17231768
if off != 0 {
17241769
panic("can't postinc with non-zero offset")

src/cmd/compile/internal/ssa/_gen/ARM64Ops.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -144,8 +144,9 @@ func init() {
144144
gpspsbg = gpspg | buildReg("SB")
145145
fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31")
146146
callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g
147+
r25 = buildReg("R25")
147148
r24to25 = buildReg("R24 R25")
148-
r23to25 = buildReg("R23 R24 R25")
149+
f16to17 = buildReg("F16 F17")
149150
rz = buildReg("ZERO")
150151
first16 = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15")
151152
)
@@ -599,8 +600,8 @@ func init() {
599600
aux: "Int64",
600601
argLength: 3,
601602
reg: regInfo{
602-
inputs: []regMask{gp &^ r24to25, gp &^ r24to25},
603-
clobbers: r24to25, // TODO: figure out needIntTemp x2
603+
inputs: []regMask{gp &^ r25, gp &^ r25},
604+
clobbers: r25 | f16to17, // TODO: figure out needIntTemp + x2 for floats
604605
},
605606
faultOnNilArg0: true,
606607
faultOnNilArg1: true,
@@ -617,8 +618,8 @@ func init() {
617618
aux: "Int64",
618619
argLength: 3,
619620
reg: regInfo{
620-
inputs: []regMask{gp &^ r23to25, gp &^ r23to25},
621-
clobbers: r23to25, // TODO: figure out needIntTemp x3
621+
inputs: []regMask{gp &^ r24to25, gp &^ r24to25},
622+
clobbers: r24to25 | f16to17, // TODO: figure out needIntTemp x2 + x2 for floats
622623
clobbersArg0: true,
623624
clobbersArg1: true,
624625
},

src/cmd/compile/internal/ssa/opGen.go

Lines changed: 6 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)