Skip to content

Commit 6e165b4

Browse files
committed
cmd/compile: implement Avg64u, Hmul64, Hmul64u for wasm
This lets us remove useAvg and useHmul from the division rules. The compiler is simpler and the generated code is faster. goos: wasip1 goarch: wasm pkg: internal/strconv │ old.txt │ new.txt │ │ sec/op │ sec/op vs base │ AppendFloat/Decimal 192.8n ± 1% 194.6n ± 0% +0.91% (p=0.000 n=10) AppendFloat/Float 328.6n ± 0% 279.6n ± 0% -14.93% (p=0.000 n=10) AppendFloat/Exp 335.6n ± 1% 289.2n ± 1% -13.80% (p=0.000 n=10) AppendFloat/NegExp 336.0n ± 0% 289.1n ± 1% -13.97% (p=0.000 n=10) AppendFloat/LongExp 332.4n ± 0% 285.2n ± 1% -14.20% (p=0.000 n=10) AppendFloat/Big 348.2n ± 0% 300.1n ± 0% -13.83% (p=0.000 n=10) AppendFloat/BinaryExp 137.4n ± 0% 138.2n ± 0% +0.55% (p=0.001 n=10) AppendFloat/32Integer 193.3n ± 1% 196.5n ± 0% +1.66% (p=0.000 n=10) AppendFloat/32ExactFraction 283.3n ± 0% 268.9n ± 1% -5.08% (p=0.000 n=10) AppendFloat/32Point 279.9n ± 0% 266.5n ± 0% -4.80% (p=0.000 n=10) AppendFloat/32Exp 300.1n ± 0% 288.3n ± 1% -3.90% (p=0.000 n=10) AppendFloat/32NegExp 288.2n ± 1% 277.9n ± 1% -3.59% (p=0.000 n=10) AppendFloat/32Shortest 261.7n ± 0% 250.2n ± 0% -4.39% (p=0.000 n=10) AppendFloat/32Fixed8Hard 173.3n ± 1% 158.9n ± 1% -8.31% (p=0.000 n=10) AppendFloat/32Fixed9Hard 180.0n ± 0% 167.9n ± 2% -6.70% (p=0.000 n=10) AppendFloat/64Fixed1 167.1n ± 0% 149.6n ± 1% -10.50% (p=0.000 n=10) AppendFloat/64Fixed2 162.4n ± 1% 146.5n ± 0% -9.73% (p=0.000 n=10) AppendFloat/64Fixed2.5 165.5n ± 0% 149.4n ± 1% -9.70% (p=0.000 n=10) AppendFloat/64Fixed3 166.4n ± 1% 150.2n ± 0% -9.74% (p=0.000 n=10) AppendFloat/64Fixed4 163.7n ± 0% 149.6n ± 1% -8.62% (p=0.000 n=10) AppendFloat/64Fixed5Hard 182.8n ± 1% 167.1n ± 1% -8.61% (p=0.000 n=10) AppendFloat/64Fixed12 222.2n ± 0% 208.8n ± 0% -6.05% (p=0.000 n=10) AppendFloat/64Fixed16 197.6n ± 1% 181.7n ± 0% -8.02% (p=0.000 n=10) AppendFloat/64Fixed12Hard 194.5n ± 0% 181.0n ± 0% -6.99% (p=0.000 n=10) AppendFloat/64Fixed17Hard 205.1n ± 1% 191.9n ± 0% -6.44% (p=0.000 n=10) AppendFloat/64Fixed18Hard 6.269µ ± 0% 6.643µ ± 0% +5.97% (p=0.000 n=10) AppendFloat/64FixedF1 211.7n ± 1% 197.0n ± 0% -6.95% (p=0.000 n=10) AppendFloat/64FixedF2 189.4n ± 0% 174.2n ± 0% -8.08% (p=0.000 n=10) AppendFloat/64FixedF3 169.0n ± 0% 154.9n ± 0% -8.32% (p=0.000 n=10) AppendFloat/Slowpath64 321.2n ± 0% 274.2n ± 1% -14.63% (p=0.000 n=10) AppendFloat/SlowpathDenormal64 307.4n ± 1% 261.2n ± 0% -15.03% (p=0.000 n=10) AppendInt 3.367µ ± 1% 3.376µ ± 0% ~ (p=0.517 n=10) AppendUint 675.5n ± 0% 676.9n ± 0% ~ (p=0.196 n=10) AppendIntSmall 28.13n ± 1% 28.17n ± 0% +0.14% (p=0.015 n=10) AppendUintVarlen/digits=1 20.70n ± 0% 20.51n ± 1% -0.89% (p=0.018 n=10) AppendUintVarlen/digits=2 20.43n ± 0% 20.27n ± 0% -0.81% (p=0.001 n=10) AppendUintVarlen/digits=3 38.48n ± 0% 37.93n ± 0% -1.43% (p=0.000 n=10) AppendUintVarlen/digits=4 41.10n ± 0% 38.78n ± 1% -5.62% (p=0.000 n=10) AppendUintVarlen/digits=5 42.25n ± 1% 42.11n ± 0% -0.32% (p=0.041 n=10) AppendUintVarlen/digits=6 45.40n ± 1% 43.14n ± 0% -4.98% (p=0.000 n=10) AppendUintVarlen/digits=7 46.81n ± 1% 46.03n ± 0% -1.66% (p=0.000 n=10) AppendUintVarlen/digits=8 48.88n ± 1% 46.59n ± 1% -4.68% (p=0.000 n=10) AppendUintVarlen/digits=9 49.94n ± 2% 49.41n ± 1% -1.06% (p=0.000 n=10) AppendUintVarlen/digits=10 57.28n ± 1% 56.92n ± 1% -0.62% (p=0.045 n=10) AppendUintVarlen/digits=11 60.09n ± 1% 58.11n ± 2% -3.30% (p=0.000 n=10) AppendUintVarlen/digits=12 62.22n ± 0% 61.85n ± 0% -0.59% (p=0.000 n=10) AppendUintVarlen/digits=13 64.94n ± 0% 62.92n ± 0% -3.10% (p=0.000 n=10) AppendUintVarlen/digits=14 65.42n ± 1% 65.19n ± 1% -0.34% (p=0.005 n=10) AppendUintVarlen/digits=15 68.17n ± 0% 66.13n ± 0% -2.99% (p=0.000 n=10) AppendUintVarlen/digits=16 70.21n ± 1% 70.09n ± 1% ~ (p=0.517 n=10) AppendUintVarlen/digits=17 72.93n ± 0% 70.49n ± 0% -3.34% (p=0.000 n=10) AppendUintVarlen/digits=18 73.01n ± 0% 72.75n ± 0% -0.35% (p=0.000 n=10) AppendUintVarlen/digits=19 79.27n ± 1% 79.49n ± 1% ~ (p=0.671 n=10) AppendUintVarlen/digits=20 82.18n ± 0% 80.43n ± 1% -2.14% (p=0.000 n=10) geomean 143.4n 136.0n -5.20% Change-Id: I8245814a0259ad13cf9225f57db8e9fe3d2e4267 Reviewed-on: https://go-review.googlesource.com/c/go/+/717407 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Cherry Mui <cherryyz@google.com>
1 parent 9f6590f commit 6e165b4

File tree

8 files changed

+322
-94
lines changed

8 files changed

+322
-94
lines changed

src/cmd/compile/internal/ssa/_gen/Wasm.rules

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5+
(Last ___) => v.Args[len(v.Args)-1]
6+
57
// Lowering arithmetic
68
(Add(64|32|16|8|Ptr) ...) => (I64Add ...)
79
(Add(64|32)F ...) => (F(64|32)Add ...)
@@ -44,6 +46,37 @@
4446

4547
(Not ...) => (I64Eqz ...)
4648

49+
(Avg64u x y) => (I64Add (I64ShrU (I64Sub x y) (I64Const [1])) y)
50+
51+
// High word of multiply without carry bits; see Hacker's Delight, 2nd. ed, Figure 8-2, p. 174.
52+
(Hmul64 <t> x y) =>
53+
(Last <t>
54+
x0: (ZeroExt32to64 x)
55+
x1: (I64ShrS x (I64Const [32]))
56+
y0: (ZeroExt32to64 y)
57+
y1: (I64ShrS y (I64Const [32]))
58+
x0y0: (I64Mul x0 y0)
59+
tt: (I64Add (I64Mul x1 y0) (I64ShrU x0y0 (I64Const [32])))
60+
w1: (I64Add (I64Mul x0 y1) (ZeroExt32to64 tt))
61+
w2: (I64ShrS tt (I64Const [32]))
62+
(I64Add (I64Add (I64Mul x1 y1) w2) (I64ShrS w1 (I64Const [32]))))
63+
64+
// Same as Hmul64 but signed shifts now unsigned.
65+
(Hmul64u <t> x y) =>
66+
(Last <t>
67+
x0: (ZeroExt32to64 x)
68+
x1: (I64ShrU x (I64Const [32]))
69+
y0: (ZeroExt32to64 y)
70+
y1: (I64ShrU y (I64Const [32]))
71+
w0: (I64Mul x0 y0)
72+
tt: (I64Add (I64Mul x1 y0) (I64ShrU w0 (I64Const [32])))
73+
w1: (I64Add (I64Mul x0 y1) (ZeroExt32to64 tt))
74+
w2: (I64ShrU tt (I64Const [32]))
75+
hi: (I64Add (I64Add (I64Mul x1 y1) w2) (I64ShrU w1 (I64Const [32]))))
76+
77+
(Select0 <t> (Mul64uhilo x y)) => (Hmul64u <t> x y)
78+
(Select1 <t> (Mul64uhilo x y)) => (I64Mul x y)
79+
4780
// Lowering pointer arithmetic
4881
(OffPtr ...) => (I64AddConst ...)
4982

src/cmd/compile/internal/ssa/_gen/divmod.rules

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -79,17 +79,9 @@
7979
// The magic number m for c is ⌈2^k/c⌉, so we can use
8080
// (m+1)/2 = ⌈2^k/(c/2)⌉ instead.
8181
//
82-
// 8. An unsigned divide on systems with an avg instruction.
82+
// 8. A general unsigned divide using an avg instruction.
8383
// We noted above that (x*((1<<N)+m))>>N>>s = ((x*m)>>N+x)>>s.
8484
// Let hi = (x*m)>>N, so we want (hi+x) >> s = avg(hi, x) >> (s-1).
85-
//
86-
// 9. Unsigned 64-bit divide by 16-bit constant on 32-bit systems.
87-
// Use long division with 16-bit digits.
88-
//
89-
// Note: All systems have Hmul and Avg except for wasm, and the
90-
// wasm JITs may well apply all these optimizations already anyway,
91-
// so it may be worth looking into avoiding this pass entirely on wasm
92-
// and dropping all the useAvg useHmul uncertainty.
9385

9486
// Case 1. Signed divides where 2N ≤ register size.
9587
(Div8 <t> x (Const8 [c])) && smagicOK8(c) =>
@@ -112,27 +104,27 @@
112104
(Rsh64x64 <t> (SignExt32to64 x) (Const64 <typ.UInt64> [63])))
113105

114106
// Case 2. Signed divides where m is even.
115-
(Div32 <t> x (Const32 [c])) && smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 == 0 && config.useHmul =>
107+
(Div32 <t> x (Const32 [c])) && smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 == 0 =>
116108
(Sub32 <t>
117109
(Rsh32x64 <t>
118110
(Hmul32 <t> x (Const32 <typ.UInt32> [int32(smagic32(c).m/2)]))
119111
(Const64 <typ.UInt64> [smagic32(c).s - 1]))
120112
(Rsh32x64 <t> x (Const64 <typ.UInt64> [31])))
121-
(Div64 <t> x (Const64 [c])) && smagicOK64(c) && smagic64(c).m&1 == 0 && config.useHmul =>
113+
(Div64 <t> x (Const64 [c])) && smagicOK64(c) && smagic64(c).m&1 == 0 =>
122114
(Sub64 <t>
123115
(Rsh64x64 <t>
124116
(Hmul64 <t> x (Const64 <typ.UInt64> [int64(smagic64(c).m/2)]))
125117
(Const64 <typ.UInt64> [smagic64(c).s - 1]))
126118
(Rsh64x64 <t> x (Const64 <typ.UInt64> [63])))
127119

128120
// Case 3. Signed divides where m is odd.
129-
(Div32 <t> x (Const32 [c])) && smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 != 0 && config.useHmul =>
121+
(Div32 <t> x (Const32 [c])) && smagicOK32(c) && config.RegSize == 4 && smagic32(c).m&1 != 0 =>
130122
(Sub32 <t>
131123
(Rsh32x64 <t>
132124
(Add32 <t> x (Hmul32 <t> x (Const32 <typ.UInt32> [int32(smagic32(c).m)])))
133125
(Const64 <typ.UInt64> [smagic32(c).s]))
134126
(Rsh32x64 <t> x (Const64 <typ.UInt64> [31])))
135-
(Div64 <t> x (Const64 [c])) && smagicOK64(c) && smagic64(c).m&1 != 0 && config.useHmul =>
127+
(Div64 <t> x (Const64 [c])) && smagicOK64(c) && smagic64(c).m&1 != 0 =>
136128
(Sub64 <t>
137129
(Rsh64x64 <t>
138130
(Add64 <t> x (Hmul64 <t> x (Const64 <typ.UInt64> [int64(smagic64(c).m)])))
@@ -149,11 +141,11 @@
149141
(Rsh64Ux64 <t>
150142
(Mul64 <typ.UInt64> (SignExt32to64 x) (Const64 <typ.UInt64> [int64(smagic32(c).m)]))
151143
(Const64 <typ.UInt64> [32 + smagic32(c).s]))
152-
(Div32u <t> x (Const32 [c])) && t.IsSigned() && smagicOK32(c) && config.RegSize == 4 && config.useHmul =>
144+
(Div32u <t> x (Const32 [c])) && t.IsSigned() && smagicOK32(c) && config.RegSize == 4 =>
153145
(Rsh32Ux64 <t>
154146
(Hmul32u <typ.UInt32> x (Const32 <typ.UInt32> [int32(smagic32(c).m)]))
155147
(Const64 <typ.UInt64> [smagic32(c).s]))
156-
(Div64u <t> x (Const64 [c])) && t.IsSigned() && smagicOK64(c) && config.useHmul =>
148+
(Div64u <t> x (Const64 [c])) && t.IsSigned() && smagicOK64(c) =>
157149
(Rsh64Ux64 <t>
158150
(Hmul64u <typ.UInt64> x (Const64 <typ.UInt64> [int64(smagic64(c).m)]))
159151
(Const64 <typ.UInt64> [smagic64(c).s]))
@@ -181,11 +173,11 @@
181173
(Rsh64Ux64 <typ.UInt64>
182174
(Mul64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt64> [int64(1<<31 + umagic32(c).m/2)]))
183175
(Const64 <typ.UInt64> [32 + umagic32(c).s - 1])))
184-
(Div32u <t> x (Const32 [c])) && umagicOK32(c) && umagic32(c).m&1 == 0 && config.RegSize == 4 && config.useHmul =>
176+
(Div32u <t> x (Const32 [c])) && umagicOK32(c) && umagic32(c).m&1 == 0 && config.RegSize == 4 =>
185177
(Rsh32Ux64 <t>
186178
(Hmul32u <typ.UInt32> x (Const32 <typ.UInt32> [int32(1<<31 + umagic32(c).m/2)]))
187179
(Const64 <typ.UInt64> [umagic32(c).s - 1]))
188-
(Div64u <t> x (Const64 [c])) && umagicOK64(c) && umagic64(c).m&1 == 0 && config.useHmul =>
180+
(Div64u <t> x (Const64 [c])) && umagicOK64(c) && umagic64(c).m&1 == 0 =>
189181
(Rsh64Ux64 <t>
190182
(Hmul64u <typ.UInt64> x (Const64 <typ.UInt64> [int64(1<<63 + umagic64(c).m/2)]))
191183
(Const64 <typ.UInt64> [umagic64(c).s - 1]))
@@ -205,39 +197,39 @@
205197
(Rsh64Ux64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt64> [1]))
206198
(Const64 <typ.UInt64> [int64(1<<31 + (umagic32(c).m+1)/2)]))
207199
(Const64 <typ.UInt64> [32 + umagic32(c).s - 2])))
208-
(Div32u <t> x (Const32 [c])) && umagicOK32(c) && config.RegSize == 4 && c&1 == 0 && config.useHmul =>
200+
(Div32u <t> x (Const32 [c])) && umagicOK32(c) && config.RegSize == 4 && c&1 == 0 =>
209201
(Rsh32Ux64 <t>
210202
(Hmul32u <typ.UInt32>
211203
(Rsh32Ux64 <typ.UInt32> x (Const64 <typ.UInt64> [1]))
212204
(Const32 <typ.UInt32> [int32(1<<31 + (umagic32(c).m+1)/2)]))
213205
(Const64 <typ.UInt64> [umagic32(c).s - 2]))
214-
(Div64u <t> x (Const64 [c])) && umagicOK64(c) && c&1 == 0 && config.useHmul =>
206+
(Div64u <t> x (Const64 [c])) && umagicOK64(c) && c&1 == 0 =>
215207
(Rsh64Ux64 <t>
216208
(Hmul64u <typ.UInt64>
217209
(Rsh64Ux64 <typ.UInt64> x (Const64 <typ.UInt64> [1]))
218210
(Const64 <typ.UInt64> [int64(1<<63 + (umagic64(c).m+1)/2)]))
219211
(Const64 <typ.UInt64> [umagic64(c).s - 2]))
220212

221-
// Case 8. Unsigned divide on systems with avg.
222-
(Div16u <t> x (Const16 [c])) && umagicOK16(c) && config.RegSize == 4 && config.useAvg =>
213+
// Case 8. Unsigned divide using avg.
214+
(Div16u <t> x (Const16 [c])) && umagicOK16(c) && config.RegSize == 4 =>
223215
(Trunc32to16 <t>
224216
(Rsh32Ux64 <typ.UInt32>
225217
(Avg32u
226218
(Lsh32x64 <typ.UInt32> (ZeroExt16to32 x) (Const64 <typ.UInt64> [16]))
227219
(Mul32 <typ.UInt32> (ZeroExt16to32 x) (Const32 <typ.UInt32> [int32(umagic16(c).m)])))
228220
(Const64 <typ.UInt64> [16 + umagic16(c).s - 1])))
229-
(Div32u <t> x (Const32 [c])) && umagicOK32(c) && config.RegSize == 8 && config.useAvg =>
221+
(Div32u <t> x (Const32 [c])) && umagicOK32(c) && config.RegSize == 8 =>
230222
(Trunc64to32 <t>
231223
(Rsh64Ux64 <typ.UInt64>
232224
(Avg64u
233225
(Lsh64x64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt64> [32]))
234226
(Mul64 <typ.UInt64> (ZeroExt32to64 x) (Const64 <typ.UInt32> [int64(umagic32(c).m)])))
235227
(Const64 <typ.UInt64> [32 + umagic32(c).s - 1])))
236-
(Div32u <t> x (Const32 [c])) && umagicOK32(c) && config.RegSize == 4 && config.useAvg && config.useHmul =>
228+
(Div32u <t> x (Const32 [c])) && umagicOK32(c) && config.RegSize == 4 =>
237229
(Rsh32Ux64 <t>
238230
(Avg32u x (Hmul32u <typ.UInt32> x (Const32 <typ.UInt32> [int32(umagic32(c).m)])))
239231
(Const64 <typ.UInt64> [umagic32(c).s - 1]))
240-
(Div64u <t> x (Const64 [c])) && umagicOK64(c) && config.useAvg && config.useHmul =>
232+
(Div64u <t> x (Const64 [c])) && umagicOK64(c) =>
241233
(Rsh64Ux64 <t>
242234
(Avg64u x (Hmul64u <typ.UInt64> x (Const64 <typ.UInt64> [int64(umagic64(c).m)])))
243235
(Const64 <typ.UInt64> [umagic64(c).s - 1]))

src/cmd/compile/internal/ssa/config.go

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,6 @@ type Config struct {
4141
hasGReg bool // has hardware g register
4242
ctxt *obj.Link // Generic arch information
4343
optimize bool // Do optimization
44-
useAvg bool // Use optimizations that need Avg* operations
45-
useHmul bool // Use optimizations that need Hmul* operations
4644
SoftFloat bool //
4745
Race bool // race detector enabled
4846
BigEndian bool //
@@ -168,8 +166,6 @@ type Frontend interface {
168166
// NewConfig returns a new configuration object for the given architecture.
169167
func NewConfig(arch string, types Types, ctxt *obj.Link, optimize, softfloat bool) *Config {
170168
c := &Config{arch: arch, Types: types}
171-
c.useAvg = true
172-
c.useHmul = true
173169
switch arch {
174170
case "amd64":
175171
c.PtrSize = 8
@@ -359,8 +355,6 @@ func NewConfig(arch string, types Types, ctxt *obj.Link, optimize, softfloat boo
359355
c.FPReg = framepointerRegWasm
360356
c.LinkReg = linkRegWasm
361357
c.hasGReg = true
362-
c.useAvg = false
363-
c.useHmul = false
364358
c.unalignedOK = true
365359
c.haveCondSelect = true
366360
default:

0 commit comments

Comments
 (0)