Commit 2e8543c
authored
[X86] Improve variable 8-bit shifts on AVX512BW (#164136)
Previously, `clang -march=znver5 -O3` would emit the following for
`shl`, `lshr` and `ashr <64 x i8>`:
```asm
.LCPI0_2:
.byte 8
.byte 4
.byte 2
.byte 1
.byte 0
.byte 0
.byte 0
.byte 0
.LCPI0_3:
.byte 32
.byte 16
.byte 8
.byte 4
.byte 2
.byte 1
.byte 0
.byte 0
shl:
vpsllw zmm1, zmm1, 5
vpmovb2m k1, zmm1
vpaddb zmm1, zmm1, zmm1
vgf2p8affineqb zmm0 {k1}, zmm0, qword ptr [rip + .LCPI0_2]{1to8}, 0
vpmovb2m k1, zmm1
vpaddb zmm1, zmm1, zmm1
vgf2p8affineqb zmm0 {k1}, zmm0, qword ptr [rip + .LCPI0_3]{1to8}, 0
vpmovb2m k1, zmm1
vpaddb zmm0 {k1}, zmm0, zmm0
ret
.LCPI1_3:
.byte 0
.byte 0
.byte 0
.byte 0
.byte 128
.byte 64
.byte 32
.byte 16
.LCPI1_4:
.byte 0
.byte 0
.byte 128
.byte 64
.byte 32
.byte 16
.byte 8
.byte 4
.LCPI1_5:
.byte 0
.byte 128
.byte 64
.byte 32
.byte 16
.byte 8
.byte 4
.byte 2
lshr:
vpsllw zmm1, zmm1, 5
vpmovb2m k1, zmm1
vpaddb zmm1, zmm1, zmm1
vgf2p8affineqb zmm0 {k1}, zmm0, qword ptr [rip + .LCPI1_3]{1to8}, 0
vpmovb2m k1, zmm1
vpaddb zmm1, zmm1, zmm1
vgf2p8affineqb zmm0 {k1}, zmm0, qword ptr [rip + .LCPI1_4]{1to8}, 0
vpmovb2m k1, zmm1
vgf2p8affineqb zmm0 {k1}, zmm0, qword ptr [rip + .LCPI1_5]{1to8}, 0
ret
ashr:
vpsllw zmm1, zmm1, 5
vpunpckhbw zmm2, zmm0, zmm0
vpunpckhbw zmm4, zmm1, zmm1
vpsraw zmm3, zmm2, 4
vpunpcklbw zmm0, zmm0, zmm0
vpmovb2m k1, zmm4
vpaddw zmm4, zmm4, zmm4
vpunpcklbw zmm1, zmm1, zmm1
vmovdqu8 zmm2 {k1}, zmm3
vpmovb2m k1, zmm4
vpsraw zmm3, zmm2, 2
vpaddw zmm4, zmm4, zmm4
vmovdqu8 zmm2 {k1}, zmm3
vpsraw zmm3, zmm2, 1
vpmovb2m k1, zmm4
vmovdqu8 zmm2 {k1}, zmm3
vpmovb2m k1, zmm1
vpsraw zmm3, zmm0, 4
vpaddw zmm1, zmm1, zmm1
vpsrlw zmm2, zmm2, 8
vmovdqu8 zmm0 {k1}, zmm3
vpmovb2m k1, zmm1
vpsraw zmm3, zmm0, 2
vpaddw zmm1, zmm1, zmm1
vmovdqu8 zmm0 {k1}, zmm3
vpsraw zmm3, zmm0, 1
vpmovb2m k1, zmm1
vmovdqu8 zmm0 {k1}, zmm3
vpsrlw zmm0, zmm0, 8
vpackuswb zmm0, zmm0, zmm2
ret
```
With this commit, the generated assembly becomes this:
```asm
.LCPI0_2:
.byte 0
.byte 255
.byte 0
.byte 255
.LCPI0_3:
.byte 255
.byte 0
.byte 255
.byte 0
shl:
vpsrlw zmm2, zmm1, 8
vpandd zmm3, zmm0, dword ptr [rip + .LCPI0_2]{1to16}
vpandd zmm1, zmm1, dword ptr [rip + .LCPI0_3]{1to16}
movabs rax, -6148914691236517206
kmovq k1, rax
vpsllvw zmm2, zmm3, zmm2
vpsllvw zmm0, zmm0, zmm1
vmovdqu8 zmm0 {k1}, zmm2
ret
.LCPI1_0:
.byte 255
.byte 0
lshr:
vpbroadcastw zmm2, word ptr [rip + .LCPI1_0]
movabs rax, -6148914691236517206
kmovq k1, rax
vpandq zmm3, zmm1, zmm2
vpandq zmm2, zmm0, zmm2
vpsrlw zmm1, zmm1, 8
vpsrlvw zmm2, zmm2, zmm3
vpsrlvw zmm0, zmm0, zmm1
vmovdqu8 zmm2 {k1}, zmm0
vmovdqa64 zmm0, zmm2
ret
.LCPI2_1:
.byte 255
.byte 0
.byte 255
.byte 0
ashr:
vpsrlw zmm2, zmm1, 8
vpandd zmm1, zmm1, dword ptr [rip + .LCPI2_1]{1to16}
movabs rax, -6148914691236517206
vpsravw zmm2, zmm0, zmm2
vpsllw zmm0, zmm0, 8
kmovq k1, rax
vpsraw zmm0, zmm0, 8
vpsravw zmm0, zmm0, zmm1
vmovdqu8 zmm0 {k1}, zmm2
ret
```
While I don't have AVX512 hardware, llvm-mca suggests significant
speedups, and I've done some simple correctness tests on random inputs
using the Intel Software Development Emulator.1 parent 831e79a commit 2e8543c
File tree
5 files changed
+109
-104
lines changed- llvm
- lib/Target/X86
- test/CodeGen/X86
5 files changed
+109
-104
lines changed| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
30908 | 30908 | | |
30909 | 30909 | | |
30910 | 30910 | | |
| 30911 | + | |
| 30912 | + | |
| 30913 | + | |
| 30914 | + | |
| 30915 | + | |
| 30916 | + | |
| 30917 | + | |
| 30918 | + | |
| 30919 | + | |
| 30920 | + | |
| 30921 | + | |
| 30922 | + | |
| 30923 | + | |
| 30924 | + | |
| 30925 | + | |
| 30926 | + | |
| 30927 | + | |
| 30928 | + | |
| 30929 | + | |
| 30930 | + | |
| 30931 | + | |
| 30932 | + | |
| 30933 | + | |
| 30934 | + | |
| 30935 | + | |
| 30936 | + | |
| 30937 | + | |
| 30938 | + | |
| 30939 | + | |
| 30940 | + | |
| 30941 | + | |
| 30942 | + | |
| 30943 | + | |
| 30944 | + | |
| 30945 | + | |
| 30946 | + | |
| 30947 | + | |
| 30948 | + | |
| 30949 | + | |
| 30950 | + | |
| 30951 | + | |
| 30952 | + | |
| 30953 | + | |
| 30954 | + | |
| 30955 | + | |
| 30956 | + | |
| 30957 | + | |
| 30958 | + | |
| 30959 | + | |
| 30960 | + | |
| 30961 | + | |
| 30962 | + | |
| 30963 | + | |
| 30964 | + | |
| 30965 | + | |
| 30966 | + | |
| 30967 | + | |
30911 | 30968 | | |
30912 | 30969 | | |
30913 | 30970 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1684 | 1684 | | |
1685 | 1685 | | |
1686 | 1686 | | |
1687 | | - | |
1688 | | - | |
1689 | | - | |
1690 | | - | |
1691 | | - | |
1692 | | - | |
1693 | | - | |
1694 | | - | |
1695 | | - | |
| 1687 | + | |
| 1688 | + | |
| 1689 | + | |
| 1690 | + | |
| 1691 | + | |
| 1692 | + | |
| 1693 | + | |
| 1694 | + | |
1696 | 1695 | | |
1697 | 1696 | | |
1698 | 1697 | | |
| |||
1876 | 1875 | | |
1877 | 1876 | | |
1878 | 1877 | | |
1879 | | - | |
1880 | | - | |
1881 | | - | |
1882 | | - | |
1883 | | - | |
1884 | | - | |
1885 | | - | |
1886 | | - | |
1887 | | - | |
| 1878 | + | |
| 1879 | + | |
| 1880 | + | |
| 1881 | + | |
| 1882 | + | |
| 1883 | + | |
| 1884 | + | |
| 1885 | + | |
| 1886 | + | |
1888 | 1887 | | |
1889 | 1888 | | |
1890 | 1889 | | |
| |||
2232 | 2231 | | |
2233 | 2232 | | |
2234 | 2233 | | |
2235 | | - | |
2236 | | - | |
2237 | | - | |
2238 | | - | |
2239 | | - | |
2240 | | - | |
2241 | | - | |
2242 | | - | |
2243 | | - | |
2244 | | - | |
2245 | | - | |
2246 | | - | |
2247 | | - | |
2248 | | - | |
2249 | | - | |
2250 | | - | |
2251 | | - | |
2252 | | - | |
2253 | | - | |
2254 | | - | |
2255 | | - | |
2256 | | - | |
2257 | | - | |
2258 | | - | |
2259 | | - | |
2260 | | - | |
2261 | | - | |
2262 | | - | |
2263 | | - | |
2264 | | - | |
| 2234 | + | |
| 2235 | + | |
| 2236 | + | |
| 2237 | + | |
| 2238 | + | |
| 2239 | + | |
| 2240 | + | |
| 2241 | + | |
| 2242 | + | |
| 2243 | + | |
2265 | 2244 | | |
2266 | 2245 | | |
2267 | 2246 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
106 | 106 | | |
107 | 107 | | |
108 | 108 | | |
109 | | - | |
110 | | - | |
111 | | - | |
112 | | - | |
113 | | - | |
114 | | - | |
115 | | - | |
116 | | - | |
117 | | - | |
118 | | - | |
119 | | - | |
120 | | - | |
121 | | - | |
122 | | - | |
123 | | - | |
124 | | - | |
125 | | - | |
126 | | - | |
127 | | - | |
128 | | - | |
129 | | - | |
130 | | - | |
131 | | - | |
132 | | - | |
133 | | - | |
134 | | - | |
135 | | - | |
136 | | - | |
137 | | - | |
138 | | - | |
| 109 | + | |
| 110 | + | |
| 111 | + | |
| 112 | + | |
| 113 | + | |
| 114 | + | |
| 115 | + | |
| 116 | + | |
| 117 | + | |
| 118 | + | |
139 | 119 | | |
140 | 120 | | |
141 | 121 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
85 | 85 | | |
86 | 86 | | |
87 | 87 | | |
88 | | - | |
89 | | - | |
90 | | - | |
91 | | - | |
92 | | - | |
93 | | - | |
94 | | - | |
95 | | - | |
96 | | - | |
97 | | - | |
98 | | - | |
99 | | - | |
100 | | - | |
101 | | - | |
| 88 | + | |
| 89 | + | |
| 90 | + | |
| 91 | + | |
| 92 | + | |
| 93 | + | |
| 94 | + | |
| 95 | + | |
102 | 96 | | |
103 | 97 | | |
104 | 98 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
82 | 82 | | |
83 | 83 | | |
84 | 84 | | |
85 | | - | |
86 | | - | |
87 | | - | |
88 | | - | |
89 | | - | |
90 | | - | |
91 | | - | |
92 | | - | |
93 | | - | |
| 85 | + | |
| 86 | + | |
| 87 | + | |
| 88 | + | |
| 89 | + | |
| 90 | + | |
| 91 | + | |
94 | 92 | | |
95 | | - | |
96 | | - | |
97 | | - | |
98 | 93 | | |
99 | 94 | | |
100 | 95 | | |
| |||
0 commit comments