11; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2- ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512F
3- ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512BW
2+ ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=ALL,SLOW,AVX512F
3+ ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefixes=ALL,SLOW,AVX512BW
4+ ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,FAST,AVX512F
5+ ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,FAST,AVX512BW
46
57target triple = "x86_64-unknown-unknown"
68
@@ -14,21 +16,33 @@ define <16 x float> @shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
1416}
1517
1618define <16 x float > @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08 (<16 x float > %a , <16 x float > %b ) {
17- ; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
18- ; ALL: # %bb.0:
19- ; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0
20- ; ALL-NEXT: vbroadcastss %xmm0, %zmm0
21- ; ALL-NEXT: retq
19+ ; SLOW-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
20+ ; SLOW: # %bb.0:
21+ ; SLOW-NEXT: vextractf32x4 $2, %zmm0, %xmm0
22+ ; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
23+ ; SLOW-NEXT: retq
24+ ;
25+ ; FAST-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
26+ ; FAST: # %bb.0:
27+ ; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
28+ ; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
29+ ; FAST-NEXT: retq
2230 %shuffle = shufflevector <16 x float > %a , <16 x float > %b , <16 x i32 ><i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 >
2331 ret <16 x float > %shuffle
2432}
2533
2634define <16 x float > @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc (<16 x i32 > %a , <16 x i32 > %b ) {
27- ; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
28- ; ALL: # %bb.0:
29- ; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0
30- ; ALL-NEXT: vbroadcastss %xmm0, %zmm0
31- ; ALL-NEXT: retq
35+ ; SLOW-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
36+ ; SLOW: # %bb.0:
37+ ; SLOW-NEXT: vextractf32x4 $2, %zmm0, %xmm0
38+ ; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
39+ ; SLOW-NEXT: retq
40+ ;
41+ ; FAST-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
42+ ; FAST: # %bb.0:
43+ ; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
44+ ; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
45+ ; FAST-NEXT: retq
3246 %tmp0 = bitcast <16 x i32 > %a to <16 x float >
3347 %tmp1 = bitcast <16 x i32 > %b to <16 x float >
3448 %shuffle = shufflevector <16 x float > %tmp0 , <16 x float > %tmp1 , <16 x i32 ><i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 , i32 8 >
@@ -196,11 +210,20 @@ define <16 x float> @shuffle_f32_v16f32_00_08_01_09_02_10_03_11_04_12_05_13_06_1
196210
197211; PR86076
198212define <16 x float > @shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08 (float %a0 , float %a1 ) {
199- ; ALL-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
200- ; ALL: # %bb.0:
201- ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
202- ; ALL-NEXT: vbroadcastsd %xmm0, %zmm0
203- ; ALL-NEXT: retq
213+ ; SLOW-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
214+ ; SLOW: # %bb.0:
215+ ; SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
216+ ; SLOW-NEXT: vbroadcastsd %xmm0, %zmm0
217+ ; SLOW-NEXT: retq
218+ ;
219+ ; FAST-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
220+ ; FAST: # %bb.0:
221+ ; FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
222+ ; FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
223+ ; FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16]
224+ ; FAST-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
225+ ; FAST-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
226+ ; FAST-NEXT: retq
204227 %v0 = insertelement <8 x float > poison, float %a0 , i64 0
205228 %v1 = insertelement <8 x float > poison, float %a1 , i64 0
206229 %sv = shufflevector <8 x float > %v0 , <8 x float > %v1 , <16 x i32 > <i32 0 , i32 8 , i32 0 , i32 8 , i32 0 , i32 8 , i32 0 , i32 8 , i32 0 , i32 8 , i32 0 , i32 8 , i32 0 , i32 8 , i32 0 , i32 8 >
@@ -217,11 +240,17 @@ define <16 x i32> @shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
217240}
218241
219242define <16 x i32 > @shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04 (<16 x i32 > %a , <16 x i32 > %b ) {
220- ; ALL-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
221- ; ALL: # %bb.0:
222- ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
223- ; ALL-NEXT: vbroadcastss %xmm0, %zmm0
224- ; ALL-NEXT: retq
243+ ; SLOW-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
244+ ; SLOW: # %bb.0:
245+ ; SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
246+ ; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
247+ ; SLOW-NEXT: retq
248+ ;
249+ ; FAST-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
250+ ; FAST: # %bb.0:
251+ ; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
252+ ; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
253+ ; FAST-NEXT: retq
225254 %shuffle = shufflevector <16 x i32 > %a , <16 x i32 > %b , <16 x i32 ><i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 >
226255 ret <16 x i32 > %shuffle
227256}
@@ -302,21 +331,33 @@ define <16 x float> @shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08
302331
303332; PR46249
304333define <16 x i32 > @shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04 (<16 x i32 > %a ) {
305- ; ALL-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
306- ; ALL: # %bb.0:
307- ; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
308- ; ALL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
309- ; ALL-NEXT: retq
334+ ; SLOW-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
335+ ; SLOW: # %bb.0:
336+ ; SLOW-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
337+ ; SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
338+ ; SLOW-NEXT: retq
339+ ;
340+ ; FAST-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
341+ ; FAST: # %bb.0:
342+ ; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
343+ ; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
344+ ; FAST-NEXT: retq
310345 %1 = shufflevector <16 x i32 > %a , <16 x i32 > undef , <16 x i32 > <i32 11 , i32 10 , i32 9 , i32 8 , i32 15 , i32 14 , i32 13 , i32 12 , i32 3 , i32 2 , i32 1 , i32 0 , i32 7 , i32 6 , i32 5 , i32 4 >
311346 ret <16 x i32 > %1
312347}
313348
314349define <16 x float > @shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04 (<16 x float > %a ) {
315- ; ALL-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
316- ; ALL: # %bb.0:
317- ; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
318- ; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
319- ; ALL-NEXT: retq
350+ ; SLOW-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
351+ ; SLOW: # %bb.0:
352+ ; SLOW-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
353+ ; SLOW-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
354+ ; SLOW-NEXT: retq
355+ ;
356+ ; FAST-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
357+ ; FAST: # %bb.0:
358+ ; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
359+ ; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
360+ ; FAST-NEXT: retq
320361 %1 = shufflevector <16 x float > %a , <16 x float > undef , <16 x i32 > <i32 11 , i32 10 , i32 9 , i32 8 , i32 15 , i32 14 , i32 13 , i32 12 , i32 3 , i32 2 , i32 1 , i32 0 , i32 7 , i32 6 , i32 5 , i32 4 >
321362 ret <16 x float > %1
322363}
@@ -333,11 +374,17 @@ define <16 x float> @shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_
333374}
334375
335376define <16 x float > @shuffle_v16f32_load_08_11_10_00_12_15_14_04 (<16 x float > %a0 , ptr %a1 ) {
336- ; ALL-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
337- ; ALL: # %bb.0:
338- ; ALL-NEXT: vshufps {{.*#+}} zmm1 = zmm0[2,0],mem[0,0],zmm0[6,4],mem[4,4],zmm0[10,8],mem[8,8],zmm0[14,12],mem[12,12]
339- ; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,2],zmm0[4,7],zmm1[4,6],zmm0[8,11],zmm1[8,10],zmm0[12,15],zmm1[12,14]
340- ; ALL-NEXT: retq
377+ ; SLOW-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
378+ ; SLOW: # %bb.0:
379+ ; SLOW-NEXT: vshufps {{.*#+}} zmm1 = zmm0[2,0],mem[0,0],zmm0[6,4],mem[4,4],zmm0[10,8],mem[8,8],zmm0[14,12],mem[12,12]
380+ ; SLOW-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,2],zmm0[4,7],zmm1[4,6],zmm0[8,11],zmm1[8,10],zmm0[12,15],zmm1[12,14]
381+ ; SLOW-NEXT: retq
382+ ;
383+ ; FAST-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
384+ ; FAST: # %bb.0:
385+ ; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [0,3,2,16,4,7,6,20,8,11,10,24,12,15,14,28]
386+ ; FAST-NEXT: vpermt2ps (%rdi), %zmm1, %zmm0
387+ ; FAST-NEXT: retq
341388 %1 = load <16 x float >, ptr %a1
342389 %2 = shufflevector <16 x float > %1 , <16 x float > %a0 , <16 x i32 > <i32 16 , i32 19 , i32 18 , i32 0 , i32 20 , i32 23 , i32 22 , i32 4 , i32 24 , i32 27 , i32 26 , i32 8 , i32 28 , i32 31 , i32 30 , i32 12 >
343390 ret <16 x float > %2
@@ -365,26 +412,41 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a
365412
366413;FIXME: can do better with vpcompress
367414define <8 x i32 > @test_v16i32_1_3_5_7_9_11_13_15 (<16 x i32 > %v ) {
368- ; ALL-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
369- ; ALL: # %bb.0:
370- ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
371- ; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
372- ; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
373- ; ALL-NEXT: retq
415+ ; SLOW-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
416+ ; SLOW: # %bb.0:
417+ ; SLOW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
418+ ; SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
419+ ; SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
420+ ; SLOW-NEXT: retq
421+ ;
422+ ; FAST-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
423+ ; FAST: # %bb.0:
424+ ; FAST-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
425+ ; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
426+ ; FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
427+ ; FAST-NEXT: retq
374428 %res = shufflevector <16 x i32 > %v , <16 x i32 > undef , <8 x i32 > <i32 1 , i32 3 , i32 5 , i32 7 , i32 9 , i32 11 , i32 13 , i32 15 >
375429 ret <8 x i32 > %res
376430}
377431
378432;FIXME: can do better with vpcompress
379433define <4 x i32 > @test_v16i32_0_1_2_12 (<16 x i32 > %v ) {
380- ; ALL-LABEL: test_v16i32_0_1_2_12:
381- ; ALL: # %bb.0:
382- ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
383- ; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1
384- ; ALL-NEXT: vbroadcastss %xmm1, %xmm1
385- ; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
386- ; ALL-NEXT: vzeroupper
387- ; ALL-NEXT: retq
434+ ; SLOW-LABEL: test_v16i32_0_1_2_12:
435+ ; SLOW: # %bb.0:
436+ ; SLOW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
437+ ; SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
438+ ; SLOW-NEXT: vbroadcastss %xmm1, %xmm1
439+ ; SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
440+ ; SLOW-NEXT: vzeroupper
441+ ; SLOW-NEXT: retq
442+ ;
443+ ; FAST-LABEL: test_v16i32_0_1_2_12:
444+ ; FAST: # %bb.0:
445+ ; FAST-NEXT: vmovaps {{.*#+}} xmm1 = [0,1,2,12]
446+ ; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
447+ ; FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
448+ ; FAST-NEXT: vzeroupper
449+ ; FAST-NEXT: retq
388450 %res = shufflevector <16 x i32 > %v , <16 x i32 > undef , <4 x i32 > <i32 0 , i32 1 , i32 2 , i32 12 >
389451 ret <4 x i32 > %res
390452}
@@ -568,11 +630,18 @@ define <16 x i32> @shuffle_v8i32_17_16_01_00_21_20_05_04_25_24_09_08_29_28_13_12
568630}
569631
570632define <16 x float > @shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04 (<8 x float > %a ) {
571- ; ALL-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
572- ; ALL: # %bb.0:
573- ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
574- ; ALL-NEXT: vbroadcastss %xmm0, %zmm0
575- ; ALL-NEXT: retq
633+ ; SLOW-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
634+ ; SLOW: # %bb.0:
635+ ; SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
636+ ; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
637+ ; SLOW-NEXT: retq
638+ ;
639+ ; FAST-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
640+ ; FAST: # %bb.0:
641+ ; FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
642+ ; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
643+ ; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
644+ ; FAST-NEXT: retq
576645 %shuffle = shufflevector <8 x float > %a , <8 x float > undef , <16 x i32 > <i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 , i32 4 >
577646 ret <16 x float > %shuffle
578647}
0 commit comments