@@ -842,3 +842,110 @@ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_
842842 %5 = shufflevector <64 x i8 > %3 , <64 x i8 > %4 , <64 x i32 > <i32 0 , i32 2 , i32 4 , i32 6 , i32 8 , i32 10 , i32 12 , i32 14 , i32 64 , i32 66 , i32 68 , i32 70 , i32 72 , i32 74 , i32 76 , i32 78 , i32 16 , i32 18 , i32 20 , i32 22 , i32 24 , i32 26 , i32 28 , i32 30 , i32 80 , i32 82 , i32 84 , i32 86 , i32 88 , i32 90 , i32 92 , i32 94 , i32 32 , i32 34 , i32 36 , i32 38 , i32 40 , i32 42 , i32 44 , i32 46 , i32 96 , i32 98 , i32 100 , i32 102 , i32 104 , i32 106 , i32 108 , i32 110 , i32 48 , i32 50 , i32 52 , i32 54 , i32 56 , i32 58 , i32 60 , i32 62 , i32 112 , i32 114 , i32 116 , i32 118 , i32 120 , i32 122 , i32 124 , i32 126 >
843843 ret <64 x i8 > %5
844844}
845+
846+ define <64 x i8 > @PR54562_ref (<64 x i8 > %a0 ) {
847+ ; AVX512F-LABEL: PR54562_ref:
848+ ; AVX512F: # %bb.0:
849+ ; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,1,2]
850+ ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30]
851+ ; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm2
852+ ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
853+ ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
854+ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10]
855+ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
856+ ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
857+ ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
858+ ; AVX512F-NEXT: retq
859+ ;
860+ ; AVX512BW-LABEL: PR54562_ref:
861+ ; AVX512BW: # %bb.0:
862+ ; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,1,2]
863+ ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30]
864+ ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
865+ ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
866+ ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
867+ ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10]
868+ ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
869+ ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
870+ ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
871+ ; AVX512BW-NEXT: retq
872+ ;
873+ ; AVX512DQ-LABEL: PR54562_ref:
874+ ; AVX512DQ: # %bb.0:
875+ ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,1,2]
876+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30]
877+ ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
878+ ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
879+ ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
880+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10]
881+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
882+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
883+ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
884+ ; AVX512DQ-NEXT: retq
885+ ;
886+ ; AVX512VBMI-LABEL: PR54562_ref:
887+ ; AVX512VBMI: # %bb.0:
888+ ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,13,12,14,13,16,15,17,16,19,18,20,19,22,21,23,22,25,24,26,25,28,27,29,28,31,30,32,31,34,33,35,34,37,36,38,37,40,39,41,40,43,42,44,43,46,45,47,46]
889+ ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
890+ ; AVX512VBMI-NEXT: retq
891+ %shuffle1 = shufflevector <64 x i8 > %a0 , <64 x i8 > poison, <64 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 4 , i32 5 , i32 6 , i32 7 , i32 8 , i32 9 , i32 10 , i32 11 , i32 undef , i32 undef , i32 undef , i32 undef , i32 12 , i32 13 , i32 14 , i32 15 , i32 16 , i32 17 , i32 18 , i32 19 , i32 20 , i32 21 , i32 22 , i32 23 , i32 undef , i32 undef , i32 undef , i32 undef , i32 24 , i32 25 , i32 26 , i32 27 , i32 28 , i32 29 , i32 30 , i32 31 , i32 32 , i32 33 , i32 34 , i32 35 , i32 undef , i32 undef , i32 undef , i32 undef , i32 36 , i32 37 , i32 38 , i32 39 , i32 40 , i32 41 , i32 42 , i32 43 , i32 44 , i32 45 , i32 46 , i32 47 , i32 undef , i32 undef , i32 undef , i32 undef >
892+ %shuffle2 = shufflevector <64 x i8 > %shuffle1 , <64 x i8 > poison, <64 x i32 > <i32 1 , i32 0 , i32 2 , i32 1 , i32 4 , i32 3 , i32 5 , i32 4 , i32 7 , i32 6 , i32 8 , i32 7 , i32 10 , i32 9 , i32 11 , i32 10 , i32 17 , i32 16 , i32 18 , i32 17 , i32 20 , i32 19 , i32 21 , i32 20 , i32 23 , i32 22 , i32 24 , i32 23 , i32 26 , i32 25 , i32 27 , i32 26 , i32 33 , i32 32 , i32 34 , i32 33 , i32 36 , i32 35 , i32 37 , i32 36 , i32 39 , i32 38 , i32 40 , i32 39 , i32 42 , i32 41 , i32 43 , i32 42 , i32 49 , i32 48 , i32 50 , i32 49 , i32 52 , i32 51 , i32 53 , i32 52 , i32 55 , i32 54 , i32 56 , i32 55 , i32 58 , i32 57 , i32 59 , i32 58 >
893+ ret <64 x i8 > %shuffle2
894+ }
895+
896+ define void @PR54562_mem (<64 x i8 >* %src , <64 x i8 >* %dst ) {
897+ ; AVX512F-LABEL: PR54562_mem:
898+ ; AVX512F: # %bb.0:
899+ ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm0
900+ ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
901+ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10]
902+ ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,1,2]
903+ ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30]
904+ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
905+ ; AVX512F-NEXT: vmovdqa %xmm0, 48(%rsi)
906+ ; AVX512F-NEXT: vmovdqa %xmm1, 32(%rsi)
907+ ; AVX512F-NEXT: vmovdqa %ymm2, (%rsi)
908+ ; AVX512F-NEXT: vzeroupper
909+ ; AVX512F-NEXT: retq
910+ ;
911+ ; AVX512BW-LABEL: PR54562_mem:
912+ ; AVX512BW: # %bb.0:
913+ ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,1,2]
914+ ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30]
915+ ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm1
916+ ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
917+ ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10]
918+ ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
919+ ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
920+ ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
921+ ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi)
922+ ; AVX512BW-NEXT: vzeroupper
923+ ; AVX512BW-NEXT: retq
924+ ;
925+ ; AVX512DQ-LABEL: PR54562_mem:
926+ ; AVX512DQ: # %bb.0:
927+ ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm0
928+ ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
929+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10]
930+ ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,1,2]
931+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30]
932+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
933+ ; AVX512DQ-NEXT: vmovdqa %xmm0, 48(%rsi)
934+ ; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rsi)
935+ ; AVX512DQ-NEXT: vmovdqa %ymm2, (%rsi)
936+ ; AVX512DQ-NEXT: vzeroupper
937+ ; AVX512DQ-NEXT: retq
938+ ;
939+ ; AVX512VBMI-LABEL: PR54562_mem:
940+ ; AVX512VBMI: # %bb.0:
941+ ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm0 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,13,12,14,13,16,15,17,16,19,18,20,19,22,21,23,22,25,24,26,25,28,27,29,28,31,30,32,31,34,33,35,34,37,36,38,37,40,39,41,40,43,42,44,43,46,45,47,46]
942+ ; AVX512VBMI-NEXT: vpermb (%rdi), %zmm0, %zmm0
943+ ; AVX512VBMI-NEXT: vmovdqa64 %zmm0, (%rsi)
944+ ; AVX512VBMI-NEXT: vzeroupper
945+ ; AVX512VBMI-NEXT: retq
946+ %load = load <64 x i8 >, <64 x i8 >* %src , align 512
947+ %shuffle1 = shufflevector <64 x i8 > %load , <64 x i8 > poison, <64 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 4 , i32 5 , i32 6 , i32 7 , i32 8 , i32 9 , i32 10 , i32 11 , i32 undef , i32 undef , i32 undef , i32 undef , i32 12 , i32 13 , i32 14 , i32 15 , i32 16 , i32 17 , i32 18 , i32 19 , i32 20 , i32 21 , i32 22 , i32 23 , i32 undef , i32 undef , i32 undef , i32 undef , i32 24 , i32 25 , i32 26 , i32 27 , i32 28 , i32 29 , i32 30 , i32 31 , i32 32 , i32 33 , i32 34 , i32 35 , i32 undef , i32 undef , i32 undef , i32 undef , i32 36 , i32 37 , i32 38 , i32 39 , i32 40 , i32 41 , i32 42 , i32 43 , i32 44 , i32 45 , i32 46 , i32 47 , i32 undef , i32 undef , i32 undef , i32 undef >
948+ %shuffle2 = shufflevector <64 x i8 > %shuffle1 , <64 x i8 > poison, <64 x i32 > <i32 1 , i32 0 , i32 2 , i32 1 , i32 4 , i32 3 , i32 5 , i32 4 , i32 7 , i32 6 , i32 8 , i32 7 , i32 10 , i32 9 , i32 11 , i32 10 , i32 17 , i32 16 , i32 18 , i32 17 , i32 20 , i32 19 , i32 21 , i32 20 , i32 23 , i32 22 , i32 24 , i32 23 , i32 26 , i32 25 , i32 27 , i32 26 , i32 33 , i32 32 , i32 34 , i32 33 , i32 36 , i32 35 , i32 37 , i32 36 , i32 39 , i32 38 , i32 40 , i32 39 , i32 42 , i32 41 , i32 43 , i32 42 , i32 49 , i32 48 , i32 50 , i32 49 , i32 52 , i32 51 , i32 53 , i32 52 , i32 55 , i32 54 , i32 56 , i32 55 , i32 58 , i32 57 , i32 59 , i32 58 >
949+ store <64 x i8 > %shuffle2 , <64 x i8 >* %dst , align 512
950+ ret void
951+ }
0 commit comments