@@ -610,230 +610,56 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
610610 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16&ig_expand=4903
611611 intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
612612
613- assert_eq ! ( a. layout( ) , b. layout( ) ) ;
614- let layout = a. layout ( ) ;
615-
616- let ( lane_count, lane_ty) = layout. ty . simd_size_and_type ( fx. tcx ) ;
617- let ( ret_lane_count, ret_lane_ty) = ret. layout ( ) . ty . simd_size_and_type ( fx. tcx ) ;
618- assert_eq ! ( lane_ty, fx. tcx. types. i16 ) ;
619- assert_eq ! ( ret_lane_ty, fx. tcx. types. u8 ) ;
620- assert_eq ! ( lane_count * 2 , ret_lane_count) ;
621-
622- let zero = fx. bcx . ins ( ) . iconst ( types:: I16 , 0 ) ;
623- let max_u8 = fx. bcx . ins ( ) . iconst ( types:: I16 , 255 ) ;
624- let ret_lane_layout = fx. layout_of ( fx. tcx . types . u8 ) ;
625-
626- for idx in 0 ..lane_count {
627- let lane = a. value_lane ( fx, idx) . load_scalar ( fx) ;
628- let sat = fx. bcx . ins ( ) . smax ( lane, zero) ;
629- let sat = fx. bcx . ins ( ) . umin ( sat, max_u8) ;
630- let res = fx. bcx . ins ( ) . ireduce ( types:: I8 , sat) ;
631-
632- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
633- ret. place_lane ( fx, idx) . write_cvalue ( fx, res_lane) ;
634- }
613+ pack_instruction ( fx, a, b, ret, PackSize :: U8 , PackWidth :: Sse ) ;
614+ }
635615
636- for idx in 0 ..lane_count {
637- let lane = b. value_lane ( fx, idx) . load_scalar ( fx) ;
638- let sat = fx. bcx . ins ( ) . smax ( lane, zero) ;
639- let sat = fx. bcx . ins ( ) . umin ( sat, max_u8) ;
640- let res = fx. bcx . ins ( ) . ireduce ( types:: I8 , sat) ;
616+ "llvm.x86.sse2.packsswb.128" => {
617+ // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16&ig_expand=4848
618+ intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
641619
642- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
643- ret. place_lane ( fx, lane_count + idx) . write_cvalue ( fx, res_lane) ;
644- }
620+ pack_instruction ( fx, a, b, ret, PackSize :: S8 , PackWidth :: Sse ) ;
645621 }
646622
647623 "llvm.x86.avx2.packuswb" => {
648624 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16&ig_expand=4906
649625 intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
650626
651- assert_eq ! ( a. layout( ) , b. layout( ) ) ;
652- let layout = a. layout ( ) ;
653-
654- let ( lane_count, lane_ty) = layout. ty . simd_size_and_type ( fx. tcx ) ;
655- let ( ret_lane_count, ret_lane_ty) = ret. layout ( ) . ty . simd_size_and_type ( fx. tcx ) ;
656- assert_eq ! ( lane_ty, fx. tcx. types. i16 ) ;
657- assert_eq ! ( ret_lane_ty, fx. tcx. types. u8 ) ;
658- assert_eq ! ( lane_count * 2 , ret_lane_count) ;
659-
660- let zero = fx. bcx . ins ( ) . iconst ( types:: I16 , 0 ) ;
661- let max_u8 = fx. bcx . ins ( ) . iconst ( types:: I16 , 255 ) ;
662- let ret_lane_layout = fx. layout_of ( fx. tcx . types . u8 ) ;
663-
664- for idx in 0 ..lane_count / 2 {
665- let lane = a. value_lane ( fx, idx) . load_scalar ( fx) ;
666- let sat = fx. bcx . ins ( ) . smax ( lane, zero) ;
667- let sat = fx. bcx . ins ( ) . umin ( sat, max_u8) ;
668- let res = fx. bcx . ins ( ) . ireduce ( types:: I8 , sat) ;
669-
670- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
671- ret. place_lane ( fx, idx) . write_cvalue ( fx, res_lane) ;
672- }
673-
674- for idx in 0 ..lane_count / 2 {
675- let lane = b. value_lane ( fx, idx) . load_scalar ( fx) ;
676- let sat = fx. bcx . ins ( ) . smax ( lane, zero) ;
677- let sat = fx. bcx . ins ( ) . umin ( sat, max_u8) ;
678- let res = fx. bcx . ins ( ) . ireduce ( types:: I8 , sat) ;
679-
680- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
681- ret. place_lane ( fx, lane_count / 2 + idx) . write_cvalue ( fx, res_lane) ;
682- }
683-
684- for idx in 0 ..lane_count / 2 {
685- let lane = a. value_lane ( fx, idx) . load_scalar ( fx) ;
686- let sat = fx. bcx . ins ( ) . smax ( lane, zero) ;
687- let sat = fx. bcx . ins ( ) . umin ( sat, max_u8) ;
688- let res = fx. bcx . ins ( ) . ireduce ( types:: I8 , sat) ;
689-
690- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
691- ret. place_lane ( fx, lane_count / 2 * 2 + idx) . write_cvalue ( fx, res_lane) ;
692- }
693-
694- for idx in 0 ..lane_count / 2 {
695- let lane = b. value_lane ( fx, idx) . load_scalar ( fx) ;
696- let sat = fx. bcx . ins ( ) . smax ( lane, zero) ;
697- let sat = fx. bcx . ins ( ) . umin ( sat, max_u8) ;
698- let res = fx. bcx . ins ( ) . ireduce ( types:: I8 , sat) ;
699-
700- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
701- ret. place_lane ( fx, lane_count / 2 * 3 + idx) . write_cvalue ( fx, res_lane) ;
702- }
627+ pack_instruction ( fx, a, b, ret, PackSize :: U8 , PackWidth :: Avx ) ;
703628 }
704629
705- "llvm.x86.sse2.packssdw.128 " => {
706- // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32 &ig_expand=4889
630+ "llvm.x86.avx2.packsswb " => {
631+ // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16 &ig_expand=4851
707632 intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
708633
709- assert_eq ! ( a. layout( ) , b. layout( ) ) ;
710- let layout = a. layout ( ) ;
711-
712- let ( lane_count, lane_ty) = layout. ty . simd_size_and_type ( fx. tcx ) ;
713- let ( ret_lane_count, ret_lane_ty) = ret. layout ( ) . ty . simd_size_and_type ( fx. tcx ) ;
714- assert_eq ! ( lane_ty, fx. tcx. types. i32 ) ;
715- assert_eq ! ( ret_lane_ty, fx. tcx. types. i16 ) ;
716- assert_eq ! ( lane_count * 2 , ret_lane_count) ;
717-
718- let min_i16 = fx. bcx . ins ( ) . iconst ( types:: I32 , i32:: from ( i16:: MIN ) as u32 as i64 ) ;
719- let max_i16 = fx. bcx . ins ( ) . iconst ( types:: I32 , i32:: from ( i16:: MAX ) as u32 as i64 ) ;
720- let ret_lane_layout = fx. layout_of ( fx. tcx . types . i16 ) ;
721-
722- for idx in 0 ..lane_count {
723- let lane = a. value_lane ( fx, idx) . load_scalar ( fx) ;
724- let sat = fx. bcx . ins ( ) . smax ( lane, min_i16) ;
725- let sat = fx. bcx . ins ( ) . smin ( sat, max_i16) ;
726- let res = fx. bcx . ins ( ) . ireduce ( types:: I16 , sat) ;
727-
728- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
729- ret. place_lane ( fx, idx) . write_cvalue ( fx, res_lane) ;
730- }
731-
732- for idx in 0 ..lane_count {
733- let lane = b. value_lane ( fx, idx) . load_scalar ( fx) ;
734- let sat = fx. bcx . ins ( ) . smax ( lane, min_i16) ;
735- let sat = fx. bcx . ins ( ) . smin ( sat, max_i16) ;
736- let res = fx. bcx . ins ( ) . ireduce ( types:: I16 , sat) ;
737-
738- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
739- ret. place_lane ( fx, lane_count + idx) . write_cvalue ( fx, res_lane) ;
740- }
634+ pack_instruction ( fx, a, b, ret, PackSize :: S8 , PackWidth :: Avx ) ;
741635 }
742636
743637 "llvm.x86.sse41.packusdw" => {
744638 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32&ig_expand=4912
745639 intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
746640
747- assert_eq ! ( a. layout( ) , b. layout( ) ) ;
748- let layout = a. layout ( ) ;
749-
750- let ( lane_count, lane_ty) = layout. ty . simd_size_and_type ( fx. tcx ) ;
751- let ( ret_lane_count, ret_lane_ty) = ret. layout ( ) . ty . simd_size_and_type ( fx. tcx ) ;
752- assert_eq ! ( lane_ty, fx. tcx. types. i32 ) ;
753- assert_eq ! ( ret_lane_ty, fx. tcx. types. u16 ) ;
754- assert_eq ! ( lane_count * 2 , ret_lane_count) ;
755-
756- let min_u16 = fx. bcx . ins ( ) . iconst ( types:: I32 , i64:: from ( u16:: MIN ) ) ;
757- let max_u16 = fx. bcx . ins ( ) . iconst ( types:: I32 , i64:: from ( u16:: MAX ) ) ;
758- let ret_lane_layout = fx. layout_of ( fx. tcx . types . u16 ) ;
641+ pack_instruction ( fx, a, b, ret, PackSize :: U16 , PackWidth :: Sse ) ;
642+ }
759643
760- for idx in 0 ..lane_count {
761- let lane = a. value_lane ( fx, idx) . load_scalar ( fx) ;
762- let sat = fx. bcx . ins ( ) . smax ( lane, min_u16) ;
763- let sat = fx. bcx . ins ( ) . smin ( sat, max_u16) ;
764- let res = fx. bcx . ins ( ) . ireduce ( types:: I16 , sat) ;
644+ "llvm.x86.sse2.packssdw.128" => {
645+ // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32&ig_expand=4889
646+ intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
765647
766- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
767- ret. place_lane ( fx, idx) . write_cvalue ( fx, res_lane) ;
768- }
648+ pack_instruction ( fx, a, b, ret, PackSize :: S16 , PackWidth :: Sse ) ;
649+ }
769650
770- for idx in 0 ..lane_count {
771- let lane = b. value_lane ( fx, idx) . load_scalar ( fx) ;
772- let sat = fx. bcx . ins ( ) . smax ( lane, min_u16) ;
773- let sat = fx. bcx . ins ( ) . smin ( sat, max_u16) ;
774- let res = fx. bcx . ins ( ) . ireduce ( types:: I16 , sat) ;
651+ "llvm.x86.avx2.packusdw" => {
652+ // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32&ig_expand=4883
653+ intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
775654
776- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
777- ret. place_lane ( fx, lane_count + idx) . write_cvalue ( fx, res_lane) ;
778- }
655+ pack_instruction ( fx, a, b, ret, PackSize :: U16 , PackWidth :: Avx ) ;
779656 }
780657
781658 "llvm.x86.avx2.packssdw" => {
782659 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32&ig_expand=4892
783660 intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
784661
785- assert_eq ! ( a. layout( ) , b. layout( ) ) ;
786- let layout = a. layout ( ) ;
787-
788- let ( lane_count, lane_ty) = layout. ty . simd_size_and_type ( fx. tcx ) ;
789- let ( ret_lane_count, ret_lane_ty) = ret. layout ( ) . ty . simd_size_and_type ( fx. tcx ) ;
790- assert_eq ! ( lane_ty, fx. tcx. types. i32 ) ;
791- assert_eq ! ( ret_lane_ty, fx. tcx. types. i16 ) ;
792- assert_eq ! ( lane_count * 2 , ret_lane_count) ;
793-
794- let min_i16 = fx. bcx . ins ( ) . iconst ( types:: I32 , i32:: from ( i16:: MIN ) as u32 as i64 ) ;
795- let max_i16 = fx. bcx . ins ( ) . iconst ( types:: I32 , i32:: from ( i16:: MAX ) as u32 as i64 ) ;
796- let ret_lane_layout = fx. layout_of ( fx. tcx . types . i16 ) ;
797-
798- for idx in 0 ..lane_count / 2 {
799- let lane = a. value_lane ( fx, idx) . load_scalar ( fx) ;
800- let sat = fx. bcx . ins ( ) . smax ( lane, min_i16) ;
801- let sat = fx. bcx . ins ( ) . smin ( sat, max_i16) ;
802- let res = fx. bcx . ins ( ) . ireduce ( types:: I16 , sat) ;
803-
804- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
805- ret. place_lane ( fx, idx) . write_cvalue ( fx, res_lane) ;
806- }
807-
808- for idx in 0 ..lane_count / 2 {
809- let lane = b. value_lane ( fx, idx) . load_scalar ( fx) ;
810- let sat = fx. bcx . ins ( ) . smax ( lane, min_i16) ;
811- let sat = fx. bcx . ins ( ) . smin ( sat, max_i16) ;
812- let res = fx. bcx . ins ( ) . ireduce ( types:: I16 , sat) ;
813-
814- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
815- ret. place_lane ( fx, lane_count / 2 + idx) . write_cvalue ( fx, res_lane) ;
816- }
817-
818- for idx in 0 ..lane_count / 2 {
819- let lane = a. value_lane ( fx, idx) . load_scalar ( fx) ;
820- let sat = fx. bcx . ins ( ) . smax ( lane, min_i16) ;
821- let sat = fx. bcx . ins ( ) . smin ( sat, max_i16) ;
822- let res = fx. bcx . ins ( ) . ireduce ( types:: I16 , sat) ;
823-
824- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
825- ret. place_lane ( fx, lane_count / 2 * 2 + idx) . write_cvalue ( fx, res_lane) ;
826- }
827-
828- for idx in 0 ..lane_count / 2 {
829- let lane = b. value_lane ( fx, idx) . load_scalar ( fx) ;
830- let sat = fx. bcx . ins ( ) . smax ( lane, min_i16) ;
831- let sat = fx. bcx . ins ( ) . smin ( sat, max_i16) ;
832- let res = fx. bcx . ins ( ) . ireduce ( types:: I16 , sat) ;
833-
834- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
835- ret. place_lane ( fx, lane_count / 2 * 3 + idx) . write_cvalue ( fx, res_lane) ;
836- }
662+ pack_instruction ( fx, a, b, ret, PackSize :: S16 , PackWidth :: Avx ) ;
837663 }
838664
839665 "llvm.x86.fma.vfmaddsub.ps"
@@ -1407,3 +1233,115 @@ fn llvm_add_sub<'tcx>(
14071233
14081234 ( cb_out, c)
14091235}
1236+
1237+ enum PackSize {
1238+ U8 ,
1239+ U16 ,
1240+ S8 ,
1241+ S16 ,
1242+ }
1243+
1244+ impl PackSize {
1245+ fn ret_clif_type ( & self ) -> Type {
1246+ match self {
1247+ Self :: U8 | Self :: S8 => types:: I8 ,
1248+ Self :: U16 | Self :: S16 => types:: I16 ,
1249+ }
1250+ }
1251+ fn src_clif_type ( & self ) -> Type {
1252+ match self {
1253+ Self :: U8 | Self :: S8 => types:: I16 ,
1254+ Self :: U16 | Self :: S16 => types:: I32 ,
1255+ }
1256+ }
1257+ fn src_ty < ' tcx > ( & self , tcx : TyCtxt < ' tcx > ) -> Ty < ' tcx > {
1258+ match self {
1259+ Self :: U8 | Self :: S8 => tcx. types . i16 ,
1260+ Self :: U16 | Self :: S16 => tcx. types . i32 ,
1261+ }
1262+ }
1263+ fn ret_ty < ' tcx > ( & self , tcx : TyCtxt < ' tcx > ) -> Ty < ' tcx > {
1264+ match self {
1265+ Self :: U8 => tcx. types . u8 ,
1266+ Self :: S8 => tcx. types . i8 ,
1267+ Self :: U16 => tcx. types . u16 ,
1268+ Self :: S16 => tcx. types . i16 ,
1269+ }
1270+ }
1271+ fn max ( & self ) -> i64 {
1272+ match self {
1273+ Self :: U8 => u8:: MAX as u64 as i64 ,
1274+ Self :: S8 => i8:: MAX as u8 as u64 as i64 ,
1275+ Self :: U16 => u16:: MAX as u64 as i64 ,
1276+ Self :: S16 => i16:: MAX as u64 as u64 as i64 ,
1277+ }
1278+ }
1279+ fn min ( & self ) -> i64 {
1280+ match self {
1281+ Self :: U8 | Self :: U16 => 0 ,
1282+ Self :: S8 => i16:: from ( i8:: MIN ) as u16 as i64 ,
1283+ Self :: S16 => i32:: from ( i16:: MIN ) as u32 as i64 ,
1284+ }
1285+ }
1286+ }
1287+
1288+ enum PackWidth {
1289+ Sse = 1 ,
1290+ Avx = 2 ,
1291+ }
1292+ impl PackWidth {
1293+ fn divisor ( & self ) -> u64 {
1294+ match self {
1295+ Self :: Sse => 1 ,
1296+ Self :: Avx => 2 ,
1297+ }
1298+ }
1299+ }
1300+
1301+ /// Implement an x86 pack instruction with the intrinsic `_mm{,256}pack{us,s}_epi{16,32}`.
1302+ /// Validated for correctness against LLVM, see commit `c8f5d35508e062bd2d95e6c03429bfec831db6d3`.
1303+ fn pack_instruction < ' tcx > (
1304+ fx : & mut FunctionCx < ' _ , ' _ , ' tcx > ,
1305+ a : CValue < ' tcx > ,
1306+ b : CValue < ' tcx > ,
1307+ ret : CPlace < ' tcx > ,
1308+ ret_size : PackSize ,
1309+ width : PackWidth ,
1310+ ) {
1311+ assert_eq ! ( a. layout( ) , b. layout( ) ) ;
1312+ let layout = a. layout ( ) ;
1313+
1314+ let ( src_lane_count, src_lane_ty) = layout. ty . simd_size_and_type ( fx. tcx ) ;
1315+ let ( ret_lane_count, ret_lane_ty) = ret. layout ( ) . ty . simd_size_and_type ( fx. tcx ) ;
1316+ assert_eq ! ( src_lane_ty, ret_size. src_ty( fx. tcx) ) ;
1317+ assert_eq ! ( ret_lane_ty, ret_size. ret_ty( fx. tcx) ) ;
1318+ assert_eq ! ( src_lane_count * 2 , ret_lane_count) ;
1319+
1320+ let min = fx. bcx . ins ( ) . iconst ( ret_size. src_clif_type ( ) , ret_size. min ( ) ) ;
1321+ let max = fx. bcx . ins ( ) . iconst ( ret_size. src_clif_type ( ) , ret_size. max ( ) ) ;
1322+ let ret_lane_layout = fx. layout_of ( ret_size. ret_ty ( fx. tcx ) ) ;
1323+
1324+ let mut round = |source : CValue < ' tcx > , source_offset : u64 , dest_offset : u64 | {
1325+ let step_amount = src_lane_count / width. divisor ( ) ;
1326+ let dest_offset = step_amount * dest_offset;
1327+ for idx in 0 ..step_amount {
1328+ let lane = source. value_lane ( fx, step_amount * source_offset + idx) . load_scalar ( fx) ;
1329+ let sat = fx. bcx . ins ( ) . smax ( lane, min) ;
1330+ let sat = match ret_size {
1331+ PackSize :: U8 | PackSize :: U16 => fx. bcx . ins ( ) . umin ( sat, max) ,
1332+ PackSize :: S8 | PackSize :: S16 => fx. bcx . ins ( ) . smin ( sat, max) ,
1333+ } ;
1334+ let res = fx. bcx . ins ( ) . ireduce ( ret_size. ret_clif_type ( ) , sat) ;
1335+ let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
1336+ ret. place_lane ( fx, dest_offset + idx) . write_cvalue ( fx, res_lane) ;
1337+ }
1338+ } ;
1339+
1340+ round ( a, 0 , 0 ) ;
1341+ round ( b, 0 , 1 ) ;
1342+
1343+ if let PackWidth :: Avx = width {
1344+ round ( a, 1 , 2 ) ;
1345+ round ( b, 1 , 3 ) ;
1346+ }
1347+ }
0 commit comments