@@ -590,6 +590,44 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
590590 }
591591 }
592592
593+ "llvm.x86.sse41.packusdw" => {
594+ // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32&ig_expand=4912
595+ intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
596+
597+ assert_eq ! ( a. layout( ) , b. layout( ) ) ;
598+ let layout = a. layout ( ) ;
599+
600+ let ( lane_count, lane_ty) = layout. ty . simd_size_and_type ( fx. tcx ) ;
601+ let ( ret_lane_count, ret_lane_ty) = ret. layout ( ) . ty . simd_size_and_type ( fx. tcx ) ;
602+ assert_eq ! ( lane_ty, fx. tcx. types. i32 ) ;
603+ assert_eq ! ( ret_lane_ty, fx. tcx. types. u16 ) ;
604+ assert_eq ! ( lane_count * 2 , ret_lane_count) ;
605+
606+ let min_u16 = fx. bcx . ins ( ) . iconst ( types:: I32 , i64:: from ( u16:: MIN ) ) ;
607+ let max_u16 = fx. bcx . ins ( ) . iconst ( types:: I32 , i64:: from ( u16:: MAX ) ) ;
608+ let ret_lane_layout = fx. layout_of ( fx. tcx . types . u16 ) ;
609+
610+ for idx in 0 ..lane_count {
611+ let lane = a. value_lane ( fx, idx) . load_scalar ( fx) ;
612+ let sat = fx. bcx . ins ( ) . umax ( lane, min_u16) ;
613+ let sat = fx. bcx . ins ( ) . umin ( sat, max_u16) ;
614+ let res = fx. bcx . ins ( ) . ireduce ( types:: I16 , sat) ;
615+
616+ let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
617+ ret. place_lane ( fx, idx) . write_cvalue ( fx, res_lane) ;
618+ }
619+
620+ for idx in 0 ..lane_count {
621+ let lane = b. value_lane ( fx, idx) . load_scalar ( fx) ;
622+ let sat = fx. bcx . ins ( ) . umax ( lane, min_u16) ;
623+ let sat = fx. bcx . ins ( ) . umin ( sat, max_u16) ;
624+ let res = fx. bcx . ins ( ) . ireduce ( types:: I16 , sat) ;
625+
626+ let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
627+ ret. place_lane ( fx, lane_count + idx) . write_cvalue ( fx, res_lane) ;
628+ }
629+ }
630+
593631 "llvm.x86.avx2.packssdw" => {
594632 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32&ig_expand=4892
595633 intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
@@ -648,6 +686,106 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
648686 }
649687 }
650688
689+ "llvm.x86.pclmulqdq" => {
690+ // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772
691+ intrinsic_args ! ( fx, args => ( a, b, imm8) ; intrinsic) ;
692+
693+ assert_eq ! ( a. layout( ) , b. layout( ) ) ;
694+ let layout = a. layout ( ) ;
695+
696+ let ( lane_count, lane_ty) = layout. ty . simd_size_and_type ( fx. tcx ) ;
697+ let ( ret_lane_count, ret_lane_ty) = ret. layout ( ) . ty . simd_size_and_type ( fx. tcx ) ;
698+ assert_eq ! ( lane_ty, fx. tcx. types. i64 ) ;
699+ assert_eq ! ( ret_lane_ty, fx. tcx. types. i64 ) ;
700+ assert_eq ! ( lane_count, 2 ) ;
701+ assert_eq ! ( ret_lane_count, 2 ) ;
702+
703+ let imm8 = imm8. load_scalar ( fx) ;
704+
705+ let control0 = fx. bcx . ins ( ) . band_imm ( imm8, 0b0000_0001 ) ;
706+ let a_lane0 = a. value_lane ( fx, 0 ) . load_scalar ( fx) ;
707+ let a_lane1 = a. value_lane ( fx, 1 ) . load_scalar ( fx) ;
708+ let temp1 = fx. bcx . ins ( ) . select ( control0, a_lane1, a_lane0) ;
709+
710+ let control4 = fx. bcx . ins ( ) . band_imm ( imm8, 0b0001_0000 ) ;
711+ let b_lane0 = b. value_lane ( fx, 0 ) . load_scalar ( fx) ;
712+ let b_lane1 = b. value_lane ( fx, 1 ) . load_scalar ( fx) ;
713+ let temp2 = fx. bcx . ins ( ) . select ( control4, b_lane1, b_lane0) ;
714+
715+ fn extract_bit ( fx : & mut FunctionCx < ' _ , ' _ , ' _ > , val : Value , bit : i64 ) -> Value {
716+ let tmp = fx. bcx . ins ( ) . ushr_imm ( val, bit) ;
717+ fx. bcx . ins ( ) . band_imm ( tmp, 1 )
718+ }
719+
720+ let mut res1 = fx. bcx . ins ( ) . iconst ( types:: I64 , 0 ) ;
721+ for i in 0 ..=63 {
722+ let x = extract_bit ( fx, temp1, 0 ) ;
723+ let y = extract_bit ( fx, temp2, i) ;
724+ let mut temp = fx. bcx . ins ( ) . band ( x, y) ;
725+ for j in 1 ..=i {
726+ let x = extract_bit ( fx, temp1, j) ;
727+ let y = extract_bit ( fx, temp2, i - j) ;
728+ let z = fx. bcx . ins ( ) . band ( x, y) ;
729+ temp = fx. bcx . ins ( ) . bxor ( temp, z) ;
730+ }
731+ let temp = fx. bcx . ins ( ) . ishl_imm ( temp, i) ;
732+ res1 = fx. bcx . ins ( ) . bor ( res1, temp) ;
733+ }
734+ ret. place_lane ( fx, 0 ) . to_ptr ( ) . store ( fx, res1, MemFlags :: trusted ( ) ) ;
735+
736+ let mut res2 = fx. bcx . ins ( ) . iconst ( types:: I64 , 0 ) ;
737+ for i in 64 ..=127 {
738+ let mut temp = fx. bcx . ins ( ) . iconst ( types:: I64 , 0 ) ;
739+ for j in i - 63 ..=63 {
740+ let x = extract_bit ( fx, temp1, j) ;
741+ let y = extract_bit ( fx, temp2, i - j) ;
742+ let z = fx. bcx . ins ( ) . band ( x, y) ;
743+ temp = fx. bcx . ins ( ) . bxor ( temp, z) ;
744+ }
745+ let temp = fx. bcx . ins ( ) . ishl_imm ( temp, i) ;
746+ res2 = fx. bcx . ins ( ) . bor ( res2, temp) ;
747+ }
748+ ret. place_lane ( fx, 1 ) . to_ptr ( ) . store ( fx, res2, MemFlags :: trusted ( ) ) ;
749+ }
750+
751+ "llvm.x86.avx.ptestz.256" => {
752+ // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256&ig_expand=6945
753+ intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
754+
755+ assert_eq ! ( a. layout( ) , b. layout( ) ) ;
756+ let layout = a. layout ( ) ;
757+
758+ let ( lane_count, lane_ty) = layout. ty . simd_size_and_type ( fx. tcx ) ;
759+ assert_eq ! ( lane_ty, fx. tcx. types. i64 ) ;
760+ assert_eq ! ( ret. layout( ) . ty, fx. tcx. types. i32 ) ;
761+ assert_eq ! ( lane_count, 4 ) ;
762+
763+ let a_lane0 = a. value_lane ( fx, 0 ) . load_scalar ( fx) ;
764+ let a_lane1 = a. value_lane ( fx, 1 ) . load_scalar ( fx) ;
765+ let a_lane2 = a. value_lane ( fx, 2 ) . load_scalar ( fx) ;
766+ let a_lane3 = a. value_lane ( fx, 3 ) . load_scalar ( fx) ;
767+ let b_lane0 = b. value_lane ( fx, 0 ) . load_scalar ( fx) ;
768+ let b_lane1 = b. value_lane ( fx, 1 ) . load_scalar ( fx) ;
769+ let b_lane2 = b. value_lane ( fx, 2 ) . load_scalar ( fx) ;
770+ let b_lane3 = b. value_lane ( fx, 3 ) . load_scalar ( fx) ;
771+
772+ let zero0 = fx. bcx . ins ( ) . band ( a_lane0, b_lane0) ;
773+ let zero1 = fx. bcx . ins ( ) . band ( a_lane1, b_lane1) ;
774+ let zero2 = fx. bcx . ins ( ) . band ( a_lane2, b_lane2) ;
775+ let zero3 = fx. bcx . ins ( ) . band ( a_lane3, b_lane3) ;
776+
777+ let all_zero0 = fx. bcx . ins ( ) . bor ( zero0, zero1) ;
778+ let all_zero1 = fx. bcx . ins ( ) . bor ( zero2, zero3) ;
779+ let all_zero = fx. bcx . ins ( ) . bor ( all_zero0, all_zero1) ;
780+
781+ let res = fx. bcx . ins ( ) . icmp_imm ( IntCC :: Equal , all_zero, 0 ) ;
782+ let res = CValue :: by_val (
783+ fx. bcx . ins ( ) . uextend ( types:: I32 , res) ,
784+ fx. layout_of ( fx. tcx . types . i32 ) ,
785+ ) ;
786+ ret. write_cvalue ( fx, res) ;
787+ }
788+
651789 _ => {
652790 fx. tcx
653791 . sess
0 commit comments