@@ -2738,25 +2738,71 @@ impl Masm for MacroAssembler {
27382738 dst : WritableReg ,
27392739 kind : V128ExtAddKind ,
27402740 ) -> Result < ( ) > {
2741- use V128ExtendKind :: * ;
2742-
27432741 self . ensure_has_avx ( ) ?;
27442742
2745- // The implementation for extadd is not optimized; for simplicity's sake, we simply perform
2746- // an extension followed by an addition using already implemented primitives.
2747- let ( low_kind, high_kind) = match kind {
2748- V128ExtAddKind :: I8x16S => ( LowI8x16S , HighI8x16S ) ,
2749- V128ExtAddKind :: I8x16U => ( LowI8x16U , HighI8x16U ) ,
2750- V128ExtAddKind :: I16x8S => ( LowI16x8S , HighI16x8S ) ,
2751- V128ExtAddKind :: I16x8U => ( LowI16x8U , HighI16x8U ) ,
2752- } ;
2753-
2754- let tmp = regs:: scratch_xmm ( ) ;
2743+ match kind {
2744+ V128ExtAddKind :: I8x16S => {
2745+ let scratch = regs:: scratch_xmm ( ) ;
2746+ // Use `vpmaddubsw` with a vector of 16 8-bit 1's which will
2747+ // sign extend `src` to 16 bits and add adjacent words.
2748+ // Need to supply constant as first operand since first operand
2749+ // is treated as unsigned and the second operand is signed.
2750+ let mask = self . asm . add_constant ( & [ 1 ; 16 ] ) ;
2751+ self . asm . xmm_mov_mr (
2752+ & mask,
2753+ writable ! ( scratch) ,
2754+ OperandSize :: S128 ,
2755+ MemFlags :: trusted ( ) ,
2756+ ) ;
2757+ self . asm
2758+ . xmm_vex_rr ( AvxOpcode :: Vpmaddubsw , scratch, src, dst) ;
2759+ }
2760+ V128ExtAddKind :: I8x16U => {
2761+ // Same approach as the signed variant but treat `src` as
2762+ // unsigned instead of signed by passing it as the first
2763+ // operand.
2764+ let mask = self . asm . add_constant ( & [ 1 ; 16 ] ) ;
2765+ self . asm
2766+ . xmm_vpmaddubs_rmr ( src, & mask, dst, OperandSize :: S16 ) ;
2767+ }
2768+ V128ExtAddKind :: I16x8S => {
2769+ // Similar approach to the two variants above. The vector is 8
2770+ // lanes of 16-bit 1's and `vpmaddwd` treats both operands as
2771+ // signed.
2772+ let mask = self
2773+ . asm
2774+ . add_constant ( & [ 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 ] ) ;
2775+ self . asm . xmm_vpmaddwd_rmr ( src, & mask, dst) ;
2776+ }
2777+ V128ExtAddKind :: I16x8U => {
2778+ // Similar approach as the signed variant.
2779+ // `vpmaddwd` operates on signed integers and the operand is
2780+ // unsigned so the operand needs to be converted to a signed
2781+ // format and than that process needs to be reversed after
2782+ // `vpmaddwd`.
2783+ // Flip the sign bit for 8 16-bit lanes.
2784+ let xor_mask = self . asm . add_constant ( & [
2785+ 0x00 , 0x80 , 0x00 , 0x80 , 0x00 , 0x80 , 0x00 , 0x80 , 0x00 , 0x80 , 0x00 , 0x80 , 0x00 ,
2786+ 0x80 , 0x00 , 0x80 ,
2787+ ] ) ;
2788+ self . asm . xmm_vpxor_rmr ( src, & xor_mask, dst) ;
27552789
2756- self . v128_extend ( src, writable ! ( tmp) , low_kind) ?;
2757- self . v128_extend ( src, dst, high_kind) ?;
2790+ let madd_mask = self
2791+ . asm
2792+ . add_constant ( & [ 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 ] ) ;
2793+ self . asm . xmm_vpmaddwd_rmr ( dst. to_reg ( ) , & madd_mask, dst) ;
27582794
2759- self . v128_add ( src, dst. to_reg ( ) , dst, kind. into ( ) )
2795+ // Reverse the XOR. The XOR effectively subtracts 32,768 from
2796+ // both pairs that are added together so 65,536 (0x10000)
2797+ // needs to be added to 4 lanes of 32-bit values.
2798+ let add_mask = self
2799+ . asm
2800+ . add_constant ( & [ 0 , 0 , 1 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 1 , 0 ] ) ;
2801+ self . asm
2802+ . xmm_vpadd_rmr ( dst. to_reg ( ) , & add_mask, dst, OperandSize :: S32 ) ;
2803+ }
2804+ }
2805+ Ok ( ( ) )
27602806 }
27612807
27622808 fn v128_dot ( & mut self , lhs : Reg , rhs : Reg , dst : WritableReg ) -> Result < ( ) > {
0 commit comments