Winch: Fix extadd implementations (#10337)

jeffcharles · web-flow · commit c56ff13fbd8e · 2025-03-06T15:23:45.000Z
* Winch: Fix extadd implementations

* Add new Wast test to excluded tests
diff --git a/crates/wast-util/src/lib.rs b/crates/wast-util/src/lib.rs
@@ -438,6 +438,7 @@ impl WastTest {
                     "misc_testsuite/simd/issue6725-no-egraph-panic.wast",
                     "misc_testsuite/simd/replace-lane-preserve.wast",
                     "misc_testsuite/simd/spillslot-size-fuzzbug.wast",
+                    "misc_testsuite/winch/issue-10331.wast",
                     "spec_testsuite/simd_align.wast",
                     "spec_testsuite/simd_boolean.wast",
                     "spec_testsuite/simd_conversions.wast",
diff --git a/tests/disas/winch/x64/i16x8/extadd/extadd_s.wat b/tests/disas/winch/x64/i16x8/extadd/extadd_s.wat
@@ -14,18 +14,25 @@
 ;;       movq    0x10(%r11), %r11
 ;;       addq    $0x20, %r11
 ;;       cmpq    %rsp, %r11
-;;       ja      0x51
+;;       ja      0x4b
 ;;   1c: movq    %rdi, %r14
 ;;       subq    $0x20, %rsp
 ;;       movq    %rdi, 0x18(%rsp)
 ;;       movq    %rsi, 0x10(%rsp)
 ;;       movdqu  %xmm0, (%rsp)
 ;;       movdqu  (%rsp), %xmm0
-;;       vpmovsxbw %xmm0, %xmm15
-;;       vpalignr $8, %xmm0, %xmm0, %xmm0
-;;       vpmovsxbw %xmm0, %xmm0
-;;       vpaddw  %xmm0, %xmm0, %xmm0
+;;       movdqu  0x10(%rip), %xmm15
+;;       vpmaddubsw %xmm0, %xmm15, %xmm0
 ;;       addq    $0x20, %rsp
 ;;       popq    %rbp
 ;;       retq
-;;   51: ud2
+;;   4b: ud2
+;;   4d: addb    %al, (%rax)
+;;   4f: addb    %al, (%rcx)
+;;   51: addl    %eax, (%rcx)
+;;   53: addl    %eax, (%rcx)
+;;   55: addl    %eax, (%rcx)
+;;   57: addl    %eax, (%rcx)
+;;   59: addl    %eax, (%rcx)
+;;   5b: addl    %eax, (%rcx)
+;;   5d: addl    %eax, (%rcx)
diff --git a/tests/disas/winch/x64/i16x8/extadd/extadd_u.wat b/tests/disas/winch/x64/i16x8/extadd/extadd_u.wat
@@ -14,18 +14,27 @@
 ;;       movq    0x10(%r11), %r11
 ;;       addq    $0x20, %r11
 ;;       cmpq    %rsp, %r11
-;;       ja      0x50
+;;       ja      0x46
 ;;   1c: movq    %rdi, %r14
 ;;       subq    $0x20, %rsp
 ;;       movq    %rdi, 0x18(%rsp)
 ;;       movq    %rsi, 0x10(%rsp)
 ;;       movdqu  %xmm0, (%rsp)
 ;;       movdqu  (%rsp), %xmm0
-;;       vpmovzxbw %xmm0, %xmm15
-;;       vpxor   %xmm15, %xmm15, %xmm15
-;;       vpunpckhbw %xmm15, %xmm0, %xmm0
-;;       vpaddw  %xmm0, %xmm0, %xmm0
+;;       vpmaddubsw 0x10(%rip), %xmm0, %xmm0
 ;;       addq    $0x20, %rsp
 ;;       popq    %rbp
 ;;       retq
-;;   50: ud2
+;;   46: ud2
+;;   48: addb    %al, (%rax)
+;;   4a: addb    %al, (%rax)
+;;   4c: addb    %al, (%rax)
+;;   4e: addb    %al, (%rax)
+;;   50: addl    %eax, (%rcx)
+;;   52: addl    %eax, (%rcx)
+;;   54: addl    %eax, (%rcx)
+;;   56: addl    %eax, (%rcx)
+;;   58: addl    %eax, (%rcx)
+;;   5a: addl    %eax, (%rcx)
+;;   5c: addl    %eax, (%rcx)
+;;   5e: addl    %eax, (%rcx)
diff --git a/tests/disas/winch/x64/i32x4/extadd/extadd_s.wat b/tests/disas/winch/x64/i32x4/extadd/extadd_s.wat
@@ -14,18 +14,27 @@
 ;;       movq    0x10(%r11), %r11
 ;;       addq    $0x20, %r11
 ;;       cmpq    %rsp, %r11
-;;       ja      0x51
+;;       ja      0x45
 ;;   1c: movq    %rdi, %r14
 ;;       subq    $0x20, %rsp
 ;;       movq    %rdi, 0x18(%rsp)
 ;;       movq    %rsi, 0x10(%rsp)
 ;;       movdqu  %xmm0, (%rsp)
 ;;       movdqu  (%rsp), %xmm0
-;;       vpmovsxwd %xmm0, %xmm15
-;;       vpalignr $8, %xmm0, %xmm0, %xmm0
-;;       vpmovsxwd %xmm0, %xmm0
-;;       vpaddd  %xmm0, %xmm0, %xmm0
+;;       vpmaddwd 0x11(%rip), %xmm0, %xmm0
 ;;       addq    $0x20, %rsp
 ;;       popq    %rbp
 ;;       retq
-;;   51: ud2
+;;   45: ud2
+;;   47: addb    %al, (%rax)
+;;   49: addb    %al, (%rax)
+;;   4b: addb    %al, (%rax)
+;;   4d: addb    %al, (%rax)
+;;   4f: addb    %al, (%rcx)
+;;   51: addb    %al, (%rcx)
+;;   53: addb    %al, (%rcx)
+;;   55: addb    %al, (%rcx)
+;;   57: addb    %al, (%rcx)
+;;   59: addb    %al, (%rcx)
+;;   5b: addb    %al, (%rcx)
+;;   5d: addb    %al, (%rcx)
diff --git a/tests/disas/winch/x64/i32x4/extadd/extadd_u.wat b/tests/disas/winch/x64/i32x4/extadd/extadd_u.wat
@@ -14,18 +14,41 @@
 ;;       movq    0x10(%r11), %r11
 ;;       addq    $0x20, %r11
 ;;       cmpq    %rsp, %r11
-;;       ja      0x50
+;;       ja      0x55
 ;;   1c: movq    %rdi, %r14
 ;;       subq    $0x20, %rsp
 ;;       movq    %rdi, 0x18(%rsp)
 ;;       movq    %rsi, 0x10(%rsp)
 ;;       movdqu  %xmm0, (%rsp)
 ;;       movdqu  (%rsp), %xmm0
-;;       vpmovzxwd %xmm0, %xmm15
-;;       vpxor   %xmm15, %xmm15, %xmm15
-;;       vpunpckhwd %xmm15, %xmm0, %xmm0
-;;       vpaddd  %xmm0, %xmm0, %xmm0
+;;       vpxor   0x21(%rip), %xmm0, %xmm0
+;;       vpmaddwd 0x29(%rip), %xmm0, %xmm0
+;;       vpaddd  0x31(%rip), %xmm0, %xmm0
 ;;       addq    $0x20, %rsp
 ;;       popq    %rbp
 ;;       retq
-;;   50: ud2
+;;   55: ud2
+;;   57: addb    %al, (%rax)
+;;   59: addb    %al, (%rax)
+;;   5b: addb    %al, (%rax)
+;;   5d: addb    %al, (%rax)
+;;   5f: addb    %al, (%rax)
+;;   61: addb    $0x80, (%rax)
+;;   64: addb    %al, -0x7fff8000(%rax)
+;;   6a: addb    %al, -0x7fff8000(%rax)
+;;   70: addl    %eax, (%rax)
+;;   72: addl    %eax, (%rax)
+;;   74: addl    %eax, (%rax)
+;;   76: addl    %eax, (%rax)
+;;   78: addl    %eax, (%rax)
+;;   7a: addl    %eax, (%rax)
+;;   7c: addl    %eax, (%rax)
+;;   7e: addl    %eax, (%rax)
+;;   80: addb    %al, (%rax)
+;;   82: addl    %eax, (%rax)
+;;   84: addb    %al, (%rax)
+;;   86: addl    %eax, (%rax)
+;;   88: addb    %al, (%rax)
+;;   8a: addl    %eax, (%rax)
+;;   8c: addb    %al, (%rax)
+;;   8e: addl    %eax, (%rax)
diff --git a/tests/misc_testsuite/winch/issue-10331.wast b/tests/misc_testsuite/winch/issue-10331.wast
@@ -0,0 +1,16 @@
+;;! simd = true
+
+;; See https://github.com/bytecodealliance/wasmtime/issues/10331
+
+(module
+  (func (export "test") (result v128)
+    v128.const i8x16 0 128 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+    call 1
+  )
+  (func (param v128) (result v128)
+    local.get 0
+    i16x8.extadd_pairwise_i8x16_s
+  )
+)
+
+(assert_return (invoke "test") (v128.const i16x8 65408 0 0 0 0 0 0 0))
diff --git a/winch/codegen/src/isa/x64/asm.rs b/winch/codegen/src/isa/x64/asm.rs
@@ -1818,17 +1818,43 @@ impl Assembler {
         })
     }
 
-    /// Adds vectors of integers in `src1` and `src2` and puts the results in
-    /// `dst`.
-    pub fn xmm_vpadd_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
-        let op = match size {
+    /// Converts an operand size to the appropriate opcode for `vpadd``.
+    fn xmm_vpadd_opcode(size: OperandSize) -> AvxOpcode {
+        match size {
             OperandSize::S8 => AvxOpcode::Vpaddb,
             OperandSize::S32 => AvxOpcode::Vpaddd,
             _ => unimplemented!(),
-        };
+        }
+    }
+
+    pub fn xmm_vpadd_rmr(
+        &mut self,
+        src1: Reg,
+        src2: &Address,
+        dst: WritableReg,
+        size: OperandSize,
+    ) {
+        let address = Self::to_synthetic_amode(
+            src2,
+            &mut self.pool,
+            &mut self.constants,
+            &mut self.buffer,
+            MemFlags::trusted(),
+        );
 
         self.emit(Inst::XmmRmiRVex {
-            op,
+            op: Self::xmm_vpadd_opcode(size),
+            src1: src1.into(),
+            src2: XmmMemImm::unwrap_new(RegMemImm::mem(address)),
+            dst: dst.to_reg().into(),
+        });
+    }
+
+    /// Adds vectors of integers in `src1` and `src2` and puts the results in
+    /// `dst`.
+    pub fn xmm_vpadd_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
+        self.emit(Inst::XmmRmiRVex {
+            op: Self::xmm_vpadd_opcode(size),
             src1: src1.into(),
             src2: src2.into(),
             dst: dst.to_reg().into(),
@@ -2914,6 +2940,72 @@ impl Assembler {
             dst: dst.to_reg().into(),
         });
     }
+
+    /// Multiply and add packed signed and unsigned bytes.
+    pub fn xmm_vpmaddubs_rmr(
+        &mut self,
+        src: Reg,
+        address: &Address,
+        dst: WritableReg,
+        size: OperandSize,
+    ) {
+        let address = Self::to_synthetic_amode(
+            address,
+            &mut self.pool,
+            &mut self.constants,
+            &mut self.buffer,
+            MemFlags::trusted(),
+        );
+
+        let op = match size {
+            OperandSize::S16 => AvxOpcode::Vpmaddubsw,
+            _ => unimplemented!(),
+        };
+
+        self.emit(Inst::XmmRmiRVex {
+            op,
+            src1: src.into(),
+            src2: XmmMemImm::unwrap_new(RegMemImm::mem(address)),
+            dst: dst.to_reg().into(),
+        });
+    }
+
+    /// Multiple and add packed integers.
+    pub fn xmm_vpmaddwd_rmr(&mut self, src: Reg, address: &Address, dst: WritableReg) {
+        let address = Self::to_synthetic_amode(
+            address,
+            &mut self.pool,
+            &mut self.constants,
+            &mut self.buffer,
+            MemFlags::trusted(),
+        );
+
+        self.emit(Inst::XmmRmiRVex {
+            op: AvxOpcode::Vpmaddwd,
+            src1: src.into(),
+            src2: XmmMemImm::unwrap_new(RegMemImm::mem(address)),
+            dst: dst.to_reg().into(),
+        })
+    }
+
+    /// Perform a logical on vector in `src` and in `address` and put the
+    /// results in `dst`.
+    pub fn xmm_vpxor_rmr(&mut self, src: Reg, address: &Address, dst: WritableReg) {
+        let address = Self::to_synthetic_amode(
+            address,
+            &mut self.pool,
+            &mut self.constants,
+            &mut self.buffer,
+            MemFlags::trusted(),
+        );
+
+        self.emit(Inst::XmmRmiRVex {
+            op: AvxOpcode::Vpxor,
+            src1: src.into(),
+            src2: XmmMemImm::unwrap_new(RegMemImm::mem(address)),
+            dst: dst.to_reg().into(),
+        })
+    }
 }
 
 /// Captures the region in a MachBuffer where an add-with-immediate instruction would be emitted,
diff --git a/winch/codegen/src/isa/x64/masm.rs b/winch/codegen/src/isa/x64/masm.rs
@@ -2738,25 +2738,71 @@ impl Masm for MacroAssembler {
         dst: WritableReg,
         kind: V128ExtAddKind,
     ) -> Result<()> {
-        use V128ExtendKind::*;
-
         self.ensure_has_avx()?;
 
-        // The implementation for extadd is not optimized; for simplicity's sake, we simply perform
-        // an extension followed by an addition using already implemented primitives.
-        let (low_kind, high_kind) = match kind {
-            V128ExtAddKind::I8x16S => (LowI8x16S, HighI8x16S),
-            V128ExtAddKind::I8x16U => (LowI8x16U, HighI8x16U),
-            V128ExtAddKind::I16x8S => (LowI16x8S, HighI16x8S),
-            V128ExtAddKind::I16x8U => (LowI16x8U, HighI16x8U),
-        };
-
-        let tmp = regs::scratch_xmm();
+        match kind {
+            V128ExtAddKind::I8x16S => {
+                let scratch = regs::scratch_xmm();
+                // Use `vpmaddubsw` with a vector of 16 8-bit 1's which will
+                // sign extend `src` to 16 bits and add adjacent words.
+                // Need to supply constant as first operand since first operand
+                // is treated as unsigned and the second operand is signed.
+                let mask = self.asm.add_constant(&[1; 16]);
+                self.asm.xmm_mov_mr(
+                    &mask,
+                    writable!(scratch),
+                    OperandSize::S128,
+                    MemFlags::trusted(),
+                );
+                self.asm
+                    .xmm_vex_rr(AvxOpcode::Vpmaddubsw, scratch, src, dst);
+            }
+            V128ExtAddKind::I8x16U => {
+                // Same approach as the signed variant but treat `src` as
+                // unsigned instead of signed by passing it as the first
+                // operand.
+                let mask = self.asm.add_constant(&[1; 16]);
+                self.asm
+                    .xmm_vpmaddubs_rmr(src, &mask, dst, OperandSize::S16);
+            }
+            V128ExtAddKind::I16x8S => {
+                // Similar approach to the two variants above. The vector is 8
+                // lanes of 16-bit 1's and `vpmaddwd` treats both operands as
+                // signed.
+                let mask = self
+                    .asm
+                    .add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);
+                self.asm.xmm_vpmaddwd_rmr(src, &mask, dst);
+            }
+            V128ExtAddKind::I16x8U => {
+                // Similar approach as the signed variant.
+                // `vpmaddwd` operates on signed integers and the operand is
+                // unsigned so the operand needs to be converted to a signed
+                // format and than that process needs to be reversed after
+                // `vpmaddwd`.
+                // Flip the sign bit for 8 16-bit lanes.
+                let xor_mask = self.asm.add_constant(&[
+                    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+                    0x80, 0x00, 0x80,
+                ]);
+                self.asm.xmm_vpxor_rmr(src, &xor_mask, dst);
 
-        self.v128_extend(src, writable!(tmp), low_kind)?;
-        self.v128_extend(src, dst, high_kind)?;
+                let madd_mask = self
+                    .asm
+                    .add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);
+                self.asm.xmm_vpmaddwd_rmr(dst.to_reg(), &madd_mask, dst);
 
-        self.v128_add(src, dst.to_reg(), dst, kind.into())
+                // Reverse the XOR. The XOR effectively subtracts 32,768 from
+                // both pairs that are added together so 65,536 (0x10000)
+                // needs to be added to 4 lanes of 32-bit values.
+                let add_mask = self
+                    .asm
+                    .add_constant(&[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0]);
+                self.asm
+                    .xmm_vpadd_rmr(dst.to_reg(), &add_mask, dst, OperandSize::S32);
+            }
+        }
+        Ok(())
     }
 
     fn v128_dot(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg) -> Result<()> {
diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs
@@ -915,15 +915,6 @@ pub(crate) enum V128ExtAddKind {
     I16x8U,
 }
 
-impl From<V128ExtAddKind> for V128AddKind {
-    fn from(value: V128ExtAddKind) -> Self {
-        match value {
-            V128ExtAddKind::I8x16S | V128ExtAddKind::I8x16U => Self::I16x8,
-            V128ExtAddKind::I16x8S | V128ExtAddKind::I16x8U => Self::I32x4,
-        }
-    }
-}
-
 /// Kinds of vector extended multiplication supported by WebAssembly.
 #[derive(Debug, Clone, Copy)]
 pub(crate) enum V128ExtMulKind {