Skip to content

Commit c56ff13

Browse files
authored
Winch: Fix extadd implementations (#10337)
* Winch: Fix extadd implementations * Add new Wast test to excluded tests
1 parent e929b63 commit c56ff13

File tree

9 files changed

+248
-54
lines changed

9 files changed

+248
-54
lines changed

crates/wast-util/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,7 @@ impl WastTest {
438438
"misc_testsuite/simd/issue6725-no-egraph-panic.wast",
439439
"misc_testsuite/simd/replace-lane-preserve.wast",
440440
"misc_testsuite/simd/spillslot-size-fuzzbug.wast",
441+
"misc_testsuite/winch/issue-10331.wast",
441442
"spec_testsuite/simd_align.wast",
442443
"spec_testsuite/simd_boolean.wast",
443444
"spec_testsuite/simd_conversions.wast",

tests/disas/winch/x64/i16x8/extadd/extadd_s.wat

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,25 @@
1414
;; movq 0x10(%r11), %r11
1515
;; addq $0x20, %r11
1616
;; cmpq %rsp, %r11
17-
;; ja 0x51
17+
;; ja 0x4b
1818
;; 1c: movq %rdi, %r14
1919
;; subq $0x20, %rsp
2020
;; movq %rdi, 0x18(%rsp)
2121
;; movq %rsi, 0x10(%rsp)
2222
;; movdqu %xmm0, (%rsp)
2323
;; movdqu (%rsp), %xmm0
24-
;; vpmovsxbw %xmm0, %xmm15
25-
;; vpalignr $8, %xmm0, %xmm0, %xmm0
26-
;; vpmovsxbw %xmm0, %xmm0
27-
;; vpaddw %xmm0, %xmm0, %xmm0
24+
;; movdqu 0x10(%rip), %xmm15
25+
;; vpmaddubsw %xmm0, %xmm15, %xmm0
2826
;; addq $0x20, %rsp
2927
;; popq %rbp
3028
;; retq
31-
;; 51: ud2
29+
;; 4b: ud2
30+
;; 4d: addb %al, (%rax)
31+
;; 4f: addb %al, (%rcx)
32+
;; 51: addl %eax, (%rcx)
33+
;; 53: addl %eax, (%rcx)
34+
;; 55: addl %eax, (%rcx)
35+
;; 57: addl %eax, (%rcx)
36+
;; 59: addl %eax, (%rcx)
37+
;; 5b: addl %eax, (%rcx)
38+
;; 5d: addl %eax, (%rcx)

tests/disas/winch/x64/i16x8/extadd/extadd_u.wat

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,27 @@
1414
;; movq 0x10(%r11), %r11
1515
;; addq $0x20, %r11
1616
;; cmpq %rsp, %r11
17-
;; ja 0x50
17+
;; ja 0x46
1818
;; 1c: movq %rdi, %r14
1919
;; subq $0x20, %rsp
2020
;; movq %rdi, 0x18(%rsp)
2121
;; movq %rsi, 0x10(%rsp)
2222
;; movdqu %xmm0, (%rsp)
2323
;; movdqu (%rsp), %xmm0
24-
;; vpmovzxbw %xmm0, %xmm15
25-
;; vpxor %xmm15, %xmm15, %xmm15
26-
;; vpunpckhbw %xmm15, %xmm0, %xmm0
27-
;; vpaddw %xmm0, %xmm0, %xmm0
24+
;; vpmaddubsw 0x10(%rip), %xmm0, %xmm0
2825
;; addq $0x20, %rsp
2926
;; popq %rbp
3027
;; retq
31-
;; 50: ud2
28+
;; 46: ud2
29+
;; 48: addb %al, (%rax)
30+
;; 4a: addb %al, (%rax)
31+
;; 4c: addb %al, (%rax)
32+
;; 4e: addb %al, (%rax)
33+
;; 50: addl %eax, (%rcx)
34+
;; 52: addl %eax, (%rcx)
35+
;; 54: addl %eax, (%rcx)
36+
;; 56: addl %eax, (%rcx)
37+
;; 58: addl %eax, (%rcx)
38+
;; 5a: addl %eax, (%rcx)
39+
;; 5c: addl %eax, (%rcx)
40+
;; 5e: addl %eax, (%rcx)

tests/disas/winch/x64/i32x4/extadd/extadd_s.wat

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,27 @@
1414
;; movq 0x10(%r11), %r11
1515
;; addq $0x20, %r11
1616
;; cmpq %rsp, %r11
17-
;; ja 0x51
17+
;; ja 0x45
1818
;; 1c: movq %rdi, %r14
1919
;; subq $0x20, %rsp
2020
;; movq %rdi, 0x18(%rsp)
2121
;; movq %rsi, 0x10(%rsp)
2222
;; movdqu %xmm0, (%rsp)
2323
;; movdqu (%rsp), %xmm0
24-
;; vpmovsxwd %xmm0, %xmm15
25-
;; vpalignr $8, %xmm0, %xmm0, %xmm0
26-
;; vpmovsxwd %xmm0, %xmm0
27-
;; vpaddd %xmm0, %xmm0, %xmm0
24+
;; vpmaddwd 0x11(%rip), %xmm0, %xmm0
2825
;; addq $0x20, %rsp
2926
;; popq %rbp
3027
;; retq
31-
;; 51: ud2
28+
;; 45: ud2
29+
;; 47: addb %al, (%rax)
30+
;; 49: addb %al, (%rax)
31+
;; 4b: addb %al, (%rax)
32+
;; 4d: addb %al, (%rax)
33+
;; 4f: addb %al, (%rcx)
34+
;; 51: addb %al, (%rcx)
35+
;; 53: addb %al, (%rcx)
36+
;; 55: addb %al, (%rcx)
37+
;; 57: addb %al, (%rcx)
38+
;; 59: addb %al, (%rcx)
39+
;; 5b: addb %al, (%rcx)
40+
;; 5d: addb %al, (%rcx)

tests/disas/winch/x64/i32x4/extadd/extadd_u.wat

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,41 @@
1414
;; movq 0x10(%r11), %r11
1515
;; addq $0x20, %r11
1616
;; cmpq %rsp, %r11
17-
;; ja 0x50
17+
;; ja 0x55
1818
;; 1c: movq %rdi, %r14
1919
;; subq $0x20, %rsp
2020
;; movq %rdi, 0x18(%rsp)
2121
;; movq %rsi, 0x10(%rsp)
2222
;; movdqu %xmm0, (%rsp)
2323
;; movdqu (%rsp), %xmm0
24-
;; vpmovzxwd %xmm0, %xmm15
25-
;; vpxor %xmm15, %xmm15, %xmm15
26-
;; vpunpckhwd %xmm15, %xmm0, %xmm0
27-
;; vpaddd %xmm0, %xmm0, %xmm0
24+
;; vpxor 0x21(%rip), %xmm0, %xmm0
25+
;; vpmaddwd 0x29(%rip), %xmm0, %xmm0
26+
;; vpaddd 0x31(%rip), %xmm0, %xmm0
2827
;; addq $0x20, %rsp
2928
;; popq %rbp
3029
;; retq
31-
;; 50: ud2
30+
;; 55: ud2
31+
;; 57: addb %al, (%rax)
32+
;; 59: addb %al, (%rax)
33+
;; 5b: addb %al, (%rax)
34+
;; 5d: addb %al, (%rax)
35+
;; 5f: addb %al, (%rax)
36+
;; 61: addb $0x80, (%rax)
37+
;; 64: addb %al, -0x7fff8000(%rax)
38+
;; 6a: addb %al, -0x7fff8000(%rax)
39+
;; 70: addl %eax, (%rax)
40+
;; 72: addl %eax, (%rax)
41+
;; 74: addl %eax, (%rax)
42+
;; 76: addl %eax, (%rax)
43+
;; 78: addl %eax, (%rax)
44+
;; 7a: addl %eax, (%rax)
45+
;; 7c: addl %eax, (%rax)
46+
;; 7e: addl %eax, (%rax)
47+
;; 80: addb %al, (%rax)
48+
;; 82: addl %eax, (%rax)
49+
;; 84: addb %al, (%rax)
50+
;; 86: addl %eax, (%rax)
51+
;; 88: addb %al, (%rax)
52+
;; 8a: addl %eax, (%rax)
53+
;; 8c: addb %al, (%rax)
54+
;; 8e: addl %eax, (%rax)
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
;;! simd = true
2+
3+
;; See https://github.com/bytecodealliance/wasmtime/issues/10331
4+
5+
(module
6+
(func (export "test") (result v128)
7+
v128.const i8x16 0 128 0 0 0 0 0 0 0 0 0 0 0 0 0 0
8+
call 1
9+
)
10+
(func (param v128) (result v128)
11+
local.get 0
12+
i16x8.extadd_pairwise_i8x16_s
13+
)
14+
)
15+
16+
(assert_return (invoke "test") (v128.const i16x8 65408 0 0 0 0 0 0 0))

winch/codegen/src/isa/x64/asm.rs

Lines changed: 98 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1818,17 +1818,43 @@ impl Assembler {
18181818
})
18191819
}
18201820

1821-
/// Adds vectors of integers in `src1` and `src2` and puts the results in
1822-
/// `dst`.
1823-
pub fn xmm_vpadd_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
1824-
let op = match size {
1821+
/// Converts an operand size to the appropriate opcode for `vpadd``.
1822+
fn xmm_vpadd_opcode(size: OperandSize) -> AvxOpcode {
1823+
match size {
18251824
OperandSize::S8 => AvxOpcode::Vpaddb,
18261825
OperandSize::S32 => AvxOpcode::Vpaddd,
18271826
_ => unimplemented!(),
1828-
};
1827+
}
1828+
}
1829+
1830+
pub fn xmm_vpadd_rmr(
1831+
&mut self,
1832+
src1: Reg,
1833+
src2: &Address,
1834+
dst: WritableReg,
1835+
size: OperandSize,
1836+
) {
1837+
let address = Self::to_synthetic_amode(
1838+
src2,
1839+
&mut self.pool,
1840+
&mut self.constants,
1841+
&mut self.buffer,
1842+
MemFlags::trusted(),
1843+
);
18291844

18301845
self.emit(Inst::XmmRmiRVex {
1831-
op,
1846+
op: Self::xmm_vpadd_opcode(size),
1847+
src1: src1.into(),
1848+
src2: XmmMemImm::unwrap_new(RegMemImm::mem(address)),
1849+
dst: dst.to_reg().into(),
1850+
});
1851+
}
1852+
1853+
/// Adds vectors of integers in `src1` and `src2` and puts the results in
1854+
/// `dst`.
1855+
pub fn xmm_vpadd_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
1856+
self.emit(Inst::XmmRmiRVex {
1857+
op: Self::xmm_vpadd_opcode(size),
18321858
src1: src1.into(),
18331859
src2: src2.into(),
18341860
dst: dst.to_reg().into(),
@@ -2914,6 +2940,72 @@ impl Assembler {
29142940
dst: dst.to_reg().into(),
29152941
});
29162942
}
2943+
2944+
/// Multiply and add packed signed and unsigned bytes.
2945+
pub fn xmm_vpmaddubs_rmr(
2946+
&mut self,
2947+
src: Reg,
2948+
address: &Address,
2949+
dst: WritableReg,
2950+
size: OperandSize,
2951+
) {
2952+
let address = Self::to_synthetic_amode(
2953+
address,
2954+
&mut self.pool,
2955+
&mut self.constants,
2956+
&mut self.buffer,
2957+
MemFlags::trusted(),
2958+
);
2959+
2960+
let op = match size {
2961+
OperandSize::S16 => AvxOpcode::Vpmaddubsw,
2962+
_ => unimplemented!(),
2963+
};
2964+
2965+
self.emit(Inst::XmmRmiRVex {
2966+
op,
2967+
src1: src.into(),
2968+
src2: XmmMemImm::unwrap_new(RegMemImm::mem(address)),
2969+
dst: dst.to_reg().into(),
2970+
});
2971+
}
2972+
2973+
/// Multiple and add packed integers.
2974+
pub fn xmm_vpmaddwd_rmr(&mut self, src: Reg, address: &Address, dst: WritableReg) {
2975+
let address = Self::to_synthetic_amode(
2976+
address,
2977+
&mut self.pool,
2978+
&mut self.constants,
2979+
&mut self.buffer,
2980+
MemFlags::trusted(),
2981+
);
2982+
2983+
self.emit(Inst::XmmRmiRVex {
2984+
op: AvxOpcode::Vpmaddwd,
2985+
src1: src.into(),
2986+
src2: XmmMemImm::unwrap_new(RegMemImm::mem(address)),
2987+
dst: dst.to_reg().into(),
2988+
})
2989+
}
2990+
2991+
/// Perform a logical on vector in `src` and in `address` and put the
2992+
/// results in `dst`.
2993+
pub fn xmm_vpxor_rmr(&mut self, src: Reg, address: &Address, dst: WritableReg) {
2994+
let address = Self::to_synthetic_amode(
2995+
address,
2996+
&mut self.pool,
2997+
&mut self.constants,
2998+
&mut self.buffer,
2999+
MemFlags::trusted(),
3000+
);
3001+
3002+
self.emit(Inst::XmmRmiRVex {
3003+
op: AvxOpcode::Vpxor,
3004+
src1: src.into(),
3005+
src2: XmmMemImm::unwrap_new(RegMemImm::mem(address)),
3006+
dst: dst.to_reg().into(),
3007+
})
3008+
}
29173009
}
29183010

29193011
/// Captures the region in a MachBuffer where an add-with-immediate instruction would be emitted,

winch/codegen/src/isa/x64/masm.rs

Lines changed: 61 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2738,25 +2738,71 @@ impl Masm for MacroAssembler {
27382738
dst: WritableReg,
27392739
kind: V128ExtAddKind,
27402740
) -> Result<()> {
2741-
use V128ExtendKind::*;
2742-
27432741
self.ensure_has_avx()?;
27442742

2745-
// The implementation for extadd is not optimized; for simplicity's sake, we simply perform
2746-
// an extension followed by an addition using already implemented primitives.
2747-
let (low_kind, high_kind) = match kind {
2748-
V128ExtAddKind::I8x16S => (LowI8x16S, HighI8x16S),
2749-
V128ExtAddKind::I8x16U => (LowI8x16U, HighI8x16U),
2750-
V128ExtAddKind::I16x8S => (LowI16x8S, HighI16x8S),
2751-
V128ExtAddKind::I16x8U => (LowI16x8U, HighI16x8U),
2752-
};
2753-
2754-
let tmp = regs::scratch_xmm();
2743+
match kind {
2744+
V128ExtAddKind::I8x16S => {
2745+
let scratch = regs::scratch_xmm();
2746+
// Use `vpmaddubsw` with a vector of 16 8-bit 1's which will
2747+
// sign extend `src` to 16 bits and add adjacent words.
2748+
// Need to supply constant as first operand since first operand
2749+
// is treated as unsigned and the second operand is signed.
2750+
let mask = self.asm.add_constant(&[1; 16]);
2751+
self.asm.xmm_mov_mr(
2752+
&mask,
2753+
writable!(scratch),
2754+
OperandSize::S128,
2755+
MemFlags::trusted(),
2756+
);
2757+
self.asm
2758+
.xmm_vex_rr(AvxOpcode::Vpmaddubsw, scratch, src, dst);
2759+
}
2760+
V128ExtAddKind::I8x16U => {
2761+
// Same approach as the signed variant but treat `src` as
2762+
// unsigned instead of signed by passing it as the first
2763+
// operand.
2764+
let mask = self.asm.add_constant(&[1; 16]);
2765+
self.asm
2766+
.xmm_vpmaddubs_rmr(src, &mask, dst, OperandSize::S16);
2767+
}
2768+
V128ExtAddKind::I16x8S => {
2769+
// Similar approach to the two variants above. The vector is 8
2770+
// lanes of 16-bit 1's and `vpmaddwd` treats both operands as
2771+
// signed.
2772+
let mask = self
2773+
.asm
2774+
.add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);
2775+
self.asm.xmm_vpmaddwd_rmr(src, &mask, dst);
2776+
}
2777+
V128ExtAddKind::I16x8U => {
2778+
// Similar approach as the signed variant.
2779+
// `vpmaddwd` operates on signed integers and the operand is
2780+
// unsigned so the operand needs to be converted to a signed
2781+
// format and than that process needs to be reversed after
2782+
// `vpmaddwd`.
2783+
// Flip the sign bit for 8 16-bit lanes.
2784+
let xor_mask = self.asm.add_constant(&[
2785+
0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
2786+
0x80, 0x00, 0x80,
2787+
]);
2788+
self.asm.xmm_vpxor_rmr(src, &xor_mask, dst);
27552789

2756-
self.v128_extend(src, writable!(tmp), low_kind)?;
2757-
self.v128_extend(src, dst, high_kind)?;
2790+
let madd_mask = self
2791+
.asm
2792+
.add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);
2793+
self.asm.xmm_vpmaddwd_rmr(dst.to_reg(), &madd_mask, dst);
27582794

2759-
self.v128_add(src, dst.to_reg(), dst, kind.into())
2795+
// Reverse the XOR. The XOR effectively subtracts 32,768 from
2796+
// both pairs that are added together so 65,536 (0x10000)
2797+
// needs to be added to 4 lanes of 32-bit values.
2798+
let add_mask = self
2799+
.asm
2800+
.add_constant(&[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0]);
2801+
self.asm
2802+
.xmm_vpadd_rmr(dst.to_reg(), &add_mask, dst, OperandSize::S32);
2803+
}
2804+
}
2805+
Ok(())
27602806
}
27612807

27622808
fn v128_dot(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg) -> Result<()> {

winch/codegen/src/masm.rs

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -915,15 +915,6 @@ pub(crate) enum V128ExtAddKind {
915915
I16x8U,
916916
}
917917

918-
impl From<V128ExtAddKind> for V128AddKind {
919-
fn from(value: V128ExtAddKind) -> Self {
920-
match value {
921-
V128ExtAddKind::I8x16S | V128ExtAddKind::I8x16U => Self::I16x8,
922-
V128ExtAddKind::I16x8S | V128ExtAddKind::I16x8U => Self::I32x4,
923-
}
924-
}
925-
}
926-
927918
/// Kinds of vector extended multiplication supported by WebAssembly.
928919
#[derive(Debug, Clone, Copy)]
929920
pub(crate) enum V128ExtMulKind {

0 commit comments

Comments
 (0)