diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index dcdd9f82cde8e..bda9d4e624505 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1705,6 +1705,13 @@ class TargetTransformInfo { /// into a shuffle sequence. bool shouldExpandReduction(const IntrinsicInst *II) const; + enum struct ReductionShuffle { SplitHalf, Pairwise }; + + /// \returns The shuffle sequence pattern used to expand the given reduction + /// intrinsic. + ReductionShuffle + getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const; + /// \returns the size cost of rematerializing a GlobalValue address relative /// to a stack reload. unsigned getGISelRematGlobalCost() const; @@ -2156,6 +2163,8 @@ class TargetTransformInfo::Concept { virtual bool preferEpilogueVectorization() const = 0; virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0; + virtual ReductionShuffle + getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const = 0; virtual unsigned getGISelRematGlobalCost() const = 0; virtual unsigned getMinTripCountTailFoldingThreshold() const = 0; virtual bool enableScalableVectorization() const = 0; @@ -2898,6 +2907,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.shouldExpandReduction(II); } + ReductionShuffle + getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const override { + return Impl.getPreferredExpandedReductionShuffle(II); + } + unsigned getGISelRematGlobalCost() const override { return Impl.getGISelRematGlobalCost(); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 01624de190d51..c1eb6151440be 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -936,6 +936,11 @@ class TargetTransformInfoImplBase { bool shouldExpandReduction(const IntrinsicInst *II) const { return true; } + TTI::ReductionShuffle + getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const { + return TTI::ReductionShuffle::SplitHalf; + } + unsigned getGISelRematGlobalCost() const { return 1; } unsigned getMinTripCountTailFoldingThreshold() const { return 0; } diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index 1a878126aa082..b01a447f3c28b 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -15,6 +15,7 @@ #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/VectorBuilder.h" #include "llvm/Transforms/Utils/ValueMapper.h" @@ -385,6 +386,7 @@ Value *getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src, /// Generates a vector reduction using shufflevectors to reduce the value. /// Fast-math-flags are propagated using the IRBuilder's setting. Value *getShuffleReduction(IRBuilderBase &Builder, Value *Src, unsigned Op, + TargetTransformInfo::ReductionShuffle RS, RecurKind MinMaxKind = RecurKind::None); /// Create a target reduction of the given vector. The reduction operation diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index c175d1737e54b..be4069bb3eabf 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1317,6 +1317,12 @@ bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const { return TTIImpl->shouldExpandReduction(II); } +TargetTransformInfo::ReductionShuffle +TargetTransformInfo::getPreferredExpandedReductionShuffle( + const IntrinsicInst *II) const { + return TTIImpl->getPreferredExpandedReductionShuffle(II); +} + unsigned TargetTransformInfo::getGISelRematGlobalCost() const { return TTIImpl->getGISelRematGlobalCost(); } diff --git a/llvm/lib/CodeGen/ExpandReductions.cpp b/llvm/lib/CodeGen/ExpandReductions.cpp index 0b1504e51b1bb..d6778ec666cbe 100644 --- a/llvm/lib/CodeGen/ExpandReductions.cpp +++ b/llvm/lib/CodeGen/ExpandReductions.cpp @@ -59,6 +59,8 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) { isa(II) ? II->getFastMathFlags() : FastMathFlags{}; Intrinsic::ID ID = II->getIntrinsicID(); RecurKind RK = getMinMaxReductionRecurKind(ID); + TargetTransformInfo::ReductionShuffle RS = + TTI->getPreferredExpandedReductionShuffle(II); Value *Rdx = nullptr; IRBuilder<> Builder(II); @@ -79,7 +81,7 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) { if (!isPowerOf2_32( cast(Vec->getType())->getNumElements())) continue; - Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK); + Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK); Rdx = Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, Acc, Rdx, "bin.rdx"); } @@ -112,7 +114,7 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) { break; } unsigned RdxOpcode = getArithmeticReductionInstruction(ID); - Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK); + Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK); break; } case Intrinsic::vector_reduce_add: @@ -127,7 +129,7 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) { cast(Vec->getType())->getNumElements())) continue; unsigned RdxOpcode = getArithmeticReductionInstruction(ID); - Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK); + Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK); break; } case Intrinsic::vector_reduce_fmax: @@ -140,7 +142,7 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) { !FMF.noNaNs()) continue; unsigned RdxOpcode = getArithmeticReductionInstruction(ID); - Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK); + Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK); break; } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp index 9a434d9b1db54..b109594811d97 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -94,6 +94,18 @@ WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, return Cost; } +TTI::ReductionShuffle WebAssemblyTTIImpl::getPreferredExpandedReductionShuffle( + const IntrinsicInst *II) const { + + switch (II->getIntrinsicID()) { + default: + break; + case Intrinsic::vector_reduce_fadd: + return TTI::ReductionShuffle::Pairwise; + } + return TTI::ReductionShuffle::SplitHalf; +} + bool WebAssemblyTTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { // Allow inlining only when the Callee has a subset of the Caller's diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h index e10f0928ed531..269922cc3ea84 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -70,6 +70,8 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase { TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1); + TTI::ReductionShuffle + getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const; /// @} bool areInlineCompatible(const Function *Caller, diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index ff93035ce0652..4609376a748f9 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1077,7 +1077,9 @@ Value *llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src, // Helper to generate a log2 shuffle reduction. Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src, - unsigned Op, RecurKind RdxKind) { + unsigned Op, + TargetTransformInfo::ReductionShuffle RS, + RecurKind RdxKind) { unsigned VF = cast(Src->getType())->getNumElements(); // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles // and vector ops, reducing the set of values being computed by half each @@ -1091,18 +1093,10 @@ Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src, // will never be relevant here. Note that it would be generally unsound to // propagate these from an intrinsic call to the expansion anyways as we/ // change the order of operations. - Value *TmpVec = Src; - SmallVector ShuffleMask(VF); - for (unsigned i = VF; i != 1; i >>= 1) { - // Move the upper half of the vector to the lower half. - for (unsigned j = 0; j != i / 2; ++j) - ShuffleMask[j] = i / 2 + j; - - // Fill the rest of the mask with undef. - std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), -1); - + auto BuildShuffledOp = [&Builder, &Op, + &RdxKind](SmallVectorImpl &ShuffleMask, + Value *&TmpVec) -> void { Value *Shuf = Builder.CreateShuffleVector(TmpVec, ShuffleMask, "rdx.shuf"); - if (Op != Instruction::ICmp && Op != Instruction::FCmp) { TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"); @@ -1111,6 +1105,30 @@ Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src, "Invalid min/max"); TmpVec = createMinMaxOp(Builder, RdxKind, TmpVec, Shuf); } + }; + + Value *TmpVec = Src; + if (TargetTransformInfo::ReductionShuffle::Pairwise == RS) { + SmallVector ShuffleMask(VF); + for (unsigned stride = 1; stride < VF; stride <<= 1) { + // Initialise the mask with undef. + std::fill(ShuffleMask.begin(), ShuffleMask.end(), -1); + for (unsigned j = 0; j < VF; j += stride << 1) { + ShuffleMask[j] = j + stride; + } + BuildShuffledOp(ShuffleMask, TmpVec); + } + } else { + SmallVector ShuffleMask(VF); + for (unsigned i = VF; i != 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i / 2; ++j) + ShuffleMask[j] = i / 2 + j; + + // Fill the rest of the mask with undef. + std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), -1); + BuildShuffledOp(ShuffleMask, TmpVec); + } } // The result is in the first element of the vector. return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); diff --git a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll new file mode 100644 index 0000000000000..4b1a1a8a0c5b6 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll @@ -0,0 +1,1074 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -mtriple=wasm32 -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s --check-prefix=SIMD128 + +define i64 @pairwise_add_v2i64(<2 x i64> %arg) { +; SIMD128-LABEL: pairwise_add_v2i64: +; SIMD128: .functype pairwise_add_v2i64 (v128) -> (i64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: i64x2.add $push1=, $0, $pop0 +; SIMD128-NEXT: i64x2.extract_lane $push2=, $pop1, 0 +; SIMD128-NEXT: return $pop2 + %res = tail call i64 @llvm.vector.reduce.add.i64.v4i64(<2 x i64> %arg) + ret i64 %res +} + +define i32 @pairwise_add_v4i32(<4 x i32> %arg) { +; SIMD128-LABEL: pairwise_add_v4i32: +; SIMD128: .functype pairwise_add_v4i32 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.add $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.add $push2=, $pop4, $pop1 +; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %arg) + ret i32 %res +} + +define i16 @pairwise_add_v8i16(<8 x i16> %arg) { +; SIMD128-LABEL: pairwise_add_v8i16: +; SIMD128: .functype pairwise_add_v8i16 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.add $push8=, $0, $pop0 +; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.add $push6=, $pop7, $pop1 +; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.add $push3=, $pop5, $pop2 +; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 + %res = tail call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %arg) + ret i16 %res +} + +define i8 @pairwise_add_v16i8(<16 x i8> %arg) { +; SIMD128-LABEL: pairwise_add_v16i8: +; SIMD128: .functype pairwise_add_v16i8 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.add $push11=, $0, $pop0 +; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.add $push9=, $pop10, $pop1 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.add $push7=, $pop8, $pop2 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.add $push4=, $pop6, $pop3 +; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 +; SIMD128-NEXT: return $pop5 + %res = tail call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %arg) + ret i8 %res +} + +define i64 @pairwise_mul_v2i64(<2 x i64> %arg) { +; SIMD128-LABEL: pairwise_mul_v2i64: +; SIMD128: .functype pairwise_mul_v2i64 (v128) -> (i64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: i64x2.mul $push1=, $0, $pop0 +; SIMD128-NEXT: i64x2.extract_lane $push2=, $pop1, 0 +; SIMD128-NEXT: return $pop2 + %res = tail call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %arg) + ret i64 %res +} + +define i32 @pairwise_mul_v4i32(<4 x i32> %arg) { +; SIMD128-LABEL: pairwise_mul_v4i32: +; SIMD128: .functype pairwise_mul_v4i32 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.mul $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.mul $push2=, $pop4, $pop1 +; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %arg) + ret i32 %res +} + +define i16 @pairwise_mul_v8i16(<8 x i16> %arg) { +; SIMD128-LABEL: pairwise_mul_v8i16: +; SIMD128: .functype pairwise_mul_v8i16 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.mul $push8=, $0, $pop0 +; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.mul $push6=, $pop7, $pop1 +; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.mul $push3=, $pop5, $pop2 +; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 + %res = tail call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %arg) + ret i16 %res +} + +define i8 @pairwise_mul_v16i8(<16 x i8> %arg) { +; SIMD128-LABEL: pairwise_mul_v16i8: +; SIMD128: .functype pairwise_mul_v16i8 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.extract_lane_u $push26=, $0, 0 +; SIMD128-NEXT: i8x16.shuffle $push32=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: local.tee $push31=, $1=, $pop32 +; SIMD128-NEXT: i8x16.extract_lane_u $push25=, $pop31, 0 +; SIMD128-NEXT: i32.mul $push27=, $pop26, $pop25 +; SIMD128-NEXT: i8x16.extract_lane_u $push23=, $0, 4 +; SIMD128-NEXT: i8x16.extract_lane_u $push22=, $1, 4 +; SIMD128-NEXT: i32.mul $push24=, $pop23, $pop22 +; SIMD128-NEXT: i32.mul $push28=, $pop27, $pop24 +; SIMD128-NEXT: i8x16.extract_lane_u $push19=, $0, 2 +; SIMD128-NEXT: i8x16.extract_lane_u $push18=, $1, 2 +; SIMD128-NEXT: i32.mul $push20=, $pop19, $pop18 +; SIMD128-NEXT: i8x16.extract_lane_u $push16=, $0, 6 +; SIMD128-NEXT: i8x16.extract_lane_u $push15=, $1, 6 +; SIMD128-NEXT: i32.mul $push17=, $pop16, $pop15 +; SIMD128-NEXT: i32.mul $push21=, $pop20, $pop17 +; SIMD128-NEXT: i32.mul $push29=, $pop28, $pop21 +; SIMD128-NEXT: i8x16.extract_lane_u $push11=, $0, 1 +; SIMD128-NEXT: i8x16.extract_lane_u $push10=, $1, 1 +; SIMD128-NEXT: i32.mul $push12=, $pop11, $pop10 +; SIMD128-NEXT: i8x16.extract_lane_u $push8=, $0, 5 +; SIMD128-NEXT: i8x16.extract_lane_u $push7=, $1, 5 +; SIMD128-NEXT: i32.mul $push9=, $pop8, $pop7 +; SIMD128-NEXT: i32.mul $push13=, $pop12, $pop9 +; SIMD128-NEXT: i8x16.extract_lane_u $push4=, $0, 3 +; SIMD128-NEXT: i8x16.extract_lane_u $push3=, $1, 3 +; SIMD128-NEXT: i32.mul $push5=, $pop4, $pop3 +; SIMD128-NEXT: i8x16.extract_lane_u $push1=, $0, 7 +; SIMD128-NEXT: i8x16.extract_lane_u $push0=, $1, 7 +; SIMD128-NEXT: i32.mul $push2=, $pop1, $pop0 +; SIMD128-NEXT: i32.mul $push6=, $pop5, $pop2 +; SIMD128-NEXT: i32.mul $push14=, $pop13, $pop6 +; SIMD128-NEXT: i32.mul $push30=, $pop29, $pop14 +; SIMD128-NEXT: return $pop30 + %res = tail call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %arg) + ret i8 %res +} + +define i64 @pairwise_and_v2i64(<2 x i64> %arg) { +; SIMD128-LABEL: pairwise_and_v2i64: +; SIMD128: .functype pairwise_and_v2i64 (v128) -> (i64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: v128.and $push1=, $0, $pop0 +; SIMD128-NEXT: i64x2.extract_lane $push2=, $pop1, 0 +; SIMD128-NEXT: return $pop2 + %res = tail call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %arg) + ret i64 %res +} + +define i32 @pairwise_and_v4i32(<4 x i32> %arg) { +; SIMD128-LABEL: pairwise_and_v4i32: +; SIMD128: .functype pairwise_and_v4i32 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: v128.and $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: v128.and $push2=, $pop4, $pop1 +; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %arg) + ret i32 %res +} + +define i16 @pairwise_and_v8i16(<8 x i16> %arg) { +; SIMD128-LABEL: pairwise_and_v8i16: +; SIMD128: .functype pairwise_and_v8i16 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.and $push8=, $0, $pop0 +; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.and $push6=, $pop7, $pop1 +; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.and $push3=, $pop5, $pop2 +; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 + %res = tail call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %arg) + ret i16 %res +} + +define i8 @pairwise_and_v16i8(<16 x i8> %arg) { +; SIMD128-LABEL: pairwise_and_v16i8: +; SIMD128: .functype pairwise_and_v16i8 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.and $push11=, $0, $pop0 +; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.and $push9=, $pop10, $pop1 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.and $push7=, $pop8, $pop2 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.and $push4=, $pop6, $pop3 +; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 +; SIMD128-NEXT: return $pop5 + %res = tail call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %arg) + ret i8 %res +} + +define i64 @pairwise_or_v2i64(<2 x i64> %arg) { +; SIMD128-LABEL: pairwise_or_v2i64: +; SIMD128: .functype pairwise_or_v2i64 (v128) -> (i64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: v128.or $push1=, $0, $pop0 +; SIMD128-NEXT: i64x2.extract_lane $push2=, $pop1, 0 +; SIMD128-NEXT: return $pop2 + %res = tail call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %arg) + ret i64 %res +} + +define i32 @pairwise_or_v4i32(<4 x i32> %arg) { +; SIMD128-LABEL: pairwise_or_v4i32: +; SIMD128: .functype pairwise_or_v4i32 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: v128.or $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: v128.or $push2=, $pop4, $pop1 +; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %arg) + ret i32 %res +} + +define i16 @pairwise_or_v8i16(<8 x i16> %arg) { +; SIMD128-LABEL: pairwise_or_v8i16: +; SIMD128: .functype pairwise_or_v8i16 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.or $push8=, $0, $pop0 +; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.or $push6=, $pop7, $pop1 +; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.or $push3=, $pop5, $pop2 +; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 + %res = tail call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %arg) + ret i16 %res +} + +define i8 @pairwise_or_v16i8(<16 x i8> %arg) { +; SIMD128-LABEL: pairwise_or_v16i8: +; SIMD128: .functype pairwise_or_v16i8 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.or $push11=, $0, $pop0 +; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.or $push9=, $pop10, $pop1 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.or $push7=, $pop8, $pop2 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.or $push4=, $pop6, $pop3 +; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 +; SIMD128-NEXT: return $pop5 + %res = tail call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %arg) + ret i8 %res +} + +define i64 @pairwise_xor_v2i64(<2 x i64> %arg) { +; SIMD128-LABEL: pairwise_xor_v2i64: +; SIMD128: .functype pairwise_xor_v2i64 (v128) -> (i64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: v128.xor $push1=, $0, $pop0 +; SIMD128-NEXT: i64x2.extract_lane $push2=, $pop1, 0 +; SIMD128-NEXT: return $pop2 + %res = tail call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %arg) + ret i64 %res +} + +define i32 @pairwise_xor_v4i32(<4 x i32> %arg) { +; SIMD128-LABEL: pairwise_xor_v4i32: +; SIMD128: .functype pairwise_xor_v4i32 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: v128.xor $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: v128.xor $push2=, $pop4, $pop1 +; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %arg) + ret i32 %res +} + +define i16 @pairwise_xor_v8i16(<8 x i16> %arg) { +; SIMD128-LABEL: pairwise_xor_v8i16: +; SIMD128: .functype pairwise_xor_v8i16 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.xor $push8=, $0, $pop0 +; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.xor $push6=, $pop7, $pop1 +; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: v128.xor $push3=, $pop5, $pop2 +; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 + %res = tail call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %arg) + ret i16 %res +} + +define i8 @pairwise_xor_v16i8(<16 x i8> %arg) { +; SIMD128-LABEL: pairwise_xor_v16i8: +; SIMD128: .functype pairwise_xor_v16i8 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.xor $push11=, $0, $pop0 +; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.xor $push9=, $pop10, $pop1 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.xor $push7=, $pop8, $pop2 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: v128.xor $push4=, $pop6, $pop3 +; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 +; SIMD128-NEXT: return $pop5 + %res = tail call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %arg) + ret i8 %res +} + +define i64 @pairwise_smax_v2i64(<2 x i64> %arg) { +; SIMD128-LABEL: pairwise_smax_v2i64: +; SIMD128: .functype pairwise_smax_v2i64 (v128) -> (i64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push4=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: local.tee $push3=, $1=, $pop4 +; SIMD128-NEXT: i64x2.gt_s $push0=, $0, $1 +; SIMD128-NEXT: v128.bitselect $push1=, $0, $pop3, $pop0 +; SIMD128-NEXT: i64x2.extract_lane $push2=, $pop1, 0 +; SIMD128-NEXT: return $pop2 + %res = tail call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %arg) + ret i64 %res +} + +define i32 @pairwise_smax_v4i32(<4 x i32> %arg) { +; SIMD128-LABEL: pairwise_smax_v4i32: +; SIMD128: .functype pairwise_smax_v4i32 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.max_s $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.max_s $push2=, $pop4, $pop1 +; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %arg) + ret i32 %res +} + +define i16 @pairwise_smax_v8i16(<8 x i16> %arg) { +; SIMD128-LABEL: pairwise_smax_v8i16: +; SIMD128: .functype pairwise_smax_v8i16 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.max_s $push8=, $0, $pop0 +; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.max_s $push6=, $pop7, $pop1 +; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.max_s $push3=, $pop5, $pop2 +; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 + %res = tail call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %arg) + ret i16 %res +} + +define i8 @pairwise_smax_v16i8(<16 x i8> %arg) { +; SIMD128-LABEL: pairwise_smax_v16i8: +; SIMD128: .functype pairwise_smax_v16i8 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.max_s $push11=, $0, $pop0 +; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.max_s $push9=, $pop10, $pop1 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.max_s $push7=, $pop8, $pop2 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.max_s $push4=, $pop6, $pop3 +; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 +; SIMD128-NEXT: return $pop5 + %res = tail call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %arg) + ret i8 %res +} + +define i64 @pairwise_smin_v2i64(<2 x i64> %arg) { +; SIMD128-LABEL: pairwise_smin_v2i64: +; SIMD128: .functype pairwise_smin_v2i64 (v128) -> (i64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push4=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: local.tee $push3=, $1=, $pop4 +; SIMD128-NEXT: i64x2.lt_s $push0=, $0, $1 +; SIMD128-NEXT: v128.bitselect $push1=, $0, $pop3, $pop0 +; SIMD128-NEXT: i64x2.extract_lane $push2=, $pop1, 0 +; SIMD128-NEXT: return $pop2 + %res = tail call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %arg) + ret i64 %res +} + +define i32 @pairwise_smin_v4i32(<4 x i32> %arg) { +; SIMD128-LABEL: pairwise_smin_v4i32: +; SIMD128: .functype pairwise_smin_v4i32 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.min_s $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.min_s $push2=, $pop4, $pop1 +; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %arg) + ret i32 %res +} + +define i16 @pairwise_smin_v8i16(<8 x i16> %arg) { +; SIMD128-LABEL: pairwise_smin_v8i16: +; SIMD128: .functype pairwise_smin_v8i16 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.min_s $push8=, $0, $pop0 +; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.min_s $push6=, $pop7, $pop1 +; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.min_s $push3=, $pop5, $pop2 +; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 + %res = tail call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %arg) + ret i16 %res +} + +define i8 @pairwise_smin_v16i8(<16 x i8> %arg) { +; SIMD128-LABEL: pairwise_smin_v16i8: +; SIMD128: .functype pairwise_smin_v16i8 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.min_s $push11=, $0, $pop0 +; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.min_s $push9=, $pop10, $pop1 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.min_s $push7=, $pop8, $pop2 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.min_s $push4=, $pop6, $pop3 +; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 +; SIMD128-NEXT: return $pop5 + %res = tail call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %arg) + ret i8 %res +} + +define i64 @pairwise_umax_v2i64(<2 x i64> %arg) { +; SIMD128-LABEL: pairwise_umax_v2i64: +; SIMD128: .functype pairwise_umax_v2i64 (v128) -> (i64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push10=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: local.tee $push9=, $1=, $pop10 +; SIMD128-NEXT: i64.const $push4=, -1 +; SIMD128-NEXT: i64.const $push3=, 0 +; SIMD128-NEXT: i64x2.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: i64x2.extract_lane $push0=, $1, 0 +; SIMD128-NEXT: i64.gt_u $push2=, $pop1, $pop0 +; SIMD128-NEXT: i64.select $push5=, $pop4, $pop3, $pop2 +; SIMD128-NEXT: i64x2.replace_lane $push6=, $0, 0, $pop5 +; SIMD128-NEXT: v128.bitselect $push7=, $0, $pop9, $pop6 +; SIMD128-NEXT: i64x2.extract_lane $push8=, $pop7, 0 +; SIMD128-NEXT: return $pop8 + %res = tail call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %arg) + ret i64 %res +} + +define i32 @pairwise_umax_v4i32(<4 x i32> %arg) { +; SIMD128-LABEL: pairwise_umax_v4i32: +; SIMD128: .functype pairwise_umax_v4i32 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.max_u $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.max_u $push2=, $pop4, $pop1 +; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %arg) + ret i32 %res +} + +define i16 @pairwise_umax_v8i16(<8 x i16> %arg) { +; SIMD128-LABEL: pairwise_umax_v8i16: +; SIMD128: .functype pairwise_umax_v8i16 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.max_u $push8=, $0, $pop0 +; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.max_u $push6=, $pop7, $pop1 +; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.max_u $push3=, $pop5, $pop2 +; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 + %res = tail call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %arg) + ret i16 %res +} + +define i8 @pairwise_umax_v16i8(<16 x i8> %arg) { +; SIMD128-LABEL: pairwise_umax_v16i8: +; SIMD128: .functype pairwise_umax_v16i8 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.max_u $push11=, $0, $pop0 +; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.max_u $push9=, $pop10, $pop1 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.max_u $push7=, $pop8, $pop2 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.max_u $push4=, $pop6, $pop3 +; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 +; SIMD128-NEXT: return $pop5 + %res = tail call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %arg) + ret i8 %res +} + +define i64 @pairwise_umin_v2i64(<2 x i64> %arg) { +; SIMD128-LABEL: pairwise_umin_v2i64: +; SIMD128: .functype pairwise_umin_v2i64 (v128) -> (i64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push10=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: local.tee $push9=, $1=, $pop10 +; SIMD128-NEXT: i64.const $push4=, -1 +; SIMD128-NEXT: i64.const $push3=, 0 +; SIMD128-NEXT: i64x2.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: i64x2.extract_lane $push0=, $1, 0 +; SIMD128-NEXT: i64.lt_u $push2=, $pop1, $pop0 +; SIMD128-NEXT: i64.select $push5=, $pop4, $pop3, $pop2 +; SIMD128-NEXT: i64x2.replace_lane $push6=, $0, 0, $pop5 +; SIMD128-NEXT: v128.bitselect $push7=, $0, $pop9, $pop6 +; SIMD128-NEXT: i64x2.extract_lane $push8=, $pop7, 0 +; SIMD128-NEXT: return $pop8 + %res = tail call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %arg) + ret i64 %res +} + +define i32 @pairwise_umin_v4i32(<4 x i32> %arg) { +; SIMD128-LABEL: pairwise_umin_v4i32: +; SIMD128: .functype pairwise_umin_v4i32 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.min_u $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: i32x4.min_u $push2=, $pop4, $pop1 +; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %arg) + ret i32 %res +} + +define i16 @pairwise_umin_v8i16(<8 x i16> %arg) { +; SIMD128-LABEL: pairwise_umin_v8i16: +; SIMD128: .functype pairwise_umin_v8i16 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.min_u $push8=, $0, $pop0 +; SIMD128-NEXT: local.tee $push7=, $0=, $pop8 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.min_u $push6=, $pop7, $pop1 +; SIMD128-NEXT: local.tee $push5=, $0=, $pop6 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +; SIMD128-NEXT: i16x8.min_u $push3=, $pop5, $pop2 +; SIMD128-NEXT: i16x8.extract_lane_u $push4=, $pop3, 0 +; SIMD128-NEXT: return $pop4 + %res = tail call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %arg) + ret i16 %res +} + +define i8 @pairwise_umin_v16i8(<16 x i8> %arg) { +; SIMD128-LABEL: pairwise_umin_v16i8: +; SIMD128: .functype pairwise_umin_v16i8 (v128) -> (i32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.min_u $push11=, $0, $pop0 +; SIMD128-NEXT: local.tee $push10=, $0=, $pop11 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.min_u $push9=, $pop10, $pop1 +; SIMD128-NEXT: local.tee $push8=, $0=, $pop9 +; SIMD128-NEXT: i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.min_u $push7=, $pop8, $pop2 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; SIMD128-NEXT: i8x16.min_u $push4=, $pop6, $pop3 +; SIMD128-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 +; SIMD128-NEXT: return $pop5 + %res = tail call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %arg) + ret i8 %res +} + +define double @pairwise_add_v2f64(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_add_v2f64: +; SIMD128: .functype pairwise_add_v2f64 (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f64x2.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f64x2.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f64.add $push2=, $pop1, $pop0 +; SIMD128-NEXT: return $pop2 + %res = tail call double @llvm.vector.reduce.fadd.v2f64(double -0.0, <2 x double> %arg) + ret double%res +} + +define double @pairwise_add_v2f64_fast(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_add_v2f64_fast: +; SIMD128: .functype pairwise_add_v2f64_fast (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: f64x2.add $push1=, $0, $pop0 +; SIMD128-NEXT: f64x2.extract_lane $push2=, $pop1, 0 +; SIMD128-NEXT: return $pop2 + %res = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.0, <2 x double> %arg) + ret double%res +} + +define float @pairwise_add_v4f32(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_add_v4f32: +; SIMD128: .functype pairwise_add_v4f32 (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f32.add $push2=, $pop1, $pop0 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 2 +; SIMD128-NEXT: f32.add $push4=, $pop2, $pop3 +; SIMD128-NEXT: f32x4.extract_lane $push5=, $0, 3 +; SIMD128-NEXT: f32.add $push6=, $pop4, $pop5 +; SIMD128-NEXT: return $pop6 + %res = tail call float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %arg) + ret float %res +} + +define float @pairwise_add_v4f32_fast(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_add_v4f32_fast: +; SIMD128: .functype pairwise_add_v4f32_fast (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3 +; SIMD128-NEXT: f32x4.add $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: f32x4.add $push2=, $pop4, $pop1 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call fast float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %arg) + ret float %res +} + +define float @pairwise_add_v4f32_reassoc(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_add_v4f32_reassoc: +; SIMD128: .functype pairwise_add_v4f32_reassoc (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3 +; SIMD128-NEXT: f32x4.add $push5=, $0, $pop0 +; SIMD128-NEXT: local.tee $push4=, $0=, $pop5 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: f32x4.add $push2=, $pop4, $pop1 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %arg) + ret float %res +} + +define double @pairwise_mul_v2f64(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_mul_v2f64: +; SIMD128: .functype pairwise_mul_v2f64 (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f64x2.extract_lane $push0=, $0, 0 +; SIMD128-NEXT: f64.const $push1=, -0x0p0 +; SIMD128-NEXT: f64.mul $push2=, $pop0, $pop1 +; SIMD128-NEXT: f64x2.extract_lane $push3=, $0, 1 +; SIMD128-NEXT: f64.mul $push4=, $pop2, $pop3 +; SIMD128-NEXT: return $pop4 + %res = tail call double @llvm.vector.reduce.fmul.v2f64(double -0.0, <2 x double> %arg) + ret double%res +} + +define double @pairwise_mul_v2f64_fast(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_mul_v2f64_fast: +; SIMD128: .functype pairwise_mul_v2f64_fast (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f64.const $push0=, 0x0p0 +; SIMD128-NEXT: return $pop0 + %res = tail call fast double @llvm.vector.reduce.fmul.v2f64(double -0.0, <2 x double> %arg) + ret double%res +} + +define float @pairwise_mul_v4f32(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_mul_v4f32: +; SIMD128: .functype pairwise_mul_v4f32 (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 0 +; SIMD128-NEXT: f32.const $push1=, -0x0p0 +; SIMD128-NEXT: f32.mul $push2=, $pop0, $pop1 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 1 +; SIMD128-NEXT: f32.mul $push4=, $pop2, $pop3 +; SIMD128-NEXT: f32x4.extract_lane $push5=, $0, 2 +; SIMD128-NEXT: f32.mul $push6=, $pop4, $pop5 +; SIMD128-NEXT: f32x4.extract_lane $push7=, $0, 3 +; SIMD128-NEXT: f32.mul $push8=, $pop6, $pop7 +; SIMD128-NEXT: return $pop8 + %res = tail call float @llvm.vector.reduce.fmul.v4f32(float -0.0, <4 x float> %arg) + ret float %res +} + +define float @pairwise_mul_v4f32_fast(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_mul_v4f32_fast: +; SIMD128: .functype pairwise_mul_v4f32_fast (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32.const $push0=, 0x0p0 +; SIMD128-NEXT: return $pop0 + %res = tail call fast float @llvm.vector.reduce.fmul.v4f32(float -0.0, <4 x float> %arg) + ret float %res +} + +define float @pairwise_mul_v4f32_reassoc(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_mul_v4f32_reassoc: +; SIMD128: .functype pairwise_mul_v4f32_reassoc (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: f32x4.mul $push7=, $0, $pop0 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: f32x4.mul $push2=, $pop6, $pop1 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: f32.const $push4=, -0x0p0 +; SIMD128-NEXT: f32.mul $push5=, $pop3, $pop4 +; SIMD128-NEXT: return $pop5 + %res = tail call reassoc float @llvm.vector.reduce.fmul.v4f32(float -0.0, <4 x float> %arg) + ret float %res +} + +define double @pairwise_max_v2f64(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_max_v2f64: +; SIMD128: .functype pairwise_max_v2f64 (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f64x2.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f64x2.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: call $push2=, fmax, $pop1, $pop0 +; SIMD128-NEXT: return $pop2 + %res = tail call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %arg) + ret double%res +} + +define double @pairwise_max_v2f64_fast(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_max_v2f64_fast: +; SIMD128: .functype pairwise_max_v2f64_fast (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push4=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: local.tee $push3=, $1=, $pop4 +; SIMD128-NEXT: f64x2.gt $push0=, $0, $1 +; SIMD128-NEXT: v128.bitselect $push1=, $0, $pop3, $pop0 +; SIMD128-NEXT: f64x2.extract_lane $push2=, $pop1, 0 +; SIMD128-NEXT: return $pop2 + %res = tail call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> %arg) + ret double%res +} + +define float @pairwise_max_v4f32(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_max_v4f32: +; SIMD128: .functype pairwise_max_v4f32 (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push2=, $0, 1 +; SIMD128-NEXT: call $push4=, fmaxf, $pop3, $pop2 +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 2 +; SIMD128-NEXT: call $push5=, fmaxf, $pop4, $pop1 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 3 +; SIMD128-NEXT: call $push6=, fmaxf, $pop5, $pop0 +; SIMD128-NEXT: return $pop6 + %res = tail call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %arg) + ret float %res +} + +define float @pairwise_max_v4f32_fast(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_max_v4f32_fast: +; SIMD128: .functype pairwise_max_v4f32_fast (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push9=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: local.tee $push8=, $1=, $pop9 +; SIMD128-NEXT: f32x4.gt $push0=, $0, $1 +; SIMD128-NEXT: v128.bitselect $push7=, $0, $pop8, $pop0 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push5=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: local.tee $push4=, $1=, $pop5 +; SIMD128-NEXT: f32x4.gt $push1=, $0, $1 +; SIMD128-NEXT: v128.bitselect $push2=, $pop6, $pop4, $pop1 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %arg) + ret float %res +} + +define float @pairwise_max_v4f32_reassoc(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_max_v4f32_reassoc: +; SIMD128: .functype pairwise_max_v4f32_reassoc (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push2=, $0, 1 +; SIMD128-NEXT: call $push4=, fmaxf, $pop3, $pop2 +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 2 +; SIMD128-NEXT: call $push5=, fmaxf, $pop4, $pop1 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 3 +; SIMD128-NEXT: call $push6=, fmaxf, $pop5, $pop0 +; SIMD128-NEXT: return $pop6 + %res = tail call reassoc float @llvm.vector.reduce.fmax.v4f32(<4 x float> %arg) + ret float %res +} + +define double @pairwise_min_v2f64(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_min_v2f64: +; SIMD128: .functype pairwise_min_v2f64 (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f64x2.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f64x2.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: call $push2=, fmin, $pop1, $pop0 +; SIMD128-NEXT: return $pop2 + %res = tail call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %arg) + ret double%res +} + +define double @pairwise_min_v2f64_fast(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_min_v2f64_fast: +; SIMD128: .functype pairwise_min_v2f64_fast (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push4=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: local.tee $push3=, $1=, $pop4 +; SIMD128-NEXT: f64x2.lt $push0=, $0, $1 +; SIMD128-NEXT: v128.bitselect $push1=, $0, $pop3, $pop0 +; SIMD128-NEXT: f64x2.extract_lane $push2=, $pop1, 0 +; SIMD128-NEXT: return $pop2 + %res = tail call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %arg) + ret double%res +} + +define float @pairwise_min_v4f32(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_min_v4f32: +; SIMD128: .functype pairwise_min_v4f32 (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push2=, $0, 1 +; SIMD128-NEXT: call $push4=, fminf, $pop3, $pop2 +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 2 +; SIMD128-NEXT: call $push5=, fminf, $pop4, $pop1 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 3 +; SIMD128-NEXT: call $push6=, fminf, $pop5, $pop0 +; SIMD128-NEXT: return $pop6 + %res = tail call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %arg) + ret float %res +} + +define float @pairwise_min_v4f32_fast(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_min_v4f32_fast: +; SIMD128: .functype pairwise_min_v4f32_fast (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: i8x16.shuffle $push9=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: local.tee $push8=, $1=, $pop9 +; SIMD128-NEXT: f32x4.lt $push0=, $0, $1 +; SIMD128-NEXT: v128.bitselect $push7=, $0, $pop8, $pop0 +; SIMD128-NEXT: local.tee $push6=, $0=, $pop7 +; SIMD128-NEXT: i8x16.shuffle $push5=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +; SIMD128-NEXT: local.tee $push4=, $1=, $pop5 +; SIMD128-NEXT: f32x4.lt $push1=, $0, $1 +; SIMD128-NEXT: v128.bitselect $push2=, $pop6, $pop4, $pop1 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $pop2, 0 +; SIMD128-NEXT: return $pop3 + %res = tail call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %arg) + ret float %res +} + +define float @pairwise_min_v4f32_reassoc(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_min_v4f32_reassoc: +; SIMD128: .functype pairwise_min_v4f32_reassoc (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push2=, $0, 1 +; SIMD128-NEXT: call $push4=, fminf, $pop3, $pop2 +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 2 +; SIMD128-NEXT: call $push5=, fminf, $pop4, $pop1 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 3 +; SIMD128-NEXT: call $push6=, fminf, $pop5, $pop0 +; SIMD128-NEXT: return $pop6 + %res = tail call reassoc float @llvm.vector.reduce.fmin.v4f32(<4 x float> %arg) + ret float %res +} + +define double @pairwise_maximum_v2f64(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_maximum_v2f64: +; SIMD128: .functype pairwise_maximum_v2f64 (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f64x2.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f64x2.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f64.max $push2=, $pop1, $pop0 +; SIMD128-NEXT: return $pop2 + %res = tail call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %arg) + ret double%res +} + +define double @pairwise_maximum_v2f64_fast(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_maximum_v2f64_fast: +; SIMD128: .functype pairwise_maximum_v2f64_fast (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f64x2.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f64x2.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f64.max $push2=, $pop1, $pop0 +; SIMD128-NEXT: return $pop2 + %res = tail call fast double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %arg) + ret double%res +} + +define float @pairwise_maximum_v4f32(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_maximum_v4f32: +; SIMD128: .functype pairwise_maximum_v4f32 (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f32.max $push2=, $pop1, $pop0 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 2 +; SIMD128-NEXT: f32.max $push4=, $pop2, $pop3 +; SIMD128-NEXT: f32x4.extract_lane $push5=, $0, 3 +; SIMD128-NEXT: f32.max $push6=, $pop4, $pop5 +; SIMD128-NEXT: return $pop6 + %res = tail call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %arg) + ret float %res +} + +define float @pairwise_maximum_v4f32_fast(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_maximum_v4f32_fast: +; SIMD128: .functype pairwise_maximum_v4f32_fast (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f32.max $push2=, $pop1, $pop0 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 2 +; SIMD128-NEXT: f32.max $push4=, $pop2, $pop3 +; SIMD128-NEXT: f32x4.extract_lane $push5=, $0, 3 +; SIMD128-NEXT: f32.max $push6=, $pop4, $pop5 +; SIMD128-NEXT: return $pop6 + %res = tail call fast float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %arg) + ret float %res +} + +define float @pairwise_maximum_v4f32_reassoc(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_maximum_v4f32_reassoc: +; SIMD128: .functype pairwise_maximum_v4f32_reassoc (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f32.max $push2=, $pop1, $pop0 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 2 +; SIMD128-NEXT: f32.max $push4=, $pop2, $pop3 +; SIMD128-NEXT: f32x4.extract_lane $push5=, $0, 3 +; SIMD128-NEXT: f32.max $push6=, $pop4, $pop5 +; SIMD128-NEXT: return $pop6 + %res = tail call reassoc float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %arg) + ret float %res +} + +define double @pairwise_minimum_v2f64(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_minimum_v2f64: +; SIMD128: .functype pairwise_minimum_v2f64 (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f64x2.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f64x2.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f64.min $push2=, $pop1, $pop0 +; SIMD128-NEXT: return $pop2 + %res = tail call double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %arg) + ret double%res +} + +define double @pairwise_minimum_v2f64_fast(<2 x double> %arg) { +; SIMD128-LABEL: pairwise_minimum_v2f64_fast: +; SIMD128: .functype pairwise_minimum_v2f64_fast (v128) -> (f64) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f64x2.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f64x2.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f64.min $push2=, $pop1, $pop0 +; SIMD128-NEXT: return $pop2 + %res = tail call fast double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %arg) + ret double%res +} + +define float @pairwise_minimum_v4f32(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_minimum_v4f32: +; SIMD128: .functype pairwise_minimum_v4f32 (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f32.min $push2=, $pop1, $pop0 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 2 +; SIMD128-NEXT: f32.min $push4=, $pop2, $pop3 +; SIMD128-NEXT: f32x4.extract_lane $push5=, $0, 3 +; SIMD128-NEXT: f32.min $push6=, $pop4, $pop5 +; SIMD128-NEXT: return $pop6 + %res = tail call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %arg) + ret float %res +} + +define float @pairwise_minimum_v4f32_fast(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_minimum_v4f32_fast: +; SIMD128: .functype pairwise_minimum_v4f32_fast (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f32.min $push2=, $pop1, $pop0 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 2 +; SIMD128-NEXT: f32.min $push4=, $pop2, $pop3 +; SIMD128-NEXT: f32x4.extract_lane $push5=, $0, 3 +; SIMD128-NEXT: f32.min $push6=, $pop4, $pop5 +; SIMD128-NEXT: return $pop6 + %res = tail call fast float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %arg) + ret float %res +} + +define float @pairwise_minimum_v4f32_reassoc(<4 x float> %arg) { +; SIMD128-LABEL: pairwise_minimum_v4f32_reassoc: +; SIMD128: .functype pairwise_minimum_v4f32_reassoc (v128) -> (f32) +; SIMD128-NEXT: # %bb.0: +; SIMD128-NEXT: f32x4.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: f32x4.extract_lane $push0=, $0, 1 +; SIMD128-NEXT: f32.min $push2=, $pop1, $pop0 +; SIMD128-NEXT: f32x4.extract_lane $push3=, $0, 2 +; SIMD128-NEXT: f32.min $push4=, $pop2, $pop3 +; SIMD128-NEXT: f32x4.extract_lane $push5=, $0, 3 +; SIMD128-NEXT: f32.min $push6=, $pop4, $pop5 +; SIMD128-NEXT: return $pop6 + %res = tail call reassoc float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %arg) + ret float %res +}