diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index dcdd9f82cde8e..bda9d4e624505 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1705,6 +1705,13 @@ class TargetTransformInfo {
   /// into a shuffle sequence.
   bool shouldExpandReduction(const IntrinsicInst *II) const;
 
+  enum struct ReductionShuffle { SplitHalf, Pairwise };
+
+  /// \returns The shuffle sequence pattern used to expand the given reduction
+  /// intrinsic.
+  ReductionShuffle
+  getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const;
+
   /// \returns the size cost of rematerializing a GlobalValue address relative
   /// to a stack reload.
   unsigned getGISelRematGlobalCost() const;
@@ -2156,6 +2163,8 @@ class TargetTransformInfo::Concept {
   virtual bool preferEpilogueVectorization() const = 0;
 
   virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
+  virtual ReductionShuffle
+  getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const = 0;
   virtual unsigned getGISelRematGlobalCost() const = 0;
   virtual unsigned getMinTripCountTailFoldingThreshold() const = 0;
   virtual bool enableScalableVectorization() const = 0;
@@ -2898,6 +2907,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.shouldExpandReduction(II);
   }
 
+  ReductionShuffle
+  getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const override {
+    return Impl.getPreferredExpandedReductionShuffle(II);
+  }
+
   unsigned getGISelRematGlobalCost() const override {
     return Impl.getGISelRematGlobalCost();
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 01624de190d51..c1eb6151440be 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -936,6 +936,11 @@ class TargetTransformInfoImplBase {
 
   bool shouldExpandReduction(const IntrinsicInst *II) const { return true; }
 
+  TTI::ReductionShuffle
+  getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const {
+    return TTI::ReductionShuffle::SplitHalf;
+  }
+
   unsigned getGISelRematGlobalCost() const { return 1; }
 
   unsigned getMinTripCountTailFoldingThreshold() const { return 0; }
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 1a878126aa082..b01a447f3c28b 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -15,6 +15,7 @@
 
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/VectorBuilder.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 
@@ -385,6 +386,7 @@ Value *getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src,
 /// Generates a vector reduction using shufflevectors to reduce the value.
 /// Fast-math-flags are propagated using the IRBuilder's setting.
 Value *getShuffleReduction(IRBuilderBase &Builder, Value *Src, unsigned Op,
+                           TargetTransformInfo::ReductionShuffle RS,
                            RecurKind MinMaxKind = RecurKind::None);
 
 /// Create a target reduction of the given vector. The reduction operation
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c175d1737e54b..be4069bb3eabf 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1317,6 +1317,12 @@ bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
   return TTIImpl->shouldExpandReduction(II);
 }
 
+TargetTransformInfo::ReductionShuffle
+TargetTransformInfo::getPreferredExpandedReductionShuffle(
+    const IntrinsicInst *II) const {
+  return TTIImpl->getPreferredExpandedReductionShuffle(II);
+}
+
 unsigned TargetTransformInfo::getGISelRematGlobalCost() const {
   return TTIImpl->getGISelRematGlobalCost();
 }
diff --git a/llvm/lib/CodeGen/ExpandReductions.cpp b/llvm/lib/CodeGen/ExpandReductions.cpp
index 0b1504e51b1bb..d6778ec666cbe 100644
--- a/llvm/lib/CodeGen/ExpandReductions.cpp
+++ b/llvm/lib/CodeGen/ExpandReductions.cpp
@@ -59,6 +59,8 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
         isa<FPMathOperator>(II) ? II->getFastMathFlags() : FastMathFlags{};
     Intrinsic::ID ID = II->getIntrinsicID();
     RecurKind RK = getMinMaxReductionRecurKind(ID);
+    TargetTransformInfo::ReductionShuffle RS =
+        TTI->getPreferredExpandedReductionShuffle(II);
 
     Value *Rdx = nullptr;
     IRBuilder<> Builder(II);
@@ -79,7 +81,7 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
         if (!isPowerOf2_32(
                 cast<FixedVectorType>(Vec->getType())->getNumElements()))
           continue;
-        Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK);
+        Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
         Rdx = Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, Acc, Rdx,
                                   "bin.rdx");
       }
@@ -112,7 +114,7 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
         break;
       }
       unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
-      Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK);
+      Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
       break;
     }
     case Intrinsic::vector_reduce_add:
@@ -127,7 +129,7 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
               cast<FixedVectorType>(Vec->getType())->getNumElements()))
         continue;
       unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
-      Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK);
+      Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
       break;
     }
     case Intrinsic::vector_reduce_fmax:
@@ -140,7 +142,7 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
           !FMF.noNaNs())
         continue;
       unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
-      Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RK);
+      Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
       break;
     }
     }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 9a434d9b1db54..b109594811d97 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -94,6 +94,18 @@ WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   return Cost;
 }
 
+TTI::ReductionShuffle WebAssemblyTTIImpl::getPreferredExpandedReductionShuffle(
+    const IntrinsicInst *II) const {
+
+  switch (II->getIntrinsicID()) {
+  default:
+    break;
+  case Intrinsic::vector_reduce_fadd:
+    return TTI::ReductionShuffle::Pairwise;
+  }
+  return TTI::ReductionShuffle::SplitHalf;
+}
+
 bool WebAssemblyTTIImpl::areInlineCompatible(const Function *Caller,
                                              const Function *Callee) const {
   // Allow inlining only when the Callee has a subset of the Caller's
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index e10f0928ed531..269922cc3ea84 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -70,6 +70,8 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
                                      TTI::TargetCostKind CostKind,
                                      unsigned Index, Value *Op0, Value *Op1);
 
+  TTI::ReductionShuffle
+  getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const;
   /// @}
 
   bool areInlineCompatible(const Function *Caller,
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index ff93035ce0652..4609376a748f9 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1077,7 +1077,9 @@ Value *llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src,
 
 // Helper to generate a log2 shuffle reduction.
 Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src,
-                                 unsigned Op, RecurKind RdxKind) {
+                                 unsigned Op,
+                                 TargetTransformInfo::ReductionShuffle RS,
+                                 RecurKind RdxKind) {
   unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements();
   // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
   // and vector ops, reducing the set of values being computed by half each
@@ -1091,18 +1093,10 @@ Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src,
   // will never be relevant here.  Note that it would be generally unsound to
   // propagate these from an intrinsic call to the expansion anyways as we/
   // change the order of operations.
-  Value *TmpVec = Src;
-  SmallVector<int, 32> ShuffleMask(VF);
-  for (unsigned i = VF; i != 1; i >>= 1) {
-    // Move the upper half of the vector to the lower half.
-    for (unsigned j = 0; j != i / 2; ++j)
-      ShuffleMask[j] = i / 2 + j;
-
-    // Fill the rest of the mask with undef.
-    std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), -1);
-
+  auto BuildShuffledOp = [&Builder, &Op,
+                          &RdxKind](SmallVectorImpl<int> &ShuffleMask,
+                                    Value *&TmpVec) -> void {
     Value *Shuf = Builder.CreateShuffleVector(TmpVec, ShuffleMask, "rdx.shuf");
-
     if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
       TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
                                    "bin.rdx");
@@ -1111,6 +1105,30 @@ Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src,
              "Invalid min/max");
       TmpVec = createMinMaxOp(Builder, RdxKind, TmpVec, Shuf);
     }
+  };
+
+  Value *TmpVec = Src;
+  if (TargetTransformInfo::ReductionShuffle::Pairwise == RS) {
+    SmallVector<int, 32> ShuffleMask(VF);
+    for (unsigned stride = 1; stride < VF; stride <<= 1) {
+      // Initialise the mask with undef.
+      std::fill(ShuffleMask.begin(), ShuffleMask.end(), -1);
+      for (unsigned j = 0; j < VF; j += stride << 1) {
+        ShuffleMask[j] = j + stride;
+      }
+      BuildShuffledOp(ShuffleMask, TmpVec);
+    }
+  } else {
+    SmallVector<int, 32> ShuffleMask(VF);
+    for (unsigned i = VF; i != 1; i >>= 1) {
+      // Move the upper half of the vector to the lower half.
+      for (unsigned j = 0; j != i / 2; ++j)
+        ShuffleMask[j] = i / 2 + j;
+
+      // Fill the rest of the mask with undef.
+      std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), -1);
+      BuildShuffledOp(ShuffleMask, TmpVec);
+    }
   }
   // The result is in the first element of the vector.
   return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
diff --git a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
new file mode 100644
index 0000000000000..4b1a1a8a0c5b6
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
@@ -0,0 +1,1074 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=wasm32 -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s --check-prefix=SIMD128
+
+define i64 @pairwise_add_v2i64(<2 x i64> %arg) {
+; SIMD128-LABEL: pairwise_add_v2i64:
+; SIMD128:         .functype pairwise_add_v2i64 (v128) -> (i64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    i64x2.add $push1=, $0, $pop0
+; SIMD128-NEXT:    i64x2.extract_lane $push2=, $pop1, 0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call i64 @llvm.vector.reduce.add.i64.v4i64(<2 x i64> %arg)
+  ret i64 %res
+}
+
+define i32 @pairwise_add_v4i32(<4 x i32> %arg) {
+; SIMD128-LABEL: pairwise_add_v4i32:
+; SIMD128:         .functype pairwise_add_v4i32 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.add $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.add $push2=, $pop4, $pop1
+; SIMD128-NEXT:    i32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %arg)
+  ret i32 %res
+}
+
+define i16 @pairwise_add_v8i16(<8 x i16> %arg) {
+; SIMD128-LABEL: pairwise_add_v8i16:
+; SIMD128:         .functype pairwise_add_v8i16 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.add $push8=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push7=, $0=, $pop8
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.add $push6=, $pop7, $pop1
+; SIMD128-NEXT:    local.tee $push5=, $0=, $pop6
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.add $push3=, $pop5, $pop2
+; SIMD128-NEXT:    i16x8.extract_lane_u $push4=, $pop3, 0
+; SIMD128-NEXT:    return $pop4
+  %res = tail call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %arg)
+  ret i16 %res
+}
+
+define i8 @pairwise_add_v16i8(<16 x i8> %arg) {
+; SIMD128-LABEL: pairwise_add_v16i8:
+; SIMD128:         .functype pairwise_add_v16i8 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.add $push11=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push10=, $0=, $pop11
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.add $push9=, $pop10, $pop1
+; SIMD128-NEXT:    local.tee $push8=, $0=, $pop9
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.add $push7=, $pop8, $pop2
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.add $push4=, $pop6, $pop3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push5=, $pop4, 0
+; SIMD128-NEXT:    return $pop5
+  %res = tail call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %arg)
+  ret i8 %res
+}
+
+define i64 @pairwise_mul_v2i64(<2 x i64> %arg) {
+; SIMD128-LABEL: pairwise_mul_v2i64:
+; SIMD128:         .functype pairwise_mul_v2i64 (v128) -> (i64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    i64x2.mul $push1=, $0, $pop0
+; SIMD128-NEXT:    i64x2.extract_lane $push2=, $pop1, 0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %arg)
+  ret i64 %res
+}
+
+define i32 @pairwise_mul_v4i32(<4 x i32> %arg) {
+; SIMD128-LABEL: pairwise_mul_v4i32:
+; SIMD128:         .functype pairwise_mul_v4i32 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.mul $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.mul $push2=, $pop4, $pop1
+; SIMD128-NEXT:    i32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %arg)
+  ret i32 %res
+}
+
+define i16 @pairwise_mul_v8i16(<8 x i16> %arg) {
+; SIMD128-LABEL: pairwise_mul_v8i16:
+; SIMD128:         .functype pairwise_mul_v8i16 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.mul $push8=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push7=, $0=, $pop8
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.mul $push6=, $pop7, $pop1
+; SIMD128-NEXT:    local.tee $push5=, $0=, $pop6
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.mul $push3=, $pop5, $pop2
+; SIMD128-NEXT:    i16x8.extract_lane_u $push4=, $pop3, 0
+; SIMD128-NEXT:    return $pop4
+  %res = tail call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %arg)
+  ret i16 %res
+}
+
+define i8 @pairwise_mul_v16i8(<16 x i8> %arg) {
+; SIMD128-LABEL: pairwise_mul_v16i8:
+; SIMD128:         .functype pairwise_mul_v16i8 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.extract_lane_u $push26=, $0, 0
+; SIMD128-NEXT:    i8x16.shuffle $push32=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    local.tee $push31=, $1=, $pop32
+; SIMD128-NEXT:    i8x16.extract_lane_u $push25=, $pop31, 0
+; SIMD128-NEXT:    i32.mul $push27=, $pop26, $pop25
+; SIMD128-NEXT:    i8x16.extract_lane_u $push23=, $0, 4
+; SIMD128-NEXT:    i8x16.extract_lane_u $push22=, $1, 4
+; SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
+; SIMD128-NEXT:    i32.mul $push28=, $pop27, $pop24
+; SIMD128-NEXT:    i8x16.extract_lane_u $push19=, $0, 2
+; SIMD128-NEXT:    i8x16.extract_lane_u $push18=, $1, 2
+; SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
+; SIMD128-NEXT:    i8x16.extract_lane_u $push16=, $0, 6
+; SIMD128-NEXT:    i8x16.extract_lane_u $push15=, $1, 6
+; SIMD128-NEXT:    i32.mul $push17=, $pop16, $pop15
+; SIMD128-NEXT:    i32.mul $push21=, $pop20, $pop17
+; SIMD128-NEXT:    i32.mul $push29=, $pop28, $pop21
+; SIMD128-NEXT:    i8x16.extract_lane_u $push11=, $0, 1
+; SIMD128-NEXT:    i8x16.extract_lane_u $push10=, $1, 1
+; SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
+; SIMD128-NEXT:    i8x16.extract_lane_u $push8=, $0, 5
+; SIMD128-NEXT:    i8x16.extract_lane_u $push7=, $1, 5
+; SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; SIMD128-NEXT:    i32.mul $push13=, $pop12, $pop9
+; SIMD128-NEXT:    i8x16.extract_lane_u $push4=, $0, 3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push3=, $1, 3
+; SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push1=, $0, 7
+; SIMD128-NEXT:    i8x16.extract_lane_u $push0=, $1, 7
+; SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
+; SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop2
+; SIMD128-NEXT:    i32.mul $push14=, $pop13, $pop6
+; SIMD128-NEXT:    i32.mul $push30=, $pop29, $pop14
+; SIMD128-NEXT:    return $pop30
+  %res = tail call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %arg)
+  ret i8 %res
+}
+
+define i64 @pairwise_and_v2i64(<2 x i64> %arg) {
+; SIMD128-LABEL: pairwise_and_v2i64:
+; SIMD128:         .functype pairwise_and_v2i64 (v128) -> (i64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    v128.and $push1=, $0, $pop0
+; SIMD128-NEXT:    i64x2.extract_lane $push2=, $pop1, 0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %arg)
+  ret i64 %res
+}
+
+define i32 @pairwise_and_v4i32(<4 x i32> %arg) {
+; SIMD128-LABEL: pairwise_and_v4i32:
+; SIMD128:         .functype pairwise_and_v4i32 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    v128.and $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    v128.and $push2=, $pop4, $pop1
+; SIMD128-NEXT:    i32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %arg)
+  ret i32 %res
+}
+
+define i16 @pairwise_and_v8i16(<8 x i16> %arg) {
+; SIMD128-LABEL: pairwise_and_v8i16:
+; SIMD128:         .functype pairwise_and_v8i16 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    v128.and $push8=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push7=, $0=, $pop8
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    v128.and $push6=, $pop7, $pop1
+; SIMD128-NEXT:    local.tee $push5=, $0=, $pop6
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    v128.and $push3=, $pop5, $pop2
+; SIMD128-NEXT:    i16x8.extract_lane_u $push4=, $pop3, 0
+; SIMD128-NEXT:    return $pop4
+  %res = tail call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %arg)
+  ret i16 %res
+}
+
+define i8 @pairwise_and_v16i8(<16 x i8> %arg) {
+; SIMD128-LABEL: pairwise_and_v16i8:
+; SIMD128:         .functype pairwise_and_v16i8 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.and $push11=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push10=, $0=, $pop11
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.and $push9=, $pop10, $pop1
+; SIMD128-NEXT:    local.tee $push8=, $0=, $pop9
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.and $push7=, $pop8, $pop2
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.and $push4=, $pop6, $pop3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push5=, $pop4, 0
+; SIMD128-NEXT:    return $pop5
+  %res = tail call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %arg)
+  ret i8 %res
+}
+
+define i64 @pairwise_or_v2i64(<2 x i64> %arg) {
+; SIMD128-LABEL: pairwise_or_v2i64:
+; SIMD128:         .functype pairwise_or_v2i64 (v128) -> (i64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    v128.or $push1=, $0, $pop0
+; SIMD128-NEXT:    i64x2.extract_lane $push2=, $pop1, 0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %arg)
+  ret i64 %res
+}
+
+define i32 @pairwise_or_v4i32(<4 x i32> %arg) {
+; SIMD128-LABEL: pairwise_or_v4i32:
+; SIMD128:         .functype pairwise_or_v4i32 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    v128.or $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    v128.or $push2=, $pop4, $pop1
+; SIMD128-NEXT:    i32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %arg)
+  ret i32 %res
+}
+
+define i16 @pairwise_or_v8i16(<8 x i16> %arg) {
+; SIMD128-LABEL: pairwise_or_v8i16:
+; SIMD128:         .functype pairwise_or_v8i16 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    v128.or $push8=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push7=, $0=, $pop8
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    v128.or $push6=, $pop7, $pop1
+; SIMD128-NEXT:    local.tee $push5=, $0=, $pop6
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    v128.or $push3=, $pop5, $pop2
+; SIMD128-NEXT:    i16x8.extract_lane_u $push4=, $pop3, 0
+; SIMD128-NEXT:    return $pop4
+  %res = tail call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %arg)
+  ret i16 %res
+}
+
+define i8 @pairwise_or_v16i8(<16 x i8> %arg) {
+; SIMD128-LABEL: pairwise_or_v16i8:
+; SIMD128:         .functype pairwise_or_v16i8 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.or $push11=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push10=, $0=, $pop11
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.or $push9=, $pop10, $pop1
+; SIMD128-NEXT:    local.tee $push8=, $0=, $pop9
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.or $push7=, $pop8, $pop2
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.or $push4=, $pop6, $pop3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push5=, $pop4, 0
+; SIMD128-NEXT:    return $pop5
+  %res = tail call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %arg)
+  ret i8 %res
+}
+
+define i64 @pairwise_xor_v2i64(<2 x i64> %arg) {
+; SIMD128-LABEL: pairwise_xor_v2i64:
+; SIMD128:         .functype pairwise_xor_v2i64 (v128) -> (i64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    v128.xor $push1=, $0, $pop0
+; SIMD128-NEXT:    i64x2.extract_lane $push2=, $pop1, 0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %arg)
+  ret i64 %res
+}
+
+define i32 @pairwise_xor_v4i32(<4 x i32> %arg) {
+; SIMD128-LABEL: pairwise_xor_v4i32:
+; SIMD128:         .functype pairwise_xor_v4i32 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    v128.xor $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    v128.xor $push2=, $pop4, $pop1
+; SIMD128-NEXT:    i32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %arg)
+  ret i32 %res
+}
+
+define i16 @pairwise_xor_v8i16(<8 x i16> %arg) {
+; SIMD128-LABEL: pairwise_xor_v8i16:
+; SIMD128:         .functype pairwise_xor_v8i16 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    v128.xor $push8=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push7=, $0=, $pop8
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    v128.xor $push6=, $pop7, $pop1
+; SIMD128-NEXT:    local.tee $push5=, $0=, $pop6
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    v128.xor $push3=, $pop5, $pop2
+; SIMD128-NEXT:    i16x8.extract_lane_u $push4=, $pop3, 0
+; SIMD128-NEXT:    return $pop4
+  %res = tail call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %arg)
+  ret i16 %res
+}
+
+define i8 @pairwise_xor_v16i8(<16 x i8> %arg) {
+; SIMD128-LABEL: pairwise_xor_v16i8:
+; SIMD128:         .functype pairwise_xor_v16i8 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.xor $push11=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push10=, $0=, $pop11
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.xor $push9=, $pop10, $pop1
+; SIMD128-NEXT:    local.tee $push8=, $0=, $pop9
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.xor $push7=, $pop8, $pop2
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    v128.xor $push4=, $pop6, $pop3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push5=, $pop4, 0
+; SIMD128-NEXT:    return $pop5
+  %res = tail call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %arg)
+  ret i8 %res
+}
+
+define i64 @pairwise_smax_v2i64(<2 x i64> %arg) {
+; SIMD128-LABEL: pairwise_smax_v2i64:
+; SIMD128:         .functype pairwise_smax_v2i64 (v128) -> (i64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push4=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    local.tee $push3=, $1=, $pop4
+; SIMD128-NEXT:    i64x2.gt_s $push0=, $0, $1
+; SIMD128-NEXT:    v128.bitselect $push1=, $0, $pop3, $pop0
+; SIMD128-NEXT:    i64x2.extract_lane $push2=, $pop1, 0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %arg)
+  ret i64 %res
+}
+
+define i32 @pairwise_smax_v4i32(<4 x i32> %arg) {
+; SIMD128-LABEL: pairwise_smax_v4i32:
+; SIMD128:         .functype pairwise_smax_v4i32 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.max_s $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.max_s $push2=, $pop4, $pop1
+; SIMD128-NEXT:    i32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %arg)
+  ret i32 %res
+}
+
+define i16 @pairwise_smax_v8i16(<8 x i16> %arg) {
+; SIMD128-LABEL: pairwise_smax_v8i16:
+; SIMD128:         .functype pairwise_smax_v8i16 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.max_s $push8=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push7=, $0=, $pop8
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.max_s $push6=, $pop7, $pop1
+; SIMD128-NEXT:    local.tee $push5=, $0=, $pop6
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.max_s $push3=, $pop5, $pop2
+; SIMD128-NEXT:    i16x8.extract_lane_u $push4=, $pop3, 0
+; SIMD128-NEXT:    return $pop4
+  %res = tail call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %arg)
+  ret i16 %res
+}
+
+define i8 @pairwise_smax_v16i8(<16 x i8> %arg) {
+; SIMD128-LABEL: pairwise_smax_v16i8:
+; SIMD128:         .functype pairwise_smax_v16i8 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.max_s $push11=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push10=, $0=, $pop11
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.max_s $push9=, $pop10, $pop1
+; SIMD128-NEXT:    local.tee $push8=, $0=, $pop9
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.max_s $push7=, $pop8, $pop2
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.max_s $push4=, $pop6, $pop3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push5=, $pop4, 0
+; SIMD128-NEXT:    return $pop5
+  %res = tail call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %arg)
+  ret i8 %res
+}
+
+define i64 @pairwise_smin_v2i64(<2 x i64> %arg) {
+; SIMD128-LABEL: pairwise_smin_v2i64:
+; SIMD128:         .functype pairwise_smin_v2i64 (v128) -> (i64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push4=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    local.tee $push3=, $1=, $pop4
+; SIMD128-NEXT:    i64x2.lt_s $push0=, $0, $1
+; SIMD128-NEXT:    v128.bitselect $push1=, $0, $pop3, $pop0
+; SIMD128-NEXT:    i64x2.extract_lane $push2=, $pop1, 0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %arg)
+  ret i64 %res
+}
+
+define i32 @pairwise_smin_v4i32(<4 x i32> %arg) {
+; SIMD128-LABEL: pairwise_smin_v4i32:
+; SIMD128:         .functype pairwise_smin_v4i32 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.min_s $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.min_s $push2=, $pop4, $pop1
+; SIMD128-NEXT:    i32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %arg)
+  ret i32 %res
+}
+
+define i16 @pairwise_smin_v8i16(<8 x i16> %arg) {
+; SIMD128-LABEL: pairwise_smin_v8i16:
+; SIMD128:         .functype pairwise_smin_v8i16 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.min_s $push8=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push7=, $0=, $pop8
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.min_s $push6=, $pop7, $pop1
+; SIMD128-NEXT:    local.tee $push5=, $0=, $pop6
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.min_s $push3=, $pop5, $pop2
+; SIMD128-NEXT:    i16x8.extract_lane_u $push4=, $pop3, 0
+; SIMD128-NEXT:    return $pop4
+  %res = tail call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %arg)
+  ret i16 %res
+}
+
+define i8 @pairwise_smin_v16i8(<16 x i8> %arg) {
+; SIMD128-LABEL: pairwise_smin_v16i8:
+; SIMD128:         .functype pairwise_smin_v16i8 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.min_s $push11=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push10=, $0=, $pop11
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.min_s $push9=, $pop10, $pop1
+; SIMD128-NEXT:    local.tee $push8=, $0=, $pop9
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.min_s $push7=, $pop8, $pop2
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.min_s $push4=, $pop6, $pop3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push5=, $pop4, 0
+; SIMD128-NEXT:    return $pop5
+  %res = tail call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %arg)
+  ret i8 %res
+}
+
+define i64 @pairwise_umax_v2i64(<2 x i64> %arg) {
+; SIMD128-LABEL: pairwise_umax_v2i64:
+; SIMD128:         .functype pairwise_umax_v2i64 (v128) -> (i64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push10=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    local.tee $push9=, $1=, $pop10
+; SIMD128-NEXT:    i64.const $push4=, -1
+; SIMD128-NEXT:    i64.const $push3=, 0
+; SIMD128-NEXT:    i64x2.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    i64x2.extract_lane $push0=, $1, 0
+; SIMD128-NEXT:    i64.gt_u $push2=, $pop1, $pop0
+; SIMD128-NEXT:    i64.select $push5=, $pop4, $pop3, $pop2
+; SIMD128-NEXT:    i64x2.replace_lane $push6=, $0, 0, $pop5
+; SIMD128-NEXT:    v128.bitselect $push7=, $0, $pop9, $pop6
+; SIMD128-NEXT:    i64x2.extract_lane $push8=, $pop7, 0
+; SIMD128-NEXT:    return $pop8
+  %res = tail call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %arg)
+  ret i64 %res
+}
+
+define i32 @pairwise_umax_v4i32(<4 x i32> %arg) {
+; SIMD128-LABEL: pairwise_umax_v4i32:
+; SIMD128:         .functype pairwise_umax_v4i32 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.max_u $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.max_u $push2=, $pop4, $pop1
+; SIMD128-NEXT:    i32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %arg)
+  ret i32 %res
+}
+
+define i16 @pairwise_umax_v8i16(<8 x i16> %arg) {
+; SIMD128-LABEL: pairwise_umax_v8i16:
+; SIMD128:         .functype pairwise_umax_v8i16 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.max_u $push8=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push7=, $0=, $pop8
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.max_u $push6=, $pop7, $pop1
+; SIMD128-NEXT:    local.tee $push5=, $0=, $pop6
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.max_u $push3=, $pop5, $pop2
+; SIMD128-NEXT:    i16x8.extract_lane_u $push4=, $pop3, 0
+; SIMD128-NEXT:    return $pop4
+  %res = tail call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %arg)
+  ret i16 %res
+}
+
+define i8 @pairwise_umax_v16i8(<16 x i8> %arg) {
+; SIMD128-LABEL: pairwise_umax_v16i8:
+; SIMD128:         .functype pairwise_umax_v16i8 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.max_u $push11=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push10=, $0=, $pop11
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.max_u $push9=, $pop10, $pop1
+; SIMD128-NEXT:    local.tee $push8=, $0=, $pop9
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.max_u $push7=, $pop8, $pop2
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.max_u $push4=, $pop6, $pop3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push5=, $pop4, 0
+; SIMD128-NEXT:    return $pop5
+  %res = tail call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %arg)
+  ret i8 %res
+}
+
+define i64 @pairwise_umin_v2i64(<2 x i64> %arg) {
+; SIMD128-LABEL: pairwise_umin_v2i64:
+; SIMD128:         .functype pairwise_umin_v2i64 (v128) -> (i64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push10=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    local.tee $push9=, $1=, $pop10
+; SIMD128-NEXT:    i64.const $push4=, -1
+; SIMD128-NEXT:    i64.const $push3=, 0
+; SIMD128-NEXT:    i64x2.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    i64x2.extract_lane $push0=, $1, 0
+; SIMD128-NEXT:    i64.lt_u $push2=, $pop1, $pop0
+; SIMD128-NEXT:    i64.select $push5=, $pop4, $pop3, $pop2
+; SIMD128-NEXT:    i64x2.replace_lane $push6=, $0, 0, $pop5
+; SIMD128-NEXT:    v128.bitselect $push7=, $0, $pop9, $pop6
+; SIMD128-NEXT:    i64x2.extract_lane $push8=, $pop7, 0
+; SIMD128-NEXT:    return $pop8
+  %res = tail call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %arg)
+  ret i64 %res
+}
+
+define i32 @pairwise_umin_v4i32(<4 x i32> %arg) {
+; SIMD128-LABEL: pairwise_umin_v4i32:
+; SIMD128:         .functype pairwise_umin_v4i32 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.min_u $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    i32x4.min_u $push2=, $pop4, $pop1
+; SIMD128-NEXT:    i32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %arg)
+  ret i32 %res
+}
+
+define i16 @pairwise_umin_v8i16(<8 x i16> %arg) {
+; SIMD128-LABEL: pairwise_umin_v8i16:
+; SIMD128:         .functype pairwise_umin_v8i16 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.min_u $push8=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push7=, $0=, $pop8
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.min_u $push6=, $pop7, $pop1
+; SIMD128-NEXT:    local.tee $push5=, $0=, $pop6
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i16x8.min_u $push3=, $pop5, $pop2
+; SIMD128-NEXT:    i16x8.extract_lane_u $push4=, $pop3, 0
+; SIMD128-NEXT:    return $pop4
+  %res = tail call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %arg)
+  ret i16 %res
+}
+
+define i8 @pairwise_umin_v16i8(<16 x i8> %arg) {
+; SIMD128-LABEL: pairwise_umin_v16i8:
+; SIMD128:         .functype pairwise_umin_v16i8 (v128) -> (i32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.min_u $push11=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push10=, $0=, $pop11
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.min_u $push9=, $pop10, $pop1
+; SIMD128-NEXT:    local.tee $push8=, $0=, $pop9
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.min_u $push7=, $pop8, $pop2
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push3=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i8x16.min_u $push4=, $pop6, $pop3
+; SIMD128-NEXT:    i8x16.extract_lane_u $push5=, $pop4, 0
+; SIMD128-NEXT:    return $pop5
+  %res = tail call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %arg)
+  ret i8 %res
+}
+
+define double @pairwise_add_v2f64(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_add_v2f64:
+; SIMD128:         .functype pairwise_add_v2f64 (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f64x2.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f64.add $push2=, $pop1, $pop0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call double @llvm.vector.reduce.fadd.v2f64(double -0.0, <2 x double> %arg)
+  ret double%res
+}
+
+define double @pairwise_add_v2f64_fast(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_add_v2f64_fast:
+; SIMD128:         .functype pairwise_add_v2f64_fast (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    f64x2.add $push1=, $0, $pop0
+; SIMD128-NEXT:    f64x2.extract_lane $push2=, $pop1, 0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.0, <2 x double> %arg)
+  ret double%res
+}
+
+define float @pairwise_add_v4f32(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_add_v4f32:
+; SIMD128:         .functype pairwise_add_v4f32 (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f32.add $push2=, $pop1, $pop0
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 2
+; SIMD128-NEXT:    f32.add $push4=, $pop2, $pop3
+; SIMD128-NEXT:    f32x4.extract_lane $push5=, $0, 3
+; SIMD128-NEXT:    f32.add $push6=, $pop4, $pop5
+; SIMD128-NEXT:    return $pop6
+  %res = tail call float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_add_v4f32_fast(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_add_v4f32_fast:
+; SIMD128:         .functype pairwise_add_v4f32_fast (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3
+; SIMD128-NEXT:    f32x4.add $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    f32x4.add $push2=, $pop4, $pop1
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call fast float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_add_v4f32_reassoc(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_add_v4f32_reassoc:
+; SIMD128:         .functype pairwise_add_v4f32_reassoc (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3
+; SIMD128-NEXT:    f32x4.add $push5=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push4=, $0=, $pop5
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    f32x4.add $push2=, $pop4, $pop1
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %arg)
+  ret float %res
+}
+
+define double @pairwise_mul_v2f64(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_mul_v2f64:
+; SIMD128:         .functype pairwise_mul_v2f64 (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.extract_lane $push0=, $0, 0
+; SIMD128-NEXT:    f64.const $push1=, -0x0p0
+; SIMD128-NEXT:    f64.mul $push2=, $pop0, $pop1
+; SIMD128-NEXT:    f64x2.extract_lane $push3=, $0, 1
+; SIMD128-NEXT:    f64.mul $push4=, $pop2, $pop3
+; SIMD128-NEXT:    return $pop4
+  %res = tail call double @llvm.vector.reduce.fmul.v2f64(double -0.0, <2 x double> %arg)
+  ret double%res
+}
+
+define double @pairwise_mul_v2f64_fast(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_mul_v2f64_fast:
+; SIMD128:         .functype pairwise_mul_v2f64_fast (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64.const $push0=, 0x0p0
+; SIMD128-NEXT:    return $pop0
+  %res = tail call fast double @llvm.vector.reduce.fmul.v2f64(double -0.0, <2 x double> %arg)
+  ret double%res
+}
+
+define float @pairwise_mul_v4f32(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_mul_v4f32:
+; SIMD128:         .functype pairwise_mul_v4f32 (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 0
+; SIMD128-NEXT:    f32.const $push1=, -0x0p0
+; SIMD128-NEXT:    f32.mul $push2=, $pop0, $pop1
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 1
+; SIMD128-NEXT:    f32.mul $push4=, $pop2, $pop3
+; SIMD128-NEXT:    f32x4.extract_lane $push5=, $0, 2
+; SIMD128-NEXT:    f32.mul $push6=, $pop4, $pop5
+; SIMD128-NEXT:    f32x4.extract_lane $push7=, $0, 3
+; SIMD128-NEXT:    f32.mul $push8=, $pop6, $pop7
+; SIMD128-NEXT:    return $pop8
+  %res = tail call float @llvm.vector.reduce.fmul.v4f32(float -0.0, <4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_mul_v4f32_fast(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_mul_v4f32_fast:
+; SIMD128:         .functype pairwise_mul_v4f32_fast (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32.const $push0=, 0x0p0
+; SIMD128-NEXT:    return $pop0
+  %res = tail call fast float @llvm.vector.reduce.fmul.v4f32(float -0.0, <4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_mul_v4f32_reassoc(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_mul_v4f32_reassoc:
+; SIMD128:         .functype pairwise_mul_v4f32_reassoc (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    f32x4.mul $push7=, $0, $pop0
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push1=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    f32x4.mul $push2=, $pop6, $pop1
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    f32.const $push4=, -0x0p0
+; SIMD128-NEXT:    f32.mul $push5=, $pop3, $pop4
+; SIMD128-NEXT:    return $pop5
+  %res = tail call reassoc float @llvm.vector.reduce.fmul.v4f32(float -0.0, <4 x float> %arg)
+  ret float %res
+}
+
+define double @pairwise_max_v2f64(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_max_v2f64:
+; SIMD128:         .functype pairwise_max_v2f64 (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f64x2.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    call $push2=, fmax, $pop1, $pop0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %arg)
+  ret double%res
+}
+
+define double @pairwise_max_v2f64_fast(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_max_v2f64_fast:
+; SIMD128:         .functype pairwise_max_v2f64_fast (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push4=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    local.tee $push3=, $1=, $pop4
+; SIMD128-NEXT:    f64x2.gt $push0=, $0, $1
+; SIMD128-NEXT:    v128.bitselect $push1=, $0, $pop3, $pop0
+; SIMD128-NEXT:    f64x2.extract_lane $push2=, $pop1, 0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> %arg)
+  ret double%res
+}
+
+define float @pairwise_max_v4f32(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_max_v4f32:
+; SIMD128:         .functype pairwise_max_v4f32 (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push2=, $0, 1
+; SIMD128-NEXT:    call $push4=, fmaxf, $pop3, $pop2
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 2
+; SIMD128-NEXT:    call $push5=, fmaxf, $pop4, $pop1
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 3
+; SIMD128-NEXT:    call $push6=, fmaxf, $pop5, $pop0
+; SIMD128-NEXT:    return $pop6
+  %res = tail call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_max_v4f32_fast(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_max_v4f32_fast:
+; SIMD128:         .functype pairwise_max_v4f32_fast (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push9=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    local.tee $push8=, $1=, $pop9
+; SIMD128-NEXT:    f32x4.gt $push0=, $0, $1
+; SIMD128-NEXT:    v128.bitselect $push7=, $0, $pop8, $pop0
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push5=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    local.tee $push4=, $1=, $pop5
+; SIMD128-NEXT:    f32x4.gt $push1=, $0, $1
+; SIMD128-NEXT:    v128.bitselect $push2=, $pop6, $pop4, $pop1
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_max_v4f32_reassoc(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_max_v4f32_reassoc:
+; SIMD128:         .functype pairwise_max_v4f32_reassoc (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push2=, $0, 1
+; SIMD128-NEXT:    call $push4=, fmaxf, $pop3, $pop2
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 2
+; SIMD128-NEXT:    call $push5=, fmaxf, $pop4, $pop1
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 3
+; SIMD128-NEXT:    call $push6=, fmaxf, $pop5, $pop0
+; SIMD128-NEXT:    return $pop6
+  %res = tail call reassoc float @llvm.vector.reduce.fmax.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define double @pairwise_min_v2f64(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_min_v2f64:
+; SIMD128:         .functype pairwise_min_v2f64 (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f64x2.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    call $push2=, fmin, $pop1, $pop0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %arg)
+  ret double%res
+}
+
+define double @pairwise_min_v2f64_fast(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_min_v2f64_fast:
+; SIMD128:         .functype pairwise_min_v2f64_fast (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push4=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT:    local.tee $push3=, $1=, $pop4
+; SIMD128-NEXT:    f64x2.lt $push0=, $0, $1
+; SIMD128-NEXT:    v128.bitselect $push1=, $0, $pop3, $pop0
+; SIMD128-NEXT:    f64x2.extract_lane $push2=, $pop1, 0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %arg)
+  ret double%res
+}
+
+define float @pairwise_min_v4f32(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_min_v4f32:
+; SIMD128:         .functype pairwise_min_v4f32 (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push2=, $0, 1
+; SIMD128-NEXT:    call $push4=, fminf, $pop3, $pop2
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 2
+; SIMD128-NEXT:    call $push5=, fminf, $pop4, $pop1
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 3
+; SIMD128-NEXT:    call $push6=, fminf, $pop5, $pop0
+; SIMD128-NEXT:    return $pop6
+  %res = tail call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_min_v4f32_fast(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_min_v4f32_fast:
+; SIMD128:         .functype pairwise_min_v4f32_fast (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push9=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    local.tee $push8=, $1=, $pop9
+; SIMD128-NEXT:    f32x4.lt $push0=, $0, $1
+; SIMD128-NEXT:    v128.bitselect $push7=, $0, $pop8, $pop0
+; SIMD128-NEXT:    local.tee $push6=, $0=, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push5=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT:    local.tee $push4=, $1=, $pop5
+; SIMD128-NEXT:    f32x4.lt $push1=, $0, $1
+; SIMD128-NEXT:    v128.bitselect $push2=, $pop6, $pop4, $pop1
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT:    return $pop3
+  %res = tail call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_min_v4f32_reassoc(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_min_v4f32_reassoc:
+; SIMD128:         .functype pairwise_min_v4f32_reassoc (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push2=, $0, 1
+; SIMD128-NEXT:    call $push4=, fminf, $pop3, $pop2
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 2
+; SIMD128-NEXT:    call $push5=, fminf, $pop4, $pop1
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 3
+; SIMD128-NEXT:    call $push6=, fminf, $pop5, $pop0
+; SIMD128-NEXT:    return $pop6
+  %res = tail call reassoc float @llvm.vector.reduce.fmin.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define double @pairwise_maximum_v2f64(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_maximum_v2f64:
+; SIMD128:         .functype pairwise_maximum_v2f64 (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f64x2.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f64.max $push2=, $pop1, $pop0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %arg)
+  ret double%res
+}
+
+define double @pairwise_maximum_v2f64_fast(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_maximum_v2f64_fast:
+; SIMD128:         .functype pairwise_maximum_v2f64_fast (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f64x2.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f64.max $push2=, $pop1, $pop0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call fast double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %arg)
+  ret double%res
+}
+
+define float @pairwise_maximum_v4f32(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_maximum_v4f32:
+; SIMD128:         .functype pairwise_maximum_v4f32 (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f32.max $push2=, $pop1, $pop0
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 2
+; SIMD128-NEXT:    f32.max $push4=, $pop2, $pop3
+; SIMD128-NEXT:    f32x4.extract_lane $push5=, $0, 3
+; SIMD128-NEXT:    f32.max $push6=, $pop4, $pop5
+; SIMD128-NEXT:    return $pop6
+  %res = tail call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_maximum_v4f32_fast(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_maximum_v4f32_fast:
+; SIMD128:         .functype pairwise_maximum_v4f32_fast (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f32.max $push2=, $pop1, $pop0
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 2
+; SIMD128-NEXT:    f32.max $push4=, $pop2, $pop3
+; SIMD128-NEXT:    f32x4.extract_lane $push5=, $0, 3
+; SIMD128-NEXT:    f32.max $push6=, $pop4, $pop5
+; SIMD128-NEXT:    return $pop6
+  %res = tail call fast float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_maximum_v4f32_reassoc(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_maximum_v4f32_reassoc:
+; SIMD128:         .functype pairwise_maximum_v4f32_reassoc (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f32.max $push2=, $pop1, $pop0
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 2
+; SIMD128-NEXT:    f32.max $push4=, $pop2, $pop3
+; SIMD128-NEXT:    f32x4.extract_lane $push5=, $0, 3
+; SIMD128-NEXT:    f32.max $push6=, $pop4, $pop5
+; SIMD128-NEXT:    return $pop6
+  %res = tail call reassoc float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define double @pairwise_minimum_v2f64(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_minimum_v2f64:
+; SIMD128:         .functype pairwise_minimum_v2f64 (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f64x2.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f64.min $push2=, $pop1, $pop0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %arg)
+  ret double%res
+}
+
+define double @pairwise_minimum_v2f64_fast(<2 x double> %arg) {
+; SIMD128-LABEL: pairwise_minimum_v2f64_fast:
+; SIMD128:         .functype pairwise_minimum_v2f64_fast (v128) -> (f64)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f64x2.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f64x2.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f64.min $push2=, $pop1, $pop0
+; SIMD128-NEXT:    return $pop2
+  %res = tail call fast double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %arg)
+  ret double%res
+}
+
+define float @pairwise_minimum_v4f32(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_minimum_v4f32:
+; SIMD128:         .functype pairwise_minimum_v4f32 (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f32.min $push2=, $pop1, $pop0
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 2
+; SIMD128-NEXT:    f32.min $push4=, $pop2, $pop3
+; SIMD128-NEXT:    f32x4.extract_lane $push5=, $0, 3
+; SIMD128-NEXT:    f32.min $push6=, $pop4, $pop5
+; SIMD128-NEXT:    return $pop6
+  %res = tail call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_minimum_v4f32_fast(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_minimum_v4f32_fast:
+; SIMD128:         .functype pairwise_minimum_v4f32_fast (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f32.min $push2=, $pop1, $pop0
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 2
+; SIMD128-NEXT:    f32.min $push4=, $pop2, $pop3
+; SIMD128-NEXT:    f32x4.extract_lane $push5=, $0, 3
+; SIMD128-NEXT:    f32.min $push6=, $pop4, $pop5
+; SIMD128-NEXT:    return $pop6
+  %res = tail call fast float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %arg)
+  ret float %res
+}
+
+define float @pairwise_minimum_v4f32_reassoc(<4 x float> %arg) {
+; SIMD128-LABEL: pairwise_minimum_v4f32_reassoc:
+; SIMD128:         .functype pairwise_minimum_v4f32_reassoc (v128) -> (f32)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    f32x4.extract_lane $push1=, $0, 0
+; SIMD128-NEXT:    f32x4.extract_lane $push0=, $0, 1
+; SIMD128-NEXT:    f32.min $push2=, $pop1, $pop0
+; SIMD128-NEXT:    f32x4.extract_lane $push3=, $0, 2
+; SIMD128-NEXT:    f32.min $push4=, $pop2, $pop3
+; SIMD128-NEXT:    f32x4.extract_lane $push5=, $0, 3
+; SIMD128-NEXT:    f32.min $push6=, $pop4, $pop5
+; SIMD128-NEXT:    return $pop6
+  %res = tail call reassoc float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %arg)
+  ret float %res
+}