From a9baf1ecfd343e2d8d8fa277c8b093a1869726bb Mon Sep 17 00:00:00 2001
From: Arnold Schwaighofer <aschwaighofer@apple.com>
Date: Sat, 21 Sep 2013 01:06:00 +0000
Subject: [PATCH] Reapply "SLPVectorizer: Handle more horizontal reductions
 (disabled)""

Reapply r191108 with a fix for a memory corruption error I introduced.  Of
course, we can't reference the scalars that we replace by vectorizing and then
call their eraseFromParent method. I only 'needed' the scalars to get the
DebugLoc. Just store the DebugLoc before actually vectorizing instead. As a nice
side effect, this also simplifies the interface between BoUpSLP and the
HorizontalReduction class to returning a value pointer (the vectorized tree
root).

radar://14607682

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@191123 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/SLPVectorizer.cpp    | 372 +++++++++++++++-
 .../SLPVectorizer/X86/horizontal.ll           | 415 ++++++++++++++++++
 2 files changed, 779 insertions(+), 8 deletions(-)
 create mode 100644 test/Transforms/SLPVectorizer/X86/horizontal.ll
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index cd3f723cd3e..053e08e4975 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -49,6 +49,11 @@ static cl::opt<int>
     SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
                      cl::desc("Only vectorize if you gain more than this "
                               "number "));
+
+static cl::opt<bool>
+ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden,
+                   cl::desc("Attempt to vectorize horizontal reductions"));
+
 namespace {
 
 static const unsigned MinVecRegSize = 128;
@@ -238,17 +243,20 @@ public:
     }
 
   /// \brief Vectorize the tree that starts with the elements in \p VL.
-  void vectorizeTree();
+  /// Returns the vectorized root.
+  Value *vectorizeTree();
 
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
   int getTreeCost();
 
-  /// Construct a vectorizable tree that starts at \p Roots.
-  void buildTree(ArrayRef<Value *> Roots);
+  /// Construct a vectorizable tree that starts at \p Roots and is possibly
+  /// used by a reduction of \p RdxOps.
+  void buildTree(ArrayRef<Value *> Roots, ValueSet *RdxOps = 0);
 
   /// Clear the internal data structures that are created by 'buildTree'.
   void deleteTree() {
+    RdxOps = 0;
     VectorizableTree.clear();
     ScalarToTreeEntry.clear();
     MustGather.clear();
@@ -401,6 +409,9 @@ private:
   /// Numbers instructions in different blocks.
   DenseMap<BasicBlock *, BlockNumbering> BlocksNumbers;
 
+  /// Reduction operators.
+  ValueSet *RdxOps;
+
   // Analysis and block reference.
   Function *F;
   ScalarEvolution *SE;
@@ -413,8 +424,9 @@ private:
   IRBuilder<> Builder;
 };
 
-void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) {
   deleteTree();
+  RdxOps = Rdx;
   if (!getSameType(Roots))
     return;
   buildTree_rec(Roots, 0);
@@ -445,8 +457,12 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
           assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
           continue;
         }
+        Instruction *UserInst = dyn_cast<Instruction>(*User);
+        if (!UserInst)
+          continue;
 
-        if (!isa<Instruction>(*User))
+        // Ignore uses that are part of the reduction.
+        if (Rdx && std::find(Rdx->begin(), Rdx->end(), UserInst) != Rdx->end())
           continue;
 
         DEBUG(dbgs() << "SLP: Need to extract:" << **User << " from lane " <<
@@ -578,6 +594,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         continue;
       }
 
+      // This user is part of the reduction.
+      if (RdxOps && RdxOps->count(User))
+        continue;
+
       // Make sure that we can schedule this unknown user.
       BlockNumbering &BN = BlocksNumbers[BB];
       int UserIndex = BN.getIndex(User);
@@ -1372,7 +1392,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   return 0;
 }
 
-void BoUpSLP::vectorizeTree() {
+Value *BoUpSLP::vectorizeTree() {
   Builder.SetInsertPoint(F->getEntryBlock().begin());
   vectorizeTree(&VectorizableTree[0]);
 
@@ -1449,7 +1469,10 @@ void BoUpSLP::vectorizeTree() {
           DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n");
           assert(!MustGather.count(*User) &&
                  "Replacing gathered value with undef");
-          assert(ScalarToTreeEntry.count(*User) &&
+
+          assert((ScalarToTreeEntry.count(*User) ||
+                  // It is legal to replace the reduction users by undef.
+                  (RdxOps && RdxOps->count(*User))) &&
                  "Replacing out-of-tree value with undef");
         }
         Value *Undef = UndefValue::get(Ty);
@@ -1464,6 +1487,8 @@ void BoUpSLP::vectorizeTree() {
     BlocksNumbers[it].forget();
   }
   Builder.ClearInsertionPoint();
+
+  return VectorizableTree[0].VectorizedValue;
 }
 
 void BoUpSLP::optimizeGatherSequence() {
@@ -1887,6 +1912,308 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
   return 0;
 }
 
+/// \brief Generate a shuffle mask to be used in a reduction tree.
+///
+/// \param VecLen The length of the vector to be reduced.
+/// \param NumEltsToRdx The number of elements that should be reduced in the
+///        vector.
+/// \param IsPairwise Whether the reduction is a pairwise or splitting
+///        reduction. A pairwise reduction will generate a mask of 
+///        <0,2,...> or <1,3,..> while a splitting reduction will generate
+///        <2,3, undef,undef> for a vector of 4 and NumElts = 2.
+/// \param IsLeft True will generate a mask of even elements, odd otherwise.
+static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
+                                   bool IsPairwise, bool IsLeft,
+                                   IRBuilder<> &Builder) {
+  assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
+
+  SmallVector<Constant *, 32> ShuffleMask(
+      VecLen, UndefValue::get(Builder.getInt32Ty()));
+
+  if (IsPairwise)
+    // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
+    for (unsigned i = 0; i != NumEltsToRdx; ++i)
+      ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
+  else
+    // Move the upper half of the vector to the lower half.
+    for (unsigned i = 0; i != NumEltsToRdx; ++i)
+      ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
+
+  return ConstantVector::get(ShuffleMask);
+}
+
+
+/// Model horizontal reductions.
+///
+/// A horizontal reduction is a tree of reduction operations (currently add and
+/// fadd) that has operations that can be put into a vector as its leaf.
+/// For example, this tree:
+///
+/// mul mul mul mul
+///  \  /    \  /
+///   +       +
+///    \     /
+///       +
+/// This tree has "mul" as its reduced values and "+" as its reduction
+/// operations. A reduction might be feeding into a store or a binary operation
+/// feeding a phi.
+///    ...
+///    \  /
+///     +
+///     \
+///  phi +=
+///
+///  Or:
+///    ...
+///    \  /
+///     +
+///     \
+///   *p =
+///
+class HorizontalReduction {
+  SmallPtrSet<Value *, 16> ReductionOps;
+  SmallVector<Value *, 32> ReducedVals;
+
+  BinaryOperator *ReductionRoot;
+  PHINode *ReductionPHI;
+
+  /// The opcode of the reduction.
+  unsigned ReductionOpcode;
+  /// The opcode of the values we perform a reduction on.
+  unsigned ReducedValueOpcode;
+  /// The width of one full horizontal reduction operation.
+  unsigned ReduxWidth;
+  /// Should we model this reduction as a pairwise reduction tree or a tree that
+  /// splits the vector in halves and adds those halves.
+  bool IsPairwiseReduction;
+
+public:
+  HorizontalReduction()
+    : ReductionRoot(0), ReductionPHI(0), ReductionOpcode(0),
+    ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
+
+  /// \brief Try to find a reduction tree.
+  bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B,
+                                 DataLayout *DL) {
+    assert((!Phi ||
+            std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
+           "Thi phi needs to use the binary operator");
+
+    // We could have a initial reductions that is not an add.
+    //  r *= v1 + v2 + v3 + v4
+    // In such a case start looking for a tree rooted in the first '+'.
+    if (Phi) {
+      if (B->getOperand(0) == Phi) {
+        Phi = 0;
+        B = dyn_cast<BinaryOperator>(B->getOperand(1));
+      } else if (B->getOperand(1) == Phi) {
+        Phi = 0;
+        B = dyn_cast<BinaryOperator>(B->getOperand(0));
+      }
+    }
+
+    if (!B)
+      return false;
+
+    Type *Ty = B->getType();
+    if (Ty->isVectorTy())
+      return false;
+
+    ReductionOpcode = B->getOpcode();
+    ReducedValueOpcode = 0;
+    ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty);
+    ReductionRoot = B;
+    ReductionPHI = Phi;
+
+    if (ReduxWidth < 4)
+      return false;
+
+    // We currently only support adds.
+    if (ReductionOpcode != Instruction::Add &&
+        ReductionOpcode != Instruction::FAdd)
+      return false;
+
+    // Post order traverse the reduction tree starting at B. We only handle true
+    // trees containing only binary operators.
+    SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack;
+    Stack.push_back(std::make_pair(B, 0));
+    while (!Stack.empty()) {
+      BinaryOperator *TreeN = Stack.back().first;
+      unsigned EdgeToVist = Stack.back().second++;
+      bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
+
+      // Only handle trees in the current basic block.
+      if (TreeN->getParent() != B->getParent())
+        return false;
+
+      // Each tree node needs to have one user except for the ultimate
+      // reduction.
+      if (!TreeN->hasOneUse() && TreeN != B)
+        return false;
+
+      // Postorder vist.
+      if (EdgeToVist == 2 || IsReducedValue) {
+        if (IsReducedValue) {
+          // Make sure that the opcodes of the operations that we are going to
+          // reduce match.
+          if (!ReducedValueOpcode)
+            ReducedValueOpcode = TreeN->getOpcode();
+          else if (ReducedValueOpcode != TreeN->getOpcode())
+            return false;
+          ReducedVals.push_back(TreeN);
+        } else {
+          // We need to be able to reassociate the adds.
+          if (!TreeN->isAssociative())
+            return false;
+          ReductionOps.insert(TreeN);
+        }
+        // Retract.
+        Stack.pop_back();
+        continue;
+      }
+
+      // Visit left or right.
+      Value *NextV = TreeN->getOperand(EdgeToVist);
+      BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV);
+      if (Next)
+        Stack.push_back(std::make_pair(Next, 0));
+      else if (NextV != Phi)
+        return false;
+    }
+    return true;
+  }
+
+  /// \brief Attempt to vectorize the tree found by
+  /// matchAssociativeReduction.
+  bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
+    if (ReducedVals.empty())
+      return false;
+
+    unsigned NumReducedVals = ReducedVals.size();
+    if (NumReducedVals < ReduxWidth)
+      return false;
+
+    Value *VectorizedTree = 0;
+    IRBuilder<> Builder(ReductionRoot);
+    FastMathFlags Unsafe;
+    Unsafe.setUnsafeAlgebra();
+    Builder.SetFastMathFlags(Unsafe);
+    unsigned i = 0;
+
+    for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
+      ArrayRef<Value *> ValsToReduce(&ReducedVals[i], ReduxWidth);
+      V.buildTree(ValsToReduce, &ReductionOps);
+
+      // Estimate cost.
+      int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
+      if (Cost >= -SLPCostThreshold)
+        break;
+
+      DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
+                   << ". (HorRdx)\n");
+
+      // Vectorize a tree.
+      DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
+      Value *VectorizedRoot = V.vectorizeTree();
+
+      // Emit a reduction.
+      Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
+      if (VectorizedTree) {
+        Builder.SetCurrentDebugLocation(Loc);
+        VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
+                                     ReducedSubTree, "bin.rdx");
+      } else
+        VectorizedTree = ReducedSubTree;
+    }
+
+    if (VectorizedTree) {
+      // Finish the reduction.
+      for (; i < NumReducedVals; ++i) {
+        Builder.SetCurrentDebugLocation(
+          cast<Instruction>(ReducedVals[i])->getDebugLoc());
+        VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
+                                     ReducedVals[i]);
+      }
+      // Update users.
+      if (ReductionPHI) {
+        assert(ReductionRoot != NULL && "Need a reduction operation");
+        ReductionRoot->setOperand(0, VectorizedTree);
+        ReductionRoot->setOperand(1, ReductionPHI);
+      } else
+        ReductionRoot->replaceAllUsesWith(VectorizedTree);
+    }
+    return VectorizedTree != 0;
+  }
+
+private:
+
+  /// \brief Calcuate the cost of a reduction.
+  int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
+    Type *ScalarTy = FirstReducedVal->getType();
+    Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
+
+    int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
+    int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
+
+    IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
+    int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
+
+    int ScalarReduxCost =
+        ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
+
+    DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
+                 << " for reduction that starts with " << *FirstReducedVal
+                 << " (It is a "
+                 << (IsPairwiseReduction ? "pairwise" : "splitting")
+                 << " reduction)\n");
+
+    return VecReduxCost - ScalarReduxCost;
+  }
+
+  static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
+                            Value *R, const Twine &Name = "") {
+    if (Opcode == Instruction::FAdd)
+      return Builder.CreateFAdd(L, R, Name);
+    return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
+  }
+
+  /// \brief Emit a horizontal reduction of the vectorized value.
+  Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
+    assert(VectorizedValue && "Need to have a vectorized tree node");
+    Instruction *ValToReduce = dyn_cast<Instruction>(VectorizedValue);
+    assert(isPowerOf2_32(ReduxWidth) &&
+           "We only handle power-of-two reductions for now");
+
+    SmallVector<Constant *, 32> ShuffleMask(ReduxWidth, 0);
+    Value *TmpVec = ValToReduce;
+    for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
+      if (IsPairwiseReduction) {
+        Value *LeftMask =
+          createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
+        Value *RightMask =
+          createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
+
+        Value *LeftShuf = Builder.CreateShuffleVector(
+          TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
+        Value *RightShuf = Builder.CreateShuffleVector(
+          TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
+          "rdx.shuf.r");
+        TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
+                             "bin.rdx");
+      } else {
+        Value *UpperHalf =
+          createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
+        Value *Shuf = Builder.CreateShuffleVector(
+          TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
+        TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
+      }
+    }
+
+    // The result is in the first element of the vector.
+    return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+  }
+};
+
 /// \brief Recognize construction of vectors like
 ///  %ra = insertelement <4 x float> undef, float %s0, i32 0
 ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
@@ -1981,7 +2308,18 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       if (!BI)
         continue;
 
-      Value *Inst = BI->getOperand(0);
+      // Try to match and vectorize a horizontal reduction.
+      HorizontalReduction HorRdx;
+      if (ShouldVectorizeHor &&
+          HorRdx.matchAssociativeReduction(P, BI, DL) &&
+          HorRdx.tryToReduce(R, TTI)) {
+        Changed = true;
+        it = BB->begin();
+        e = BB->end();
+        continue;
+      }
+
+     Value *Inst = BI->getOperand(0);
       if (Inst == P)
         Inst = BI->getOperand(1);
 
@@ -1991,10 +2329,28 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
         Changed = true;
         it = BB->begin();
         e = BB->end();
+        continue;
       }
+
       continue;
     }
 
+    // Try to vectorize horizontal reductions feeding into a store.
+    if (StoreInst *SI = dyn_cast<StoreInst>(it))
+      if (BinaryOperator *BinOp =
+              dyn_cast<BinaryOperator>(SI->getValueOperand())) {
+        HorizontalReduction HorRdx;
+        if (ShouldVectorizeHor &&
+            ((HorRdx.matchAssociativeReduction(0, BinOp, DL) &&
+              HorRdx.tryToReduce(R, TTI)) ||
+             tryToVectorize(BinOp, R))) {
+          Changed = true;
+          it = BB->begin();
+          e = BB->end();
+          continue;
+        }
+      }
+
     // Try to vectorize trees that start at compare instructions.
     if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
       if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
diff --git a/test/Transforms/SLPVectorizer/X86/horizontal.ll b/test/Transforms/SLPVectorizer/X86/horizontal.ll
new file mode 100644
index 00000000000..9517066ed2e
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/horizontal.ll
@@ -0,0 +1,415 @@
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -S <  %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; #include <stdint.h>
+;
+; int foo(float *A, int n) {
+;   float sum = 0;
+;   for (intptr_t i=0; i < n; ++i) {
+;     sum += 7*A[i*4  ] +
+;            7*A[i*4+1] +
+;            7*A[i*4+2] +
+;            7*A[i*4+3];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: add_red
+; CHECK: fmul <4 x float>
+; CHECK: shufflevector <4 x float>
+
+define i32 @add_red(float* %A, i32 %n) {
+entry:
+  %cmp31 = icmp sgt i32 %n, 0
+  br i1 %cmp31, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.033 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %sum.032 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add17, %for.body ]
+  %mul = shl nsw i64 %i.033, 2
+  %arrayidx = getelementptr inbounds float* %A, i64 %mul
+  %1 = load float* %arrayidx, align 4
+  %mul2 = fmul float %1, 7.000000e+00
+  %add28 = or i64 %mul, 1
+  %arrayidx4 = getelementptr inbounds float* %A, i64 %add28
+  %2 = load float* %arrayidx4, align 4
+  %mul5 = fmul float %2, 7.000000e+00
+  %add6 = fadd fast float %mul2, %mul5
+  %add829 = or i64 %mul, 2
+  %arrayidx9 = getelementptr inbounds float* %A, i64 %add829
+  %3 = load float* %arrayidx9, align 4
+  %mul10 = fmul float %3, 7.000000e+00
+  %add11 = fadd fast float %add6, %mul10
+  %add1330 = or i64 %mul, 3
+  %arrayidx14 = getelementptr inbounds float* %A, i64 %add1330
+  %4 = load float* %arrayidx14, align 4
+  %mul15 = fmul float %4, 7.000000e+00
+  %add16 = fadd fast float %add11, %mul15
+  %add17 = fadd fast float %sum.032, %add16
+  %inc = add nsw i64 %i.033, 1
+  %exitcond = icmp eq i64 %inc, %0
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = fptosi float %add17 to i32
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+; int foo(float * restrict A, float * restrict B, int n) {
+;   float sum = 0;
+;   for (intptr_t i=0; i < n; ++i) {
+;     sum *= B[0]*A[i*4  ] +
+;       B[1]*A[i*4+1] +
+;       B[2]*A[i*4+2] +
+;       B[3]*A[i*4+3];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: mul_red
+; CHECK: fmul <4 x float>
+; CHECK: shufflevector <4 x float>
+
+define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) {
+entry:
+  %cmp38 = icmp sgt i32 %n, 0
+  br i1 %cmp38, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load float* %B, align 4
+  %arrayidx4 = getelementptr inbounds float* %B, i64 1
+  %1 = load float* %arrayidx4, align 4
+  %arrayidx9 = getelementptr inbounds float* %B, i64 2
+  %2 = load float* %arrayidx9, align 4
+  %arrayidx15 = getelementptr inbounds float* %B, i64 3
+  %3 = load float* %arrayidx15, align 4
+  %4 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.040 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %sum.039 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %mul21, %for.body ]
+  %mul = shl nsw i64 %i.040, 2
+  %arrayidx2 = getelementptr inbounds float* %A, i64 %mul
+  %5 = load float* %arrayidx2, align 4
+  %mul3 = fmul float %0, %5
+  %add35 = or i64 %mul, 1
+  %arrayidx6 = getelementptr inbounds float* %A, i64 %add35
+  %6 = load float* %arrayidx6, align 4
+  %mul7 = fmul float %1, %6
+  %add8 = fadd fast float %mul3, %mul7
+  %add1136 = or i64 %mul, 2
+  %arrayidx12 = getelementptr inbounds float* %A, i64 %add1136
+  %7 = load float* %arrayidx12, align 4
+  %mul13 = fmul float %2, %7
+  %add14 = fadd fast float %add8, %mul13
+  %add1737 = or i64 %mul, 3
+  %arrayidx18 = getelementptr inbounds float* %A, i64 %add1737
+  %8 = load float* %arrayidx18, align 4
+  %mul19 = fmul float %3, %8
+  %add20 = fadd fast float %add14, %mul19
+  %mul21 = fmul float %sum.039, %add20
+  %inc = add nsw i64 %i.040, 1
+  %exitcond = icmp eq i64 %inc, %4
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = fptosi float %mul21 to i32
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+; int foo(float * restrict A, float * restrict B, int n) {
+;   float sum = 0;
+;   for (intptr_t i=0; i < n; ++i) {
+;     sum += B[0]*A[i*6  ] +
+;            B[1]*A[i*6+1] +
+;            B[2]*A[i*6+2] +
+;            B[3]*A[i*6+3] +
+;            B[4]*A[i*6+4] +
+;            B[5]*A[i*6+5] +
+;            B[6]*A[i*6+6] +
+;            B[7]*A[i*6+7] +
+;            B[8]*A[i*6+8];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: long_red
+; CHECK: fmul <4 x float>
+; CHECK: shufflevector <4 x float>
+
+define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) {
+entry:
+  %cmp81 = icmp sgt i32 %n, 0
+  br i1 %cmp81, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load float* %B, align 4
+  %arrayidx4 = getelementptr inbounds float* %B, i64 1
+  %1 = load float* %arrayidx4, align 4
+  %arrayidx9 = getelementptr inbounds float* %B, i64 2
+  %2 = load float* %arrayidx9, align 4
+  %arrayidx15 = getelementptr inbounds float* %B, i64 3
+  %3 = load float* %arrayidx15, align 4
+  %arrayidx21 = getelementptr inbounds float* %B, i64 4
+  %4 = load float* %arrayidx21, align 4
+  %arrayidx27 = getelementptr inbounds float* %B, i64 5
+  %5 = load float* %arrayidx27, align 4
+  %arrayidx33 = getelementptr inbounds float* %B, i64 6
+  %6 = load float* %arrayidx33, align 4
+  %arrayidx39 = getelementptr inbounds float* %B, i64 7
+  %7 = load float* %arrayidx39, align 4
+  %arrayidx45 = getelementptr inbounds float* %B, i64 8
+  %8 = load float* %arrayidx45, align 4
+  %9 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.083 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %sum.082 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add51, %for.body ]
+  %mul = mul nsw i64 %i.083, 6
+  %arrayidx2 = getelementptr inbounds float* %A, i64 %mul
+  %10 = load float* %arrayidx2, align 4
+  %mul3 = fmul fast float %0, %10
+  %add80 = or i64 %mul, 1
+  %arrayidx6 = getelementptr inbounds float* %A, i64 %add80
+  %11 = load float* %arrayidx6, align 4
+  %mul7 = fmul fast float %1, %11
+  %add8 = fadd fast float %mul3, %mul7
+  %add11 = add nsw i64 %mul, 2
+  %arrayidx12 = getelementptr inbounds float* %A, i64 %add11
+  %12 = load float* %arrayidx12, align 4
+  %mul13 = fmul fast float %2, %12
+  %add14 = fadd fast float %add8, %mul13
+  %add17 = add nsw i64 %mul, 3
+  %arrayidx18 = getelementptr inbounds float* %A, i64 %add17
+  %13 = load float* %arrayidx18, align 4
+  %mul19 = fmul fast float %3, %13
+  %add20 = fadd fast float %add14, %mul19
+  %add23 = add nsw i64 %mul, 4
+  %arrayidx24 = getelementptr inbounds float* %A, i64 %add23
+  %14 = load float* %arrayidx24, align 4
+  %mul25 = fmul fast float %4, %14
+  %add26 = fadd fast float %add20, %mul25
+  %add29 = add nsw i64 %mul, 5
+  %arrayidx30 = getelementptr inbounds float* %A, i64 %add29
+  %15 = load float* %arrayidx30, align 4
+  %mul31 = fmul fast float %5, %15
+  %add32 = fadd fast float %add26, %mul31
+  %add35 = add nsw i64 %mul, 6
+  %arrayidx36 = getelementptr inbounds float* %A, i64 %add35
+  %16 = load float* %arrayidx36, align 4
+  %mul37 = fmul fast float %6, %16
+  %add38 = fadd fast float %add32, %mul37
+  %add41 = add nsw i64 %mul, 7
+  %arrayidx42 = getelementptr inbounds float* %A, i64 %add41
+  %17 = load float* %arrayidx42, align 4
+  %mul43 = fmul fast float %7, %17
+  %add44 = fadd fast float %add38, %mul43
+  %add47 = add nsw i64 %mul, 8
+  %arrayidx48 = getelementptr inbounds float* %A, i64 %add47
+  %18 = load float* %arrayidx48, align 4
+  %mul49 = fmul fast float %8, %18
+  %add50 = fadd fast float %add44, %mul49
+  %add51 = fadd fast float %sum.082, %add50
+  %inc = add nsw i64 %i.083, 1
+  %exitcond = icmp eq i64 %inc, %9
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = fptosi float %add51 to i32
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+; int foo(float * restrict A, float * restrict B, int n) {
+;   float sum = 0;
+;   for (intptr_t i=0; i < n; ++i) {
+;     sum += B[0]*A[i*4  ];
+;     sum += B[1]*A[i*4+1];
+;     sum += B[2]*A[i*4+2];
+;     sum += B[3]*A[i*4+3];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: chain_red
+; CHECK: fmul <4 x float>
+; CHECK: shufflevector <4 x float>
+
+define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) {
+entry:
+  %cmp41 = icmp sgt i32 %n, 0
+  br i1 %cmp41, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load float* %B, align 4
+  %arrayidx4 = getelementptr inbounds float* %B, i64 1
+  %1 = load float* %arrayidx4, align 4
+  %arrayidx10 = getelementptr inbounds float* %B, i64 2
+  %2 = load float* %arrayidx10, align 4
+  %arrayidx16 = getelementptr inbounds float* %B, i64 3
+  %3 = load float* %arrayidx16, align 4
+  %4 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.043 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %sum.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add21, %for.body ]
+  %mul = shl nsw i64 %i.043, 2
+  %arrayidx2 = getelementptr inbounds float* %A, i64 %mul
+  %5 = load float* %arrayidx2, align 4
+  %mul3 = fmul fast float %0, %5
+  %add = fadd fast float %sum.042, %mul3
+  %add638 = or i64 %mul, 1
+  %arrayidx7 = getelementptr inbounds float* %A, i64 %add638
+  %6 = load float* %arrayidx7, align 4
+  %mul8 = fmul fast float %1, %6
+  %add9 = fadd fast float %add, %mul8
+  %add1239 = or i64 %mul, 2
+  %arrayidx13 = getelementptr inbounds float* %A, i64 %add1239
+  %7 = load float* %arrayidx13, align 4
+  %mul14 = fmul fast float %2, %7
+  %add15 = fadd fast float %add9, %mul14
+  %add1840 = or i64 %mul, 3
+  %arrayidx19 = getelementptr inbounds float* %A, i64 %add1840
+  %8 = load float* %arrayidx19, align 4
+  %mul20 = fmul fast float %3, %8
+  %add21 = fadd fast float %add15, %mul20
+  %inc = add nsw i64 %i.043, 1
+  %exitcond = icmp eq i64 %inc, %4
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = fptosi float %add21 to i32
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+; int foo(float * restrict A, float * restrict B, float * restrict C, int n) {
+;   float sum = 0;
+;   for (intptr_t i=0; i < n; ++i) {
+;     C[i] = B[0] *A[i*4  ] +
+;          B[1] *A[i*4+1] +
+;          B[2] *A[i*4+2] +
+;          B[3] *A[i*4+3];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: store_red
+; CHECK: fmul <4 x float>
+; CHECK: shufflevector <4 x float>
+
+define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) {
+entry:
+  %cmp37 = icmp sgt i32 %n, 0
+  br i1 %cmp37, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %arrayidx4 = getelementptr inbounds float* %B, i64 1
+  %arrayidx9 = getelementptr inbounds float* %B, i64 2
+  %arrayidx15 = getelementptr inbounds float* %B, i64 3
+  %0 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.039 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %C.addr.038 = phi float* [ %C, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
+  %1 = load float* %B, align 4
+  %mul = shl nsw i64 %i.039, 2
+  %arrayidx2 = getelementptr inbounds float* %A, i64 %mul
+  %2 = load float* %arrayidx2, align 4
+  %mul3 = fmul fast float %1, %2
+  %3 = load float* %arrayidx4, align 4
+  %add34 = or i64 %mul, 1
+  %arrayidx6 = getelementptr inbounds float* %A, i64 %add34
+  %4 = load float* %arrayidx6, align 4
+  %mul7 = fmul fast float %3, %4
+  %add8 = fadd fast float %mul3, %mul7
+  %5 = load float* %arrayidx9, align 4
+  %add1135 = or i64 %mul, 2
+  %arrayidx12 = getelementptr inbounds float* %A, i64 %add1135
+  %6 = load float* %arrayidx12, align 4
+  %mul13 = fmul fast float %5, %6
+  %add14 = fadd fast float %add8, %mul13
+  %7 = load float* %arrayidx15, align 4
+  %add1736 = or i64 %mul, 3
+  %arrayidx18 = getelementptr inbounds float* %A, i64 %add1736
+  %8 = load float* %arrayidx18, align 4
+  %mul19 = fmul fast float %7, %8
+  %add20 = fadd fast float %add14, %mul19
+  store float %add20, float* %C.addr.038, align 4
+  %incdec.ptr = getelementptr inbounds float* %C.addr.038, i64 1
+  %inc = add nsw i64 %i.039, 1
+  %exitcond = icmp eq i64 %inc, %0
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 0
+}
+
+
+; void foo(double * restrict A, double * restrict B, double * restrict C,
+;          int n) {
+;   for (intptr_t i=0; i < n; ++i) {
+;     C[i] = B[0] *A[i*4  ] + B[1] *A[i*4+1];
+;   }
+; }
+
+; CHECK-LABEL: store_red_double
+; CHECK: fmul <2 x double>
+; CHECK: extractelement <2 x double>
+; CHECK: extractelement <2 x double>
+
+define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
+entry:
+  %cmp17 = icmp sgt i32 %n, 0
+  br i1 %cmp17, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load double* %B, align 8
+  %arrayidx4 = getelementptr inbounds double* %B, i64 1
+  %1 = load double* %arrayidx4, align 8
+  %2 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %mul = shl nsw i64 %i.018, 2
+  %arrayidx2 = getelementptr inbounds double* %A, i64 %mul
+  %3 = load double* %arrayidx2, align 8
+  %mul3 = fmul fast double %0, %3
+  %add16 = or i64 %mul, 1
+  %arrayidx6 = getelementptr inbounds double* %A, i64 %add16
+  %4 = load double* %arrayidx6, align 8
+  %mul7 = fmul fast double %1, %4
+  %add8 = fadd fast double %mul3, %mul7
+  %arrayidx9 = getelementptr inbounds double* %C, i64 %i.018
+  store double %add8, double* %arrayidx9, align 8
+  %inc = add nsw i64 %i.018, 1
+  %exitcond = icmp eq i64 %inc, %2
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
-- 
2.34.1