LoopVectorizer: Preserve fast-math flags

[oota-llvm.git] / lib / Transforms / Vectorize / LoopVectorize.cpp
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index c05288bd07030a9c90975ba36672accfa59a2b78..77633a562a6759549ea9c6fb77b40aa5ba0f7926 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -56,6 +56,7 @@
  #include "llvm/ADT/SmallVector.h"
  #include "llvm/ADT/StringExtras.h"
  #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
  #include "llvm/Analysis/LoopInfo.h"
  #include "llvm/Analysis/LoopIterator.h"
  #include "llvm/Analysis/LoopPass.h"
@@ -74,14 +75,15 @@
  #include "llvm/IR/IntrinsicInst.h"
  #include "llvm/IR/LLVMContext.h"
  #include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
  #include "llvm/IR/Type.h"
  #include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
  #include "llvm/IR/Verifier.h"
  #include "llvm/Pass.h"
+#include "llvm/Support/BranchProbability.h"
  #include "llvm/Support/CommandLine.h"
  #include "llvm/Support/Debug.h"
-#include "llvm/Support/PatternMatch.h"
-#include "llvm/Support/ValueHandle.h"
  #include "llvm/Support/raw_ostream.h"
  #include "llvm/Target/TargetLibraryInfo.h"
  #include "llvm/Transforms/Scalar.h"
@@ -139,11 +141,60 @@ static const unsigned RuntimeMemoryCheckThreshold = 8;
  /// Maximum simd width.
  static const unsigned MaxVectorWidth = 64;
  
+static cl::opt<unsigned> ForceTargetNumScalarRegs(
+    "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
+    cl::desc("A flag that overrides the target's number of scalar registers."));
+
+static cl::opt<unsigned> ForceTargetNumVectorRegs(
+    "force-target-num-vector-regs", cl::init(0), cl::Hidden,
+    cl::desc("A flag that overrides the target's number of vector registers."));
+
  /// Maximum vectorization unroll count.
  static const unsigned MaxUnrollFactor = 16;
  
-/// The cost of a loop that is considered 'small' by the unroller.
-static const unsigned SmallLoopCost = 20;
+static cl::opt<unsigned> ForceTargetMaxScalarUnrollFactor(
+    "force-target-max-scalar-unroll", cl::init(0), cl::Hidden,
+    cl::desc("A flag that overrides the target's max unroll factor for scalar "
+             "loops."));
+
+static cl::opt<unsigned> ForceTargetMaxVectorUnrollFactor(
+    "force-target-max-vector-unroll", cl::init(0), cl::Hidden,
+    cl::desc("A flag that overrides the target's max unroll factor for "
+             "vectorized loops."));
+
+static cl::opt<unsigned> ForceTargetInstructionCost(
+    "force-target-instruction-cost", cl::init(0), cl::Hidden,
+    cl::desc("A flag that overrides the target's expected cost for "
+             "an instruction to a single constant value. Mostly "
+             "useful for getting consistent testing."));
+
+static cl::opt<unsigned> SmallLoopCost(
+    "small-loop-cost", cl::init(20), cl::Hidden,
+    cl::desc("The cost of a loop that is considered 'small' by the unroller."));
+
+static cl::opt<bool> LoopVectorizeWithBlockFrequency(
+    "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
+    cl::desc("Enable the use of the block frequency analysis to access PGO "
+             "heuristics minimizing code growth in cold regions and being more "
+             "aggressive in hot regions."));
+
+// Runtime unroll loops for load/store throughput.
+static cl::opt<bool> EnableLoadStoreRuntimeUnroll(
+    "enable-loadstore-runtime-unroll", cl::init(true), cl::Hidden,
+    cl::desc("Enable runtime unrolling until load/store ports are saturated"));
+
+/// The number of stores in a loop that are allowed to need predication.
+static cl::opt<unsigned> NumberOfStoresToPredicate(
+    "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
+    cl::desc("Max number of stores to be predicated behind an if."));
+
+static cl::opt<bool> EnableIndVarRegisterHeur(
+    "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
+    cl::desc("Count the induction variable only once when unrolling"));
+
+static cl::opt<bool> EnableCondStoresVectorization(
+    "enable-cond-stores-vec", cl::init(false), cl::Hidden,
+    cl::desc("Enable if predication of stores during vectorization."));
  
  namespace {
  
@@ -168,7 +219,7 @@ class LoopVectorizationCostModel;
  class InnerLoopVectorizer {
  public:
    InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
-                      DominatorTree *DT, DataLayout *DL,
+                      DominatorTree *DT, const DataLayout *DL,
                        const TargetLibraryInfo *TLI, unsigned VecWidth,
                        unsigned UnrollFactor)
        : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI),
@@ -248,8 +299,11 @@ protected:
    void updateAnalysis();
  
    /// This instruction is un-vectorizable. Implement it as a sequence
-  /// of scalars.
-  virtual void scalarizeInstruction(Instruction *Instr);
+  /// of scalars. If \p IfPredicateStore is true we need to 'hide' each
+  /// scalarized instruction behind an if block predicated on the control
+  /// dependence of the instruction.
+  virtual void scalarizeInstruction(Instruction *Instr,
+                                    bool IfPredicateStore=false);
  
    /// Vectorize Load and Store instructions,
    virtual void vectorizeMemoryInstruction(Instruction *Instr);
@@ -325,7 +379,7 @@ protected:
    /// Dominator Tree.
    DominatorTree *DT;
    /// Data Layout.
-  DataLayout *DL;
+  const DataLayout *DL;
    /// Target Library Info.
    const TargetLibraryInfo *TLI;
  
@@ -352,7 +406,7 @@ protected:
    ///The ExitBlock of the scalar loop.
    BasicBlock *LoopExitBlock;
    ///The vector loop body.
-  BasicBlock *LoopVectorBody;
+  SmallVector<BasicBlock *, 4> LoopVectorBody;
    ///The scalar loop body.
    BasicBlock *LoopScalarBody;
    /// A list of all bypass blocks. The first block is the entry of the loop.
@@ -374,16 +428,17 @@ protected:
  class InnerLoopUnroller : public InnerLoopVectorizer {
  public:
    InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
-                    DominatorTree *DT, DataLayout *DL,
+                    DominatorTree *DT, const DataLayout *DL,
                      const TargetLibraryInfo *TLI, unsigned UnrollFactor) :
      InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { }
  
  private:
-  virtual void scalarizeInstruction(Instruction *Instr);
-  virtual void vectorizeMemoryInstruction(Instruction *Instr);
-  virtual Value *getBroadcastInstrs(Value *V);
-  virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
-  virtual Value *reverseVector(Value *Vec);
+  void scalarizeInstruction(Instruction *Instr,
+                            bool IfPredicateStore = false) override;
+  void vectorizeMemoryInstruction(Instruction *Instr) override;
+  Value *getBroadcastInstrs(Value *V) override;
+  Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate) override;
+  Value *reverseVector(Value *Vec) override;
  };
  
  /// \brief Look for a meaningful debug location on the instruction or it's
@@ -429,10 +484,14 @@ static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
  /// induction variable and the different reduction variables.
  class LoopVectorizationLegality {
  public:
-  LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
+  unsigned NumLoads;
+  unsigned NumStores;
+  unsigned NumPredStores;
+
+  LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL,
                              DominatorTree *DT, TargetLibraryInfo *TLI)
-      : TheLoop(L), SE(SE), DL(DL), DT(DT), TLI(TLI),
-        Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false),
+      : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
+        DT(DT), TLI(TLI), Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false),
          MaxSafeDepDistBytes(-1U) {}
  
    /// This enum represents the kinds of reductions that we support.
@@ -667,7 +726,7 @@ private:
    /// Scev analysis.
    ScalarEvolution *SE;
    /// DataLayout analysis.
-  DataLayout *DL;
+  const DataLayout *DL;
    /// Dominators.
    DominatorTree *DT;
    /// Target Library Info.
@@ -717,7 +776,7 @@ public:
    LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
                               LoopVectorizationLegality *Legal,
                               const TargetTransformInfo &TTI,
-                             DataLayout *DL, const TargetLibraryInfo *TLI)
+                             const DataLayout *DL, const TargetLibraryInfo *TLI)
        : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {}
  
    /// Information about vectorization costs
@@ -790,7 +849,7 @@ private:
    /// Vector target information.
    const TargetTransformInfo &TTI;
    /// Target data layout information.
-  DataLayout *DL;
+  const DataLayout *DL;
    /// Target Library Info.
    const TargetLibraryInfo *TLI;
  };
@@ -930,39 +989,53 @@ private:
    }
  };
  
+static void addInnerLoop(Loop *L, SmallVectorImpl<Loop *> &V) {
+  if (L->empty())
+    return V.push_back(L);
+
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    addInnerLoop(*I, V);
+}
+
  /// The LoopVectorize Pass.
-struct LoopVectorize : public LoopPass {
+struct LoopVectorize : public FunctionPass {
    /// Pass identification, replacement for typeid
    static char ID;
  
    explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
-    : LoopPass(ID),
+    : FunctionPass(ID),
        DisableUnrolling(NoUnrolling),
        AlwaysVectorize(AlwaysVectorize) {
      initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
    }
  
    ScalarEvolution *SE;
-  DataLayout *DL;
+  const DataLayout *DL;
    LoopInfo *LI;
    TargetTransformInfo *TTI;
    DominatorTree *DT;
+  BlockFrequencyInfo *BFI;
    TargetLibraryInfo *TLI;
    bool DisableUnrolling;
    bool AlwaysVectorize;
  
-  virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
-    // We only vectorize innermost loops.
-    if (!L->empty())
-      return false;
+  BlockFrequency ColdEntryFreq;
  
+  bool runOnFunction(Function &F) override {
      SE = &getAnalysis<ScalarEvolution>();
-    DL = getAnalysisIfAvailable<DataLayout>();
+    DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+    DL = DLP ? &DLP->getDataLayout() : 0;
      LI = &getAnalysis<LoopInfo>();
      TTI = &getAnalysis<TargetTransformInfo>();
      DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    BFI = &getAnalysis<BlockFrequencyInfo>();
      TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
  
+    // Compute some weights outside of the loop over the loops. Compute this
+    // using a BranchProbability to re-use its scaling math.
+    const BranchProbability ColdProb(1, 5); // 20%
+    ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
+
      // If the target claims to have no vector registers don't attempt
      // vectorization.
      if (!TTI->getNumberOfRegisters(true))
@@ -973,6 +1046,32 @@ struct LoopVectorize : public LoopPass {
        return false;
      }
  
+    // Build up a worklist of inner-loops to vectorize. This is necessary as
+    // the act of vectorizing or partially unrolling a loop creates new loops
+    // and can invalidate iterators across the loops.
+    SmallVector<Loop *, 8> Worklist;
+
+    for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
+      addInnerLoop(*I, Worklist);
+
+    // Now walk the identified inner loops.
+    bool Changed = false;
+    while (!Worklist.empty())
+      Changed |= processLoop(Worklist.pop_back_val());
+
+    // Process each loop nest in the function.
+    return Changed;
+  }
+
+  bool processLoop(Loop *L) {
+    // We only handle inner loops, so if there are children just recurse.
+    if (!L->empty()) {
+      bool Changed = false;
+      for (Loop::iterator I = L->begin(), E = L->begin(); I != E; ++I)
+        Changed |= processLoop(*I);
+      return Changed;
+    }
+
      DEBUG(dbgs() << "LV: Checking a loop in \"" <<
            L->getHeader()->getParent()->getName() << "\"\n");
  
@@ -1006,14 +1105,25 @@ struct LoopVectorize : public LoopPass {
      // Check the function attributes to find out if this function should be
      // optimized for size.
      Function *F = L->getHeader()->getParent();
-    Attribute::AttrKind SzAttr = Attribute::OptimizeForSize;
-    Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat;
-    unsigned FnIndex = AttributeSet::FunctionIndex;
-    bool OptForSize = Hints.Force != 1 &&
-                      F->getAttributes().hasAttribute(FnIndex, SzAttr);
-    bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr);
-
-    if (NoFloat) {
+    bool OptForSize =
+        Hints.Force != 1 && F->hasFnAttribute(Attribute::OptimizeForSize);
+
+    // Compute the weighted frequency of this loop being executed and see if it
+    // is less than 20% of the function entry baseline frequency. Note that we
+    // always have a canonical loop here because we think we *can* vectoriez.
+    // FIXME: This is hidden behind a flag due to pervasive problems with
+    // exactly what block frequency models.
+    if (LoopVectorizeWithBlockFrequency) {
+      BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
+      if (Hints.Force != 1 && LoopEntryFreq < ColdEntryFreq)
+        OptForSize = true;
+    }
+
+    // Check the function attributes to see if implicit floats are allowed.a
+    // FIXME: This check doesn't seem possibly correct -- what if the loop is
+    // an integer loop and the vector instructions selected are purely integer
+    // vector instructions?
+    if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
        DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
              "attribute is used.\n");
        return false;
@@ -1051,10 +1161,10 @@ struct LoopVectorize : public LoopPass {
      return true;
    }
  
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-    LoopPass::getAnalysisUsage(AU);
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
      AU.addRequiredID(LoopSimplifyID);
      AU.addRequiredID(LCSSAID);
+    AU.addRequired<BlockFrequencyInfo>();
      AU.addRequired<DominatorTreeWrapperPass>();
      AU.addRequired<LoopInfo>();
      AU.addRequired<ScalarEvolution>();
@@ -1133,7 +1243,9 @@ void LoopVectorizationLegality::RuntimePointerCheck::insert(
  Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
    // We need to place the broadcast of invariant variables outside the loop.
    Instruction *Instr = dyn_cast<Instruction>(V);
-  bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
+  bool NewInstr =
+      (Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(),
+                          Instr->getParent()) != LoopVectorBody.end());
    bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
  
    // Place the code for broadcasting invariant variables in the new preheader.
@@ -1173,7 +1285,7 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx,
  /// \brief Find the operand of the GEP that should be checked for consecutive
  /// stores. This ignores trailing indices that have no effect on the final
  /// pointer.
-static unsigned getGEPInductionOperand(DataLayout *DL,
+static unsigned getGEPInductionOperand(const DataLayout *DL,
                                         const GetElementPtrInst *Gep) {
    unsigned LastOperand = Gep->getNumOperands() - 1;
    unsigned GEPAllocSize = DL->getTypeAllocSize(
@@ -1338,6 +1450,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
    unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
    unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
  
+  if (SI && Legal->blockNeedsPredication(SI->getParent()))
+    return scalarizeInstruction(Instr, true);
+
    if (ScalarAllocatedSize != VectorElementSize)
      return scalarizeInstruction(Instr);
  
@@ -1456,7 +1571,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
    }
  }
  
-void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) {
    assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
    // Holds vector parameters or scalars, in case of uniform vals.
    SmallVector<VectorParts, 4> Params;
@@ -1501,10 +1616,38 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
    // Create a new entry in the WidenMap and initialize it to Undef or Null.
    VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
  
+  Instruction *InsertPt = Builder.GetInsertPoint();
+  BasicBlock *IfBlock = Builder.GetInsertBlock();
+  BasicBlock *CondBlock = 0;
+
+  VectorParts Cond;
+  Loop *VectorLp = 0;
+  if (IfPredicateStore) {
+    assert(Instr->getParent()->getSinglePredecessor() &&
+           "Only support single predecessor blocks");
+    Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
+                          Instr->getParent());
+    VectorLp = LI->getLoopFor(IfBlock);
+    assert(VectorLp && "Must have a loop for this block");
+  }
+
    // For each vector unroll 'part':
    for (unsigned Part = 0; Part < UF; ++Part) {
      // For each scalar that we create:
      for (unsigned Width = 0; Width < VF; ++Width) {
+
+      // Start if-block.
+      Value *Cmp = 0;
+      if (IfPredicateStore) {
+        Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
+        Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));
+        CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+        LoopVectorBody.push_back(CondBlock);
+        VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase());
+        // Update Builder with newly created basic block.
+        Builder.SetInsertPoint(InsertPt);
+      }
+
        Instruction *Cloned = Instr->clone();
        if (!IsVoidRetTy)
          Cloned->setName(Instr->getName() + ".cloned");
@@ -1525,6 +1668,17 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
        if (!IsVoidRetTy)
          VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
                                                         Builder.getInt32(Width));
+      // End if-block.
+      if (IfPredicateStore) {
+         BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+         LoopVectorBody.push_back(NewIfBlock);
+         VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase());
+         Builder.SetInsertPoint(InsertPt);
+         Instruction *OldBr = IfBlock->getTerminator();
+         BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+         OldBr->eraseFromParent();
+         IfBlock = NewIfBlock;
+      }
      }
    }
  }
@@ -1824,7 +1978,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
    // sequence of instructions that form a check.
    Instruction *StrideCheck;
    Instruction *FirstCheckInst;
-  tie(FirstCheckInst, StrideCheck) =
+  std::tie(FirstCheckInst, StrideCheck) =
        addStrideCheck(BypassBlock->getTerminator());
    if (StrideCheck) {
      // Create a new block containing the stride check.
@@ -1848,7 +2002,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
    // checks into a separate block to make the more common case of few elements
    // faster.
    Instruction *MemRuntimeCheck;
-  tie(FirstCheckInst, MemRuntimeCheck) =
+  std::tie(FirstCheckInst, MemRuntimeCheck) =
        addRuntimeCheck(LastBypassBlock->getTerminator());
    if (MemRuntimeCheck) {
      // Create a new block containing the memory check.
@@ -2028,7 +2182,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
    LoopScalarPreHeader = ScalarPH;
    LoopMiddleBlock = MiddleBlock;
    LoopExitBlock = ExitBlock;
-  LoopVectorBody = VecBody;
+  LoopVectorBody.push_back(VecBody);
    LoopScalarBody = OldBasicBlock;
  
    LoopVectorizeHints Hints(Lp, true);
@@ -2296,26 +2450,53 @@ struct CSEDenseMapInfo {
  };
  }
  
+/// \brief Check whether this block is a predicated block.
+/// Due to if predication of stores we might create a sequence of "if(pred) a[i]
+/// = ...;  " blocks. We start with one vectorized basic block. For every
+/// conditional block we split this vectorized block. Therefore, every second
+/// block will be a predicated one.
+static bool isPredicatedBlock(unsigned BlockNum) {
+  return BlockNum % 2;
+}
+
  ///\brief Perform cse of induction variable instructions.
-static void cse(BasicBlock *BB) {
+static void cse(SmallVector<BasicBlock *, 4> &BBs) {
    // Perform simple cse.
    SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
-    Instruction *In = I++;
+  for (unsigned i = 0, e = BBs.size(); i != e; ++i) {
+    BasicBlock *BB = BBs[i];
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+      Instruction *In = I++;
  
-    if (!CSEDenseMapInfo::canHandle(In))
-      continue;
+      if (!CSEDenseMapInfo::canHandle(In))
+        continue;
  
-    // Check if we can replace this instruction with any of the
-    // visited instructions.
-    if (Instruction *V = CSEMap.lookup(In)) {
-      In->replaceAllUsesWith(V);
-      In->eraseFromParent();
-      continue;
+      // Check if we can replace this instruction with any of the
+      // visited instructions.
+      if (Instruction *V = CSEMap.lookup(In)) {
+        In->replaceAllUsesWith(V);
+        In->eraseFromParent();
+        continue;
+      }
+      // Ignore instructions in conditional blocks. We create "if (pred) a[i] =
+      // ...;" blocks for predicated stores. Every second block is a predicated
+      // block.
+      if (isPredicatedBlock(i))
+        continue;
+
+      CSEMap[In] = In;
      }
+  }
+}
  
-    CSEMap[In] = In;
+/// \brief Adds a 'fast' flag to floating point operations.
+static Value *addFastMathFlag(Value *V) {
+  if (isa<FPMathOperator>(V)){
+    FastMathFlags Flags;
+    Flags.setUnsafeAlgebra();
+    cast<Instruction>(V)->setFastMathFlags(Flags);
    }
+  return V;
  }
  
  void InnerLoopVectorizer::vectorizeLoop() {
@@ -2371,7 +2552,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
      setDebugLocFromInst(Builder, RdxDesc.StartValue);
  
      // We need to generate a reduction vector from the incoming scalar.
-    // To do so, we need to generate the 'identity' vector and overide
+    // To do so, we need to generate the 'identity' vector and override
      // one of the elements with the incoming scalar reduction. We need
      // to do it in the vector-loop preheader.
      Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator());
@@ -2430,7 +2611,8 @@ void InnerLoopVectorizer::vectorizeLoop() {
        // first unroll part.
        Value *StartVal = (part == 0) ? VectorStart : Identity;
        cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader);
-      cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody);
+      cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],
+                                                  LoopVectorBody.back());
      }
  
      // Before each round, move the insertion point right between
@@ -2449,7 +2631,8 @@ void InnerLoopVectorizer::vectorizeLoop() {
        Value *StartVal = (part == 0) ? VectorStart : Identity;
        for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
          NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
-      NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody);
+      NewPhi->addIncoming(RdxExitVal[part],
+                          LoopVectorBody.back());
        RdxParts.push_back(NewPhi);
      }
  
@@ -2459,9 +2642,10 @@ void InnerLoopVectorizer::vectorizeLoop() {
      setDebugLocFromInst(Builder, ReducedPartRdx);
      for (unsigned part = 1; part < UF; ++part) {
        if (Op != Instruction::ICmp && Op != Instruction::FCmp)
-        ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op,
-                                             RdxParts[part], ReducedPartRdx,
-                                             "bin.rdx");
+        // Floating point operations had to be 'fast' to enable the reduction.
+        ReducedPartRdx = addFastMathFlag(
+            Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
+                                ReducedPartRdx, "bin.rdx"));
        else
          ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind,
                                          ReducedPartRdx, RdxParts[part]);
@@ -2491,8 +2675,9 @@ void InnerLoopVectorizer::vectorizeLoop() {
                                      "rdx.shuf");
  
          if (Op != Instruction::ICmp && Op != Instruction::FCmp)
-          TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
-                                       "bin.rdx");
+          // Floating point operations had to be 'fast' to enable the reduction.
+          TmpVec = addFastMathFlag(Builder.CreateBinOp(
+              (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
          else
            TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
        }
@@ -2622,7 +2807,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
        Type *VecTy = (VF == 1) ? PN->getType() :
        VectorType::get(PN->getType(), VF);
        Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
-                                    LoopVectorBody-> getFirstInsertionPt());
+                                    LoopVectorBody.back()-> getFirstInsertionPt());
      }
      PV->push_back(P);
      return;
@@ -2826,6 +3011,10 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
          if (VecOp && isa<PossiblyExactOperator>(VecOp))
            VecOp->setIsExact(BinOp->isExact());
  
+        // Copy the fast-math flags.
+        if (VecOp && isa<FPMathOperator>(V))
+          VecOp->setFastMathFlags(it->getFastMathFlags());
+
          Entry[Part] = V;
        }
        break;
@@ -2971,7 +3160,19 @@ void InnerLoopVectorizer::updateAnalysis() {
    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
      DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
    DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
-  DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
+
+  // Due to if predication of stores we might create a sequence of "if(pred)
+  // a[i] = ...;  " blocks.
+  for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) {
+    if (i == 0)
+      DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
+    else if (isPredicatedBlock(i)) {
+      DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]);
+    } else {
+      DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]);
+    }
+  }
+
    DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front());
    DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
    DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
@@ -3115,7 +3316,7 @@ bool LoopVectorizationLegality::canVectorize() {
    return true;
  }
  
-static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) {
+static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
    if (Ty->isPointerTy())
      return DL.getIntPtrType(Ty);
  
@@ -3127,7 +3328,7 @@ static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) {
    return Ty;
  }
  
-static Type* getWiderType(DataLayout &DL, Type *Ty0, Type *Ty1) {
+static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
    Ty0 = convertPointerToIntegerType(DL, Ty0);
    Ty1 = convertPointerToIntegerType(DL, Ty1);
    if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
@@ -3325,7 +3526,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
  ///\brief Remove GEPs whose indices but the last one are loop invariant and
  /// return the induction operand of the gep pointer.
  static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE,
-                                 DataLayout *DL, Loop *Lp) {
+                                 const DataLayout *DL, Loop *Lp) {
    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
    if (!GEP)
      return Ptr;
@@ -3361,7 +3562,7 @@ static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) {
  /// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a
  /// pointer to the Value, or null otherwise.
  static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
-                                   DataLayout *DL, Loop *Lp) {
+                                   const DataLayout *DL, Loop *Lp) {
    const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
    if (!PtrTy || PtrTy->isAggregateType())
      return 0;
@@ -3496,7 +3697,7 @@ public:
    /// \brief Set of potential dependent memory accesses.
    typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
  
-  AccessAnalysis(DataLayout *Dl, DepCandidates &DA) :
+  AccessAnalysis(const DataLayout *Dl, DepCandidates &DA) :
      DL(Dl), DepCands(DA), AreAllWritesIdentified(true),
      AreAllReadsIdentified(true), IsRTCheckNeeded(false) {}
  
@@ -3562,7 +3763,7 @@ private:
    /// Set of underlying objects already written to.
    SmallPtrSet<Value*, 16> WriteObjects;
  
-  DataLayout *DL;
+  const DataLayout *DL;
  
    /// Sets of potentially dependent accesses - members of one set share an
    /// underlying pointer. The set "CheckDeps" identfies which sets really need a
@@ -3589,7 +3790,7 @@ static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides,
  
  /// \brief Check the stride of the pointer and ensure that it does not wrap in
  /// the address space.
-static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr,
+static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr,
                          const Loop *Lp, ValueToValueMap &StridesMap);
  
  bool AccessAnalysis::canCheckPtrAtRT(
@@ -3713,8 +3914,8 @@ void AccessAnalysis::processMemAccesses(bool UseDeferred) {
      }
  
      bool NeedDepCheck = false;
-    // Check whether there is the possiblity of dependency because of underlying
-    // objects being the same.
+    // Check whether there is the possibility of dependency because of
+    // underlying objects being the same.
      typedef SmallVector<Value*, 16> ValueVector;
      ValueVector TempObjects;
      GetUnderlyingObjects(Ptr, TempObjects, DL);
@@ -3809,7 +4010,7 @@ public:
    typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
    typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
  
-  MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L)
+  MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L)
        : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0),
          ShouldRetryWithRuntimeCheck(false) {}
  
@@ -3847,7 +4048,7 @@ public:
  
  private:
    ScalarEvolution *SE;
-  DataLayout *DL;
+  const DataLayout *DL;
    const Loop *InnermostLoop;
  
    /// \brief Maps access locations (ptr, read/write) to program order.
@@ -3896,7 +4097,7 @@ static bool isInBoundsGep(Value *Ptr) {
  }
  
  /// \brief Check whether the access through \p Ptr has a constant stride.
-static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr,
+static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr,
                          const Loop *Lp, ValueToValueMap &StridesMap) {
    const Type *Ty = Ptr->getType();
    assert(Ty->isPointerTy() && "Unexpected non-ptr");
@@ -4155,7 +4356,7 @@ bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
      // Check every access pair.
      while (AI != AE) {
        CheckDeps.erase(*AI);
-      EquivalenceClasses<MemAccessInfo>::member_iterator OI = llvm::next(AI);
+      EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI);
        while (OI != AE) {
          // Check every accessing instruction pair in program order.
          for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
@@ -4219,6 +4420,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
            DEBUG(dbgs() << "LV: Found a non-simple load.\n");
            return false;
          }
+        NumLoads++;
          Loads.push_back(Ld);
          DepChecker.addAccess(Ld);
          continue;
@@ -4232,6 +4434,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
            DEBUG(dbgs() << "LV: Found a non-simple store.\n");
            return false;
          }
+        NumStores++;
          Stores.push_back(St);
          DepChecker.addAccess(St);
        }
@@ -4535,13 +4738,22 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
          continue;
        }
  
-      // Process instructions only once (termination).
+      // Process instructions only once (termination). Each reduction cycle
+      // value must only be used once, except by phi nodes and min/max
+      // reductions which are represented as a cmp followed by a select.
+      ReductionInstDesc IgnoredVal(false, 0);
        if (VisitedInsts.insert(Usr)) {
          if (isa<PHINode>(Usr))
            PHIs.push_back(Usr);
          else
            NonPHIs.push_back(Usr);
-      }
+      } else if (!isa<PHINode>(Usr) &&
+                 ((!isa<FCmpInst>(Usr) &&
+                   !isa<ICmpInst>(Usr) &&
+                   !isa<SelectInst>(Usr)) ||
+                  !isMinMaxSelectCmpPattern(Usr, IgnoredVal).IsReduction))
+        return false;
+
        // Remember that we completed the cycle.
        if (Usr == Phi)
          FoundStartPHI = true;
@@ -4734,7 +4946,16 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
      }
  
      // We don't predicate stores at the moment.
-    if (it->mayWriteToMemory() || it->mayThrow())
+    if (it->mayWriteToMemory()) {
+      StoreInst *SI = dyn_cast<StoreInst>(it);
+      // We only support predication of stores in basic blocks with one
+      // predecessor.
+      if (!SI || ++NumPredStores > NumberOfStoresToPredicate ||
+          !SafePtrs.count(SI->getPointerOperand()) ||
+          !SI->getParent()->getSinglePredecessor())
+        return false;
+    }
+    if (it->mayThrow())
        return false;
  
      // Check that we don't have a constant expression that can trap as operand.
@@ -4769,6 +4990,11 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
      return Factor;
    }
  
+  if (!EnableCondStoresVectorization && Legal->NumPredStores) {
+    DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
+    return Factor;
+  }
+
    // Find the trip count.
    unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
    DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
@@ -4924,9 +5150,17 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
    if (TC > 1 && TC < TinyTripCountUnrollThreshold)
      return 1;
  
-  unsigned TargetVectorRegisters = TTI.getNumberOfRegisters(true);
-  DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters <<
-        " vector registers\n");
+  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
+  DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters <<
+        " registers\n");
+
+  if (VF == 1) {
+    if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
+      TargetNumRegisters = ForceTargetNumScalarRegs;
+  } else {
+    if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
+      TargetNumRegisters = ForceTargetNumVectorRegs;
+  }
  
    LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
    // We divide by these constants so assume that we have at least one
@@ -4939,12 +5173,29 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
    // registers. These registers are used by all of the unrolled instances.
    // Next, divide the remaining registers by the number of registers that is
    // required by the loop, in order to estimate how many parallel instances
-  // fit without causing spills.
-  unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers;
+  // fit without causing spills. All of this is rounded down if necessary to be
+  // a power of two. We want power of two unroll factors to simplify any
+  // addressing operations or alignment considerations.
+  unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
+                              R.MaxLocalUsers);
+
+  // Don't count the induction variable as unrolled.
+  if (EnableIndVarRegisterHeur)
+    UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
+                       std::max(1U, (R.MaxLocalUsers - 1)));
  
    // Clamp the unroll factor ranges to reasonable factors.
    unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor();
  
+  // Check if the user has overridden the unroll max.
+  if (VF == 1) {
+    if (ForceTargetMaxScalarUnrollFactor.getNumOccurrences() > 0)
+      MaxUnrollSize = ForceTargetMaxScalarUnrollFactor;
+  } else {
+    if (ForceTargetMaxVectorUnrollFactor.getNumOccurrences() > 0)
+      MaxUnrollSize = ForceTargetMaxVectorUnrollFactor;
+  }
+
    // If we did not calculate the cost for VF (because the user selected the VF)
    // then we calculate the cost of VF here.
    if (LoopCost == 0)
@@ -4957,32 +5208,40 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
    else if (UF < 1)
      UF = 1;
  
-  bool HasReductions = Legal->getReductionVars()->size();
-
-  // Decide if we want to unroll if we decided that it is legal to vectorize
-  // but not profitable.
-  if (VF == 1) {
-    if (TheLoop->getNumBlocks() > 1 || !HasReductions ||
-        LoopCost > SmallLoopCost)
-      return 1;
-
-    return UF;
-  }
-
-  if (HasReductions) {
+  // Unroll if we vectorized this loop and there is a reduction that could
+  // benefit from unrolling.
+  if (VF > 1 && Legal->getReductionVars()->size()) {
      DEBUG(dbgs() << "LV: Unrolling because of reductions.\n");
      return UF;
    }
  
-  // We want to unroll tiny loops in order to reduce the loop overhead.
-  // We assume that the cost overhead is 1 and we use the cost model
-  // to estimate the cost of the loop and unroll until the cost of the
-  // loop overhead is about 5% of the cost of the loop.
+  // Note that if we've already vectorized the loop we will have done the
+  // runtime check and so unrolling won't require further checks.
+  bool UnrollingRequiresRuntimePointerCheck =
+      (VF == 1 && Legal->getRuntimePointerCheck()->Need);
+
+  // We want to unroll small loops in order to reduce the loop overhead and
+  // potentially expose ILP opportunities.
    DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
-  if (LoopCost < SmallLoopCost) {
+  if (!UnrollingRequiresRuntimePointerCheck &&
+      LoopCost < SmallLoopCost) {
+    // We assume that the cost overhead is 1 and we use the cost model
+    // to estimate the cost of the loop and unroll until the cost of the
+    // loop overhead is about 5% of the cost of the loop.
+    unsigned SmallUF = std::min(UF, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
+
+    // Unroll until store/load ports (estimated by max unroll factor) are
+    // saturated.
+    unsigned StoresUF = UF / (Legal->NumStores ? Legal->NumStores : 1);
+    unsigned LoadsUF = UF /  (Legal->NumLoads ? Legal->NumLoads : 1);
+
+    if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) {
+      DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n");
+      return std::max(StoresUF, LoadsUF);
+    }
+
      DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n");
-    unsigned NewUF = SmallLoopCost / (LoopCost + 1);
-    return std::min(NewUF, UF);
+    return SmallUF;
    }
  
    DEBUG(dbgs() << "LV: Not Unrolling.\n");
@@ -5118,6 +5377,11 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
          continue;
  
        unsigned C = getInstructionCost(it, VF);
+
+      // Check if we should override the cost.
+      if (ForceTargetInstructionCost.getNumOccurrences() > 0)
+        C = ForceTargetInstructionCost;
+
        BlockCost += C;
        DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<
              VF << " For instruction: " << *it << '\n');
@@ -5245,9 +5509,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
        TargetTransformInfo::OK_AnyValue;
      TargetTransformInfo::OperandValueKind Op2VK =
        TargetTransformInfo::OK_AnyValue;
+    Value *Op2 = I->getOperand(1);
  
-    if (isa<ConstantInt>(I->getOperand(1)))
+    // Check for a splat of a constant or for a non uniform vector of constants.
+    if (isa<ConstantInt>(Op2))
        Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+    else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
+      Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+      if (cast<Constant>(Op2)->getSplatValue() != NULL)
+        Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+    }
  
      return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK);
    }
@@ -5391,6 +5662,7 @@ char LoopVectorize::ID = 0;
  static const char lv_name[] = "Loop Vectorization";
  INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
  INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo)
  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
  INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
  INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -5417,7 +5689,8 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
  }
  
  
-void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) {
+void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
+                                             bool IfPredicateStore) {
    assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
    // Holds vector parameters or scalars, in case of uniform vals.
    SmallVector<VectorParts, 4> Params;
@@ -5462,10 +5735,40 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) {
    // Create a new entry in the WidenMap and initialize it to Undef or Null.
    VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
  
+  Instruction *InsertPt = Builder.GetInsertPoint();
+  BasicBlock *IfBlock = Builder.GetInsertBlock();
+  BasicBlock *CondBlock = 0;
+
+  VectorParts Cond;
+  Loop *VectorLp = 0;
+  if (IfPredicateStore) {
+    assert(Instr->getParent()->getSinglePredecessor() &&
+           "Only support single predecessor blocks");
+    Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
+                          Instr->getParent());
+    VectorLp = LI->getLoopFor(IfBlock);
+    assert(VectorLp && "Must have a loop for this block");
+  }
+
    // For each vector unroll 'part':
    for (unsigned Part = 0; Part < UF; ++Part) {
      // For each scalar that we create:
  
+    // Start an "if (pred) a[i] = ..." block.
+    Value *Cmp = 0;
+    if (IfPredicateStore) {
+      if (Cond[Part]->getType()->isVectorTy())
+        Cond[Part] =
+            Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
+      Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
+                               ConstantInt::get(Cond[Part]->getType(), 1));
+      CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+      LoopVectorBody.push_back(CondBlock);
+      VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase());
+      // Update Builder with newly created basic block.
+      Builder.SetInsertPoint(InsertPt);
+    }
+
      Instruction *Cloned = Instr->clone();
        if (!IsVoidRetTy)
          Cloned->setName(Instr->getName() + ".cloned");
@@ -5482,11 +5785,26 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) {
        // so that future users will be able to use it.
        if (!IsVoidRetTy)
          VecResults[Part] = Cloned;
+
+    // End if-block.
+      if (IfPredicateStore) {
+        BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+        LoopVectorBody.push_back(NewIfBlock);
+        VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase());
+        Builder.SetInsertPoint(InsertPt);
+        Instruction *OldBr = IfBlock->getTerminator();
+        BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+        OldBr->eraseFromParent();
+        IfBlock = NewIfBlock;
+      }
    }
  }
  
  void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
-  return scalarizeInstruction(Instr);
+  StoreInst *SI = dyn_cast<StoreInst>(Instr);
+  bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent()));
+
+  return scalarizeInstruction(Instr, IfPredicateStore);
  }
  
  Value *InnerLoopUnroller::reverseVector(Value *Vec) {