[C++] Use 'nullptr'.

[oota-llvm.git] / lib / Transforms / Vectorize / LoopVectorize.cpp
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index 78674952847bc33205aaa63aad51c50e21d56642..843e9e90c2e8d69b0493c3c8ee87aa8cb48cf49a 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -42,9 +42,6 @@
  //
  //===----------------------------------------------------------------------===//
  
-#define LV_NAME "loop-vectorize"
-#define DEBUG_TYPE LV_NAME
-
  #include "llvm/Transforms/Vectorize.h"
  #include "llvm/ADT/DenseMap.h"
  #include "llvm/ADT/EquivalenceClasses.h"
@@ -54,6 +51,7 @@
  #include "llvm/ADT/SmallPtrSet.h"
  #include "llvm/ADT/SmallSet.h"
  #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
  #include "llvm/ADT/StringExtras.h"
  #include "llvm/Analysis/AliasAnalysis.h"
  #include "llvm/Analysis/BlockFrequencyInfo.h"
@@ -67,6 +65,7 @@
  #include "llvm/Analysis/ValueTracking.h"
  #include "llvm/IR/Constants.h"
  #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
  #include "llvm/IR/DerivedTypes.h"
  #include "llvm/IR/Dominators.h"
  #include "llvm/IR/Function.h"
@@ -75,26 +74,34 @@
  #include "llvm/IR/IntrinsicInst.h"
  #include "llvm/IR/LLVMContext.h"
  #include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
  #include "llvm/IR/Type.h"
  #include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
  #include "llvm/IR/Verifier.h"
  #include "llvm/Pass.h"
  #include "llvm/Support/BranchProbability.h"
  #include "llvm/Support/CommandLine.h"
  #include "llvm/Support/Debug.h"
-#include "llvm/Support/PatternMatch.h"
-#include "llvm/Support/ValueHandle.h"
+#include "llvm/Support/Format.h"
  #include "llvm/Support/raw_ostream.h"
  #include "llvm/Target/TargetLibraryInfo.h"
  #include "llvm/Transforms/Scalar.h"
  #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/VectorUtils.h"
  #include <algorithm>
  #include <map>
  
  using namespace llvm;
  using namespace llvm::PatternMatch;
  
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
+STATISTIC(LoopsVectorized, "Number of loops vectorized");
+STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
+
  static cl::opt<unsigned>
  VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
                      cl::desc("Sets the SIMD width. Zero is autoselect."));
@@ -172,16 +179,26 @@ static cl::opt<unsigned> SmallLoopCost(
      "small-loop-cost", cl::init(20), cl::Hidden,
      cl::desc("The cost of a loop that is considered 'small' by the unroller."));
  
+static cl::opt<bool> LoopVectorizeWithBlockFrequency(
+    "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
+    cl::desc("Enable the use of the block frequency analysis to access PGO "
+             "heuristics minimizing code growth in cold regions and being more "
+             "aggressive in hot regions."));
+
  // Runtime unroll loops for load/store throughput.
  static cl::opt<bool> EnableLoadStoreRuntimeUnroll(
-    "enable-loadstore-runtime-unroll", cl::init(false), cl::Hidden,
+    "enable-loadstore-runtime-unroll", cl::init(true), cl::Hidden,
      cl::desc("Enable runtime unrolling until load/store ports are saturated"));
  
  /// The number of stores in a loop that are allowed to need predication.
  static cl::opt<unsigned> NumberOfStoresToPredicate(
-    "vectorize-num-stores-pred", cl::init(0), cl::Hidden,
+    "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
      cl::desc("Max number of stores to be predicated behind an if."));
  
+static cl::opt<bool> EnableIndVarRegisterHeur(
+    "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
+    cl::desc("Count the induction variable only once when unrolling"));
+
  static cl::opt<bool> EnableCondStoresVectorization(
      "enable-cond-stores-vec", cl::init(false), cl::Hidden,
      cl::desc("Enable if predication of stores during vectorization."));
@@ -209,12 +226,13 @@ class LoopVectorizationCostModel;
  class InnerLoopVectorizer {
  public:
    InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
-                      DominatorTree *DT, DataLayout *DL,
+                      DominatorTree *DT, const DataLayout *DL,
                        const TargetLibraryInfo *TLI, unsigned VecWidth,
                        unsigned UnrollFactor)
        : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI),
-        VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(0),
-        OldInduction(0), WidenMap(UnrollFactor), Legal(0) {}
+        VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
+        Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
+        Legal(nullptr) {}
  
    // Perform the actual loop widening (vectorization).
    void vectorize(LoopVectorizationLegality *L) {
@@ -369,7 +387,7 @@ protected:
    /// Dominator Tree.
    DominatorTree *DT;
    /// Data Layout.
-  DataLayout *DL;
+  const DataLayout *DL;
    /// Target Library Info.
    const TargetLibraryInfo *TLI;
  
@@ -418,16 +436,17 @@ protected:
  class InnerLoopUnroller : public InnerLoopVectorizer {
  public:
    InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
-                    DominatorTree *DT, DataLayout *DL,
+                    DominatorTree *DT, const DataLayout *DL,
                      const TargetLibraryInfo *TLI, unsigned UnrollFactor) :
      InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { }
  
  private:
-  virtual void scalarizeInstruction(Instruction *Instr, bool IfPredicateStore = false);
-  virtual void vectorizeMemoryInstruction(Instruction *Instr);
-  virtual Value *getBroadcastInstrs(Value *V);
-  virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
-  virtual Value *reverseVector(Value *Vec);
+  void scalarizeInstruction(Instruction *Instr,
+                            bool IfPredicateStore = false) override;
+  void vectorizeMemoryInstruction(Instruction *Instr) override;
+  Value *getBroadcastInstrs(Value *V) override;
+  Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate) override;
+  Value *reverseVector(Value *Vec) override;
  };
  
  /// \brief Look for a meaningful debug location on the instruction or it's
@@ -458,6 +477,28 @@ static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
      B.SetCurrentDebugLocation(DebugLoc());
  }
  
+#ifndef NDEBUG
+/// \return string containing a file name and a line # for the given
+/// instruction.
+static format_object3<const char *, const char *, unsigned>
+getDebugLocString(const Instruction *I) {
+  if (!I)
+    return format<const char *, const char *, unsigned>("", "", "", 0U);
+  MDNode *N = I->getMetadata("dbg");
+  if (!N) {
+    const StringRef ModuleName =
+        I->getParent()->getParent()->getParent()->getModuleIdentifier();
+    return format<const char *, const char *, unsigned>("%s", ModuleName.data(),
+                                                        "", 0U);
+  }
+  const DILocation Loc(N);
+  const unsigned LineNo = Loc.getLineNumber();
+  const char *DirName = Loc.getDirectory().data();
+  const char *FileName = Loc.getFilename().data();
+  return format("%s/%s:%u", DirName, FileName, LineNo);
+}
+#endif
+
  /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
  /// to what vectorization factor.
  /// This class does not look at the profitability of vectorization, only the
@@ -477,11 +518,11 @@ public:
    unsigned NumStores;
    unsigned NumPredStores;
  
-  LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
+  LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL,
                              DominatorTree *DT, TargetLibraryInfo *TLI)
        : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
-        DT(DT), TLI(TLI), Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false),
-        MaxSafeDepDistBytes(-1U) {}
+        DT(DT), TLI(TLI), Induction(nullptr), WidestIndTy(nullptr),
+        HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {}
  
    /// This enum represents the kinds of reductions that we support.
    enum ReductionKind {
@@ -519,7 +560,7 @@ public:
  
    /// This struct holds information about reduction variables.
    struct ReductionDescriptor {
-    ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
+    ReductionDescriptor() : StartValue(nullptr), LoopExitInstr(nullptr),
        Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
  
      ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K,
@@ -591,7 +632,7 @@ public:
    /// A struct for saving information about induction variables.
    struct InductionInfo {
      InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
-    InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
+    InductionInfo() : StartValue(nullptr), IK(IK_NoInduction) {}
      /// Start value.
      TrackingVH<Value> StartValue;
      /// Induction kind.
@@ -715,7 +756,7 @@ private:
    /// Scev analysis.
    ScalarEvolution *SE;
    /// DataLayout analysis.
-  DataLayout *DL;
+  const DataLayout *DL;
    /// Dominators.
    DominatorTree *DT;
    /// Target Library Info.
@@ -765,7 +806,7 @@ public:
    LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
                               LoopVectorizationLegality *Legal,
                               const TargetTransformInfo &TTI,
-                             DataLayout *DL, const TargetLibraryInfo *TLI)
+                             const DataLayout *DL, const TargetLibraryInfo *TLI)
        : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {}
  
    /// Information about vectorization costs
@@ -838,7 +879,7 @@ private:
    /// Vector target information.
    const TargetTransformInfo &TTI;
    /// Target data layout information.
-  DataLayout *DL;
+  const DataLayout *DL;
    /// Target Library Info.
    const TargetLibraryInfo *TLI;
  };
@@ -920,7 +961,7 @@ private:
      assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
  
      for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
-      const MDString *S = 0;
+      const MDString *S = nullptr;
        SmallVector<Value*, 4> Args;
  
        // The expected hint is either a MDString or a MDNode with the first
@@ -978,12 +1019,12 @@ private:
    }
  };
  
-static void addInnerLoop(Loop *L, SmallVectorImpl<Loop *> &V) {
-  if (L->empty())
-    return V.push_back(L);
+static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
+  if (L.empty())
+    return V.push_back(&L);
  
-  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
-    addInnerLoop(*I, V);
+  for (Loop *InnerL : L)
+    addInnerLoop(*InnerL, V);
  }
  
  /// The LoopVectorize Pass.
@@ -999,7 +1040,7 @@ struct LoopVectorize : public FunctionPass {
    }
  
    ScalarEvolution *SE;
-  DataLayout *DL;
+  const DataLayout *DL;
    LoopInfo *LI;
    TargetTransformInfo *TTI;
    DominatorTree *DT;
@@ -1010,9 +1051,10 @@ struct LoopVectorize : public FunctionPass {
  
    BlockFrequency ColdEntryFreq;
  
-  virtual bool runOnFunction(Function &F) {
+  bool runOnFunction(Function &F) override {
      SE = &getAnalysis<ScalarEvolution>();
-    DL = getAnalysisIfAvailable<DataLayout>();
+    DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+    DL = DLP ? &DLP->getDataLayout() : nullptr;
      LI = &getAnalysis<LoopInfo>();
      TTI = &getAnalysis<TargetTransformInfo>();
      DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -1029,8 +1071,9 @@ struct LoopVectorize : public FunctionPass {
      if (!TTI->getNumberOfRegisters(true))
        return false;
  
-    if (DL == NULL) {
-      DEBUG(dbgs() << "LV: Not vectorizing: Missing data layout\n");
+    if (!DL) {
+      DEBUG(dbgs() << "\nLV: Not vectorizing " << F.getName()
+                   << ": Missing data layout\n");
        return false;
      }
  
@@ -1039,8 +1082,10 @@ struct LoopVectorize : public FunctionPass {
      // and can invalidate iterators across the loops.
      SmallVector<Loop *, 8> Worklist;
  
-    for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
-      addInnerLoop(*I, Worklist);
+    for (Loop *L : *LI)
+      addInnerLoop(*L, Worklist);
+
+    LoopsAnalyzed += Worklist.size();
  
      // Now walk the identified inner loops.
      bool Changed = false;
@@ -1052,19 +1097,21 @@ struct LoopVectorize : public FunctionPass {
    }
  
    bool processLoop(Loop *L) {
-    // We only handle inner loops, so if there are children just recurse.
-    if (!L->empty()) {
-      bool Changed = false;
-      for (Loop::iterator I = L->begin(), E = L->begin(); I != E; ++I)
-        Changed |= processLoop(*I);
-      return Changed;
-    }
-
-    DEBUG(dbgs() << "LV: Checking a loop in \"" <<
-          L->getHeader()->getParent()->getName() << "\"\n");
+    assert(L->empty() && "Only process inner loops.");
+    DEBUG(dbgs() << "\nLV: Checking a loop in \""
+                 << L->getHeader()->getParent()->getName() << "\" from "
+                 << getDebugLocString(L->getHeader()->getFirstNonPHIOrDbg())
+                 << "\n");
  
      LoopVectorizeHints Hints(L, DisableUnrolling);
  
+    DEBUG(dbgs() << "LV: Loop hints:"
+                 << " force=" << (Hints.Force == 0
+                                      ? "disabled"
+                                      : (Hints.Force == 1 ? "enabled" : "?"))
+                 << " width=" << Hints.Width << " unroll=" << Hints.Unroll
+                 << "\n");
+
      if (Hints.Force == 0) {
        DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
        return false;
@@ -1099,9 +1146,13 @@ struct LoopVectorize : public FunctionPass {
      // Compute the weighted frequency of this loop being executed and see if it
      // is less than 20% of the function entry baseline frequency. Note that we
      // always have a canonical loop here because we think we *can* vectoriez.
-    BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
-    if (Hints.Force != 1 && LoopEntryFreq < ColdEntryFreq)
-      OptForSize = true;
+    // FIXME: This is hidden behind a flag due to pervasive problems with
+    // exactly what block frequency models.
+    if (LoopVectorizeWithBlockFrequency) {
+      BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
+      if (Hints.Force != 1 && LoopEntryFreq < ColdEntryFreq)
+        OptForSize = true;
+    }
  
      // Check the function attributes to see if implicit floats are allowed.a
      // FIXME: This check doesn't seem possibly correct -- what if the loop is
@@ -1114,14 +1165,16 @@ struct LoopVectorize : public FunctionPass {
      }
  
      // Select the optimal vectorization factor.
-    LoopVectorizationCostModel::VectorizationFactor VF;
-    VF = CM.selectVectorizationFactor(OptForSize, Hints.Width);
+    const LoopVectorizationCostModel::VectorizationFactor VF =
+                          CM.selectVectorizationFactor(OptForSize, Hints.Width);
      // Select the unroll factor.
-    unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width,
+    const unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width,
                                          VF.Cost);
  
-    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
-          F->getParent()->getModuleIdentifier() << '\n');
+    DEBUG(dbgs() << "LV: Found a vectorizable loop ("
+                 << VF.Width << ") in "
+                 << getDebugLocString(L->getHeader()->getFirstNonPHIOrDbg())
+                 << '\n');
      DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n');
  
      if (VF.Width == 1) {
@@ -1136,6 +1189,7 @@ struct LoopVectorize : public FunctionPass {
        // If we decided that it is *legal* to vectorize the loop then do it.
        InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
        LB.vectorize(&LVL);
+      ++LoopsVectorized;
      }
  
      // Mark the loop as already vectorized to avoid vectorizing again.
@@ -1145,7 +1199,7 @@ struct LoopVectorize : public FunctionPass {
      return true;
    }
  
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
      AU.addRequiredID(LoopSimplifyID);
      AU.addRequiredID(LCSSAID);
      AU.addRequired<BlockFrequencyInfo>();
@@ -1179,7 +1233,7 @@ static Value *stripIntegerCast(Value *V) {
  /// \p Ptr.
  static const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE,
                                               ValueToValueMap &PtrToStride,
-                                             Value *Ptr, Value *OrigPtr = 0) {
+                                             Value *Ptr, Value *OrigPtr = nullptr) {
  
    const SCEV *OrigSCEV = SE->getSCEV(Ptr);
  
@@ -1269,7 +1323,7 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx,
  /// \brief Find the operand of the GEP that should be checked for consecutive
  /// stores. This ignores trailing indices that have no effect on the final
  /// pointer.
-static unsigned getGEPInductionOperand(DataLayout *DL,
+static unsigned getGEPInductionOperand(const DataLayout *DL,
                                         const GetElementPtrInst *Gep) {
    unsigned LastOperand = Gep->getNumOperands() - 1;
    unsigned GEPAllocSize = DL->getTypeAllocSize(
@@ -1346,7 +1400,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
  
    // We can emit wide load/stores only if the last non-zero index is the
    // induction variable.
-  const SCEV *Last = 0;
+  const SCEV *Last = nullptr;
    if (!Strides.count(Gep))
      Last = SE->getSCEV(Gep->getOperand(InductionOperand));
    else {
@@ -1595,17 +1649,17 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
    // Does this instruction return a value ?
    bool IsVoidRetTy = Instr->getType()->isVoidTy();
  
-  Value *UndefVec = IsVoidRetTy ? 0 :
+  Value *UndefVec = IsVoidRetTy ? nullptr :
      UndefValue::get(VectorType::get(Instr->getType(), VF));
    // Create a new entry in the WidenMap and initialize it to Undef or Null.
    VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
  
    Instruction *InsertPt = Builder.GetInsertPoint();
    BasicBlock *IfBlock = Builder.GetInsertBlock();
-  BasicBlock *CondBlock = 0;
+  BasicBlock *CondBlock = nullptr;
  
    VectorParts Cond;
-  Loop *VectorLp = 0;
+  Loop *VectorLp = nullptr;
    if (IfPredicateStore) {
      assert(Instr->getParent()->getSinglePredecessor() &&
             "Only support single predecessor blocks");
@@ -1621,11 +1675,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
      for (unsigned Width = 0; Width < VF; ++Width) {
  
        // Start if-block.
-      Value *Cmp = 0;
+      Value *Cmp = nullptr;
        if (IfPredicateStore) {
          Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
          Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));
          CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+        LoopVectorBody.push_back(CondBlock);
          VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase());
          // Update Builder with newly created basic block.
          Builder.SetInsertPoint(InsertPt);
@@ -1654,6 +1709,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
        // End if-block.
        if (IfPredicateStore) {
           BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+         LoopVectorBody.push_back(NewIfBlock);
           VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase());
           Builder.SetInsertPoint(InsertPt);
           Instruction *OldBr = IfBlock->getTerminator();
@@ -1670,21 +1726,21 @@ static Instruction *getFirstInst(Instruction *FirstInst, Value *V,
    if (FirstInst)
      return FirstInst;
    if (Instruction *I = dyn_cast<Instruction>(V))
-    return I->getParent() == Loc->getParent() ? I : 0;
-  return 0;
+    return I->getParent() == Loc->getParent() ? I : nullptr;
+  return nullptr;
  }
  
  std::pair<Instruction *, Instruction *>
  InnerLoopVectorizer::addStrideCheck(Instruction *Loc) {
-  Instruction *tnullptr = 0;
+  Instruction *tnullptr = nullptr;
    if (!Legal->mustCheckStrides())
      return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
  
    IRBuilder<> ChkBuilder(Loc);
  
    // Emit checks.
-  Value *Check = 0;
-  Instruction *FirstInst = 0;
+  Value *Check = nullptr;
+  Instruction *FirstInst = nullptr;
    for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(),
                                           SE = Legal->strides_end();
         SI != SE; ++SI) {
@@ -1716,7 +1772,7 @@ InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) {
    LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
    Legal->getRuntimePointerCheck();
  
-  Instruction *tnullptr = 0;
+  Instruction *tnullptr = nullptr;
    if (!PtrRtCheck->Need)
      return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
  
@@ -1726,7 +1782,7 @@ InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) {
  
    LLVMContext &Ctx = Loc->getContext();
    SCEVExpander Exp(*SE, "induction");
-  Instruction *FirstInst = 0;
+  Instruction *FirstInst = nullptr;
  
    for (unsigned i = 0; i < NumPointers; ++i) {
      Value *Ptr = PtrRtCheck->Pointers[i];
@@ -1753,7 +1809,7 @@ InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) {
  
    IRBuilder<> ChkBuilder(Loc);
    // Our instructions might fold to a constant.
-  Value *MemoryRuntimeCheck = 0;
+  Value *MemoryRuntimeCheck = nullptr;
    for (unsigned i = 0; i < NumPointers; ++i) {
      for (unsigned j = i+1; j < NumPointers; ++j) {
        // No need to check if two readonly pointers intersect.
@@ -1960,7 +2016,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
    // sequence of instructions that form a check.
    Instruction *StrideCheck;
    Instruction *FirstCheckInst;
-  tie(FirstCheckInst, StrideCheck) =
+  std::tie(FirstCheckInst, StrideCheck) =
        addStrideCheck(BypassBlock->getTerminator());
    if (StrideCheck) {
      // Create a new block containing the stride check.
@@ -1984,7 +2040,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
    // checks into a separate block to make the more common case of few elements
    // faster.
    Instruction *MemRuntimeCheck;
-  tie(FirstCheckInst, MemRuntimeCheck) =
+  std::tie(FirstCheckInst, MemRuntimeCheck) =
        addRuntimeCheck(LastBypassBlock->getTerminator());
    if (MemRuntimeCheck) {
      // Create a new block containing the memory check.
@@ -2017,7 +2073,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
    // start value.
  
    // This variable saves the new starting index for the scalar loop.
-  PHINode *ResumeIndex = 0;
+  PHINode *ResumeIndex = nullptr;
    LoopVectorizationLegality::InductionList::iterator I, E;
    LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
    // Set builder to point to last bypass block.
@@ -2033,9 +2089,9 @@ void InnerLoopVectorizer::createEmptyLoop() {
      // truncated version for the scalar loop.
      PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
        PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
-                      MiddleBlock->getTerminator()) : 0;
+                      MiddleBlock->getTerminator()) : nullptr;
  
-    Value *EndValue = 0;
+    Value *EndValue = nullptr;
      switch (II.IK) {
      case LoopVectorizationLegality::IK_NoInduction:
        llvm_unreachable("Unknown induction");
@@ -2227,32 +2283,12 @@ static Intrinsic::ID
  getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
    // If we have an intrinsic call, check if it is trivially vectorizable.
    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
-    switch (II->getIntrinsicID()) {
-    case Intrinsic::sqrt:
-    case Intrinsic::sin:
-    case Intrinsic::cos:
-    case Intrinsic::exp:
-    case Intrinsic::exp2:
-    case Intrinsic::log:
-    case Intrinsic::log10:
-    case Intrinsic::log2:
-    case Intrinsic::fabs:
-    case Intrinsic::copysign:
-    case Intrinsic::floor:
-    case Intrinsic::ceil:
-    case Intrinsic::trunc:
-    case Intrinsic::rint:
-    case Intrinsic::nearbyint:
-    case Intrinsic::round:
-    case Intrinsic::pow:
-    case Intrinsic::fma:
-    case Intrinsic::fmuladd:
-    case Intrinsic::lifetime_start:
-    case Intrinsic::lifetime_end:
-      return II->getIntrinsicID();
-    default:
+    Intrinsic::ID ID = II->getIntrinsicID();
+    if (isTriviallyVectorizable(ID) || ID == Intrinsic::lifetime_start ||
+        ID == Intrinsic::lifetime_end)
+      return ID;
+    else
        return Intrinsic::not_intrinsic;
-    }
    }
  
    if (!TLI)
@@ -2471,6 +2507,16 @@ static void cse(SmallVector<BasicBlock *, 4> &BBs) {
    }
  }
  
+/// \brief Adds a 'fast' flag to floating point operations.
+static Value *addFastMathFlag(Value *V) {
+  if (isa<FPMathOperator>(V)){
+    FastMathFlags Flags;
+    Flags.setUnsafeAlgebra();
+    cast<Instruction>(V)->setFastMathFlags(Flags);
+  }
+  return V;
+}
+
  void InnerLoopVectorizer::vectorizeLoop() {
    //===------------------------------------------------===//
    //
@@ -2614,9 +2660,10 @@ void InnerLoopVectorizer::vectorizeLoop() {
      setDebugLocFromInst(Builder, ReducedPartRdx);
      for (unsigned part = 1; part < UF; ++part) {
        if (Op != Instruction::ICmp && Op != Instruction::FCmp)
-        ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op,
-                                             RdxParts[part], ReducedPartRdx,
-                                             "bin.rdx");
+        // Floating point operations had to be 'fast' to enable the reduction.
+        ReducedPartRdx = addFastMathFlag(
+            Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
+                                ReducedPartRdx, "bin.rdx"));
        else
          ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind,
                                          ReducedPartRdx, RdxParts[part]);
@@ -2629,7 +2676,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
        assert(isPowerOf2_32(VF) &&
               "Reduction emission only supported for pow2 vectors!");
        Value *TmpVec = ReducedPartRdx;
-      SmallVector<Constant*, 32> ShuffleMask(VF, 0);
+      SmallVector<Constant*, 32> ShuffleMask(VF, nullptr);
        for (unsigned i = VF; i != 1; i >>= 1) {
          // Move the upper half of the vector to the lower half.
          for (unsigned j = 0; j != i/2; ++j)
@@ -2646,8 +2693,9 @@ void InnerLoopVectorizer::vectorizeLoop() {
                                      "rdx.shuf");
  
          if (Op != Instruction::ICmp && Op != Instruction::FCmp)
-          TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
-                                       "bin.rdx");
+          // Floating point operations had to be 'fast' to enable the reduction.
+          TmpVec = addFastMathFlag(Builder.CreateBinOp(
+              (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
          else
            TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
        }
@@ -2981,6 +3029,10 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
          if (VecOp && isa<PossiblyExactOperator>(VecOp))
            VecOp->setIsExact(BinOp->isExact());
  
+        // Copy the fast-math flags.
+        if (VecOp && isa<FPMathOperator>(V))
+          VecOp->setFastMathFlags(it->getFastMathFlags());
+
          Entry[Part] = V;
        }
        break;
@@ -3022,7 +3074,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
        VectorParts &A = getVectorValue(it->getOperand(0));
        VectorParts &B = getVectorValue(it->getOperand(1));
        for (unsigned Part = 0; Part < UF; ++Part) {
-        Value *C = 0;
+        Value *C = nullptr;
          if (FCmp)
            C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
          else
@@ -3282,7 +3334,7 @@ bool LoopVectorizationLegality::canVectorize() {
    return true;
  }
  
-static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) {
+static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
    if (Ty->isPointerTy())
      return DL.getIntPtrType(Ty);
  
@@ -3294,7 +3346,7 @@ static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) {
    return Ty;
  }
  
-static Type* getWiderType(DataLayout &DL, Type *Ty0, Type *Ty1) {
+static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
    Ty0 = convertPointerToIntegerType(DL, Ty0);
    Ty1 = convertPointerToIntegerType(DL, Ty1);
    if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
@@ -3310,12 +3362,11 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
    // instructions must not have external users.
    if (!Reductions.count(Inst))
      //Check that all of the users of the loop are inside the BB.
-    for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end();
-         I != E; ++I) {
-      Instruction *U = cast<Instruction>(*I);
+    for (User *U : Inst->users()) {
+      Instruction *UI = cast<Instruction>(U);
        // This user may be a reduction exit value.
-      if (!TheLoop->contains(U)) {
-        DEBUG(dbgs() << "LV: Found an outside user for : " << *U << '\n');
+      if (!TheLoop->contains(UI)) {
+        DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
          return true;
        }
      }
@@ -3492,7 +3543,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
  ///\brief Remove GEPs whose indices but the last one are loop invariant and
  /// return the induction operand of the gep pointer.
  static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE,
-                                 DataLayout *DL, Loop *Lp) {
+                                 const DataLayout *DL, Loop *Lp) {
    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
    if (!GEP)
      return Ptr;
@@ -3510,15 +3561,14 @@ static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE,
  
  ///\brief Look for a cast use of the passed value.
  static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) {
-  Value *UniqueCast = 0;
-  for (Value::use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end(); UI != UE;
-       ++UI) {
-    CastInst *CI = dyn_cast<CastInst>(*UI);
+  Value *UniqueCast = nullptr;
+  for (User *U : Ptr->users()) {
+    CastInst *CI = dyn_cast<CastInst>(U);
      if (CI && CI->getType() == Ty) {
        if (!UniqueCast)
          UniqueCast = CI;
        else
-        return 0;
+        return nullptr;
      }
    }
    return UniqueCast;
@@ -3528,10 +3578,10 @@ static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) {
  /// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a
  /// pointer to the Value, or null otherwise.
  static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
-                                   DataLayout *DL, Loop *Lp) {
+                                   const DataLayout *DL, Loop *Lp) {
    const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
    if (!PtrTy || PtrTy->isAggregateType())
-    return 0;
+    return nullptr;
  
    // Try to remove a gep instruction to make the pointer (actually index at this
    // point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the
@@ -3551,11 +3601,11 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
  
    const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
    if (!S)
-    return 0;
+    return nullptr;
  
    V = S->getStepRecurrence(*SE);
    if (!V)
-    return 0;
+    return nullptr;
  
    // Strip off the size of access multiplication if we are still analyzing the
    // pointer.
@@ -3563,24 +3613,24 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
      DL->getTypeAllocSize(PtrTy->getElementType());
      if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
        if (M->getOperand(0)->getSCEVType() != scConstant)
-        return 0;
+        return nullptr;
  
        const APInt &APStepVal =
            cast<SCEVConstant>(M->getOperand(0))->getValue()->getValue();
  
        // Huge step value - give up.
        if (APStepVal.getBitWidth() > 64)
-        return 0;
+        return nullptr;
  
        int64_t StepVal = APStepVal.getSExtValue();
        if (PtrAccessSize != StepVal)
-        return 0;
+        return nullptr;
        V = M->getOperand(1);
      }
    }
  
    // Strip off casts.
-  Type *StripedOffRecurrenceCast = 0;
+  Type *StripedOffRecurrenceCast = nullptr;
    if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) {
      StripedOffRecurrenceCast = C->getType();
      V = C->getOperand();
@@ -3589,11 +3639,11 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
    // Look for the loop invariant symbolic value.
    const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);
    if (!U)
-    return 0;
+    return nullptr;
  
    Value *Stride = U->getValue();
    if (!Lp->isLoopInvariant(Stride))
-    return 0;
+    return nullptr;
  
    // If we have stripped off the recurrence cast we have to make sure that we
    // return the value that is used in this loop so that we can replace it later.
@@ -3604,7 +3654,7 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
  }
  
  void LoopVectorizationLegality::collectStridedAcccess(Value *MemAccess) {
-  Value *Ptr = 0;
+  Value *Ptr = nullptr;
    if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
      Ptr = LI->getPointerOperand();
    else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
@@ -3631,6 +3681,16 @@ void LoopVectorizationLegality::collectLoopUniforms() {
    // Start with the conditional branch and walk up the block.
    Worklist.push_back(Latch->getTerminator()->getOperand(0));
  
+  // Also add all consecutive pointer values; these values will be uniform
+  // after vectorization (and subsequent cleanup) and, until revectorization is
+  // supported, all dependencies must also be uniform.
+  for (Loop::block_iterator B = TheLoop->block_begin(),
+       BE = TheLoop->block_end(); B != BE; ++B)
+    for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end();
+         I != IE; ++I)
+      if (I->getType()->isPointerTy() && isConsecutivePtr(I))
+        Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
+
    while (Worklist.size()) {
      Instruction *I = dyn_cast<Instruction>(Worklist.back());
      Worklist.pop_back();
@@ -3663,7 +3723,7 @@ public:
    /// \brief Set of potential dependent memory accesses.
    typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
  
-  AccessAnalysis(DataLayout *Dl, DepCandidates &DA) :
+  AccessAnalysis(const DataLayout *Dl, DepCandidates &DA) :
      DL(Dl), DepCands(DA), AreAllWritesIdentified(true),
      AreAllReadsIdentified(true), IsRTCheckNeeded(false) {}
  
@@ -3729,7 +3789,7 @@ private:
    /// Set of underlying objects already written to.
    SmallPtrSet<Value*, 16> WriteObjects;
  
-  DataLayout *DL;
+  const DataLayout *DL;
  
    /// Sets of potentially dependent accesses - members of one set share an
    /// underlying pointer. The set "CheckDeps" identfies which sets really need a
@@ -3756,7 +3816,7 @@ static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides,
  
  /// \brief Check the stride of the pointer and ensure that it does not wrap in
  /// the address space.
-static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr,
+static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr,
                          const Loop *Lp, ValueToValueMap &StridesMap);
  
  bool AccessAnalysis::canCheckPtrAtRT(
@@ -3976,7 +4036,7 @@ public:
    typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
    typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
  
-  MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L)
+  MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L)
        : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0),
          ShouldRetryWithRuntimeCheck(false) {}
  
@@ -4014,7 +4074,7 @@ public:
  
  private:
    ScalarEvolution *SE;
-  DataLayout *DL;
+  const DataLayout *DL;
    const Loop *InnermostLoop;
  
    /// \brief Maps access locations (ptr, read/write) to program order.
@@ -4063,7 +4123,7 @@ static bool isInBoundsGep(Value *Ptr) {
  }
  
  /// \brief Check whether the access through \p Ptr has a constant stride.
-static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr,
+static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr,
                          const Loop *Lp, ValueToValueMap &StridesMap) {
    const Type *Ty = Ptr->getType();
    assert(Ty->isPointerTy() && "Unexpected non-ptr");
@@ -4322,7 +4382,7 @@ bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
      // Check every access pair.
      while (AI != AE) {
        CheckDeps.erase(*AI);
-      EquivalenceClasses<MemAccessInfo>::member_iterator OI = llvm::next(AI);
+      EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI);
        while (OI != AE) {
          // Check every accessing instruction pair in program order.
          for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
@@ -4593,7 +4653,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
    // We only allow for a single reduction value to be used outside the loop.
    // This includes users of the reduction, variables (which form a cycle
    // which ends in the phi node).
-  Instruction *ExitInstruction = 0;
+  Instruction *ExitInstruction = nullptr;
    // Indicates that we found a reduction operation in our scan.
    bool FoundReduxOp = false;
  
@@ -4607,7 +4667,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
    // the number of instruction we saw from the recognized min/max pattern,
    //  to make sure we only see exactly the two instructions.
    unsigned NumCmpSelectPatternInst = 0;
-  ReductionInstDesc ReduxDesc(false, 0);
+  ReductionInstDesc ReduxDesc(false, nullptr);
  
    SmallPtrSet<Instruction *, 8> VisitedInsts;
    SmallVector<Instruction *, 8> Worklist;
@@ -4680,18 +4740,17 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
      // nodes once we get to them.
      SmallVector<Instruction *, 8> NonPHIs;
      SmallVector<Instruction *, 8> PHIs;
-    for (Value::use_iterator UI = Cur->use_begin(), E = Cur->use_end(); UI != E;
-         ++UI) {
-      Instruction *Usr = cast<Instruction>(*UI);
+    for (User *U : Cur->users()) {
+      Instruction *UI = cast<Instruction>(U);
  
        // Check if we found the exit user.
-      BasicBlock *Parent = Usr->getParent();
+      BasicBlock *Parent = UI->getParent();
        if (!TheLoop->contains(Parent)) {
          // Exit if you find multiple outside users or if the header phi node is
          // being used. In this case the user uses the value of the previous
          // iteration, in which case we would loose "VF-1" iterations of the
          // reduction operation if we vectorize.
-        if (ExitInstruction != 0 || Cur == Phi)
+        if (ExitInstruction != nullptr || Cur == Phi)
            return false;
  
          // The instruction used by an outside user must be the last instruction
@@ -4707,21 +4766,21 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
        // Process instructions only once (termination). Each reduction cycle
        // value must only be used once, except by phi nodes and min/max
        // reductions which are represented as a cmp followed by a select.
-      ReductionInstDesc IgnoredVal(false, 0);
-      if (VisitedInsts.insert(Usr)) {
-        if (isa<PHINode>(Usr))
-          PHIs.push_back(Usr);
+      ReductionInstDesc IgnoredVal(false, nullptr);
+      if (VisitedInsts.insert(UI)) {
+        if (isa<PHINode>(UI))
+          PHIs.push_back(UI);
          else
-          NonPHIs.push_back(Usr);
-      } else if (!isa<PHINode>(Usr) &&
-                 ((!isa<FCmpInst>(Usr) &&
-                   !isa<ICmpInst>(Usr) &&
-                   !isa<SelectInst>(Usr)) ||
-                  !isMinMaxSelectCmpPattern(Usr, IgnoredVal).IsReduction))
+          NonPHIs.push_back(UI);
+      } else if (!isa<PHINode>(UI) &&
+                 ((!isa<FCmpInst>(UI) &&
+                   !isa<ICmpInst>(UI) &&
+                   !isa<SelectInst>(UI)) ||
+                  !isMinMaxSelectCmpPattern(UI, IgnoredVal).IsReduction))
          return false;
  
        // Remember that we completed the cycle.
-      if (Usr == Phi)
+      if (UI == Phi)
          FoundStartPHI = true;
      }
      Worklist.append(PHIs.begin(), PHIs.end());
@@ -4761,13 +4820,13 @@ LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I,
  
    assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) &&
           "Expect a select instruction");
-  Instruction *Cmp = 0;
-  SelectInst *Select = 0;
+  Instruction *Cmp = nullptr;
+  SelectInst *Select = nullptr;
  
    // We must handle the select(cmp()) as a single instruction. Advance to the
    // select.
    if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) {
-    if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->use_begin())))
+    if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->user_begin())))
        return ReductionInstDesc(false, I);
      return ReductionInstDesc(Select, Prev.MinMaxKind);
    }
@@ -5033,7 +5092,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
      }
    }
  
-  DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
+  DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n");
    Factor.Width = Width;
    Factor.Cost = Width * Cost;
    return Factor;
@@ -5145,6 +5204,11 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
    unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
                                R.MaxLocalUsers);
  
+  // Don't count the induction variable as unrolled.
+  if (EnableIndVarRegisterHeur)
+    UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
+                       std::max(1U, (R.MaxLocalUsers - 1)));
+
    // Clamp the unroll factor ranges to reasonable factors.
    unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor();
  
@@ -5176,26 +5240,33 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
      return UF;
    }
  
-  if (EnableLoadStoreRuntimeUnroll &&
-      !Legal->getRuntimePointerCheck()->Need &&
+  // Note that if we've already vectorized the loop we will have done the
+  // runtime check and so unrolling won't require further checks.
+  bool UnrollingRequiresRuntimePointerCheck =
+      (VF == 1 && Legal->getRuntimePointerCheck()->Need);
+
+  // We want to unroll small loops in order to reduce the loop overhead and
+  // potentially expose ILP opportunities.
+  DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
+  if (!UnrollingRequiresRuntimePointerCheck &&
        LoopCost < SmallLoopCost) {
+    // We assume that the cost overhead is 1 and we use the cost model
+    // to estimate the cost of the loop and unroll until the cost of the
+    // loop overhead is about 5% of the cost of the loop.
+    unsigned SmallUF = std::min(UF, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
+
      // Unroll until store/load ports (estimated by max unroll factor) are
      // saturated.
-    unsigned UnrollStores = UF / (Legal->NumStores ? Legal->NumStores : 1);
-    unsigned UnrollLoads = UF /  (Legal->NumLoads ? Legal->NumLoads : 1);
-    UF = std::max(std::min(UnrollStores, UnrollLoads), 1u);
-    return UF;
-  }
+    unsigned StoresUF = UF / (Legal->NumStores ? Legal->NumStores : 1);
+    unsigned LoadsUF = UF /  (Legal->NumLoads ? Legal->NumLoads : 1);
+
+    if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) {
+      DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n");
+      return std::max(StoresUF, LoadsUF);
+    }
  
-  // We want to unroll tiny loops in order to reduce the loop overhead.
-  // We assume that the cost overhead is 1 and we use the cost model
-  // to estimate the cost of the loop and unroll until the cost of the
-  // loop overhead is about 5% of the cost of the loop.
-  DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
-  if (LoopCost < SmallLoopCost) {
      DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n");
-    unsigned NewUF = PowerOf2Floor(SmallLoopCost / LoopCost);
-    return std::min(NewUF, UF);
+    return SmallUF;
    }
  
    DEBUG(dbgs() << "LV: Not Unrolling.\n");
@@ -5463,9 +5534,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
        TargetTransformInfo::OK_AnyValue;
      TargetTransformInfo::OperandValueKind Op2VK =
        TargetTransformInfo::OK_AnyValue;
+    Value *Op2 = I->getOperand(1);
  
-    if (isa<ConstantInt>(I->getOperand(1)))
+    // Check for a splat of a constant or for a non uniform vector of constants.
+    if (isa<ConstantInt>(Op2))
        Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+    else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
+      Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+      if (cast<Constant>(Op2)->getSplatValue() != nullptr)
+        Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+    }
  
      return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK);
    }
@@ -5677,17 +5755,17 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
    // Does this instruction return a value ?
    bool IsVoidRetTy = Instr->getType()->isVoidTy();
  
-  Value *UndefVec = IsVoidRetTy ? 0 :
+  Value *UndefVec = IsVoidRetTy ? nullptr :
    UndefValue::get(Instr->getType());
    // Create a new entry in the WidenMap and initialize it to Undef or Null.
    VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
  
    Instruction *InsertPt = Builder.GetInsertPoint();
    BasicBlock *IfBlock = Builder.GetInsertBlock();
-  BasicBlock *CondBlock = 0;
+  BasicBlock *CondBlock = nullptr;
  
    VectorParts Cond;
-  Loop *VectorLp = 0;
+  Loop *VectorLp = nullptr;
    if (IfPredicateStore) {
      assert(Instr->getParent()->getSinglePredecessor() &&
             "Only support single predecessor blocks");
@@ -5702,7 +5780,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
      // For each scalar that we create:
  
      // Start an "if (pred) a[i] = ..." block.
-    Value *Cmp = 0;
+    Value *Cmp = nullptr;
      if (IfPredicateStore) {
        if (Cond[Part]->getType()->isVectorTy())
          Cond[Part] =
@@ -5710,6 +5788,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
        Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
                                 ConstantInt::get(Cond[Part]->getType(), 1));
        CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+      LoopVectorBody.push_back(CondBlock);
        VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase());
        // Update Builder with newly created basic block.
        Builder.SetInsertPoint(InsertPt);
@@ -5735,6 +5814,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
      // End if-block.
        if (IfPredicateStore) {
          BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+        LoopVectorBody.push_back(NewIfBlock);
          VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase());
          Builder.SetInsertPoint(InsertPt);
          Instruction *OldBr = IfBlock->getTerminator();