[safestack] Protect byval function arguments.

[oota-llvm.git] / lib / Transforms / Vectorize / LoopVectorize.cpp
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index 899b639fd126d28d732cb7d0ea76a23d88f13df0..c956a55a1009a497016324d62eebceef209047ed 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -126,6 +126,11 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
                                        "trip count that is smaller than this "
                                        "value."));
  
+static cl::opt<bool> MaximizeBandwidth(
+    "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
+    cl::desc("Maximize bandwidth when selecting vectorization factor which "
+             "will be determined by the smallest type in loop."));
+
  /// This enables versioning on the strides of symbolically striding memory
  /// accesses in code like the following.
  ///   for (i = 0; i < N; ++i)
@@ -139,7 +144,7 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
  ///      ...
  static cl::opt<bool> EnableMemAccessVersioning(
      "enable-mem-access-versioning", cl::init(true), cl::Hidden,
-    cl::desc("Enable symblic stride memory access versioning"));
+    cl::desc("Enable symbolic stride memory access versioning"));
  
  static cl::opt<bool> EnableInterleavedMemAccesses(
      "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
@@ -222,6 +227,15 @@ static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
      cl::desc("The maximum allowed number of runtime memory checks with a "
               "vectorize(enable) pragma."));
  
+static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
+    "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
+    cl::desc("The maximum number of SCEV checks allowed."));
+
+static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
+    "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
+    cl::desc("The maximum number of SCEV checks allowed with a "
+             "vectorize(enable) pragma"));
+
  namespace {
  
  // Forward declarations.
@@ -254,16 +268,29 @@ static Type* ToVectorTy(Type *Scalar, unsigned VF) {
    return VectorType::get(Scalar, VF);
  }
  
-/// A helper function that returns GEP instruction and knows to skip
-/// 'bitcast'.
+/// A helper function that returns GEP instruction and knows to skip a
+/// 'bitcast'. The 'bitcast' may be skipped if the source and the destination
+/// pointee types of the 'bitcast' have the same size.
+/// For example:
+///   bitcast double** %var to i64* - can be skipped
+///   bitcast double** %var to i8*  - can not
  static GetElementPtrInst *getGEPInstruction(Value *Ptr) {
  
    if (isa<GetElementPtrInst>(Ptr))
      return cast<GetElementPtrInst>(Ptr);
  
    if (isa<BitCastInst>(Ptr) &&
-      isa<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0)))
-    return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0));
+      isa<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0))) {
+    Type *BitcastTy = Ptr->getType();
+    Type *GEPTy = cast<BitCastInst>(Ptr)->getSrcTy();
+    if (!isa<PointerType>(BitcastTy) || !isa<PointerType>(GEPTy))
+      return nullptr;
+    Type *Pointee1Ty = cast<PointerType>(BitcastTy)->getPointerElementType();
+    Type *Pointee2Ty = cast<PointerType>(GEPTy)->getPointerElementType();
+    const DataLayout &DL = cast<BitCastInst>(Ptr)->getModule()->getDataLayout();
+    if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty))
+      return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0));
+  }
    return nullptr;
  }
  
@@ -286,19 +313,19 @@ public:
    InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
                        DominatorTree *DT, const TargetLibraryInfo *TLI,
                        const TargetTransformInfo *TTI, unsigned VecWidth,
-                      unsigned UnrollFactor)
+                      unsigned UnrollFactor, SCEVUnionPredicate &Preds)
        : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
          VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
          Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
          TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr),
-        AddedSafetyChecks(false) {}
+        AddedSafetyChecks(false), Preds(Preds) {}
  
    // Perform the actual loop widening (vectorization).
    // MinimumBitWidths maps scalar integer values to the smallest bitwidth they
    // can be validly truncated to. The cost model has assumed this truncation
    // will happen when vectorizing.
    void vectorize(LoopVectorizationLegality *L,
-                 DenseMap<Instruction*,uint64_t> MinimumBitWidths) {
+                 MapVector<Instruction*,uint64_t> MinimumBitWidths) {
      MinBWs = MinimumBitWidths;
      Legal = L;
      // Create a new empty loop. Unlink the old loop and connect the new one.
@@ -328,12 +355,6 @@ protected:
    typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>,
                     VectorParts> EdgeMaskCache;
  
-  /// \brief Add checks for strides that were assumed to be 1.
-  ///
-  /// Returns the last check instruction and the first check instruction in the
-  /// pair as (first, last).
-  std::pair<Instruction *, Instruction *> addStrideCheck(Instruction *Loc);
-
    /// Create an empty loop, based on the loop ranges of the old loop.
    void createEmptyLoop();
    /// Create a new induction variable inside L.
@@ -417,11 +438,12 @@ protected:
    void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
    /// Emit a bypass check to see if the vector trip count is nonzero.
    void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass);
-  /// Emit bypass checks to check if strides we've assumed to be one really are.
-  void emitStrideChecks(Loop *L, BasicBlock *Bypass);
+  /// Emit a bypass check to see if all of the SCEV assumptions we've
+  /// had to make are correct.
+  void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
    /// Emit bypass checks to check any memory assumptions we may have made.
    void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
-  
+
    /// This is a helper class that holds the vectorizer state. It maps scalar
    /// instructions to vector instructions. When the code is 'unrolled' then
    /// then a single scalar value is mapped to multiple vector parts. The parts
@@ -524,19 +546,28 @@ protected:
    /// Map of scalar integer values to the smallest bitwidth they can be legally
    /// represented as. The vector equivalents of these values should be truncated
    /// to this type.
-  DenseMap<Instruction*,uint64_t> MinBWs;
+  MapVector<Instruction*,uint64_t> MinBWs;
    LoopVectorizationLegality *Legal;
  
    // Record whether runtime check is added.
    bool AddedSafetyChecks;
+
+  /// The SCEV predicate containing all the SCEV-related assumptions.
+  /// The predicate is used to simplify existing expressions in the
+  /// context of existing SCEV assumptions. Since legality checking is
+  /// not done here, we don't need to use this predicate to record
+  /// further assumptions.
+  SCEVUnionPredicate &Preds;
  };
  
  class InnerLoopUnroller : public InnerLoopVectorizer {
  public:
    InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
                      DominatorTree *DT, const TargetLibraryInfo *TLI,
-                    const TargetTransformInfo *TTI, unsigned UnrollFactor)
-      : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {}
+                    const TargetTransformInfo *TTI, unsigned UnrollFactor,
+                    SCEVUnionPredicate &Preds)
+      : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor,
+                            Preds) {}
  
  private:
    void scalarizeInstruction(Instruction *Instr,
@@ -617,7 +648,8 @@ static void propagateMetadata(Instruction *To, const Instruction *From) {
  }
  
  /// \brief Propagate known metadata from one instruction to a vector of others.
-static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *From) {
+static void propagateMetadata(SmallVectorImpl<Value *> &To,
+                              const Instruction *From) {
    for (Value *V : To)
      if (Instruction *I = dyn_cast<Instruction>(V))
        propagateMetadata(I, From);
@@ -757,8 +789,9 @@ private:
  /// between the member and the group in a map.
  class InterleavedAccessInfo {
  public:
-  InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT)
-      : SE(SE), TheLoop(L), DT(DT) {}
+  InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT,
+                        SCEVUnionPredicate &Preds)
+      : SE(SE), TheLoop(L), DT(DT), Preds(Preds) {}
  
    ~InterleavedAccessInfo() {
      SmallSet<InterleaveGroup *, 4> DelSet;
@@ -792,6 +825,13 @@ private:
    Loop *TheLoop;
    DominatorTree *DT;
  
+  /// The SCEV predicate containing all the SCEV-related assumptions.
+  /// The predicate is used to simplify SCEV expressions in the
+  /// context of existing SCEV assumptions. The interleaved access
+  /// analysis can also add new predicates (for example by versioning
+  /// strides of pointers).
+  SCEVUnionPredicate &Preds;
+
    /// Holds the relationships between the members and the interleave group.
    DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
  
@@ -1154,11 +1194,13 @@ public:
                              Function *F, const TargetTransformInfo *TTI,
                              LoopAccessAnalysis *LAA,
                              LoopVectorizationRequirements *R,
-                            const LoopVectorizeHints *H)
+                            const LoopVectorizeHints *H,
+                            SCEVUnionPredicate &Preds)
        : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F),
-        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(SE, L, DT),
-        Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
-        Requirements(R), Hints(H) {}
+        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr),
+        InterleaveInfo(SE, L, DT, Preds), Induction(nullptr),
+        WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R), Hints(H),
+        Preds(Preds) {}
  
    /// ReductionList contains the reduction descriptors for all
    /// of the reductions that were found in the loop.
@@ -1188,6 +1230,9 @@ public:
    /// Returns True if V is an induction variable in this loop.
    bool isInductionVariable(const Value *V);
  
+  /// Returns True if PN is a reduction variable in this loop.
+  bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); }
+
    /// Return true if the block BB needs to be predicated in order for the loop
    /// to be vectorized.
    bool blockNeedsPredication(BasicBlock *BB);
@@ -1357,7 +1402,14 @@ private:
  
    /// While vectorizing these instructions we have to generate a
    /// call to the appropriate masked intrinsic
-  SmallPtrSet<const Instruction*, 8> MaskedOp;
+  SmallPtrSet<const Instruction *, 8> MaskedOp;
+
+  /// The SCEV predicate containing all the SCEV-related assumptions.
+  /// The predicate is used to simplify SCEV expressions in the
+  /// context of existing SCEV assumptions. The analysis will also
+  /// add a minimal set of new predicates if this is required to
+  /// enable vectorization/unrolling.
+  SCEVUnionPredicate &Preds;
  };
  
  /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -1373,9 +1425,10 @@ public:
                               LoopVectorizationLegality *Legal,
                               const TargetTransformInfo &TTI,
                               const TargetLibraryInfo *TLI, DemandedBits *DB,
-                             AssumptionCache *AC,
-                             const Function *F, const LoopVectorizeHints *Hints,
-                             SmallPtrSetImpl<const Value *> &ValuesToIgnore)
+                             AssumptionCache *AC, const Function *F,
+                             const LoopVectorizeHints *Hints,
+                             SmallPtrSetImpl<const Value *> &ValuesToIgnore,
+                             SCEVUnionPredicate &Preds)
        : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
          TheFunction(F), Hints(Hints), ValuesToIgnore(ValuesToIgnore) {}
  
@@ -1390,10 +1443,10 @@ public:
    /// possible.
    VectorizationFactor selectVectorizationFactor(bool OptForSize);
  
-  /// \return The size (in bits) of the widest type in the code that
-  /// needs to be vectorized. We ignore values that remain scalar such as
+  /// \return The size (in bits) of the smallest and widest types in the code
+  /// that needs to be vectorized. We ignore values that remain scalar such as
    /// 64 bit loop indices.
-  unsigned getWidestType();
+  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
  
    /// \return The desired interleave count.
    /// If interleave count has been specified by metadata it will be returned.
@@ -1420,8 +1473,10 @@ public:
      unsigned NumInstructions;
    };
  
-  /// \return  information about the register usage of the loop.
-  RegisterUsage calculateRegisterUsage();
+  /// \return Returns information about the register usages of the loop for the
+  /// given vectorization factors.
+  SmallVector<RegisterUsage, 8>
+  calculateRegisterUsage(const SmallVector<unsigned, 8> &VFs);
  
  private:
    /// Returns the expected execution cost. The unit of the cost does
@@ -1450,7 +1505,7 @@ public:
    /// Map of scalar integer values to the smallest bitwidth they can be legally
    /// represented as. The vector equivalents of these values should be truncated
    /// to this type.
-  DenseMap<Instruction*,uint64_t> MinBWs;
+  MapVector<Instruction*,uint64_t> MinBWs;
  
    /// The loop that we evaluate.
    Loop *TheLoop;
@@ -1703,10 +1758,12 @@ struct LoopVectorize : public FunctionPass {
        }
      }
  
+    SCEVUnionPredicate Preds;
+
      // Check if it is legal to vectorize the loop.
      LoopVectorizationRequirements Requirements;
      LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA,
-                                  &Requirements, &Hints);
+                                  &Requirements, &Hints, Preds);
      if (!LVL.canVectorize()) {
        DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
        emitMissedWarning(F, L, Hints);
@@ -1725,7 +1782,7 @@ struct LoopVectorize : public FunctionPass {
  
      // Use the cost model.
      LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, DB, AC, F, &Hints,
-                                  ValuesToIgnore);
+                                  ValuesToIgnore, Preds);
  
      // Check the function attributes to find out if this function should be
      // optimized for size.
@@ -1836,7 +1893,7 @@ struct LoopVectorize : public FunctionPass {
        assert(IC > 1 && "interleave count should not be 1 or 0");
        // If we decided that it is not legal to vectorize the loop then
        // interleave it.
-      InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC);
+      InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC, Preds);
        Unroller.vectorize(&LVL, CM.MinBWs);
  
        emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
@@ -1844,7 +1901,7 @@ struct LoopVectorize : public FunctionPass {
                                   Twine(IC) + ")");
      } else {
        // If we decided that it is *legal* to vectorize the loop then do it.
-      InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC);
+      InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC, Preds);
        LB.vectorize(&LVL, CM.MinBWs);
        ++LoopsVectorized;
  
@@ -2005,7 +2062,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
      //  %idxprom = zext i32 %mul to i64  << Safe cast.
      //  %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom
      //
-    Last = replaceSymbolicStrideSCEV(SE, Strides,
+    Last = replaceSymbolicStrideSCEV(SE, Strides, Preds,
                                       Gep->getOperand(InductionOperand), Gep);
      if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
        Last =
@@ -2047,7 +2104,6 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
    // If this scalar is unknown, assume that it is a constant or that it is
    // loop invariant. Broadcast V and save the value for future uses.
    Value *B = getBroadcastInstrs(V);
-
    return WidenMap.splat(V, B);
  }
  
@@ -2471,7 +2527,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
    }
  }
  
-void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) {
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
+                                               bool IfPredicateStore) {
    assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
    // Holds vector parameters or scalars, in case of uniform vals.
    SmallVector<VectorParts, 4> Params;
@@ -2533,7 +2590,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
        Value *Cmp = nullptr;
        if (IfPredicateStore) {
          Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
-        Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));
+        Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp,
+                                 ConstantInt::get(Cmp->getType(), 1));
        }
  
        Instruction *Cloned = Instr->clone();
@@ -2564,56 +2622,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
    }
  }
  
-static Instruction *getFirstInst(Instruction *FirstInst, Value *V,
-                                 Instruction *Loc) {
-  if (FirstInst)
-    return FirstInst;
-  if (Instruction *I = dyn_cast<Instruction>(V))
-    return I->getParent() == Loc->getParent() ? I : nullptr;
-  return nullptr;
-}
-
-std::pair<Instruction *, Instruction *>
-InnerLoopVectorizer::addStrideCheck(Instruction *Loc) {
-  Instruction *tnullptr = nullptr;
-  if (!Legal->mustCheckStrides())
-    return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
-
-  IRBuilder<> ChkBuilder(Loc);
-
-  // Emit checks.
-  Value *Check = nullptr;
-  Instruction *FirstInst = nullptr;
-  for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(),
-                                         SE = Legal->strides_end();
-       SI != SE; ++SI) {
-    Value *Ptr = stripIntegerCast(*SI);
-    Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1),
-                                       "stride.chk");
-    // Store the first instruction we create.
-    FirstInst = getFirstInst(FirstInst, C, Loc);
-    if (Check)
-      Check = ChkBuilder.CreateOr(Check, C);
-    else
-      Check = C;
-  }
-
-  // We have to do this trickery because the IRBuilder might fold the check to a
-  // constant expression in which case there is no Instruction anchored in a
-  // the block.
-  LLVMContext &Ctx = Loc->getContext();
-  Instruction *TheCheck =
-      BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx));
-  ChkBuilder.Insert(TheCheck, "stride.not.one");
-  FirstInst = getFirstInst(FirstInst, TheCheck, Loc);
-
-  return std::make_pair(FirstInst, TheCheck);
-}
-
-PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L,
-                                                      Value *Start,
-                                                      Value *End,
-                                                      Value *Step,
+PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
+                                                      Value *End, Value *Step,
                                                        Instruction *DL) {
    BasicBlock *Header = L->getHeader();
    BasicBlock *Latch = L->getLoopLatch();
@@ -2649,7 +2659,8 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
    IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
    // Find the loop boundaries.
    const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(OrigLoop);
-  assert(BackedgeTakenCount != SE->getCouldNotCompute() && "Invalid loop count");
+  assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
+         "Invalid loop count");
  
    Type *IdxTy = Legal->getWidestInductionType();
    
@@ -2748,26 +2759,26 @@ void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L,
    LoopBypassBlocks.push_back(BB);
  }
  
-void InnerLoopVectorizer::emitStrideChecks(Loop *L,
-                                           BasicBlock *Bypass) {
+void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
    BasicBlock *BB = L->getLoopPreheader();
-  
-  // Generate the code to check that the strides we assumed to be one are really
-  // one. We want the new basic block to start at the first instruction in a
+
+  // Generate the code to check that the SCEV assumptions that we made.
+  // We want the new basic block to start at the first instruction in a
    // sequence of instructions that form a check.
-  Instruction *StrideCheck;
-  Instruction *FirstCheckInst;
-  std::tie(FirstCheckInst, StrideCheck) = addStrideCheck(BB->getTerminator());
-  if (!StrideCheck)
-    return;
+  SCEVExpander Exp(*SE, Bypass->getModule()->getDataLayout(), "scev.check");
+  Value *SCEVCheck = Exp.expandCodeForPredicate(&Preds, BB->getTerminator());
+
+  if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
+    if (C->isZero())
+      return;
  
    // Create a new block containing the stride check.
-  BB->setName("vector.stridecheck");
+  BB->setName("vector.scevcheck");
    auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
    if (L->getParentLoop())
      L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
    ReplaceInstWithInst(BB->getTerminator(),
-                      BranchInst::Create(Bypass, NewBB, StrideCheck));
+                      BranchInst::Create(Bypass, NewBB, SCEVCheck));
    LoopBypassBlocks.push_back(BB);
    AddedSafetyChecks = true;
  }
@@ -2887,10 +2898,10 @@ void InnerLoopVectorizer::createEmptyLoop() {
    // Now, compare the new count to zero. If it is zero skip the vector loop and
    // jump to the scalar loop.
    emitVectorLoopEnteredCheck(Lp, ScalarPH);
-  // Generate the code to check that the strides we assumed to be one are really
-  // one. We want the new basic block to start at the first instruction in a
-  // sequence of instructions that form a check.
-  emitStrideChecks(Lp, ScalarPH);
+  // Generate the code to check any assumptions that we've made for SCEV
+  // expressions.
+  emitSCEVChecks(Lp, ScalarPH);
+
    // Generate the code that checks in runtime if arrays overlap. We put the
    // checks into a separate block to make the more common case of few elements
    // faster.
@@ -3302,7 +3313,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
      assert(RdxPhi && "Unable to recover vectorized PHI");
  
      // Find the reduction variable descriptor.
-    assert(Legal->getReductionVars()->count(RdxPhi) &&
+    assert(Legal->isReductionVariable(RdxPhi) &&
             "Unable to find the reduction variable");
      RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi];
  
@@ -3597,12 +3608,12 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
    return BlockMask;
  }
  
-void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
-                                              InnerLoopVectorizer::VectorParts &Entry,
-                                              unsigned UF, unsigned VF, PhiVector *PV) {
+void InnerLoopVectorizer::widenPHIInstruction(
+    Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF,
+    unsigned VF, PhiVector *PV) {
    PHINode* P = cast<PHINode>(PN);
    // Handle reduction variables:
-  if (Legal->getReductionVars()->count(P)) {
+  if (Legal->isReductionVariable(P)) {
      for (unsigned part = 0; part < UF; ++part) {
        // This is phase one of vectorizing PHIs.
        Type *VecTy = (VF == 1) ? PN->getType() :
@@ -3664,7 +3675,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
      case InductionDescriptor::IK_NoInduction:
        llvm_unreachable("Unknown induction");
      case InductionDescriptor::IK_IntInduction: {
-      assert(P->getType() == II.getStartValue()->getType() && "Types must match");
+      assert(P->getType() == II.getStartValue()->getType() &&
+             "Types must match");
        // Handle other induction variables that are now based on the
        // canonical one.
        Value *V = Induction;
@@ -3849,9 +3861,10 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
          Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
                                                 CI->getType());
          Value *Broadcasted = getBroadcastInstrs(ScalarCast);
-        InductionDescriptor II = Legal->getInductionVars()->lookup(OldInduction);
-        Constant *Step =
-            ConstantInt::getSigned(CI->getType(), II.getStepValue()->getSExtValue());
+        InductionDescriptor II =
+            Legal->getInductionVars()->lookup(OldInduction);
+        Constant *Step = ConstantInt::getSigned(
+            CI->getType(), II.getStepValue()->getSExtValue());
          for (unsigned Part = 0; Part < UF; ++Part)
            Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
          propagateMetadata(Entry, &*it);
@@ -4143,7 +4156,19 @@ bool LoopVectorizationLegality::canVectorize() {
  
    // Analyze interleaved memory accesses.
    if (UseInterleaved)
-     InterleaveInfo.analyzeInterleaving(Strides);
+    InterleaveInfo.analyzeInterleaving(Strides);
+
+  unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
+  if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
+    SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
+
+  if (Preds.getComplexity() > SCEVThreshold) {
+    emitAnalysis(VectorizationReport()
+                 << "Too many SCEV assumptions need to be made and checked "
+                 << "at runtime");
+    DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
+    return false;
+  }
  
    // Okay! We can vectorize. At this point we don't have any other mem analysis
    // which may limit our maximum vectorization factor, so just return true with
@@ -4449,6 +4474,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
    }
  
    Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
+  Preds.add(&LAI->Preds);
  
    return true;
  }
@@ -4504,8 +4530,8 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
        
        if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
            !isSinglePredecessor) {
-        // Build a masked store if it is legal for the target, otherwise scalarize
-        // the block.
+        // Build a masked store if it is legal for the target, otherwise
+        // scalarize the block.
          bool isLegalMaskedOp =
            isLegalMaskedStore(SI->getValueOperand()->getType(),
                               SI->getPointerOperand());
@@ -4563,7 +4589,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses(
      StoreInst *SI = dyn_cast<StoreInst>(I);
  
      Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
-    int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides);
+    int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides, Preds);
  
      // The factor of the corresponding interleave group.
      unsigned Factor = std::abs(Stride);
@@ -4572,7 +4598,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses(
      if (Factor < 2 || Factor > MaxInterleaveGroupFactor)
        continue;
  
-    const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
+    const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Preds, Ptr);
      PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
      unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType());
  
@@ -4720,7 +4746,8 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
    DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
  
    MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
-  unsigned WidestType = getWidestType();
+  unsigned SmallestType, WidestType;
+  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
    unsigned WidestRegister = TTI.getRegisterBitWidth(true);
    unsigned MaxSafeDepDist = -1U;
    if (Legal->getMaxSafeDepDistBytes() != -1U)
@@ -4728,7 +4755,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
    WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
                      WidestRegister : MaxSafeDepDist);
    unsigned MaxVectorSize = WidestRegister / WidestType;
-  DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
+
+  DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
+               << WidestType << " bits.\n");
    DEBUG(dbgs() << "LV: The Widest register is: "
            << WidestRegister << " bits.\n");
  
@@ -4741,6 +4770,26 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
           " into one vector!");
  
    unsigned VF = MaxVectorSize;
+  if (MaximizeBandwidth && !OptForSize) {
+    // Collect all viable vectorization factors.
+    SmallVector<unsigned, 8> VFs;
+    unsigned NewMaxVectorSize = WidestRegister / SmallestType;
+    for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2)
+      VFs.push_back(VS);
+
+    // For each VF calculate its register usage.
+    auto RUs = calculateRegisterUsage(VFs);
+
+    // Select the largest VF which doesn't require more registers than existing
+    // ones.
+    unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
+    for (int i = RUs.size() - 1; i >= 0; --i) {
+      if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
+        VF = VFs[i];
+        break;
+      }
+    }
+  }
  
    // If we optimize the program for size, avoid creating the tail loop.
    if (OptForSize) {
@@ -4816,7 +4865,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
    return Factor;
  }
  
-unsigned LoopVectorizationCostModel::getWidestType() {
+std::pair<unsigned, unsigned>
+LoopVectorizationCostModel::getSmallestAndWidestTypes() {
+  unsigned MinWidth = -1U;
    unsigned MaxWidth = 8;
    const DataLayout &DL = TheFunction->getParent()->getDataLayout();
  
@@ -4840,7 +4891,7 @@ unsigned LoopVectorizationCostModel::getWidestType() {
        // Examine PHI nodes that are reduction variables. Update the type to
        // account for the recurrence type.
        if (PHINode *PN = dyn_cast<PHINode>(it)) {
-        if (!Legal->getReductionVars()->count(PN))
+        if (!Legal->isReductionVariable(PN))
            continue;
          RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
          T = RdxDesc.getRecurrenceType();
@@ -4856,12 +4907,14 @@ unsigned LoopVectorizationCostModel::getWidestType() {
        if (T->isPointerTy() && !isConsecutiveLoadOrStore(&*it))
          continue;
  
+      MinWidth = std::min(MinWidth,
+                          (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
        MaxWidth = std::max(MaxWidth,
                            (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
      }
    }
  
-  return MaxWidth;
+  return {MinWidth, MaxWidth};
  }
  
  unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
@@ -4907,7 +4960,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
        TargetNumRegisters = ForceTargetNumVectorRegs;
    }
  
-  LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
+  RegisterUsage R = calculateRegisterUsage({VF})[0];
    // We divide by these constants so assume that we have at least one
    // instruction that uses at least one register.
    R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
@@ -5005,8 +5058,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
    }
  
    // Interleave if this is a large loop (small loops are already dealt with by
-  // this
-  // point) that could benefit from interleaving.
+  // this point) that could benefit from interleaving.
    bool HasReductions = (Legal->getReductionVars()->size() > 0);
    if (TTI.enableAggressiveInterleaving(HasReductions)) {
      DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
@@ -5017,8 +5069,9 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
    return 1;
  }
  
-LoopVectorizationCostModel::RegisterUsage
-LoopVectorizationCostModel::calculateRegisterUsage() {
+SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
+LoopVectorizationCostModel::calculateRegisterUsage(
+    const SmallVector<unsigned, 8> &VFs) {
    // This function calculates the register usage by measuring the highest number
    // of values that are alive at a single location. Obviously, this is a very
    // rough estimation. We scan the loop in a topological order in order and
@@ -5039,8 +5092,8 @@ LoopVectorizationCostModel::calculateRegisterUsage() {
    LoopBlocksDFS DFS(TheLoop);
    DFS.perform(LI);
  
-  RegisterUsage R;
-  R.NumInstructions = 0;
+  RegisterUsage RU;
+  RU.NumInstructions = 0;
  
    // Each 'key' in the map opens a new interval. The values
    // of the map are the index of the 'last seen' usage of the
@@ -5059,7 +5112,7 @@ LoopVectorizationCostModel::calculateRegisterUsage() {
    unsigned Index = 0;
    for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
         be = DFS.endRPO(); bb != be; ++bb) {
-    R.NumInstructions += (*bb)->size();
+    RU.NumInstructions += (*bb)->size();
      for (Instruction &I : **bb) {
        IdxToInstr[Index++] = &I;
  
@@ -5094,10 +5147,26 @@ LoopVectorizationCostModel::calculateRegisterUsage() {
      TransposeEnds[it->second].push_back(it->first);
  
    SmallSet<Instruction*, 8> OpenIntervals;
-  unsigned MaxUsage = 0;
  
+  // Get the size of the widest register.
+  unsigned MaxSafeDepDist = -1U;
+  if (Legal->getMaxSafeDepDistBytes() != -1U)
+    MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
+  unsigned WidestRegister =
+      std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
+  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
+
+  SmallVector<RegisterUsage, 8> RUs(VFs.size());
+  SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
  
    DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
+
+  // A lambda that gets the register usage for the given type and VF.
+  auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
+    unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
+    return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
+  };
+
    for (unsigned int i = 0; i < Index; ++i) {
      Instruction *I = IdxToInstr[i];
      // Ignore instructions that are never used within the loop.
@@ -5109,27 +5178,50 @@ LoopVectorizationCostModel::calculateRegisterUsage() {
  
      // Remove all of the instructions that end at this location.
      InstrList &List = TransposeEnds[i];
-    for (unsigned int j=0, e = List.size(); j < e; ++j)
+    for (unsigned int j = 0, e = List.size(); j < e; ++j)
        OpenIntervals.erase(List[j]);
  
-    // Count the number of live interals.
-    MaxUsage = std::max(MaxUsage, OpenIntervals.size());
+    // For each VF find the maximum usage of registers.
+    for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
+      if (VFs[j] == 1) {
+        MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
+        continue;
+      }
+
+      // Count the number of live interals.
+      unsigned RegUsage = 0;
+      for (auto Inst : OpenIntervals)
+        RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
+      MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
+    }
  
-    DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
-          OpenIntervals.size() << '\n');
+    DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
+                 << OpenIntervals.size() << '\n');
  
      // Add the current instruction to the list of open intervals.
      OpenIntervals.insert(I);
    }
  
-  unsigned Invariant = LoopInvariants.size();
-  DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n');
-  DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
-  DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n');
+  for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
+    unsigned Invariant = 0;
+    if (VFs[i] == 1)
+      Invariant = LoopInvariants.size();
+    else {
+      for (auto Inst : LoopInvariants)
+        Invariant += GetRegUsage(Inst->getType(), VFs[i]);
+    }
+
+    DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] <<  '\n');
+    DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
+    DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
+    DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n');
+
+    RU.LoopInvariantRegs = Invariant;
+    RU.MaxLocalUsers = MaxUsages[i];
+    RUs[i] = RU;
+  }
  
-  R.LoopInvariantRegs = Invariant;
-  R.MaxLocalUsers = MaxUsage;
-  return R;
+  return RUs;
  }
  
  unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
@@ -5324,8 +5416,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
    case Instruction::ICmp:
    case Instruction::FCmp: {
      Type *ValTy = I->getOperand(0)->getType();
-    if (VF > 1 && MinBWs.count(dyn_cast<Instruction>(I->getOperand(0))))
-      ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
+    Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
+    auto It = MinBWs.find(Op0AsInstruction);
+    if (VF > 1 && It != MinBWs.end())
+      ValTy = IntegerType::get(ValTy->getContext(), It->second);
      VectorTy = ToVectorTy(ValTy, VF);
      return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
    }