Remove the use of LPPassManager. We can remove LPM because we dont need to run any...

[oota-llvm.git] / lib / Transforms / Vectorize / LoopVectorize.cpp
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index be197db9563dc68ef9ca963e9f953e72f8fa6aa2..d55b7bd360926ec2dd7c9a9ab6d01587e0b1e87a 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -25,6 +25,7 @@
  // 4. LoopVectorizationCostModel - A unit that checks for the profitability
  //    of vectorization. It decides on the optimal vector width, which
  //    can be one, if vectorization is not profitable.
+//
  //===----------------------------------------------------------------------===//
  //
  // The reduction-variable vectorization is based on the paper:
@@ -36,6 +37,9 @@
  // Other ideas/concepts are from:
  //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
  //
+//  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
+//  Vectorizing Compilers.
+//
  //===----------------------------------------------------------------------===//
  #define LV_NAME "loop-vectorize"
  #define DEBUG_TYPE LV_NAME
@@ -55,6 +59,7 @@
  #include "llvm/Analysis/AliasAnalysis.h"
  #include "llvm/Analysis/AliasSetTracker.h"
  #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/Dominators.h"
  #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  #include "llvm/Analysis/ScalarEvolutionExpander.h"
  #include "llvm/Analysis/LoopInfo.h"
@@ -74,6 +79,16 @@ static cl::opt<unsigned>
  VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
            cl::desc("Set the default vectorization width. Zero is autoselect."));
  
+/// We don't vectorize loops with a known constant trip count below this number.
+const unsigned TinyTripCountThreshold = 16;
+
+/// When performing a runtime memory check, do not check more than this
+/// number of pointers. Notice that the check is quadratic!
+const unsigned RuntimeMemoryCheckThreshold = 2;
+
+/// This is the highest vector width that we try to generate.
+const unsigned MaxVectorSize = 8;
+
  namespace {
  
  // Forward declarations.
@@ -98,28 +113,34 @@ class SingleBlockLoopVectorizer {
  public:
    /// Ctor.
    SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
-                            LPPassManager *Lpm, unsigned VecWidth):
-  OrigLoop(Orig), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth),
+                            DominatorTree *Dt, DataLayout *Dl,
+                            unsigned VecWidth):
+  OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), VF(VecWidth),
    Builder(Se->getContext()), Induction(0), OldInduction(0) { }
  
    // Perform the actual loop widening (vectorization).
    void vectorize(LoopVectorizationLegality *Legal) {
-    ///Create a new empty loop. Unlink the old loop and connect the new one.
+    // Create a new empty loop. Unlink the old loop and connect the new one.
      createEmptyLoop(Legal);
-    /// Widen each instruction in the old loop to a new one in the new loop.
-    /// Use the Legality module to find the induction and reduction variables.
+    // Widen each instruction in the old loop to a new one in the new loop.
+    // Use the Legality module to find the induction and reduction variables.
      vectorizeLoop(Legal);
-    // register the new loop.
-    cleanup();
+    // Register the new loop and update the analysis passes.
+    updateAnalysis();
   }
  
  private:
+  /// Add code that checks at runtime if the accessed arrays overlap.
+  /// Returns the comperator value or NULL if no check is needed.
+  Value *addRuntimeCheck(LoopVectorizationLegality *Legal,
+                         Instruction *Loc);
    /// Create an empty loop, based on the loop ranges of the old loop.
    void createEmptyLoop(LoopVectorizationLegality *Legal);
    /// Copy and widen the instructions from the old loop.
    void vectorizeLoop(LoopVectorizationLegality *Legal);
-  /// Insert the new loop to the loop hierarchy and pass manager.
-  void cleanup();
+  /// Insert the new loop to the loop hierarchy and pass manager
+  /// and update the analysis passes.
+  void updateAnalysis();
  
    /// This instruction is un-vectorizable. Implement it as a sequence
    /// of scalars.
@@ -155,8 +176,10 @@ private:
    ScalarEvolution *SE;
    // Loop Info.
    LoopInfo *LI;
-  // Loop Pass Manager;
-  LPPassManager *LPM;
+  // Dominator Tree.
+  DominatorTree *DT;
+  // Data Layout.
+  DataLayout *DL;
    // The vectorization factor to use.
    unsigned VF;
  
@@ -165,6 +188,10 @@ private:
  
    // --- Vectorization state ---
  
+  /// The vector-loop preheader.
+  BasicBlock *LoopVectorPreHeader;
+  /// The scalar-loop preheader.
+  BasicBlock *LoopScalarPreHeader;
    /// Middle Block between the vector and the scalar.
    BasicBlock *LoopMiddleBlock;
    ///The ExitBlock of the scalar loop.
@@ -203,15 +230,13 @@ public:
    TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
  
    /// This represents the kinds of reductions that we support.
-  /// We use the enum values to hold the 'identity' value for
-  /// each operand. This value does not change the result if applied.
    enum ReductionKind {
-    NoReduction = -1, /// Not a reduction.
-    IntegerAdd  = 0,  /// Sum of numbers.
-    IntegerMult = 1,  /// Product of numbers.
-    IntegerOr   = 2,  /// Bitwise or logical OR of numbers.
-    IntegerAnd  = 3,  /// Bitwise or logical AND of numbers.
-    IntegerXor  = 4   /// Bitwise or logical XOR of numbers.
+    NoReduction, /// Not a reduction.
+    IntegerAdd,  /// Sum of numbers.
+    IntegerMult, /// Product of numbers.
+    IntegerOr,   /// Bitwise or logical OR of numbers.
+    IntegerAnd,  /// Bitwise or logical AND of numbers.
+    IntegerXor   /// Bitwise or logical XOR of numbers.
    };
  
    /// This POD struct holds information about reduction variables.
@@ -233,10 +258,49 @@ public:
      ReductionKind Kind;
    };
  
+  // This POD struct holds information about the memory runtime legality
+  // check that a group of pointers do not overlap.
+  struct RuntimePointerCheck {
+    RuntimePointerCheck(): Need(false) {}
+
+    /// Reset the state of the pointer runtime information.
+    void reset() {
+      Need = false;
+      Pointers.clear();
+      Starts.clear();
+      Ends.clear();
+    }
+
+    /// Insert a pointer and calculate the start and end SCEVs.
+    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr) {
+      const SCEV *Sc = SE->getSCEV(Ptr);
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
+      assert(AR && "Invalid addrec expression");
+      const SCEV *Ex = SE->getExitCount(Lp, Lp->getHeader());
+      const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
+      Pointers.push_back(Ptr);
+      Starts.push_back(AR->getStart());
+      Ends.push_back(ScEnd);
+    }
+
+    /// This flag indicates if we need to add the runtime check.
+    bool Need;
+    /// Holds the pointers that we need to check.
+    SmallVector<Value*, 2> Pointers;
+    /// Holds the pointer value at the beginning of the loop.
+    SmallVector<const SCEV*, 2> Starts;
+    /// Holds the pointer value at the end of the loop.
+    SmallVector<const SCEV*, 2> Ends;
+  };
+
    /// ReductionList contains the reduction descriptors for all
    /// of the reductions that were found in the loop.
    typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
  
+  /// InductionList saves induction variables and maps them to the initial
+  /// value entring the loop.
+  typedef DenseMap<PHINode*, Value*> InductionList;
+
    /// Returns true if it is legal to vectorize this loop.
    /// This does not mean that it is profitable to vectorize this
    /// loop, only that it is legal to do so.
@@ -248,15 +312,23 @@ public:
    /// Returns the reduction variables found in the loop.
    ReductionList *getReductionVars() { return &Reductions; }
  
-  /// Check if the pointer returned by this GEP is consecutive
-  /// when the index is vectorized. This happens when the last
-  /// index of the GEP is consecutive, like the induction variable.
+  /// Returns the induction variables found in the loop.
+  InductionList *getInductionVars() { return &Inductions; }
+
+  /// Check if this  pointer is consecutive when vectorizing. This happens
+  /// when the last index of the GEP is the induction variable, or that the
+  /// pointer itself is an induction variable.
    /// This check allows us to vectorize A[idx] into a wide load/store.
-  bool isConsecutiveGep(Value *Ptr);
+  bool isConsecutivePtr(Value *Ptr);
+
+  /// Returns true if the value V is uniform within the loop.
+  bool isUniform(Value *V);
  
    /// Returns true if this instruction will remain scalar after vectorization.
    bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);}
  
+  /// Returns the information that we collected about runtime memory check.
+  RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; }
  private:
    /// Check if a single basic block loop is vectorizable.
    /// At this point we know that this is a loop with a constant trip count
@@ -277,6 +349,8 @@ private:
    bool isReductionInstr(Instruction *I, ReductionKind Kind);
    /// Returns True, if 'Phi' is an induction variable.
    bool isInductionVariable(PHINode *Phi);
+  /// Return true if can compute the address bounds of Ptr within the loop.
+  bool hasComputableBounds(Value *Ptr);
  
    /// The loop that we evaluate.
    Loop *TheLoop;
@@ -287,16 +361,25 @@ private:
  
    //  ---  vectorization state --- //
  
-  /// Holds the induction variable.
+  /// Holds the integer induction variable. This is the counter of the
+  /// loop.
    PHINode *Induction;
    /// Holds the reduction variables.
    ReductionList Reductions;
+  /// Holds all of the induction variables that we found in the loop.
+  /// Notice that inductions don't need to start at zero and that induction
+  /// variables can be pointers.
+  InductionList Inductions;
+
    /// Allowed outside users. This holds the reduction
    /// vars which can be accessed from outside the loop.
    SmallPtrSet<Value*, 4> AllowedExit;
    /// This set holds the variables which are known to be uniform after
    /// vectorization.
    SmallPtrSet<Instruction*, 4> Uniforms;
+  /// We need to check that all of the pointers in this list are disjoint
+  /// at runtime.
+  RuntimePointerCheck PtrRtCheck;
  };
  
  /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -317,7 +400,7 @@ public:
    /// Returns the most profitable vectorization factor for the loop that is
    /// smaller or equal to the VF argument. This method checks every power
    /// of two up to VF.
-  unsigned findBestVectorizationFactor(unsigned VF = 8);
+  unsigned findBestVectorizationFactor(unsigned VF = MaxVectorSize);
  
  private:
    /// Returns the expected execution cost. The unit of the cost does
@@ -357,6 +440,7 @@ struct LoopVectorize : public LoopPass {
    DataLayout *DL;
    LoopInfo *LI;
    TargetTransformInfo *TTI;
+  DominatorTree *DT;
  
    virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
      // We only vectorize innermost loops.
@@ -367,6 +451,7 @@ struct LoopVectorize : public LoopPass {
      DL = getAnalysisIfAvailable<DataLayout>();
      LI = &getAnalysis<LoopInfo>();
      TTI = getAnalysisIfAvailable<TargetTransformInfo>();
+    DT = &getAnalysis<DominatorTree>();
  
      DEBUG(dbgs() << "LV: Checking a loop in \"" <<
            L->getHeader()->getParent()->getName() << "\"\n");
@@ -398,10 +483,12 @@ struct LoopVectorize : public LoopPass {
        VF = VectorizationFactor;
      }
  
-    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ").\n");
+    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
+          L->getHeader()->getParent()->getParent()->getModuleIdentifier()<<
+          "\n");
  
      // If we decided that it is *legal* to vectorizer the loop then do it.
-    SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, VF);
+    SingleBlockLoopVectorizer LB(L, SE, LI, DT, DL, VF);
      LB.vectorize(&LVL);
  
      DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -414,19 +501,29 @@ struct LoopVectorize : public LoopPass {
      AU.addRequiredID(LCSSAID);
      AU.addRequired<LoopInfo>();
      AU.addRequired<ScalarEvolution>();
+    AU.addRequired<DominatorTree>();
+    AU.addPreserved<LoopInfo>();
+    AU.addPreserved<DominatorTree>();
    }
  
  };
  
  Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
-  // Instructions that access the old induction variable
-  // actually want to get the new one.
-  if (V == OldInduction)
-    V = Induction;
    // Create the types.
    LLVMContext &C = V->getContext();
    Type *VTy = VectorType::get(V->getType(), VF);
    Type *I32 = IntegerType::getInt32Ty(C);
+
+  // Save the current insertion location.
+  Instruction *Loc = Builder.GetInsertPoint();
+
+  // We need to place the broadcast of invariant variables outside the loop.
+  bool Invariant = (OrigLoop->isLoopInvariant(V) && V != Induction);
+
+  // Place the code for broadcasting invariant variables in the new preheader.
+  if (Invariant)
+    Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+
    Constant *Zero = ConstantInt::get(I32, 0);
    Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
    Value *UndefVal = UndefValue::get(VTy);
@@ -435,10 +532,11 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
    // Broadcast the scalar into all locations in the vector.
    Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
                                               "broadcast");
-  // We are accessing the induction variable. Make sure to promote the
-  // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes.
-  if (V == Induction)
-    return getConsecutiveVector(Shuf);
+
+  // Restore the builder insertion point.
+  if (Invariant)
+    Builder.SetInsertPoint(Loc);
+
    return Shuf;
  }
  
@@ -462,7 +560,14 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
    return Builder.CreateAdd(Val, Cv, "induction");
  }
  
-bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
+bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
+  assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
+
+  // If this pointer is an induction variable, return it.
+  PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
+  if (Phi && getInductionVars()->count(Phi))
+    return true;
+
    GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
    if (!Gep)
      return false;
@@ -475,7 +580,7 @@ bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
      if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
        return false;
  
-  // We can emit wide load/stores only of the last index is the induction
+  // We can emit wide load/stores only if the last index is the induction
    // variable.
    const SCEV *Last = SE->getSCEV(LastIndex);
    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
@@ -490,7 +595,12 @@ bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
    return false;
  }
  
+bool LoopVectorizationLegality::isUniform(Value *V) {
+  return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
+}
+
  Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
+  assert(V != Induction && "The new induction variable should not be used.");
    assert(!V->getType()->isVectorTy() && "Can't widen a vector");
    // If we saved a vectorized copy of V, use it.
    Value *&MapEntry = WidenMap[V];
@@ -505,13 +615,7 @@ Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
  
  Constant*
  SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
-  SmallVector<Constant*, 8> Indices;
-  // Create a vector of consecutive numbers from zero to VF.
-  for (unsigned i = 0; i < VF; ++i)
-    Indices.push_back(ConstantInt::get(ScalarTy, Val));
-
-  // Add the consecutive indices to the vector value.
-  return ConstantVector::get(Indices);
+  return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true));
  }
  
  void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
@@ -525,7 +629,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
  
      // If we are accessing the old induction variable, use the new one.
      if (SrcOp == OldInduction) {
-      Params.push_back(getBroadcastInstrs(Induction));
+      Params.push_back(getVectorValue(SrcOp));
        continue;
      }
  
@@ -584,7 +688,69 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
      WidenMap[Instr] = VecResults;
  }
  
-void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
+Value*
+SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
+                                           Instruction *Loc) {
+  LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
+    Legal->getRuntimePointerCheck();
+
+  if (!PtrRtCheck->Need)
+    return NULL;
+
+  Value *MemoryRuntimeCheck = 0;
+  unsigned NumPointers = PtrRtCheck->Pointers.size();
+  SmallVector<Value* , 2> Starts;
+  SmallVector<Value* , 2> Ends;
+
+  SCEVExpander Exp(*SE, "induction");
+
+  // Use this type for pointer arithmetic.
+  Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType();
+
+  for (unsigned i = 0; i < NumPointers; ++i) {
+    Value *Ptr = PtrRtCheck->Pointers[i];
+    const SCEV *Sc = SE->getSCEV(Ptr);
+
+    if (SE->isLoopInvariant(Sc, OrigLoop)) {
+      DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" <<
+            *Ptr <<"\n");
+      Starts.push_back(Ptr);
+      Ends.push_back(Ptr);
+    } else {
+      DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
+
+      Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i],
+                                       PtrArithTy, Loc);
+      Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
+      Starts.push_back(Start);
+      Ends.push_back(End);
+    }
+  }
+
+  for (unsigned i = 0; i < NumPointers; ++i) {
+    for (unsigned j = i+1; j < NumPointers; ++j) {
+      Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+                                    Starts[i], Ends[j], "bound0", Loc);
+      Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+                                    Starts[j], Ends[i], "bound1", Loc);
+      Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
+                                                 "found.conflict", Loc);
+      if (MemoryRuntimeCheck)
+        MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
+                                                    MemoryRuntimeCheck,
+                                                    IsConflict,
+                                                    "conflict.rdx", Loc);
+      else
+        MemoryRuntimeCheck = IsConflict;
+
+    }
+  }
+
+  return MemoryRuntimeCheck;
+}
+
+void
+SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
    /*
     In this function we generate a new loop. The new loop will contain
     the vectorized instructions while the old loop will continue to run the
@@ -614,38 +780,18 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal
     ...
     */
  
-  // This is the original scalar-loop preheader.
+  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
    BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
    BasicBlock *ExitBlock = OrigLoop->getExitBlock();
    assert(ExitBlock && "Must have an exit block");
  
-  assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
-  assert(BypassBlock && "Invalid loop structure");
-
-  BasicBlock *VectorPH =
-      BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
-  BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(),
-                                                 "vector.body");
-
-  BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
-                                                  "middle.block");
-  BasicBlock *ScalarPH =
-    MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
-                                 "scalar.preheader");
-  // Find the induction variable.
-  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
+  // Some loops have a single integer induction variable, while other loops
+  // don't. One example is c++ iterators that often have multiple pointer
+  // induction variables. In the code below we also support a case where we
+  // don't have a single induction variable.
    OldInduction = Legal->getInduction();
-  assert(OldInduction && "We must have a single phi node.");
-  Type *IdxTy = OldInduction->getType();
-
-  // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
-  // inside the loop.
-  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
-
-  // Generate the induction variable.
-  Induction = Builder.CreatePHI(IdxTy, 2, "index");
-  Constant *Zero = ConstantInt::get(IdxTy, 0);
-  Constant *Step = ConstantInt::get(IdxTy, VF);
+  Type *IdxTy = OldInduction ? OldInduction->getType() :
+    DL->getIntPtrType(SE->getContext());
  
    // Find the loop boundaries.
    const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
@@ -658,35 +804,148 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal
    // Expand the trip count and place the new instructions in the preheader.
    // Notice that the pre-header does not change, only the loop body.
    SCEVExpander Exp(*SE, "induction");
+
+  // Count holds the overall loop count (N).
+  Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
+                                   BypassBlock->getTerminator());
+
+  // The loop index does not have to start at Zero. Find the original start
+  // value from the induction PHI node. If we don't have an induction variable
+  // then we know that it starts at zero.
+  Value *StartIdx = OldInduction ?
+    OldInduction->getIncomingValueForBlock(BypassBlock):
+    ConstantInt::get(IdxTy, 0);
+
+  assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
+  assert(BypassBlock && "Invalid loop structure");
+
+  // Generate the code that checks in runtime if arrays overlap.
+  Value *MemoryRuntimeCheck = addRuntimeCheck(Legal,
+                                              BypassBlock->getTerminator());
+
+  // Split the single block loop into the two loop structure described above.
+  BasicBlock *VectorPH =
+      BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
+  BasicBlock *VecBody =
+    VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
+  BasicBlock *MiddleBlock =
+    VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
+  BasicBlock *ScalarPH =
+    MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
+
+  // This is the location in which we add all of the logic for bypassing
+  // the new vector loop.
    Instruction *Loc = BypassBlock->getTerminator();
  
+  // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
+  // inside the loop.
+  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
+
+  // Generate the induction variable.
+  Induction = Builder.CreatePHI(IdxTy, 2, "index");
+  Constant *Step = ConstantInt::get(IdxTy, VF);
+
    // We may need to extend the index in case there is a type mismatch.
    // We know that the count starts at zero and does not overflow.
-  // We are using Zext because it should be less expensive.
-  if (ExitCount->getType() != Induction->getType())
-    ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
+  if (Count->getType() != IdxTy) {
+    // The exit count can be of pointer type. Convert it to the correct
+    // integer type.
+    if (ExitCount->getType()->isPointerTy())
+      Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc);
+    else
+      Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc);
+  }
+
+  // Add the start index to the loop count to get the new end index.
+  Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc);
  
-  // Count holds the overall loop count (N).
-  Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
    // Now we need to generate the expression for N - (N % VF), which is
    // the part that the vectorized body will execute.
    Constant *CIVF = ConstantInt::get(IdxTy, VF);
    Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc);
    Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
+  Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx,
+                                                     "end.idx.rnd.down", Loc);
  
-  // Now, compare the new count to zero. If it is zero, jump to the scalar part.
+  // Now, compare the new count to zero. If it is zero skip the vector loop and
+  // jump to the scalar loop.
    Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
-                               CountRoundDown, ConstantInt::getNullValue(IdxTy),
+                               IdxEndRoundDown,
+                               StartIdx,
                                 "cmp.zero", Loc);
+
+  // If we are using memory runtime checks, include them in.
+  if (MemoryRuntimeCheck)
+    Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck,
+                                 "CntOrMem", Loc);
+
    BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
    // Remove the old terminator.
    Loc->eraseFromParent();
  
+  // We are going to resume the execution of the scalar loop.
+  // Go over all of the induction variables that we found and fix the
+  // PHIs that are left in the scalar version of the loop.
+  // The starting values of PHI nodes depend on the counter of the last
+  // iteration in the vectorized loop.
+  // If we come from a bypass edge then we need to start from the original start
+  // value.
+
+  // This variable saves the new starting index for the scalar loop.
+  PHINode *ResumeIndex = 0;
+  LoopVectorizationLegality::InductionList::iterator I, E;
+  LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
+  for (I = List->begin(), E = List->end(); I != E; ++I) {
+    PHINode *OrigPhi = I->first;
+    PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val",
+                                           MiddleBlock->getTerminator());
+    Value *EndValue = 0;
+    if (OrigPhi->getType()->isIntegerTy()) {
+      // Handle the integer induction counter:
+      assert(OrigPhi == OldInduction && "Unknown integer PHI");
+      // We know what the end value is.
+      EndValue = IdxEndRoundDown;
+      // We also know which PHI node holds it.
+      ResumeIndex = ResumeVal;
+    } else {
+      // For pointer induction variables, calculate the offset using
+      // the end index.
+      EndValue = GetElementPtrInst::Create(I->second, CountRoundDown,
+                                           "ptr.ind.end",
+                                           BypassBlock->getTerminator());
+    }
+
+    // The new PHI merges the original incoming value, in case of a bypass,
+    // or the value at the end of the vectorized loop.
+    ResumeVal->addIncoming(I->second, BypassBlock);
+    ResumeVal->addIncoming(EndValue, VecBody);
+
+    // Fix the scalar body counter (PHI node).
+    unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
+    OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
+  }
+
+  // If we are generating a new induction variable then we also need to
+  // generate the code that calculates the exit value. This value is not
+  // simply the end of the counter because we may skip the vectorized body
+  // in case of a runtime check.
+  if (!OldInduction){
+    assert(!ResumeIndex && "Unexpected resume value found");
+    ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
+                                  MiddleBlock->getTerminator());
+    ResumeIndex->addIncoming(StartIdx, BypassBlock);
+    ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
+  }
+
+  // Make sure that we found the index where scalar loop needs to continue.
+  assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
+         "Invalid resume Index");
+
    // Add a check in the middle block to see if we have completed
    // all of the iterations in the first vector loop.
    // If (N - N%VF) == N, then we *don't* need to run the remainder.
-  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
-                                CountRoundDown, "cmp.n",
+  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
+                                ResumeIndex, "cmp.n",
                                  MiddleBlock->getTerminator());
  
    BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
@@ -695,36 +954,37 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal
  
    // Create i+1 and fill the PHINode.
    Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
-  Induction->addIncoming(Zero, VectorPH);
+  Induction->addIncoming(StartIdx, VectorPH);
    Induction->addIncoming(NextIdx, VecBody);
    // Create the compare.
-  Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown);
+  Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
    Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
  
    // Now we have two terminators. Remove the old one from the block.
    VecBody->getTerminator()->eraseFromParent();
  
-  // Fix the scalar body iteration count.
-  unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
-  OldInduction->setIncomingValue(BlockIdx, CountRoundDown);
-
    // Get ready to start creating new instructions into the vectorized body.
    Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
  
-  // Register the new loop.
+  // Create and register the new vector loop.
    Loop* Lp = new Loop();
-  LPM->insertLoop(Lp, OrigLoop->getParentLoop());
-
-  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
-
    Loop *ParentLoop = OrigLoop->getParentLoop();
+
+  // Insert the new loop into the loop nest and register the new basic blocks.
    if (ParentLoop) {
+    ParentLoop->addChildLoop(Lp);
      ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
      ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
      ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
+  } else {
+    LI->addTopLevelLoop(Lp);
    }
  
+  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
+
    // Save the state.
+  LoopVectorPreHeader = VectorPH;
+  LoopScalarPreHeader = ScalarPH;
    LoopMiddleBlock = MiddleBlock;
    LoopExitBlock = ExitBlock;
    LoopVectorBody = VecBody;
@@ -732,6 +992,27 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal
    LoopBypassBlock = BypassBlock;
  }
  
+/// This function returns the identity element (or neutral element) for
+/// the operation K.
+static unsigned
+getReductionIdentity(LoopVectorizationLegality::ReductionKind K) {
+  switch (K) {
+  case LoopVectorizationLegality::IntegerXor:
+  case LoopVectorizationLegality::IntegerAdd:
+  case LoopVectorizationLegality::IntegerOr:
+    // Adding, Xoring, Oring zero to a number does not change it.
+    return 0;
+  case LoopVectorizationLegality::IntegerMult:
+    // Multiplying a number by 1 does not change it.
+    return 1;
+  case LoopVectorizationLegality::IntegerAnd:
+    // AND-ing a number with an all-1 value does not change it.
+    return -1;
+  default:
+    llvm_unreachable("Unknown reduction kind");
+  }
+}
+
  void
  SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
    //===------------------------------------------------===//
@@ -748,13 +1029,13 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
  
    // In order to support reduction variables we need to be able to vectorize
    // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
-  // steages. First, we create a new vector PHI node with no incoming edges.
+  // stages. First, we create a new vector PHI node with no incoming edges.
    // We use this value when we vectorize all of the instructions that use the
    // PHI. Next, after all of the instructions in the block are complete we
    // add the new incoming edges to the PHI. At this point all of the
    // instructions in the basic block are vectorized, so we can use them to
    // construct the PHI.
-  PhiVector PHIsToFix;
+  PhiVector RdxPHIsToFix;
  
    // For each instruction in the old loop.
    for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
@@ -767,15 +1048,58 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
          continue;
        case Instruction::PHI:{
          PHINode* P = cast<PHINode>(Inst);
-        // Special handling for the induction var.
-        if (OldInduction == Inst)
+        // Handle reduction variables:
+        if (Legal->getReductionVars()->count(P)) {
+          // This is phase one of vectorizing PHIs.
+          Type *VecTy = VectorType::get(Inst->getType(), VF);
+          WidenMap[Inst] = PHINode::Create(VecTy, 2, "vec.phi",
+                                  LoopVectorBody->getFirstInsertionPt());
+          RdxPHIsToFix.push_back(P);
+          continue;
+        }
+
+        // This PHINode must be an induction variable.
+        // Make sure that we know about it.
+        assert(Legal->getInductionVars()->count(P) &&
+               "Not an induction variable");
+
+        if (P->getType()->isIntegerTy()) {
+          assert(P == OldInduction && "Unexpected PHI");
+          Value *Broadcasted = getBroadcastInstrs(Induction);
+          // After broadcasting the induction variable we need to make the
+          // vector consecutive by adding 0, 1, 2 ...
+          Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted);
+           
+          WidenMap[OldInduction] = ConsecutiveInduction;
            continue;
-        // This is phase one of vectorizing PHIs.
-        // This has to be a reduction variable.
-        assert(Legal->getReductionVars()->count(P) && "Not a Reduction");
-        Type *VecTy = VectorType::get(Inst->getType(), VF);
-        WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi");
-        PHIsToFix.push_back(P);
+        }
+
+        // Handle pointer inductions.
+        assert(P->getType()->isPointerTy() && "Unexpected type.");
+        Value *StartIdx = OldInduction ?
+          Legal->getInductionVars()->lookup(OldInduction) :
+          ConstantInt::get(Induction->getType(), 0);
+
+        // This is the pointer value coming into the loop.
+        Value *StartPtr = Legal->getInductionVars()->lookup(P);
+
+        // This is the normalized GEP that starts counting at zero.
+        Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
+                                                 "normalized.idx");
+
+        // This is the vector of results. Notice that we don't generate vector
+        // geps because scalar geps result in better code.
+        Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
+        for (unsigned int i = 0; i < VF; ++i) {
+          Constant *Idx = ConstantInt::get(Induction->getType(), i);
+          Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
+          Value *SclrGep = Builder.CreateGEP(StartPtr, GlobalIdx, "next.gep");
+          VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
+                                               Builder.getInt32(i),
+                                               "insert.gep");
+        }
+
+        WidenMap[Inst] = VecVal;
          continue;
        }
        case Instruction::Add:
@@ -800,8 +1124,19 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
          BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
          Value *A = getVectorValue(Inst->getOperand(0));
          Value *B = getVectorValue(Inst->getOperand(1));
+
          // Use this vector value for all users of the original instruction.
-        WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
+        Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
+        WidenMap[Inst] = V;
+
+        // Update the NSW, NUW and Exact flags.
+        BinaryOperator *VecOp = cast<BinaryOperator>(V);
+        if (isa<OverflowingBinaryOperator>(BinOp)) {
+          VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
+          VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
+        }
+        if (isa<PossiblyExactOperator>(VecOp))
+          VecOp->setIsExact(BinOp->isExact());
          break;
        }
        case Instruction::Select: {
@@ -845,23 +1180,34 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
          Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
          Value *Ptr = SI->getPointerOperand();
          unsigned Alignment = SI->getAlignment();
+
+        assert(!Legal->isUniform(Ptr) &&
+               "We do not allow storing to uniform addresses");
+
          GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+
          // This store does not use GEPs.
-        if (!Legal->isConsecutiveGep(Gep)) {
+        if (!Legal->isConsecutivePtr(Ptr)) {
            scalarizeInstruction(Inst);
            break;
          }
  
-        // The last index does not have to be the induction. It can be
-        // consecutive and be a function of the index. For example A[I+1];
-        unsigned NumOperands = Gep->getNumOperands();
-        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
-        LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0));
-
-        // Create the new GEP with the new induction variable.
-        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-        Gep2->setOperand(NumOperands - 1, LastIndex);
-        Ptr = Builder.Insert(Gep2);
+        if (Gep) {
+          // The last index does not have to be the induction. It can be
+          // consecutive and be a function of the index. For example A[I+1];
+          unsigned NumOperands = Gep->getNumOperands();
+          Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
+          LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+          // Create the new GEP with the new induction variable.
+          GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+          Gep2->setOperand(NumOperands - 1, LastIndex);
+          Ptr = Builder.Insert(Gep2);
+        } else {
+          // Use the induction element ptr.
+          assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+          Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
+        }
          Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
          Value *Val = getVectorValue(SI->getValueOperand());
          Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
@@ -875,22 +1221,31 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
          unsigned Alignment = LI->getAlignment();
          GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
  
-        // We don't have a gep. Scalarize the load.
-        if (!Legal->isConsecutiveGep(Gep)) {
+        // If the pointer is loop invariant or if it is non consecutive,
+        // scalarize the load.
+        bool Con = Legal->isConsecutivePtr(Ptr);
+        if (Legal->isUniform(Ptr) || !Con) {
            scalarizeInstruction(Inst);
            break;
          }
  
-        // The last index does not have to be the induction. It can be
-        // consecutive and be a function of the index. For example A[I+1];
-        unsigned NumOperands = Gep->getNumOperands();
-        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
-        LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0));
+        if (Gep) {
+          // The last index does not have to be the induction. It can be
+          // consecutive and be a function of the index. For example A[I+1];
+          unsigned NumOperands = Gep->getNumOperands();
+          Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
+          LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+          // Create the new GEP with the new induction variable.
+          GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+          Gep2->setOperand(NumOperands - 1, LastIndex);
+          Ptr = Builder.Insert(Gep2);
+        } else {
+          // Use the induction element ptr.
+          assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+          Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
+        }
  
-        // Create the new GEP with the new induction variable.
-        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-        Gep2->setOperand(NumOperands - 1, LastIndex);
-        Ptr = Builder.Insert(Gep2);
          Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
          LI = Builder.CreateLoad(Ptr);
          LI->setAlignment(Alignment);
@@ -934,7 +1289,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
    // Create the 'reduced' values for each of the induction vars.
    // The reduced values are the vector values that we scalarize and combine
    // after the loop is finished.
-  for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end();
+  for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
         it != e; ++it) {
      PHINode *RdxPhi = *it;
      PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
@@ -956,10 +1311,9 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
      Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
      Type *VecTy = VectorExit->getType();
  
-    // Find the reduction identity variable. The value of the enum is the
-    // identity. Zero for addition. One for Multiplication.
-    unsigned IdentitySclr =  RdxDesc.Kind;
-    Constant *Identity = getUniformVector(IdentitySclr,
+    // Find the reduction identity variable. Zero for addition, or, xor,
+    // one for multiplication, -1 for And.
+    Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind),
                                            VecTy->getScalarType());
  
      // This vector is the Identity vector where the first element is the
@@ -967,7 +1321,6 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
      Value *VectorStart = Builder.CreateInsertElement(Identity,
                                                      RdxDesc.StartValue, Zero);
  
-
      // Fix the vector-loop phi.
      // We created the induction variable so we know that the
      // preheader is the first entry.
@@ -1051,16 +1404,29 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
    }// end of for each redux variable.
  }
  
-void SingleBlockLoopVectorizer::cleanup() {
-  // The original basic block.
+void SingleBlockLoopVectorizer::updateAnalysis() {
+  // Forget the original basic block.
    SE->forgetLoop(OrigLoop);
+
+  // Update the dominator tree information.
+  assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) &&
+         "Entry does not dominate exit.");
+
+  DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock);
+  DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
+  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock);
+  DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
+  DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
+  DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
+
+  DEBUG(DT->verifyAnalysis());
  }
  
  bool LoopVectorizationLegality::canVectorize() {
    if (!TheLoop->getLoopPreheader()) {
      assert(false && "No preheader!!");
      DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
-    return  false;
+    return false;
    }
  
    // We can only vectorize single basic block loops.
@@ -1074,12 +1440,6 @@ bool LoopVectorizationLegality::canVectorize() {
    BasicBlock *BB = TheLoop->getHeader();
    DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n");
  
-  // Go over each instruction and look at memory deps.
-  if (!canVectorizeBlock(*BB)) {
-    DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
-    return false;
-  }
-
    // ScalarEvolution needs to be able to find the exit count.
    const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
    if (ExitCount == SE->getCouldNotCompute()) {
@@ -1087,7 +1447,23 @@ bool LoopVectorizationLegality::canVectorize() {
      return false;
    }
  
-  DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
+  // Do not loop-vectorize loops with a tiny trip count.
+  unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB);
+  if (TC > 0u && TC < TinyTripCountThreshold) {
+    DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
+          "This loop is not worth vectorizing.\n");
+    return false;
+  }
+
+  // Go over each instruction and look at memory deps.
+  if (!canVectorizeBlock(*BB)) {
+    DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
+    return false;
+  }
+
+  DEBUG(dbgs() << "LV: We can vectorize this loop" <<
+        (PtrRtCheck.Need ? " (with a runtime bound check)" : "")
+        <<"!\n");
  
    // Okay! We can vectorize. At this point we don't have any other mem analysis
    // which may limit our maximum vectorization factor, so just return true with
@@ -1096,23 +1472,34 @@ bool LoopVectorizationLegality::canVectorize() {
  }
  
  bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
+
+  BasicBlock *PreHeader = TheLoop->getLoopPreheader();
+
    // Scan the instructions in the block and look for hazards.
    for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
      Instruction *I = it;
  
-    PHINode *Phi = dyn_cast<PHINode>(I);
-    if (Phi) {
+    if (PHINode *Phi = dyn_cast<PHINode>(I)) {
        // This should not happen because the loop should be normalized.
        if (Phi->getNumIncomingValues() != 2) {
          DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
          return false;
        }
-      // We only look at integer phi nodes.
-      if (!Phi->getType()->isIntegerTy()) {
-        DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
+
+      // This is the value coming from the preheader.
+      Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
+
+      // We only look at integer and pointer phi nodes.
+      if (Phi->getType()->isPointerTy() && isInductionVariable(Phi)) {
+        DEBUG(dbgs() << "LV: Found a pointer induction variable.\n");
+        Inductions[Phi] = StartValue;
+        continue;
+      } else if (!Phi->getType()->isIntegerTy()) {
+        DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
          return false;
        }
  
+      // Handle integer PHIs:
        if (isInductionVariable(Phi)) {
          if (Induction) {
            DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
@@ -1120,6 +1507,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
          }
          DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n");
          Induction = Phi;
+        Inductions[Phi] = StartValue;
          continue;
        }
        if (AddReductionVar(Phi, IntegerAdd)) {
@@ -1150,8 +1538,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
      // We still don't handle functions.
      CallInst *CI = dyn_cast<CallInst>(I);
      if (CI) {
-      DEBUG(dbgs() << "LV: Found a call site:"<<
-            CI->getCalledFunction()->getName() << "\n");
+      DEBUG(dbgs() << "LV: Found a call site.\n");
        return false;
      }
  
@@ -1179,8 +1566,8 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
    } // next instr.
  
    if (!Induction) {
-      DEBUG(dbgs() << "LV: Did not find an induction var.\n");
-      return false;
+    DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
+    assert(getInductionVars()->size() && "No induction variables");
    }
  
    // Don't vectorize if the memory dependencies do not allow vectorization.
@@ -1197,21 +1584,16 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
    while (Worklist.size()) {
      Instruction *I = dyn_cast<Instruction>(Worklist.back());
      Worklist.pop_back();
-    // Look at instructions inside this block.
-    if (!I) continue;
-    if (I->getParent() != &BB) continue;
  
-    // Stop when reaching PHI nodes.
-    if (isa<PHINode>(I)) {
-      assert(I == Induction && "Found a uniform PHI that is not the induction");
-      break;
-    }
+    // Look at instructions inside this block. Stop when reaching PHI nodes.
+    if (!I || I->getParent() != &BB || isa<PHINode>(I))
+      continue;
  
      // This is a known uniform.
      Uniforms.insert(I);
  
      // Insert all operands.
-    for (int i=0, Op = I->getNumOperands(); i < Op; ++i) {
+    for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) {
        Worklist.push_back(I->getOperand(i));
      }
    }
@@ -1225,6 +1607,8 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
    // Holds the Load and Store *instructions*.
    ValueVector Loads;
    ValueVector Stores;
+  PtrRtCheck.Pointers.clear();
+  PtrRtCheck.Need = false;
  
    // Scan the BB and collect legal loads and stores.
    for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
@@ -1282,6 +1666,12 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
      StoreInst *ST = dyn_cast<StoreInst>(*I);
      assert(ST && "Bad StoreInst");
      Value* Ptr = ST->getPointerOperand();
+
+    if (isUniform(Ptr)) {
+      DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
+      return false;
+    }
+
      // If we did *not* see this pointer before, insert it to
      // the read-write list. At this phase it is only a 'write' list.
      if (Seen.insert(Ptr))
@@ -1300,10 +1690,50 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
      // If the address of i is unknown (for example A[B[i]]) then we may
      // read a few words, modify, and write a few words, and some of the
      // words may be written to the same address.
-    if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr))
+    if (Seen.insert(Ptr) || !isConsecutivePtr(Ptr))
        Reads.push_back(Ptr);
    }
  
+  // If we write (or read-write) to a single destination and there are no
+  // other reads in this loop then is it safe to vectorize.
+  if (ReadWrites.size() == 1 && Reads.size() == 0) {
+    DEBUG(dbgs() << "LV: Found a write-only loop!\n");
+    return true;
+  }
+
+  // Find pointers with computable bounds. We are going to use this information
+  // to place a runtime bound check.
+  bool RT = true;
+  for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
+    if (hasComputableBounds(*I)) {
+      PtrRtCheck.insert(SE, TheLoop, *I);
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+    } else {
+      RT = false;
+      break;
+    }
+  for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
+    if (hasComputableBounds(*I)) {
+      PtrRtCheck.insert(SE, TheLoop, *I);
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+    } else {
+      RT = false;
+      break;
+    }
+
+  // Check that we did not collect too many pointers or found a
+  // unsizeable pointer.
+  if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
+    PtrRtCheck.reset();
+    RT = false;
+  }
+
+  PtrRtCheck.Need = RT;
+
+  if (RT) {
+    DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
+  }
+
    // Now that the pointers are in two lists (Reads and ReadWrites), we
    // can check that there are no conflicts between each of the writes and
    // between the writes to the reads.
@@ -1318,12 +1748,12 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
           it != e; ++it) {
        if (!isIdentifiedObject(*it)) {
          DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
-        return false;
+        return RT;
        }
        if (!WriteObjects.insert(*it)) {
          DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
                << **it <<"\n");
-        return false;
+        return RT;
        }
      }
      TempObjects.clear();
@@ -1336,18 +1766,20 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
           it != e; ++it) {
        if (!isIdentifiedObject(*it)) {
          DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
-        return false;
+        return RT;
        }
        if (WriteObjects.count(*it)) {
          DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
                << **it <<"\n");
-        return false;
+        return RT;
        }
      }
      TempObjects.clear();
    }
  
-  // All is okay.
+  // It is safe to vectorize and we don't need any runtime checks.
+  DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n");
+  PtrRtCheck.reset();
    return true;
  }
  
@@ -1441,8 +1873,6 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
      case Instruction::Sub:
        return Kind == IntegerAdd;
      case Instruction::Mul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
        return Kind == IntegerMult;
      case Instruction::And:
        return Kind == IntegerAnd;
@@ -1454,6 +1884,11 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
  }
  
  bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
+  Type *PhiTy = Phi->getType();
+  // We only handle integer and pointer inductions variables.
+  if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
+    return false;
+
    // Check that the PHI is consecutive and starts at zero.
    const SCEV *PhiScev = SE->getSCEV(Phi);
    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
@@ -1462,13 +1897,27 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
      return false;
    }
    const SCEV *Step = AR->getStepRecurrence(*SE);
-  const SCEV *Start = AR->getStart();
  
-  if (!Step->isOne() || !Start->isZero()) {
-    DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n");
+  // Integer inductions need to have a stride of one.
+  if (PhiTy->isIntegerTy())
+    return Step->isOne();
+
+  // Calculate the pointer stride and check if it is consecutive.
+  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
+  if (!C) return false;
+
+  assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
+  uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType());
+  return (C->getValue()->equalsInt(Size));
+}
+
+bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
+  const SCEV *PhiScev = SE->getSCEV(Ptr);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+  if (!AR)
      return false;
-  }
-  return true;
+
+  return AR->isAffine();
  }
  
  unsigned
@@ -1588,7 +2037,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
                                SI->getAlignment(), SI->getPointerAddressSpace());
  
        // Scalarized stores.
-      if (!Legal->isConsecutiveGep(SI->getPointerOperand())) {
+      if (!Legal->isConsecutivePtr(SI->getPointerOperand())) {
          unsigned Cost = 0;
          unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
                                                ValTy);
@@ -1615,7 +2064,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
                                       LI->getPointerAddressSpace());
  
        // Scalarized loads.
-      if (!Legal->isConsecutiveGep(LI->getPointerOperand())) {
+      if (!Legal->isConsecutivePtr(LI->getPointerOperand())) {
          unsigned Cost = 0;
          unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy);
          // The cost of inserting the loaded value into the result vector.