minor cleanups

[oota-llvm.git] / lib / Transforms / Vectorize / LoopVectorize.cpp
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index 59e8e5e5fd165e385e01350eaa734e8aea0b9b17..8be31dc86d2556fd01a44c06e136fff594da72a4 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -18,10 +18,14 @@
  //
  // This pass has three parts:
  // 1. The main loop pass that drives the different parts.
-// 2. LoopVectorizationLegality - A helper class that checks for the legality
+// 2. LoopVectorizationLegality - A unit that checks for the legality
  //    of the vectorization.
-// 3. SingleBlockLoopVectorizer - A helper class that performs the actual
+// 3. SingleBlockLoopVectorizer - A unit that performs the actual
  //    widening of instructions.
+// 4. LoopVectorizationCostModel - A unit that checks for the profitability
+//    of vectorization. It decides on the optimal vector width, which
+//    can be one, if vectorization is not profitable.
+//
  //===----------------------------------------------------------------------===//
  //
  // The reduction-variable vectorization is based on the paper:
@@ -33,6 +37,9 @@
  // Other ideas/concepts are from:
  //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
  //
+//  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
+//  Vectorizing Compilers.
+//
  //===----------------------------------------------------------------------===//
  #define LV_NAME "loop-vectorize"
  #define DEBUG_TYPE LV_NAME
@@ -51,13 +58,15 @@
  #include "llvm/ADT/StringExtras.h"
  #include "llvm/Analysis/AliasAnalysis.h"
  #include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Transforms/Scalar.h"
  #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/Dominators.h"
  #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  #include "llvm/Analysis/ScalarEvolutionExpander.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Analysis/ValueTracking.h"
  #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/TargetTransformInfo.h"
  #include "llvm/Support/CommandLine.h"
  #include "llvm/Support/Debug.h"
  #include "llvm/Support/raw_ostream.h"
@@ -67,13 +76,24 @@
  using namespace llvm;
  
  static cl::opt<unsigned>
-DefaultVectorizationFactor("default-loop-vectorize-width",
-                          cl::init(4), cl::Hidden,
-                          cl::desc("Set the default loop vectorization width"));
+VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
+          cl::desc("Set the default vectorization width. Zero is autoselect."));
+
+/// We don't vectorize loops with a known constant trip count below this number.
+const unsigned TinyTripCountThreshold = 16;
+
+/// When performing a runtime memory check, do not check more than this
+/// number of pointers. Notice that the check is quadratic!
+const unsigned RuntimeMemoryCheckThreshold = 2;
+
+/// This is the highest vector width that we try to generate.
+const unsigned MaxVectorSize = 8;
+
  namespace {
  
-// Forward declaration.
+// Forward declarations.
  class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
  
  /// SingleBlockLoopVectorizer vectorizes loops which contain only one basic
  /// block to a specified vectorization factor (VF).
@@ -92,29 +112,35 @@ class LoopVectorizationLegality;
  class SingleBlockLoopVectorizer {
  public:
    /// Ctor.
-  SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li,
-                            LPPassManager *Lpm, unsigned VecWidth):
-  Orig(OrigLoop), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth),
+  SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
+                            DominatorTree *Dt, DataLayout *Dl,
+                            unsigned VecWidth):
+  OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), VF(VecWidth),
    Builder(Se->getContext()), Induction(0), OldInduction(0) { }
  
    // Perform the actual loop widening (vectorization).
    void vectorize(LoopVectorizationLegality *Legal) {
-    ///Create a new empty loop. Unlink the old loop and connect the new one.
+    // Create a new empty loop. Unlink the old loop and connect the new one.
      createEmptyLoop(Legal);
-    /// Widen each instruction in the old loop to a new one in the new loop.
-    /// Use the Legality module to find the induction and reduction variables.
-   vectorizeLoop(Legal);
-    // register the new loop.
-    cleanup();
+    // Widen each instruction in the old loop to a new one in the new loop.
+    // Use the Legality module to find the induction and reduction variables.
+    vectorizeLoop(Legal);
+    // Register the new loop and update the analysis passes.
+    updateAnalysis();
   }
  
  private:
+  /// Add code that checks at runtime if the accessed arrays overlap.
+  /// Returns the comperator value or NULL if no check is needed.
+  Value *addRuntimeCheck(LoopVectorizationLegality *Legal,
+                         Instruction *Loc);
    /// Create an empty loop, based on the loop ranges of the old loop.
    void createEmptyLoop(LoopVectorizationLegality *Legal);
    /// Copy and widen the instructions from the old loop.
    void vectorizeLoop(LoopVectorizationLegality *Legal);
-  /// Insert the new loop to the loop hierarchy and pass manager.
-  void cleanup();
+  /// Insert the new loop to the loop hierarchy and pass manager
+  /// and update the analysis passes.
+  void updateAnalysis();
  
    /// This instruction is un-vectorizable. Implement it as a sequence
    /// of scalars.
@@ -145,13 +171,15 @@ private:
    typedef DenseMap<Value*, Value*> ValueMap;
  
    /// The original loop.
-  Loop *Orig;
+  Loop *OrigLoop;
    // Scev analysis to use.
    ScalarEvolution *SE;
    // Loop Info.
    LoopInfo *LI;
-  // Loop Pass Manager;
-  LPPassManager *LPM;
+  // Dominator Tree.
+  DominatorTree *DT;
+  // Data Layout.
+  DataLayout *DL;
    // The vectorization factor to use.
    unsigned VF;
  
@@ -160,6 +188,10 @@ private:
  
    // --- Vectorization state ---
  
+  /// The vector-loop preheader.
+  BasicBlock *LoopVectorPreHeader;
+  /// The scalar-loop preheader.
+  BasicBlock *LoopScalarPreHeader;
    /// Middle Block between the vector and the scalar.
    BasicBlock *LoopMiddleBlock;
    ///The ExitBlock of the scalar loop.
@@ -198,12 +230,13 @@ public:
    TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
  
    /// This represents the kinds of reductions that we support.
-  /// We use the enum values to hold the 'identity' value for
-  /// each operand. This value does not change the result if applied.
    enum ReductionKind {
-    NoReduction = -1, /// Not a reduction.
-    IntegerAdd  = 0,  /// Sum of numbers.
-    IntegerMult = 1  /// Product of numbers.
+    NoReduction, /// Not a reduction.
+    IntegerAdd,  /// Sum of numbers.
+    IntegerMult, /// Product of numbers.
+    IntegerOr,   /// Bitwise or logical OR of numbers.
+    IntegerAnd,  /// Bitwise or logical AND of numbers.
+    IntegerXor   /// Bitwise or logical XOR of numbers.
    };
  
    /// This POD struct holds information about reduction variables.
@@ -225,15 +258,53 @@ public:
      ReductionKind Kind;
    };
  
+  // This POD struct holds information about the memory runtime legality
+  // check that a group of pointers do not overlap.
+  struct RuntimePointerCheck {
+    RuntimePointerCheck(): Need(false) {}
+
+    /// Reset the state of the pointer runtime information.
+    void reset() {
+      Need = false;
+      Pointers.clear();
+      Starts.clear();
+      Ends.clear();
+    }
+
+    /// Insert a pointer and calculate the start and end SCEVs.
+    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr) {
+      const SCEV *Sc = SE->getSCEV(Ptr);
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
+      assert(AR && "Invalid addrec expression");
+      const SCEV *Ex = SE->getExitCount(Lp, Lp->getHeader());
+      const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
+      Pointers.push_back(Ptr);
+      Starts.push_back(AR->getStart());
+      Ends.push_back(ScEnd);
+    }
+
+    /// This flag indicates if we need to add the runtime check.
+    bool Need;
+    /// Holds the pointers that we need to check.
+    SmallVector<Value*, 2> Pointers;
+    /// Holds the pointer value at the beginning of the loop.
+    SmallVector<const SCEV*, 2> Starts;
+    /// Holds the pointer value at the end of the loop.
+    SmallVector<const SCEV*, 2> Ends;
+  };
+
    /// ReductionList contains the reduction descriptors for all
    /// of the reductions that were found in the loop.
    typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
  
-  /// Returns the maximum vectorization factor that we *can* use to vectorize
-  /// this loop. This does not mean that it is profitable to vectorize this
-  /// loop, only that it is legal to do so. This may be a large number. We
-  /// can vectorize to any SIMD width below this number.
-  unsigned getLoopMaxVF();
+  /// InductionList saves induction variables and maps them to the initial
+  /// value entring the loop.
+  typedef DenseMap<PHINode*, Value*> InductionList;
+
+  /// Returns true if it is legal to vectorize this loop.
+  /// This does not mean that it is profitable to vectorize this
+  /// loop, only that it is legal to do so.
+  bool canVectorize();
  
    /// Returns the Induction variable.
    PHINode *getInduction() {return Induction;}
@@ -241,12 +312,23 @@ public:
    /// Returns the reduction variables found in the loop.
    ReductionList *getReductionVars() { return &Reductions; }
  
-  /// Check if the pointer returned by this GEP is consecutive
-  /// when the index is vectorized. This happens when the last
-  /// index of the GEP is consecutive, like the induction variable.
+  /// Returns the induction variables found in the loop.
+  InductionList *getInductionVars() { return &Inductions; }
+
+  /// Check if this  pointer is consecutive when vectorizing. This happens
+  /// when the last index of the GEP is the induction variable, or that the
+  /// pointer itself is an induction variable.
    /// This check allows us to vectorize A[idx] into a wide load/store.
-  bool isConsecutiveGep(Value *Ptr);
+  bool isConsecutivePtr(Value *Ptr);
+
+  /// Returns true if the value V is uniform within the loop.
+  bool isUniform(Value *V);
+
+  /// Returns true if this instruction will remain scalar after vectorization.
+  bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);}
  
+  /// Returns the information that we collected about runtime memory check.
+  RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; }
  private:
    /// Check if a single basic block loop is vectorizable.
    /// At this point we know that this is a loop with a constant trip count
@@ -259,10 +341,6 @@ private:
    /// Returns true if BB is vectorizable
    bool canVectorizeMemory(BasicBlock &BB);
  
-  // Check if a pointer value is known to be disjoint.
-  // Example: Alloca, Global, NoAlias.
-  bool isIdentifiedSafeObject(Value* Val);
-
    /// Returns True, if 'Phi' is the kind of reduction variable for type
    /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
    bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
@@ -271,6 +349,8 @@ private:
    bool isReductionInstr(Instruction *I, ReductionKind Kind);
    /// Returns True, if 'Phi' is an induction variable.
    bool isInductionVariable(PHINode *Phi);
+  /// Return true if can compute the address bounds of Ptr within the loop.
+  bool hasComputableBounds(Value *Ptr);
  
    /// The loop that we evaluate.
    Loop *TheLoop;
@@ -281,13 +361,72 @@ private:
  
    //  ---  vectorization state --- //
  
-  /// Holds the induction variable.
+  /// Holds the integer induction variable. This is the counter of the
+  /// loop.
    PHINode *Induction;
    /// Holds the reduction variables.
    ReductionList Reductions;
+  /// Holds all of the induction variables that we found in the loop.
+  /// Notice that inductions don't need to start at zero and that induction
+  /// variables can be pointers.
+  InductionList Inductions;
+
    /// Allowed outside users. This holds the reduction
    /// vars which can be accessed from outside the loop.
    SmallPtrSet<Value*, 4> AllowedExit;
+  /// This set holds the variables which are known to be uniform after
+  /// vectorization.
+  SmallPtrSet<Instruction*, 4> Uniforms;
+  /// We need to check that all of the pointers in this list are disjoint
+  /// at runtime.
+  RuntimePointerCheck PtrRtCheck;
+};
+
+/// LoopVectorizationCostModel - estimates the expected speedups due to
+/// vectorization.
+/// In many cases vectorization is not profitable. This can happen because
+/// of a number of reasons. In this class we mainly attempt to predict
+/// the expected speedup/slowdowns due to the supported instruction set.
+/// We use the VectorTargetTransformInfo to query the different backends
+/// for the cost of different operations.
+class LoopVectorizationCostModel {
+public:
+  /// C'tor.
+  LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se,
+                             LoopVectorizationLegality *Leg,
+                             const VectorTargetTransformInfo *Vtti):
+  TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { }
+
+  /// Returns the most profitable vectorization factor for the loop that is
+  /// smaller or equal to the VF argument. This method checks every power
+  /// of two up to VF.
+  unsigned findBestVectorizationFactor(unsigned VF = MaxVectorSize);
+
+private:
+  /// Returns the expected execution cost. The unit of the cost does
+  /// not matter because we use the 'cost' units to compare different
+  /// vector widths. The cost that is returned is *not* normalized by
+  /// the factor width.
+  unsigned expectedCost(unsigned VF);
+
+  /// Returns the execution time cost of an instruction for a given vector
+  /// width. Vector width of one means scalar.
+  unsigned getInstructionCost(Instruction *I, unsigned VF);
+
+  /// A helper function for converting Scalar types to vector types.
+  /// If the incoming type is void, we return void. If the VF is 1, we return
+  /// the scalar type.
+  static Type* ToVectorTy(Type *Scalar, unsigned VF);
+
+  /// The loop that we evaluate.
+  Loop *TheLoop;
+  /// Scev analysis.
+  ScalarEvolution *SE;
+
+  /// Vectorization legality.
+  LoopVectorizationLegality *Legal;
+  /// Vector target information.
+  const VectorTargetTransformInfo *VTTI;
  };
  
  struct LoopVectorize : public LoopPass {
@@ -300,6 +439,8 @@ struct LoopVectorize : public LoopPass {
    ScalarEvolution *SE;
    DataLayout *DL;
    LoopInfo *LI;
+  TargetTransformInfo *TTI;
+  DominatorTree *DT;
  
    virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
      // We only vectorize innermost loops.
@@ -309,25 +450,45 @@ struct LoopVectorize : public LoopPass {
      SE = &getAnalysis<ScalarEvolution>();
      DL = getAnalysisIfAvailable<DataLayout>();
      LI = &getAnalysis<LoopInfo>();
+    TTI = getAnalysisIfAvailable<TargetTransformInfo>();
+    DT = &getAnalysis<DominatorTree>();
  
      DEBUG(dbgs() << "LV: Checking a loop in \"" <<
            L->getHeader()->getParent()->getName() << "\"\n");
  
      // Check if it is legal to vectorize the loop.
      LoopVectorizationLegality LVL(L, SE, DL);
-    unsigned MaxVF = LVL.getLoopMaxVF();
-
-    // Check that we can vectorize this loop using the chosen vectorization
-    // width.
-    if (MaxVF < DefaultVectorizationFactor) {
-      DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n");
+    if (!LVL.canVectorize()) {
+      DEBUG(dbgs() << "LV: Not vectorizing.\n");
        return false;
      }
  
-    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n");
+    // Select the preffered vectorization factor.
+    unsigned VF = 1;
+    if (VectorizationFactor == 0) {
+      const VectorTargetTransformInfo *VTTI = 0;
+      if (TTI)
+        VTTI = TTI->getVectorTargetTransformInfo();
+      // Use the cost model.
+      LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
+      VF = CM.findBestVectorizationFactor();
+
+      if (VF == 1) {
+        DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+        return false;
+      }
+
+    } else {
+      // Use the user command flag.
+      VF = VectorizationFactor;
+    }
+
+    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
+          L->getHeader()->getParent()->getParent()->getModuleIdentifier()<<
+          "\n");
  
      // If we decided that it is *legal* to vectorizer the loop then do it.
-    SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor);
+    SingleBlockLoopVectorizer LB(L, SE, LI, DT, DL, VF);
      LB.vectorize(&LVL);
  
      DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -340,19 +501,29 @@ struct LoopVectorize : public LoopPass {
      AU.addRequiredID(LCSSAID);
      AU.addRequired<LoopInfo>();
      AU.addRequired<ScalarEvolution>();
+    AU.addRequired<DominatorTree>();
+    AU.addPreserved<LoopInfo>();
+    AU.addPreserved<DominatorTree>();
    }
  
  };
  
  Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
-  // Instructions that access the old induction variable
-  // actually want to get the new one.
-  if (V == OldInduction)
-    V = Induction;
    // Create the types.
    LLVMContext &C = V->getContext();
    Type *VTy = VectorType::get(V->getType(), VF);
    Type *I32 = IntegerType::getInt32Ty(C);
+
+  // Save the current insertion location.
+  Instruction *Loc = Builder.GetInsertPoint();
+
+  // We need to place the broadcast of invariant variables outside the loop.
+  bool Invariant = (OrigLoop->isLoopInvariant(V) && V != Induction);
+
+  // Place the code for broadcasting invariant variables in the new preheader.
+  if (Invariant)
+    Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+
    Constant *Zero = ConstantInt::get(I32, 0);
    Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
    Value *UndefVal = UndefValue::get(VTy);
@@ -361,10 +532,11 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
    // Broadcast the scalar into all locations in the vector.
    Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
                                               "broadcast");
-  // We are accessing the induction variable. Make sure to promote the
-  // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes.
-  if (V == Induction)
-    return getConsecutiveVector(Shuf);
+
+  // Restore the builder insertion point.
+  if (Invariant)
+    Builder.SetInsertPoint(Loc);
+
    return Shuf;
  }
  
@@ -388,8 +560,15 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
    return Builder.CreateAdd(Val, Cv, "induction");
  }
  
-bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
-  GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
+  assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
+
+  // If this pointer is an induction variable, return it.
+  PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
+  if (Phi && getInductionVars()->count(Phi))
+    return true;
+
+  GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
    if (!Gep)
      return false;
  
@@ -401,7 +580,7 @@ bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
      if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
        return false;
  
-  // We can emit wide load/stores only of the last index is the induction
+  // We can emit wide load/stores only if the last index is the induction
    // variable.
    const SCEV *Last = SE->getSCEV(LastIndex);
    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
@@ -416,7 +595,12 @@ bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
    return false;
  }
  
+bool LoopVectorizationLegality::isUniform(Value *V) {
+  return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
+}
+
  Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
+  assert(V != Induction && "The new induction variable should not be used.");
    assert(!V->getType()->isVectorTy() && "Can't widen a vector");
    // If we saved a vectorized copy of V, use it.
    Value *&MapEntry = WidenMap[V];
@@ -431,13 +615,7 @@ Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
  
  Constant*
  SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
-  SmallVector<Constant*, 8> Indices;
-  // Create a vector of consecutive numbers from zero to VF.
-  for (unsigned i = 0; i < VF; ++i)
-    Indices.push_back(ConstantInt::get(ScalarTy, Val));
-
-  // Add the consecutive indices to the vector value.
-  return ConstantVector::get(Indices);
+  return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true));
  }
  
  void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
@@ -451,7 +629,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
  
      // If we are accessing the old induction variable, use the new one.
      if (SrcOp == OldInduction) {
-      Params.push_back(getBroadcastInstrs(Induction));
+      Params.push_back(getVectorValue(SrcOp));
        continue;
      }
  
@@ -510,7 +688,69 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
      WidenMap[Instr] = VecResults;
  }
  
-void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
+Value*
+SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
+                                           Instruction *Loc) {
+  LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
+    Legal->getRuntimePointerCheck();
+
+  if (!PtrRtCheck->Need)
+    return NULL;
+
+  Value *MemoryRuntimeCheck = 0;
+  unsigned NumPointers = PtrRtCheck->Pointers.size();
+  SmallVector<Value* , 2> Starts;
+  SmallVector<Value* , 2> Ends;
+
+  SCEVExpander Exp(*SE, "induction");
+
+  // Use this type for pointer arithmetic.
+  Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType();
+
+  for (unsigned i = 0; i < NumPointers; ++i) {
+    Value *Ptr = PtrRtCheck->Pointers[i];
+    const SCEV *Sc = SE->getSCEV(Ptr);
+
+    if (SE->isLoopInvariant(Sc, OrigLoop)) {
+      DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" <<
+            *Ptr <<"\n");
+      Starts.push_back(Ptr);
+      Ends.push_back(Ptr);
+    } else {
+      DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
+
+      Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i],
+                                       PtrArithTy, Loc);
+      Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
+      Starts.push_back(Start);
+      Ends.push_back(End);
+    }
+  }
+
+  for (unsigned i = 0; i < NumPointers; ++i) {
+    for (unsigned j = i+1; j < NumPointers; ++j) {
+      Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+                                    Starts[i], Ends[j], "bound0", Loc);
+      Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+                                    Starts[j], Ends[i], "bound1", Loc);
+      Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
+                                                 "found.conflict", Loc);
+      if (MemoryRuntimeCheck)
+        MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
+                                                    MemoryRuntimeCheck,
+                                                    IsConflict,
+                                                    "conflict.rdx", Loc);
+      else
+        MemoryRuntimeCheck = IsConflict;
+
+    }
+  }
+
+  return MemoryRuntimeCheck;
+}
+
+void
+SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
    /*
     In this function we generate a new loop. The new loop will contain
     the vectorized instructions while the old loop will continue to run the
@@ -540,41 +780,21 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal
     ...
     */
  
-  // This is the original scalar-loop preheader.
-  BasicBlock *BypassBlock = Orig->getLoopPreheader();
-  BasicBlock *ExitBlock = Orig->getExitBlock();
+  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
+  BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
+  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
    assert(ExitBlock && "Must have an exit block");
  
-  assert(Orig->getNumBlocks() == 1 && "Invalid loop");
-  assert(BypassBlock && "Invalid loop structure");
-
-  BasicBlock *VectorPH =
-      BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
-  BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(),
-                                                 "vector.body");
-
-  BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
-                                                  "middle.block");
-  BasicBlock *ScalarPH =
-    MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
-                                 "scalar.preheader");
-  // Find the induction variable.
-  BasicBlock *OldBasicBlock = Orig->getHeader();
+  // Some loops have a single integer induction variable, while other loops
+  // don't. One example is c++ iterators that often have multiple pointer
+  // induction variables. In the code below we also support a case where we
+  // don't have a single induction variable.
    OldInduction = Legal->getInduction();
-  assert(OldInduction && "We must have a single phi node.");
-  Type *IdxTy = OldInduction->getType();
-
-  // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
-  // inside the loop.
-  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
-
-  // Generate the induction variable.
-  Induction = Builder.CreatePHI(IdxTy, 2, "index");
-  Constant *Zero = ConstantInt::get(IdxTy, 0);
-  Constant *Step = ConstantInt::get(IdxTy, VF);
+  Type *IdxTy = OldInduction ? OldInduction->getType() :
+    DL->getIntPtrType(SE->getContext());
  
    // Find the loop boundaries.
-  const SCEV *ExitCount = SE->getExitCount(Orig, Orig->getHeader());
+  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
    assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
  
    // Get the total trip count from the count by adding 1.
@@ -584,35 +804,148 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal
    // Expand the trip count and place the new instructions in the preheader.
    // Notice that the pre-header does not change, only the loop body.
    SCEVExpander Exp(*SE, "induction");
+
+  // Count holds the overall loop count (N).
+  Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
+                                   BypassBlock->getTerminator());
+
+  // The loop index does not have to start at Zero. Find the original start
+  // value from the induction PHI node. If we don't have an induction variable
+  // then we know that it starts at zero.
+  Value *StartIdx = OldInduction ?
+    OldInduction->getIncomingValueForBlock(BypassBlock):
+    ConstantInt::get(IdxTy, 0);
+
+  assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
+  assert(BypassBlock && "Invalid loop structure");
+
+  // Generate the code that checks in runtime if arrays overlap.
+  Value *MemoryRuntimeCheck = addRuntimeCheck(Legal,
+                                              BypassBlock->getTerminator());
+
+  // Split the single block loop into the two loop structure described above.
+  BasicBlock *VectorPH =
+      BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
+  BasicBlock *VecBody =
+    VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
+  BasicBlock *MiddleBlock =
+    VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
+  BasicBlock *ScalarPH =
+    MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
+
+  // This is the location in which we add all of the logic for bypassing
+  // the new vector loop.
    Instruction *Loc = BypassBlock->getTerminator();
  
+  // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
+  // inside the loop.
+  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
+
+  // Generate the induction variable.
+  Induction = Builder.CreatePHI(IdxTy, 2, "index");
+  Constant *Step = ConstantInt::get(IdxTy, VF);
+
    // We may need to extend the index in case there is a type mismatch.
    // We know that the count starts at zero and does not overflow.
-  // We are using Zext because it should be less expensive.
-  if (ExitCount->getType() != Induction->getType())
-    ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
+  if (Count->getType() != IdxTy) {
+    // The exit count can be of pointer type. Convert it to the correct
+    // integer type.
+    if (ExitCount->getType()->isPointerTy())
+      Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc);
+    else
+      Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc);
+  }
+
+  // Add the start index to the loop count to get the new end index.
+  Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc);
  
-  // Count holds the overall loop count (N).
-  Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
    // Now we need to generate the expression for N - (N % VF), which is
    // the part that the vectorized body will execute.
    Constant *CIVF = ConstantInt::get(IdxTy, VF);
    Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc);
    Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
+  Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx,
+                                                     "end.idx.rnd.down", Loc);
  
-  // Now, compare the new count to zero. If it is zero, jump to the scalar part.
+  // Now, compare the new count to zero. If it is zero skip the vector loop and
+  // jump to the scalar loop.
    Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
-                               CountRoundDown, ConstantInt::getNullValue(IdxTy),
+                               IdxEndRoundDown,
+                               StartIdx,
                                 "cmp.zero", Loc);
+
+  // If we are using memory runtime checks, include them in.
+  if (MemoryRuntimeCheck)
+    Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck,
+                                 "CntOrMem", Loc);
+
    BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
    // Remove the old terminator.
    Loc->eraseFromParent();
  
+  // We are going to resume the execution of the scalar loop.
+  // Go over all of the induction variables that we found and fix the
+  // PHIs that are left in the scalar version of the loop.
+  // The starting values of PHI nodes depend on the counter of the last
+  // iteration in the vectorized loop.
+  // If we come from a bypass edge then we need to start from the original start
+  // value.
+
+  // This variable saves the new starting index for the scalar loop.
+  PHINode *ResumeIndex = 0;
+  LoopVectorizationLegality::InductionList::iterator I, E;
+  LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
+  for (I = List->begin(), E = List->end(); I != E; ++I) {
+    PHINode *OrigPhi = I->first;
+    PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val",
+                                           MiddleBlock->getTerminator());
+    Value *EndValue = 0;
+    if (OrigPhi->getType()->isIntegerTy()) {
+      // Handle the integer induction counter:
+      assert(OrigPhi == OldInduction && "Unknown integer PHI");
+      // We know what the end value is.
+      EndValue = IdxEndRoundDown;
+      // We also know which PHI node holds it.
+      ResumeIndex = ResumeVal;
+    } else {
+      // For pointer induction variables, calculate the offset using
+      // the end index.
+      EndValue = GetElementPtrInst::Create(I->second, CountRoundDown,
+                                           "ptr.ind.end",
+                                           BypassBlock->getTerminator());
+    }
+
+    // The new PHI merges the original incoming value, in case of a bypass,
+    // or the value at the end of the vectorized loop.
+    ResumeVal->addIncoming(I->second, BypassBlock);
+    ResumeVal->addIncoming(EndValue, VecBody);
+
+    // Fix the scalar body counter (PHI node).
+    unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
+    OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
+  }
+
+  // If we are generating a new induction variable then we also need to
+  // generate the code that calculates the exit value. This value is not
+  // simply the end of the counter because we may skip the vectorized body
+  // in case of a runtime check.
+  if (!OldInduction){
+    assert(!ResumeIndex && "Unexpected resume value found");
+    ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
+                                  MiddleBlock->getTerminator());
+    ResumeIndex->addIncoming(StartIdx, BypassBlock);
+    ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
+  }
+
+  // Make sure that we found the index where scalar loop needs to continue.
+  assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
+         "Invalid resume Index");
+
    // Add a check in the middle block to see if we have completed
    // all of the iterations in the first vector loop.
    // If (N - N%VF) == N, then we *don't* need to run the remainder.
-  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
-                                CountRoundDown, "cmp.n",
+  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
+                                ResumeIndex, "cmp.n",
                                  MiddleBlock->getTerminator());
  
    BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
@@ -621,36 +954,37 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal
  
    // Create i+1 and fill the PHINode.
    Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
-  Induction->addIncoming(Zero, VectorPH);
+  Induction->addIncoming(StartIdx, VectorPH);
    Induction->addIncoming(NextIdx, VecBody);
    // Create the compare.
-  Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown);
+  Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
    Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
  
    // Now we have two terminators. Remove the old one from the block.
    VecBody->getTerminator()->eraseFromParent();
  
-  // Fix the scalar body iteration count.
-  unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
-  OldInduction->setIncomingValue(BlockIdx, CountRoundDown);
-
    // Get ready to start creating new instructions into the vectorized body.
    Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
  
-  // Register the new loop.
+  // Create and register the new vector loop.
    Loop* Lp = new Loop();
-  LPM->insertLoop(Lp, Orig->getParentLoop());
+  Loop *ParentLoop = OrigLoop->getParentLoop();
  
-  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
-
-  Loop *ParentLoop = Orig->getParentLoop();
+  // Insert the new loop into the loop nest and register the new basic blocks.
    if (ParentLoop) {
+    ParentLoop->addChildLoop(Lp);
      ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
      ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
      ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
+  } else {
+    LI->addTopLevelLoop(Lp);
    }
  
+  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
+
    // Save the state.
+  LoopVectorPreHeader = VectorPH;
+  LoopScalarPreHeader = ScalarPH;
    LoopMiddleBlock = MiddleBlock;
    LoopExitBlock = ExitBlock;
    LoopVectorBody = VecBody;
@@ -658,22 +992,50 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal
    LoopBypassBlock = BypassBlock;
  }
  
+/// This function returns the identity element (or neutral element) for
+/// the operation K.
+static unsigned
+getReductionIdentity(LoopVectorizationLegality::ReductionKind K) {
+  switch (K) {
+  case LoopVectorizationLegality::IntegerXor:
+  case LoopVectorizationLegality::IntegerAdd:
+  case LoopVectorizationLegality::IntegerOr:
+    // Adding, Xoring, Oring zero to a number does not change it.
+    return 0;
+  case LoopVectorizationLegality::IntegerMult:
+    // Multiplying a number by 1 does not change it.
+    return 1;
+  case LoopVectorizationLegality::IntegerAnd:
+    // AND-ing a number with an all-1 value does not change it.
+    return -1;
+  default:
+    llvm_unreachable("Unknown reduction kind");
+  }
+}
+
  void
  SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
+  //===------------------------------------------------===//
+  //
+  // Notice: any optimization or new instruction that go
+  // into the code below should be also be implemented in
+  // the cost-model.
+  //
+  //===------------------------------------------------===//
    typedef SmallVector<PHINode*, 4> PhiVector;
-  BasicBlock &BB = *Orig->getHeader();
+  BasicBlock &BB = *OrigLoop->getHeader();
    Constant *Zero = ConstantInt::get(
      IntegerType::getInt32Ty(BB.getContext()), 0);
  
    // In order to support reduction variables we need to be able to vectorize
    // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
-  // steages. First, we create a new vector PHI node with no incoming edges.
+  // stages. First, we create a new vector PHI node with no incoming edges.
    // We use this value when we vectorize all of the instructions that use the
    // PHI. Next, after all of the instructions in the block are complete we
    // add the new incoming edges to the PHI. At this point all of the
    // instructions in the basic block are vectorized, so we can use them to
    // construct the PHI.
-  PhiVector PHIsToFix;
+  PhiVector RdxPHIsToFix;
  
    // For each instruction in the old loop.
    for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
@@ -686,15 +1048,58 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
          continue;
        case Instruction::PHI:{
          PHINode* P = cast<PHINode>(Inst);
-        // Special handling for the induction var.
-        if (OldInduction == Inst)
+        // Handle reduction variables:
+        if (Legal->getReductionVars()->count(P)) {
+          // This is phase one of vectorizing PHIs.
+          Type *VecTy = VectorType::get(Inst->getType(), VF);
+          WidenMap[Inst] = PHINode::Create(VecTy, 2, "vec.phi",
+                                  LoopVectorBody->getFirstInsertionPt());
+          RdxPHIsToFix.push_back(P);
            continue;
-        // This is phase one of vectorizing PHIs.
-        // This has to be a reduction variable.
-        assert(Legal->getReductionVars()->count(P) && "Not a Reduction");
-        Type *VecTy = VectorType::get(Inst->getType(), VF);
-        WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi");
-        PHIsToFix.push_back(P);
+        }
+
+        // This PHINode must be an induction variable.
+        // Make sure that we know about it.
+        assert(Legal->getInductionVars()->count(P) &&
+               "Not an induction variable");
+
+        if (P->getType()->isIntegerTy()) {
+          assert(P == OldInduction && "Unexpected PHI");
+          Value *Broadcasted = getBroadcastInstrs(Induction);
+          // After broadcasting the induction variable we need to make the
+          // vector consecutive by adding 0, 1, 2 ...
+          Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted);
+           
+          WidenMap[OldInduction] = ConsecutiveInduction;
+          continue;
+        }
+
+        // Handle pointer inductions.
+        assert(P->getType()->isPointerTy() && "Unexpected type.");
+        Value *StartIdx = OldInduction ?
+          Legal->getInductionVars()->lookup(OldInduction) :
+          ConstantInt::get(Induction->getType(), 0);
+
+        // This is the pointer value coming into the loop.
+        Value *StartPtr = Legal->getInductionVars()->lookup(P);
+
+        // This is the normalized GEP that starts counting at zero.
+        Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
+                                                 "normalized.idx");
+
+        // This is the vector of results. Notice that we don't generate vector
+        // geps because scalar geps result in better code.
+        Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
+        for (unsigned int i = 0; i < VF; ++i) {
+          Constant *Idx = ConstantInt::get(Induction->getType(), i);
+          Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
+          Value *SclrGep = Builder.CreateGEP(StartPtr, GlobalIdx, "next.gep");
+          VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
+                                               Builder.getInt32(i),
+                                               "insert.gep");
+        }
+
+        WidenMap[Inst] = VecVal;
          continue;
        }
        case Instruction::Add:
@@ -719,8 +1124,19 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
          BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
          Value *A = getVectorValue(Inst->getOperand(0));
          Value *B = getVectorValue(Inst->getOperand(1));
+
          // Use this vector value for all users of the original instruction.
-        WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
+        Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
+        WidenMap[Inst] = V;
+
+        // Update the NSW, NUW and Exact flags.
+        BinaryOperator *VecOp = cast<BinaryOperator>(V);
+        if (isa<OverflowingBinaryOperator>(BinOp)) {
+          VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
+          VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
+        }
+        if (isa<PossiblyExactOperator>(VecOp))
+          VecOp->setIsExact(BinOp->isExact());
          break;
        }
        case Instruction::Select: {
@@ -728,7 +1144,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
          // If the selector is loop invariant we can create a select
          // instruction with a scalar condition. Otherwise, use vector-select.
          Value *Cond = Inst->getOperand(0);
-        bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), Orig);
+        bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop);
  
          // The condition can be loop invariant  but still defined inside the
          // loop. This means that we can't just use the original 'cond' value.
@@ -764,23 +1180,34 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
          Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
          Value *Ptr = SI->getPointerOperand();
          unsigned Alignment = SI->getAlignment();
+
+        assert(!Legal->isUniform(Ptr) &&
+               "We do not allow storing to uniform addresses");
+
          GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+
          // This store does not use GEPs.
-        if (!Legal->isConsecutiveGep(Gep)) {
+        if (!Legal->isConsecutivePtr(Ptr)) {
            scalarizeInstruction(Inst);
            break;
          }
  
-        // The last index does not have to be the induction. It can be
-        // consecutive and be a function of the index. For example A[I+1];
-        unsigned NumOperands = Gep->getNumOperands();
-        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
-        LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0));
-
-        // Create the new GEP with the new induction variable.
-        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-        Gep2->setOperand(NumOperands - 1, LastIndex);
-        Ptr = Builder.Insert(Gep2);
+        if (Gep) {
+          // The last index does not have to be the induction. It can be
+          // consecutive and be a function of the index. For example A[I+1];
+          unsigned NumOperands = Gep->getNumOperands();
+          Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
+          LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+          // Create the new GEP with the new induction variable.
+          GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+          Gep2->setOperand(NumOperands - 1, LastIndex);
+          Ptr = Builder.Insert(Gep2);
+        } else {
+          // Use the induction element ptr.
+          assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+          Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
+        }
          Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
          Value *Val = getVectorValue(SI->getValueOperand());
          Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
@@ -794,22 +1221,31 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
          unsigned Alignment = LI->getAlignment();
          GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
  
-        // We don't have a gep. Scalarize the load.
-        if (!Legal->isConsecutiveGep(Gep)) {
+        // If the pointer is loop invariant or if it is non consecutive,
+        // scalarize the load.
+        bool Con = Legal->isConsecutivePtr(Ptr);
+        if (Legal->isUniform(Ptr) || !Con) {
            scalarizeInstruction(Inst);
            break;
          }
  
-        // The last index does not have to be the induction. It can be
-        // consecutive and be a function of the index. For example A[I+1];
-        unsigned NumOperands = Gep->getNumOperands();
-        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
-        LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0));
+        if (Gep) {
+          // The last index does not have to be the induction. It can be
+          // consecutive and be a function of the index. For example A[I+1];
+          unsigned NumOperands = Gep->getNumOperands();
+          Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
+          LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+          // Create the new GEP with the new induction variable.
+          GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+          Gep2->setOperand(NumOperands - 1, LastIndex);
+          Ptr = Builder.Insert(Gep2);
+        } else {
+          // Use the induction element ptr.
+          assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+          Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
+        }
  
-        // Create the new GEP with the new induction variable.
-        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-        Gep2->setOperand(NumOperands - 1, LastIndex);
-        Ptr = Builder.Insert(Gep2);
          Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
          LI = Builder.CreateLoad(Ptr);
          LI->setAlignment(Alignment);
@@ -853,7 +1289,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
    // Create the 'reduced' values for each of the induction vars.
    // The reduced values are the vector values that we scalarize and combine
    // after the loop is finished.
-  for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end();
+  for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
         it != e; ++it) {
      PHINode *RdxPhi = *it;
      PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
@@ -875,10 +1311,9 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
      Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
      Type *VecTy = VectorExit->getType();
  
-    // Find the reduction identity variable. The value of the enum is the
-    // identity. Zero for addition. One for Multiplication.
-    unsigned IdentitySclr =  RdxDesc.Kind;
-    Constant *Identity = getUniformVector(IdentitySclr,
+    // Find the reduction identity variable. Zero for addition, or, xor,
+    // one for multiplication, -1 for And.
+    Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind),
                                            VecTy->getScalarType());
  
      // This vector is the Identity vector where the first element is the
@@ -886,7 +1321,6 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
      Value *VectorStart = Builder.CreateInsertElement(Identity,
                                                      RdxDesc.StartValue, Zero);
  
-
      // Fix the vector-loop phi.
      // We created the induction variable so we know that the
      // preheader is the first entry.
@@ -914,14 +1348,28 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
      // Extract the first scalar.
      Value *Scalar0 =
        Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
-    // Extract and sum the remaining vector elements.
+    // Extract and reduce the remaining vector elements.
      for (unsigned i=1; i < VF; ++i) {
        Value *Scalar1 =
          Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
-      if (RdxDesc.Kind == LoopVectorizationLegality::IntegerAdd) {
-        Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
-      } else {
-        Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
+      switch (RdxDesc.Kind) {
+        case LoopVectorizationLegality::IntegerAdd:
+          Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
+          break;
+        case LoopVectorizationLegality::IntegerMult:
+          Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
+          break;
+        case LoopVectorizationLegality::IntegerOr:
+          Scalar0 = Builder.CreateOr(Scalar0, Scalar1);
+          break;
+        case LoopVectorizationLegality::IntegerAnd:
+          Scalar0 = Builder.CreateAnd(Scalar0, Scalar1);
+          break;
+        case LoopVectorizationLegality::IntegerXor:
+          Scalar0 = Builder.CreateXor(Scalar0, Scalar1);
+          break;
+        default:
+          llvm_unreachable("Unknown reduction operation");
        }
      }
  
@@ -956,68 +1404,98 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
    }// end of for each redux variable.
  }
  
-void SingleBlockLoopVectorizer::cleanup() {
-  // The original basic block.
-  SE->forgetLoop(Orig);
+void SingleBlockLoopVectorizer::updateAnalysis() {
+  // Forget the original basic block.
+  SE->forgetLoop(OrigLoop);
+
+  // Update the dominator tree information.
+  assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) &&
+         "Entry does not dominate exit.");
+
+  DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock);
+  DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
+  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock);
+  DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
+  DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
+  DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
+
+  DEBUG(DT->verifyAnalysis());
  }
  
-unsigned LoopVectorizationLegality::getLoopMaxVF() {
-  if (!TheLoop->getLoopPreheader()) {
-    assert(false && "No preheader!!");
-    DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
-    return  1;
-  }
+bool LoopVectorizationLegality::canVectorize() {
+  assert(TheLoop->getLoopPreheader() && "No preheader!!");
  
    // We can only vectorize single basic block loops.
    unsigned NumBlocks = TheLoop->getNumBlocks();
    if (NumBlocks != 1) {
      DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n");
-    return 1;
+    return false;
    }
  
    // We need to have a loop header.
    BasicBlock *BB = TheLoop->getHeader();
    DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n");
  
-  // Go over each instruction and look at memory deps.
-  if (!canVectorizeBlock(*BB)) {
-    DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
-    return 1;
-  }
-
    // ScalarEvolution needs to be able to find the exit count.
    const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
    if (ExitCount == SE->getCouldNotCompute()) {
      DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
-    return 1;
+    return false;
+  }
+
+  // Do not loop-vectorize loops with a tiny trip count.
+  unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB);
+  if (TC > 0u && TC < TinyTripCountThreshold) {
+    DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
+          "This loop is not worth vectorizing.\n");
+    return false;
+  }
+
+  // Go over each instruction and look at memory deps.
+  if (!canVectorizeBlock(*BB)) {
+    DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
+    return false;
    }
  
-  DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
+  DEBUG(dbgs() << "LV: We can vectorize this loop" <<
+        (PtrRtCheck.Need ? " (with a runtime bound check)" : "")
+        <<"!\n");
  
    // Okay! We can vectorize. At this point we don't have any other mem analysis
-  // which may limit our maximum vectorization factor, so just return the
-  // maximum SIMD size.
-  return DefaultVectorizationFactor;
+  // which may limit our maximum vectorization factor, so just return true with
+  // no restrictions.
+  return true;
  }
  
  bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
+
+  BasicBlock *PreHeader = TheLoop->getLoopPreheader();
+
    // Scan the instructions in the block and look for hazards.
    for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
      Instruction *I = it;
  
-    PHINode *Phi = dyn_cast<PHINode>(I);
-    if (Phi) {
+    if (PHINode *Phi = dyn_cast<PHINode>(I)) {
        // This should not happen because the loop should be normalized.
        if (Phi->getNumIncomingValues() != 2) {
          DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
          return false;
        }
-      // We only look at integer phi nodes.
-      if (!Phi->getType()->isIntegerTy()) {
-        DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
+
+      // This is the value coming from the preheader.
+      Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
+
+      // We only look at integer and pointer phi nodes.
+      if (Phi->getType()->isPointerTy() && isInductionVariable(Phi)) {
+        DEBUG(dbgs() << "LV: Found a pointer induction variable.\n");
+        Inductions[Phi] = StartValue;
+        continue;
+      } else if (!Phi->getType()->isIntegerTy()) {
+        DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
          return false;
        }
  
+      // Handle integer PHIs:
        if (isInductionVariable(Phi)) {
          if (Induction) {
            DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
@@ -1025,6 +1503,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
          }
          DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n");
          Induction = Phi;
+        Inductions[Phi] = StartValue;
          continue;
        }
        if (AddReductionVar(Phi, IntegerAdd)) {
@@ -1032,7 +1511,19 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
          continue;
        }
        if (AddReductionVar(Phi, IntegerMult)) {
-        DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n");
+        DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
+        continue;
+      }
+      if (AddReductionVar(Phi, IntegerOr)) {
+        DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
+        continue;
+      }
+      if (AddReductionVar(Phi, IntegerAnd)) {
+        DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
+        continue;
+      }
+      if (AddReductionVar(Phi, IntegerXor)) {
+        DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
          continue;
        }
  
@@ -1043,8 +1534,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
      // We still don't handle functions.
      CallInst *CI = dyn_cast<CallInst>(I);
      if (CI) {
-      DEBUG(dbgs() << "LV: Found a call site:"<<
-            CI->getCalledFunction()->getName() << "\n");
+      DEBUG(dbgs() << "LV: Found a call site.\n");
        return false;
      }
  
@@ -1072,13 +1562,39 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
    } // next instr.
  
    if (!Induction) {
-      DEBUG(dbgs() << "LV: Did not find an induction var.\n");
-      return false;
+    DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
+    assert(getInductionVars()->size() && "No induction variables");
+  }
+
+  // Don't vectorize if the memory dependencies do not allow vectorization.
+  if (!canVectorizeMemory(BB))
+    return false;
+
+  // We now know that the loop is vectorizable!
+  // Collect variables that will remain uniform after vectorization.
+  std::vector<Value*> Worklist;
+
+  // Start with the conditional branch and walk up the block.
+  Worklist.push_back(BB.getTerminator()->getOperand(0));
+
+  while (Worklist.size()) {
+    Instruction *I = dyn_cast<Instruction>(Worklist.back());
+    Worklist.pop_back();
+
+    // Look at instructions inside this block. Stop when reaching PHI nodes.
+    if (!I || I->getParent() != &BB || isa<PHINode>(I))
+      continue;
+
+    // This is a known uniform.
+    Uniforms.insert(I);
+
+    // Insert all operands.
+    for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) {
+      Worklist.push_back(I->getOperand(i));
+    }
    }
  
-  // If the memory dependencies do not prevent us from
-  // vectorizing, then vectorize.
-  return canVectorizeMemory(BB);
+  return true;
  }
  
  bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
@@ -1087,6 +1603,8 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
    // Holds the Load and Store *instructions*.
    ValueVector Loads;
    ValueVector Stores;
+  PtrRtCheck.Pointers.clear();
+  PtrRtCheck.Need = false;
  
    // Scan the BB and collect legal loads and stores.
    for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
@@ -1144,6 +1662,12 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
      StoreInst *ST = dyn_cast<StoreInst>(*I);
      assert(ST && "Bad StoreInst");
      Value* Ptr = ST->getPointerOperand();
+
+    if (isUniform(Ptr)) {
+      DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
+      return false;
+    }
+
      // If we did *not* see this pointer before, insert it to
      // the read-write list. At this phase it is only a 'write' list.
      if (Seen.insert(Ptr))
@@ -1162,10 +1686,50 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
      // If the address of i is unknown (for example A[B[i]]) then we may
      // read a few words, modify, and write a few words, and some of the
      // words may be written to the same address.
-    if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr))
+    if (Seen.insert(Ptr) || !isConsecutivePtr(Ptr))
        Reads.push_back(Ptr);
    }
  
+  // If we write (or read-write) to a single destination and there are no
+  // other reads in this loop then is it safe to vectorize.
+  if (ReadWrites.size() == 1 && Reads.size() == 0) {
+    DEBUG(dbgs() << "LV: Found a write-only loop!\n");
+    return true;
+  }
+
+  // Find pointers with computable bounds. We are going to use this information
+  // to place a runtime bound check.
+  bool RT = true;
+  for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
+    if (hasComputableBounds(*I)) {
+      PtrRtCheck.insert(SE, TheLoop, *I);
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+    } else {
+      RT = false;
+      break;
+    }
+  for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
+    if (hasComputableBounds(*I)) {
+      PtrRtCheck.insert(SE, TheLoop, *I);
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+    } else {
+      RT = false;
+      break;
+    }
+
+  // Check that we did not collect too many pointers or found a
+  // unsizeable pointer.
+  if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
+    PtrRtCheck.reset();
+    RT = false;
+  }
+
+  PtrRtCheck.Need = RT;
+
+  if (RT) {
+    DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
+  }
+
    // Now that the pointers are in two lists (Reads and ReadWrites), we
    // can check that there are no conflicts between each of the writes and
    // between the writes to the reads.
@@ -1178,14 +1742,14 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
      GetUnderlyingObjects(*I, TempObjects, DL);
      for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
           it != e; ++it) {
-      if (!isIdentifiedSafeObject(*it)) {
+      if (!isIdentifiedObject(*it)) {
          DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
-        return false;
+        return RT;
        }
        if (!WriteObjects.insert(*it)) {
          DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
                << **it <<"\n");
-        return false;
+        return RT;
        }
      }
      TempObjects.clear();
@@ -1196,36 +1760,25 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
      GetUnderlyingObjects(*I, TempObjects, DL);
      for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
           it != e; ++it) {
-      if (!isIdentifiedSafeObject(*it)) {
+      if (!isIdentifiedObject(*it)) {
          DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
-        return false;
+        return RT;
        }
        if (WriteObjects.count(*it)) {
          DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
                << **it <<"\n");
-        return false;
+        return RT;
        }
      }
      TempObjects.clear();
    }
  
-  // All is okay.
+  // It is safe to vectorize and we don't need any runtime checks.
+  DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n");
+  PtrRtCheck.reset();
    return true;
  }
  
-/// Checks if the value is a Global variable or if it is an Arguments
-/// marked with the NoAlias attribute.
-bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) {
-  assert(Val && "Invalid value");
-  if (isa<GlobalValue>(Val))
-    return true;
-  if (isa<AllocaInst>(Val))
-    return true;
-  if (Argument *A = dyn_cast<Argument>(Val))
-    return A->hasNoAliasAttr();
-  return false;
-}
-
  bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
                                                  ReductionKind Kind) {
    if (Phi->getNumIncomingValues() != 2)
@@ -1316,13 +1869,22 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
      case Instruction::Sub:
        return Kind == IntegerAdd;
      case Instruction::Mul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
        return Kind == IntegerMult;
+    case Instruction::And:
+      return Kind == IntegerAnd;
+    case Instruction::Or:
+      return Kind == IntegerOr;
+    case Instruction::Xor:
+      return Kind == IntegerXor;
      }
  }
  
  bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
+  Type *PhiTy = Phi->getType();
+  // We only handle integer and pointer inductions variables.
+  if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
+    return false;
+
    // Check that the PHI is consecutive and starts at zero.
    const SCEV *PhiScev = SE->getSCEV(Phi);
    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
@@ -1331,13 +1893,234 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
      return false;
    }
    const SCEV *Step = AR->getStepRecurrence(*SE);
-  const SCEV *Start = AR->getStart();
  
-  if (!Step->isOne() || !Start->isZero()) {
-    DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n");
+  // Integer inductions need to have a stride of one.
+  if (PhiTy->isIntegerTy())
+    return Step->isOne();
+
+  // Calculate the pointer stride and check if it is consecutive.
+  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
+  if (!C) return false;
+
+  assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
+  uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType());
+  return (C->getValue()->equalsInt(Size));
+}
+
+bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
+  const SCEV *PhiScev = SE->getSCEV(Ptr);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+  if (!AR)
      return false;
+
+  return AR->isAffine();
+}
+
+unsigned
+LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
+  if (!VTTI) {
+    DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n");
+    return 1;
    }
-  return true;
+
+  float Cost = expectedCost(1);
+  unsigned Width = 1;
+  DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
+  for (unsigned i=2; i <= VF; i*=2) {
+    // Notice that the vector loop needs to be executed less times, so
+    // we need to divide the cost of the vector loops by the width of
+    // the vector elements.
+    float VectorCost = expectedCost(i) / (float)i;
+    DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
+          (int)VectorCost << ".\n");
+    if (VectorCost < Cost) {
+      Cost = VectorCost;
+      Width = i;
+    }
+  }
+
+  DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
+  return Width;
+}
+
+unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
+  // We can only estimate the cost of single basic block loops.
+  assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop");
+
+  BasicBlock *BB = TheLoop->getHeader();
+  unsigned Cost = 0;
+
+  // For each instruction in the old loop.
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    Instruction *Inst = it;
+    unsigned C = getInstructionCost(Inst, VF);
+    Cost += C;
+    DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF <<
+          " For instruction: "<< *Inst << "\n");
+  }
+
+  return Cost;
+}
+
+unsigned
+LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
+  assert(VTTI && "Invalid vector target transformation info");
+
+  // If we know that this instruction will remain uniform, check the cost of
+  // the scalar version.
+  if (Legal->isUniformAfterVectorization(I))
+    VF = 1;
+
+  Type *RetTy = I->getType();
+  Type *VectorTy = ToVectorTy(RetTy, VF);
+
+
+  // TODO: We need to estimate the cost of intrinsic calls.
+  switch (I->getOpcode()) {
+    case Instruction::GetElementPtr:
+      // We mark this instruction as zero-cost because scalar GEPs are usually
+      // lowered to the intruction addressing mode. At the moment we don't
+      // generate vector geps.
+      return 0;
+    case Instruction::Br: {
+      return VTTI->getCFInstrCost(I->getOpcode());
+    }
+    case Instruction::PHI:
+      return 0;
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+      return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy);
+    case Instruction::Select: {
+      SelectInst *SI = cast<SelectInst>(I);
+      const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
+      bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
+      Type *CondTy = SI->getCondition()->getType();
+      if (ScalarCond)
+        CondTy = VectorType::get(CondTy, VF);
+
+      return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
+    }
+    case Instruction::ICmp:
+    case Instruction::FCmp: {
+      Type *ValTy = I->getOperand(0)->getType();
+      VectorTy = ToVectorTy(ValTy, VF);
+      return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy);
+    }
+    case Instruction::Store: {
+      StoreInst *SI = cast<StoreInst>(I);
+      Type *ValTy = SI->getValueOperand()->getType();
+      VectorTy = ToVectorTy(ValTy, VF);
+
+      if (VF == 1)
+        return VTTI->getMemoryOpCost(I->getOpcode(), ValTy,
+                              SI->getAlignment(), SI->getPointerAddressSpace());
+
+      // Scalarized stores.
+      if (!Legal->isConsecutivePtr(SI->getPointerOperand())) {
+        unsigned Cost = 0;
+        unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
+                                              ValTy);
+        // The cost of extracting from the value vector.
+        Cost += VF * (ExtCost);
+        // The cost of the scalar stores.
+        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
+                                           ValTy->getScalarType(),
+                                           SI->getAlignment(),
+                                           SI->getPointerAddressSpace());
+        return Cost;
+      }
+
+      // Wide stores.
+      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(),
+                                   SI->getPointerAddressSpace());
+    }
+    case Instruction::Load: {
+      LoadInst *LI = cast<LoadInst>(I);
+
+      if (VF == 1)
+        return VTTI->getMemoryOpCost(I->getOpcode(), RetTy,
+                                     LI->getAlignment(),
+                                     LI->getPointerAddressSpace());
+
+      // Scalarized loads.
+      if (!Legal->isConsecutivePtr(LI->getPointerOperand())) {
+        unsigned Cost = 0;
+        unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy);
+        // The cost of inserting the loaded value into the result vector.
+        Cost += VF * (InCost);
+        // The cost of the scalar stores.
+        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
+                                           RetTy->getScalarType(),
+                                           LI->getAlignment(),
+                                           LI->getPointerAddressSpace());
+        return Cost;
+      }
+
+      // Wide loads.
+      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(),
+                                   LI->getPointerAddressSpace());
+    }
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast: {
+      Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
+      return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
+    }
+    default: {
+      // We are scalarizing the instruction. Return the cost of the scalar
+      // instruction, plus the cost of insert and extract into vector
+      // elements, times the vector width.
+      unsigned Cost = 0;
+
+      bool IsVoid = RetTy->isVoidTy();
+
+      unsigned InsCost = (IsVoid ? 0 :
+                          VTTI->getInstrCost(Instruction::InsertElement,
+                                             VectorTy));
+
+      unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
+                                            VectorTy);
+
+      // The cost of inserting the results plus extracting each one of the
+      // operands.
+      Cost += VF * (InsCost + ExtCost * I->getNumOperands());
+
+      // The cost of executing VF copies of the scalar instruction.
+      Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy);
+      return Cost;
+    }
+  }// end of switch.
+}
+
+Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
+  if (Scalar->isVoidTy() || VF == 1)
+    return Scalar;
+  return VectorType::get(Scalar, VF);
  }
  
  } // namespace