Re-sort all of the includes with ./utils/sort_includes.py so that

[oota-llvm.git] / lib / Transforms / Vectorize / LoopVectorize.cpp
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index 4833aecc17b5f6115b56defa4e9d630695d7feb0..70c18edf55a9d0ede07d2951a4efada9173b1d79 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -48,6 +48,7 @@
  #include "llvm/Transforms/Vectorize.h"
  #include "llvm/ADT/DenseMap.h"
  #include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/Hashing.h"
  #include "llvm/ADT/MapVector.h"
  #include "llvm/ADT/SetVector.h"
  #include "llvm/ADT/SmallPtrSet.h"
@@ -80,8 +81,8 @@
  #include "llvm/Support/CommandLine.h"
  #include "llvm/Support/Debug.h"
  #include "llvm/Support/PatternMatch.h"
-#include "llvm/Support/raw_ostream.h"
  #include "llvm/Support/ValueHandle.h"
+#include "llvm/Support/raw_ostream.h"
  #include "llvm/Target/TargetLibraryInfo.h"
  #include "llvm/Transforms/Scalar.h"
  #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -445,7 +446,7 @@ public:
      MRK_FloatMax
    };
  
-  /// This POD struct holds information about reduction variables.
+  /// This struct holds information about reduction variables.
    struct ReductionDescriptor {
      ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
        Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
@@ -482,8 +483,8 @@ public:
      MinMaxReductionKind MinMaxKind;
    };
  
-  // This POD struct holds information about the memory runtime legality
-  // check that a group of pointers do not overlap.
+  /// This struct holds information about the memory runtime legality
+  /// check that a group of pointers do not overlap.
    struct RuntimePointerCheck {
      RuntimePointerCheck() : Need(false) {}
  
@@ -493,6 +494,8 @@ public:
        Pointers.clear();
        Starts.clear();
        Ends.clear();
+      IsWritePtr.clear();
+      DependencySetId.clear();
      }
  
      /// Insert a pointer and calculate the start and end SCEVs.
@@ -514,7 +517,7 @@ public:
      SmallVector<unsigned, 2> DependencySetId;
    };
  
-  /// A POD for saving information about induction variables.
+  /// A struct for saving information about induction variables.
    struct InductionInfo {
      InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
      InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
@@ -561,7 +564,7 @@ public:
    /// pointer itself is an induction variable.
    /// This check allows us to vectorize A[idx] into a wide load/store.
    /// Returns:
-  /// 0 - Stride is unknown or non consecutive.
+  /// 0 - Stride is unknown or non-consecutive.
    /// 1 - Address is consecutive.
    /// -1 - Address is consecutive, and decreasing.
    int isConsecutivePtr(Value *Ptr);
@@ -760,10 +763,13 @@ struct LoopVectorizeHints {
    unsigned Width;
    /// Vectorization unroll factor.
    unsigned Unroll;
+  /// Vectorization forced (-1 not selected, 0 force disabled, 1 force enabled)
+  int Force;
  
    LoopVectorizeHints(const Loop *L, bool DisableUnrolling)
    : Width(VectorizationFactor)
    , Unroll(DisableUnrolling ? 1 : VectorizationUnroll)
+  , Force(-1)
    , LoopID(L->getLoopID()) {
      getHints(L);
      // The command line options override any loop metadata except for when
@@ -801,6 +807,7 @@ struct LoopVectorizeHints {
          Vals.push_back(LoopID->getOperand(i));
  
      Vals.push_back(createHint(Context, Twine(Prefix(), "width").str(), Width));
+    Vals.push_back(createHint(Context, Twine(Prefix(), "unroll").str(), 1));
  
      MDNode *NewLoopID = MDNode::get(Context, Vals);
      // Set operand 0 to refer to the loop id itself.
@@ -867,14 +874,19 @@ private:
        if (isPowerOf2_32(Val) && Val <= MaxVectorWidth)
          Width = Val;
        else
-        DEBUG(dbgs() << "LV: ignoring invalid width hint metadata");
+        DEBUG(dbgs() << "LV: ignoring invalid width hint metadata\n");
      } else if (Hint == "unroll") {
        if (isPowerOf2_32(Val) && Val <= MaxUnrollFactor)
          Unroll = Val;
        else
-        DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata");
+        DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n");
+    } else if (Hint == "enable") {
+      if (C->getBitWidth() == 1)
+        Force = Val;
+      else
+        DEBUG(dbgs() << "LV: ignoring invalid enable hint metadata\n");
      } else {
-      DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint);
+      DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint << '\n');
      }
    }
  };
@@ -884,8 +896,10 @@ struct LoopVectorize : public LoopPass {
    /// Pass identification, replacement for typeid
    static char ID;
  
-  explicit LoopVectorize(bool NoUnrolling = false)
-    : LoopPass(ID), DisableUnrolling(NoUnrolling) {
+  explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
+    : LoopPass(ID),
+      DisableUnrolling(NoUnrolling),
+      AlwaysVectorize(AlwaysVectorize) {
      initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
    }
  
@@ -896,6 +910,7 @@ struct LoopVectorize : public LoopPass {
    DominatorTree *DT;
    TargetLibraryInfo *TLI;
    bool DisableUnrolling;
+  bool AlwaysVectorize;
  
    virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
      // We only vectorize innermost loops.
@@ -915,7 +930,7 @@ struct LoopVectorize : public LoopPass {
        return false;
  
      if (DL == NULL) {
-      DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout");
+      DEBUG(dbgs() << "LV: Not vectorizing: Missing data layout\n");
        return false;
      }
  
@@ -924,15 +939,25 @@ struct LoopVectorize : public LoopPass {
  
      LoopVectorizeHints Hints(L, DisableUnrolling);
  
+    if (Hints.Force == 0) {
+      DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
+      return false;
+    }
+
+    if (!AlwaysVectorize && Hints.Force != 1) {
+      DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
+      return false;
+    }
+
      if (Hints.Width == 1 && Hints.Unroll == 1) {
-      DEBUG(dbgs() << "LV: Not vectorizing.\n");
+      DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
        return false;
      }
  
      // Check if it is legal to vectorize the loop.
      LoopVectorizationLegality LVL(L, SE, DL, DT, TLI);
      if (!LVL.canVectorize()) {
-      DEBUG(dbgs() << "LV: Not vectorizing.\n");
+      DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
        return false;
      }
  
@@ -945,7 +970,8 @@ struct LoopVectorize : public LoopPass {
      Attribute::AttrKind SzAttr = Attribute::OptimizeForSize;
      Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat;
      unsigned FnIndex = AttributeSet::FunctionIndex;
-    bool OptForSize = F->getAttributes().hasAttribute(FnIndex, SzAttr);
+    bool OptForSize = Hints.Force != 1 &&
+                      F->getAttributes().hasAttribute(FnIndex, SzAttr);
      bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr);
  
      if (NoFloat) {
@@ -961,17 +987,15 @@ struct LoopVectorize : public LoopPass {
      unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width,
                                          VF.Cost);
  
-    if (VF.Width == 1) {
-      DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
-    }
-
      DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
-          F->getParent()->getModuleIdentifier()<<"\n");
-    DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
+          F->getParent()->getModuleIdentifier() << '\n');
+    DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n');
  
      if (VF.Width == 1) {
+      DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
        if (UF == 1)
          return false;
+      DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n");
        // We decided not to vectorize, but we may want to unroll.
        InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF);
        Unroller.vectorize(&LVL);
@@ -1027,25 +1051,19 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
  }
  
  Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
-  // Save the current insertion location.
-  Instruction *Loc = Builder.GetInsertPoint();
-
    // We need to place the broadcast of invariant variables outside the loop.
    Instruction *Instr = dyn_cast<Instruction>(V);
    bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
    bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
  
    // Place the code for broadcasting invariant variables in the new preheader.
+  IRBuilder<>::InsertPointGuard Guard(Builder);
    if (Invariant)
      Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
  
    // Broadcast the scalar into all locations in the vector.
    Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
  
-  // Restore the builder insertion point.
-  if (Invariant)
-    Builder.SetInsertPoint(Loc);
-
    return Shuf;
  }
  
@@ -1072,10 +1090,35 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx,
    return Builder.CreateAdd(Val, Cv, "induction");
  }
  
+/// \brief Find the operand of the GEP that should be checked for consecutive
+/// stores. This ignores trailing indices that have no effect on the final
+/// pointer.
+static unsigned getGEPInductionOperand(DataLayout *DL,
+                                       const GetElementPtrInst *Gep) {
+  unsigned LastOperand = Gep->getNumOperands() - 1;
+  unsigned GEPAllocSize = DL->getTypeAllocSize(
+      cast<PointerType>(Gep->getType()->getScalarType())->getElementType());
+
+  // Walk backwards and try to peel off zeros.
+  while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
+    // Find the type we're currently indexing into.
+    gep_type_iterator GEPTI = gep_type_begin(Gep);
+    std::advance(GEPTI, LastOperand - 1);
+
+    // If it's a type with the same allocation size as the result of the GEP we
+    // can peel off the zero index.
+    if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize)
+      break;
+    --LastOperand;
+  }
+
+  return LastOperand;
+}
+
  int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
-  assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
+  assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
    // Make sure that the pointer does not point to structs.
-  if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType())
+  if (Ptr->getType()->getPointerElementType()->isAggregateType())
      return 0;
  
    // If this value is a pointer induction variable we know it is consecutive.
@@ -1093,8 +1136,6 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
      return 0;
  
    unsigned NumOperands = Gep->getNumOperands();
-  Value *LastIndex = Gep->getOperand(NumOperands - 1);
-
    Value *GpPtr = Gep->getPointerOperand();
    // If this GEP value is a consecutive pointer induction variable and all of
    // the indices are constant then we know it is consecutive. We can
@@ -1118,14 +1159,18 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
        return -1;
    }
  
-  // Check that all of the gep indices are uniform except for the last.
-  for (unsigned i = 0; i < NumOperands - 1; ++i)
-    if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+  unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
+
+  // Check that all of the gep indices are uniform except for our induction
+  // operand.
+  for (unsigned i = 0; i != NumOperands; ++i)
+    if (i != InductionOperand &&
+        !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
        return 0;
  
-  // We can emit wide load/stores only if the last index is the induction
-  // variable.
-  const SCEV *Last = SE->getSCEV(LastIndex);
+  // We can emit wide load/stores only if the last non-zero index is the
+  // induction variable.
+  const SCEV *Last = SE->getSCEV(Gep->getOperand(InductionOperand));
    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
      const SCEV *Step = AR->getStepRecurrence(*SE);
  
@@ -1183,6 +1228,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
    Type *DataTy = VectorType::get(ScalarDataTy, VF);
    Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
    unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
+  // An alignment of 0 means target abi alignment. We need to use the scalar's
+  // target abi alignment in such a case.
+  if (!Alignment)
+    Alignment = DL->getABITypeAlignment(ScalarDataTy);
    unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
    unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
    unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
@@ -1190,7 +1239,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
    if (ScalarAllocatedSize != VectorElementSize)
      return scalarizeInstruction(Instr);
  
-  // If the pointer is loop invariant or if it is non consecutive,
+  // If the pointer is loop invariant or if it is non-consecutive,
    // scalarize the load.
    int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
    bool Reverse = ConsecutiveStride < 0;
@@ -1222,7 +1271,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
      // The last index does not have to be the induction. It can be
      // consecutive and be a function of the index. For example A[I+1];
      unsigned NumOperands = Gep->getNumOperands();
-    unsigned LastOperand = NumOperands - 1;
+    unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
      // Create the new GEP with the new induction variable.
      GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
  
@@ -1231,9 +1280,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
        Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);
  
        // Update last index or loop invariant instruction anchored in loop.
-      if (i == LastOperand ||
+      if (i == InductionOperand ||
            (GepOperandInst && OrigLoop->contains(GepOperandInst))) {
-        assert((i == LastOperand ||
+        assert((i == InductionOperand ||
                 SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&
                 "Must be last index or loop invariant");
  
@@ -1357,7 +1406,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
        Instruction *Cloned = Instr->clone();
        if (!IsVoidRetTy)
          Cloned->setName(Instr->getName() + ".cloned");
-      // Replace the operands of the cloned instrucions with extracted scalars.
+      // Replace the operands of the cloned instructions with extracted scalars.
        for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
          Value *Op = Params[op][Part];
          // Param is a vector. Need to extract the right lane.
@@ -1391,11 +1440,9 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
    SmallVector<TrackingVH<Value> , 2> Starts;
    SmallVector<TrackingVH<Value> , 2> Ends;
  
+  LLVMContext &Ctx = Loc->getContext();
    SCEVExpander Exp(*SE, "induction");
  
-  // Use this type for pointer arithmetic.
-  Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0);
-
    for (unsigned i = 0; i < NumPointers; ++i) {
      Value *Ptr = PtrRtCheck->Pointers[i];
      const SCEV *Sc = SE->getSCEV(Ptr);
@@ -1406,7 +1453,11 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
        Starts.push_back(Ptr);
        Ends.push_back(Ptr);
      } else {
-      DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
+      DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n');
+      unsigned AS = Ptr->getType()->getPointerAddressSpace();
+
+      // Use this type for pointer arithmetic.
+      Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
  
        Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
        Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
@@ -1428,10 +1479,20 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
        if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j])
         continue;
  
-      Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
-      Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
-      Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy, "bc");
-      Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy, "bc");
+      unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace();
+      unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace();
+
+      assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) &&
+             (AS1 == Ends[i]->getType()->getPointerAddressSpace()) &&
+             "Trying to bounds check pointers with different address spaces");
+
+      Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
+      Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
+
+      Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc");
+      Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc");
+      Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy1, "bc");
+      Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy0, "bc");
  
        Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
        Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
@@ -1446,9 +1507,8 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
    // We have to do this trickery because the IRBuilder might fold the check to a
    // constant expression in which case there is no Instruction anchored in a
    // the block.
-  LLVMContext &Ctx = Loc->getContext();
-  Instruction * Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
-                                                  ConstantInt::getTrue(Ctx));
+  Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
+                                                 ConstantInt::getTrue(Ctx));
    ChkBuilder.Insert(Check, "memcheck.conflict");
    return Check;
  }
@@ -1500,6 +1560,16 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
    const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
    assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
  
+  // The exit count might have the type of i64 while the phi is i32. This can
+  // happen if we have an induction variable that is sign extended before the
+  // compare. The only way that we get a backedge taken count is that the
+  // induction variable was signed and as such will not overflow. In such a case
+  // truncation is legal.
+  if (ExitCount->getType()->getPrimitiveSizeInBits() >
+      IdxTy->getPrimitiveSizeInBits())
+    ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);
+
+  ExitCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
    // Get the total trip count from the count by adding 1.
    ExitCount = SE->getAddExpr(ExitCount,
                               SE->getConstant(ExitCount->getType(), 1));
@@ -1780,6 +1850,9 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
    LoopExitBlock = ExitBlock;
    LoopVectorBody = VecBody;
    LoopScalarBody = OldBasicBlock;
+
+  LoopVectorizeHints Hints(Lp, true);
+  Hints.setAlreadyVectorized(Lp);
  }
  
  /// This function returns the identity element (or neutral element) for
@@ -2017,6 +2090,54 @@ Value *createMinMaxOp(IRBuilder<> &Builder,
    return Select;
  }
  
+namespace {
+struct CSEDenseMapInfo {
+  static bool canHandle(Instruction *I) {
+    return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
+           isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
+  }
+  static inline Instruction *getEmptyKey() {
+    return DenseMapInfo<Instruction *>::getEmptyKey();
+  }
+  static inline Instruction *getTombstoneKey() {
+    return DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+  static unsigned getHashValue(Instruction *I) {
+    assert(canHandle(I) && "Unknown instruction!");
+    return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
+                                                           I->value_op_end()));
+  }
+  static bool isEqual(Instruction *LHS, Instruction *RHS) {
+    if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
+        LHS == getTombstoneKey() || RHS == getTombstoneKey())
+      return LHS == RHS;
+    return LHS->isIdenticalTo(RHS);
+  }
+};
+}
+
+///\brief Perform cse of induction variable instructions.
+static void cse(BasicBlock *BB) {
+  // Perform simple cse.
+  SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+    Instruction *In = I++;
+
+    if (!CSEDenseMapInfo::canHandle(In))
+      continue;
+
+    // Check if we can replace this instruction with any of the
+    // visited instructions.
+    if (Instruction *V = CSEMap.lookup(In)) {
+      In->replaceAllUsesWith(V);
+      In->eraseFromParent();
+      continue;
+    }
+
+    CSEMap[In] = In;
+  }
+}
+
  void
  InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
    //===------------------------------------------------===//
@@ -2234,8 +2355,11 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
      (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, ReducedPartRdx);
      (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
    }// end of for each redux variable.
- 
+
    fixLCSSAPHIs();
+
+  // Remove redundant induction instructions.
+  cse(LoopVectorBody);
  }
  
  void InnerLoopVectorizer::fixLCSSAPHIs() {
@@ -2329,7 +2453,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
    setDebugLocFromInst(Builder, P);
    // Check for PHI nodes that are lowered to vector selects.
    if (P->getParent() != OrigLoop->getHeader()) {
-    // We know that all PHIs in non header blocks are converted into
+    // We know that all PHIs in non-header blocks are converted into
      // selects, so we don't have to worry about the insertion order and we
      // can just use the builder.
      // At this point we generate the predication tree. There may be
@@ -2680,19 +2804,36 @@ void InnerLoopVectorizer::updateAnalysis() {
    DEBUG(DT->verifyAnalysis());
  }
  
+/// \brief Check whether it is safe to if-convert this phi node.
+///
+/// Phi nodes with constant expressions that can trap are not safe to if
+/// convert.
+static bool canIfConvertPHINodes(BasicBlock *BB) {
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    PHINode *Phi = dyn_cast<PHINode>(I);
+    if (!Phi)
+      return true;
+    for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p)
+      if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p)))
+        if (C->canTrap())
+          return false;
+  }
+  return true;
+}
+
  bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
    if (!EnableIfConversion)
      return false;
  
    assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
-  std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector();
  
    // A list of pointers that we can safely read and write to.
    SmallPtrSet<Value *, 8> SafePointes;
  
    // Collect safe addresses.
-  for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
-    BasicBlock *BB = LoopBlocks[i];
+  for (Loop::block_iterator BI = TheLoop->block_begin(),
+         BE = TheLoop->block_end(); BI != BE; ++BI) {
+    BasicBlock *BB = *BI;
  
      if (blockNeedsPredication(BB))
        continue;
@@ -2706,16 +2847,22 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
    }
  
    // Collect the blocks that need predication.
-  for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
-    BasicBlock *BB = LoopBlocks[i];
+  BasicBlock *Header = TheLoop->getHeader();
+  for (Loop::block_iterator BI = TheLoop->block_begin(),
+         BE = TheLoop->block_end(); BI != BE; ++BI) {
+    BasicBlock *BB = *BI;
  
      // We don't support switch statements inside loops.
      if (!isa<BranchInst>(BB->getTerminator()))
        return false;
  
      // We must be able to predicate all blocks that need to be predicated.
-    if (blockNeedsPredication(BB) && !blockCanBePredicated(BB, SafePointes))
+    if (blockNeedsPredication(BB)) {
+      if (!blockCanBePredicated(BB, SafePointes))
+        return false;
+    } else if (BB != Header && !canIfConvertPHINodes(BB))
        return false;
+
    }
  
    // We can if-convert this loop.
@@ -2740,19 +2887,17 @@ bool LoopVectorizationLegality::canVectorize() {
    if (!TheLoop->getExitingBlock())
      return false;
  
-  unsigned NumBlocks = TheLoop->getNumBlocks();
+  // We need to have a loop header.
+  DEBUG(dbgs() << "LV: Found a loop: " <<
+        TheLoop->getHeader()->getName() << '\n');
  
-  // Check if we can if-convert non single-bb loops.
+  // Check if we can if-convert non-single-bb loops.
+  unsigned NumBlocks = TheLoop->getNumBlocks();
    if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
      DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
      return false;
    }
  
-  // We need to have a loop header.
-  BasicBlock *Latch = TheLoop->getLoopLatch();
-  DEBUG(dbgs() << "LV: Found a loop: " <<
-        TheLoop->getHeader()->getName() << "\n");
-
    // ScalarEvolution needs to be able to find the exit count.
    const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
    if (ExitCount == SE->getCouldNotCompute()) {
@@ -2761,6 +2906,7 @@ bool LoopVectorizationLegality::canVectorize() {
    }
  
    // Do not loop-vectorize loops with a tiny trip count.
+  BasicBlock *Latch = TheLoop->getLoopLatch();
    unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch);
    if (TC > 0u && TC < TinyTripCountVectorThreshold) {
      DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
@@ -2797,6 +2943,11 @@ static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) {
    if (Ty->isPointerTy())
      return DL.getIntPtrType(Ty);
  
+  // It is possible that char's or short's overflow when we ask for the loop's
+  // trip count, work around this by changing the type size.
+  if (Ty->getScalarSizeInBits() < 32)
+    return Type::getInt32Ty(Ty->getContext());
+
    return Ty;
  }
  
@@ -2821,7 +2972,7 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
        Instruction *U = cast<Instruction>(*I);
        // This user may be a reduction exit value.
        if (!TheLoop->contains(U)) {
-        DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+        DEBUG(dbgs() << "LV: Found an outside user for : " << *U << '\n');
          return true;
        }
      }
@@ -2957,9 +3108,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
        }
  
        // Check that the instruction return type is vectorizable.
-      if (!VectorType::isValidElementType(it->getType()) &&
-          !it->getType()->isVoidTy()) {
-        DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
+      // Also, we can't vectorize extractelement instructions.
+      if ((!VectorType::isValidElementType(it->getType()) &&
+           !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
+        DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
          return false;
        }
  
@@ -3049,7 +3201,7 @@ public:
    /// non-intersection.
    bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
                         unsigned &NumComparisons, ScalarEvolution *SE,
-                       Loop *TheLoop);
+                       Loop *TheLoop, bool ShouldCheckStride = false);
  
    /// \brief Goes over all memory accesses, checks whether a RT check is needed
    /// and builds sets of dependent accesses.
@@ -3063,6 +3215,7 @@ public:
    bool isRTCheckNeeded() { return IsRTCheckNeeded; }
  
    bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
+  void resetDepChecks() { CheckDeps.clear(); }
  
    MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; }
  
@@ -3117,10 +3270,15 @@ static bool hasComputableBounds(ScalarEvolution *SE, Value *Ptr) {
    return AR->isAffine();
  }
  
+/// \brief Check the stride of the pointer and ensure that it does not wrap in
+/// the address space.
+static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr,
+                        const Loop *Lp);
+
  bool AccessAnalysis::canCheckPtrAtRT(
                         LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
                          unsigned &NumComparisons, ScalarEvolution *SE,
-                        Loop *TheLoop) {
+                        Loop *TheLoop, bool ShouldCheckStride) {
    // Find pointers with computable bounds. We are going to use this information
    // to place a runtime bound check.
    unsigned NumReadPtrChecks = 0;
@@ -3148,7 +3306,10 @@ bool AccessAnalysis::canCheckPtrAtRT(
      else
        ++NumReadPtrChecks;
  
-    if (hasComputableBounds(SE, Ptr)) {
+    if (hasComputableBounds(SE, Ptr) &&
+        // When we run after a failing dependency check we have to make sure we
+        // don't have wrapping pointers.
+        (!ShouldCheckStride || isStridedPtr(SE, DL, Ptr, TheLoop) == 1)) {
        // The id of the dependence set.
        unsigned DepId;
  
@@ -3164,7 +3325,7 @@ bool AccessAnalysis::canCheckPtrAtRT(
  
        RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId);
  
-      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr <<"\n");
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n');
      } else {
        CanDoRT = false;
      }
@@ -3172,9 +3333,36 @@ bool AccessAnalysis::canCheckPtrAtRT(
  
    if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2)
      NumComparisons = 0; // Only one dependence set.
-  else
+  else {
      NumComparisons = (NumWritePtrChecks * (NumReadPtrChecks +
                                             NumWritePtrChecks - 1));
+  }
+
+  // If the pointers that we would use for the bounds comparison have different
+  // address spaces, assume the values aren't directly comparable, so we can't
+  // use them for the runtime check. We also have to assume they could
+  // overlap. In the future there should be metadata for whether address spaces
+  // are disjoint.
+  unsigned NumPointers = RtCheck.Pointers.size();
+  for (unsigned i = 0; i < NumPointers; ++i) {
+    for (unsigned j = i + 1; j < NumPointers; ++j) {
+      // Only need to check pointers between two different dependency sets.
+      if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j])
+       continue;
+
+      Value *PtrI = RtCheck.Pointers[i];
+      Value *PtrJ = RtCheck.Pointers[j];
+
+      unsigned ASi = PtrI->getType()->getPointerAddressSpace();
+      unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
+      if (ASi != ASj) {
+        DEBUG(dbgs() << "LV: Runtime check would require comparison between"
+                       " different address spaces\n");
+        return false;
+      }
+    }
+  }
+
    return CanDoRT;
  }
  
@@ -3229,7 +3417,7 @@ void AccessAnalysis::processMemAccesses(bool UseDeferred) {
                          !isa<Argument>(UnderlyingObj)) &&
             !isIdentifiedObject(UnderlyingObj))) {
          DEBUG(dbgs() << "LV: Found an unidentified " <<
-              (IsWrite ?  "write" : "read" ) << " ptr:" << *UnderlyingObj <<
+              (IsWrite ?  "write" : "read" ) << " ptr: " << *UnderlyingObj <<
                "\n");
          IsRTCheckNeeded = (IsRTCheckNeeded ||
                             !isIdentifiedObject(UnderlyingObj) ||
@@ -3303,8 +3491,9 @@ public:
    typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
    typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
  
-  MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L) :
-    SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0) {}
+  MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L)
+      : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0),
+        ShouldRetryWithRuntimeCheck(false) {}
  
    /// \brief Register the location (instructions are given increasing numbers)
    /// of a write access.
@@ -3334,6 +3523,10 @@ public:
    /// the accesses safely with.
    unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
  
+  /// \brief In same cases when the dependency check fails we can still
+  /// vectorize the loop with a dynamic array access check.
+  bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; }
+
  private:
    ScalarEvolution *SE;
    DataLayout *DL;
@@ -3351,6 +3544,10 @@ private:
    // We can access this many bytes in parallel safely.
    unsigned MaxSafeDepDistBytes;
  
+  /// \brief If we see a non-constant dependence distance we can still try to
+  /// vectorize this loop with runtime checks.
+  bool ShouldRetryWithRuntimeCheck;
+
    /// \brief Check whether there is a plausible dependence between the two
    /// accesses.
    ///
@@ -3383,7 +3580,7 @@ static bool isInBoundsGep(Value *Ptr) {
  static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr,
                          const Loop *Lp) {
    const Type *Ty = Ptr->getType();
-  assert(Ty->isPointerTy() && "Unexpected non ptr");
+  assert(Ty->isPointerTy() && "Unexpected non-ptr");
  
    // Make sure that the pointer does not point to aggregate types.
    const PointerType *PtrTy = cast<PointerType>(Ty);
@@ -3547,7 +3744,8 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
  
    const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
    if (!C) {
-    DEBUG(dbgs() << "LV: Dependence because of non constant distance\n");
+    DEBUG(dbgs() << "LV: Dependence because of non-constant distance\n");
+    ShouldRetryWithRuntimeCheck = true;
      return true;
    }
  
@@ -3573,7 +3771,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
    if (Val == 0) {
      if (ATy == BTy)
        return false;
-    DEBUG(dbgs() << "LV: Zero dependence difference but different types");
+    DEBUG(dbgs() << "LV: Zero dependence difference but different types\n");
      return true;
    }
  
@@ -3582,7 +3780,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
    // Positive distance bigger than max vectorization factor.
    if (ATy != BTy) {
      DEBUG(dbgs() <<
-          "LV: ReadWrite-Write positive dependency with different types");
+          "LV: ReadWrite-Write positive dependency with different types\n");
      return false;
    }
  
@@ -3599,7 +3797,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
        2*TypeByteSize > MaxSafeDepDistBytes ||
        Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
      DEBUG(dbgs() << "LV: Failure because of Positive distance "
-        << Val.getSExtValue() << "\n");
+        << Val.getSExtValue() << '\n');
      return true;
    }
  
@@ -3612,7 +3810,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
       return true;
  
    DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() <<
-        " with max VF=" << MaxSafeDepDistBytes/TypeByteSize << "\n");
+        " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n');
  
    return false;
  }
@@ -3716,8 +3914,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
          Stores.push_back(St);
          DepChecker.addAccess(St);
        }
-    } // next instr.
-  } // next block.
+    } // Next instr.
+  } // Next block.
  
    // Now we have two lists that hold the loads and the stores.
    // Next, we find the pointers that they use.
@@ -3764,7 +3962,6 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
      return true;
    }
  
-  SmallPtrSet<Value *, 16> ReadOnlyPtr;
    for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
      LoadInst *LD = cast<LoadInst>(*I);
      Value* Ptr = LD->getPointerOperand();
@@ -3812,7 +4009,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
    if (NumComparisons == 0 && NeedRTCheck)
      NeedRTCheck = false;
  
-  // Check that we did not collect too many pointers or found a unsizeable
+  // Check that we did not collect too many pointers or found an unsizeable
    // pointer.
    if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
      PtrRtCheck.reset();
@@ -3838,9 +4035,32 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
      CanVecMem = DepChecker.areDepsSafe(DependentAccesses,
                                         Accesses.getDependenciesToCheck());
      MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes();
+
+    if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) {
+      DEBUG(dbgs() << "LV: Retrying with memory checks\n");
+      NeedRTCheck = true;
+
+      // Clear the dependency checks. We assume they are not needed.
+      Accesses.resetDepChecks();
+
+      PtrRtCheck.reset();
+      PtrRtCheck.Need = true;
+
+      CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE,
+                                         TheLoop, true);
+      // Check that we did not collect too many pointers or found an unsizeable
+      // pointer.
+      if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
+        DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n");
+        PtrRtCheck.reset();
+        return false;
+      }
+
+      CanVecMem = true;
+    }
    }
  
-  DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") <<
+  DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") <<
          " need a runtime memory check.\n");
  
    return CanVecMem;
@@ -3965,7 +4185,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
      // Check  whether we found a reduction operator.
      FoundReduxOp |= !IsAPhi;
  
-    // Process users of current instruction. Push non PHI nodes after PHI nodes
+    // Process users of current instruction. Push non-PHI nodes after PHI nodes
      // onto the stack. This way we are going to have seen all inputs to PHI
      // nodes once we get to them.
      SmallVector<Instruction *, 8> NonPHIs;
@@ -3984,6 +4204,12 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
          if (ExitInstruction != 0 || Cur == Phi)
            return false;
  
+        // The instruction used by an outside user must be the last instruction
+        // before we feed back to the reduction phi. Otherwise, we loose VF-1
+        // operations on the value.
+        if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end())
+         return false;
+
          ExitInstruction = Cur;
          continue;
        }
@@ -4190,6 +4416,14 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
      if (it->mayWriteToMemory() || it->mayThrow())
        return false;
  
+    // Check that we don't have a constant expression that can trap as operand.
+    for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();
+         OI != OE; ++OI) {
+      if (Constant *C = dyn_cast<Constant>(*OI))
+        if (C->canTrap())
+          return false;
+    }
+
      // The instructions below can trap.
      switch (it->getOpcode()) {
      default: continue;
@@ -4216,7 +4450,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
  
    // Find the trip count.
    unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
-  DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n");
+  DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
  
    unsigned WidestType = getWidestType();
    unsigned WidestRegister = TTI.getRegisterBitWidth(true);
@@ -4227,7 +4461,8 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
                      WidestRegister : MaxSafeDepDist);
    unsigned MaxVectorSize = WidestRegister / WidestType;
    DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
-  DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n");
+  DEBUG(dbgs() << "LV: The Widest register is: "
+          << WidestRegister << " bits.\n");
  
    if (MaxVectorSize == 0) {
      DEBUG(dbgs() << "LV: The target has no vector registers.\n");
@@ -4263,7 +4498,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
  
    if (UserVF != 0) {
      assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
-    DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n");
+    DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
  
      Factor.Width = UserVF;
      return Factor;
@@ -4271,13 +4506,13 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
  
    float Cost = expectedCost(1);
    unsigned Width = 1;
-  DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
+  DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)Cost << ".\n");
    for (unsigned i=2; i <= VF; i*=2) {
      // Notice that the vector loop needs to be executed less times, so
      // we need to divide the cost of the vector loops by the width of
      // the vector elements.
      float VectorCost = expectedCost(i) / (float)i;
-    DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
+    DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<
            (int)VectorCost << ".\n");
      if (VectorCost < Cost) {
        Cost = VectorCost;
@@ -4414,7 +4649,7 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
    }
  
    if (HasReductions) {
-    DEBUG(dbgs() << "LV: Unrolling because of reductions. \n");
+    DEBUG(dbgs() << "LV: Unrolling because of reductions.\n");
      return UF;
    }
  
@@ -4422,14 +4657,14 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
    // We assume that the cost overhead is 1 and we use the cost model
    // to estimate the cost of the loop and unroll until the cost of the
    // loop overhead is about 5% of the cost of the loop.
-  DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n");
+  DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
    if (LoopCost < SmallLoopCost) {
-    DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n");
+    DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n");
      unsigned NewUF = SmallLoopCost / (LoopCost + 1);
      return std::min(NewUF, UF);
    }
  
-  DEBUG(dbgs() << "LV: Not Unrolling. \n");
+  DEBUG(dbgs() << "LV: Not Unrolling.\n");
    return 1;
  }
  
@@ -4530,16 +4765,16 @@ LoopVectorizationCostModel::calculateRegisterUsage() {
      MaxUsage = std::max(MaxUsage, OpenIntervals.size());
  
      DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
-          OpenIntervals.size() <<"\n");
+          OpenIntervals.size() << '\n');
  
      // Add the current instruction to the list of open intervals.
      OpenIntervals.insert(I);
    }
  
    unsigned Invariant = LoopInvariants.size();
-  DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n");
-  DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n");
-  DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n");
+  DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n');
+  DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
+  DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n');
  
    R.LoopInvariantRegs = Invariant;
    R.MaxLocalUsers = MaxUsage;
@@ -4563,8 +4798,8 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
  
        unsigned C = getInstructionCost(it, VF);
        BlockCost += C;
-      DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " <<
-            VF << " For instruction: "<< *it << "\n");
+      DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<
+            VF << " For instruction: " << *it << '\n');
      }
  
      // We assume that if-converted blocks have a 50% chance of being executed.
@@ -4826,13 +5061,16 @@ char LoopVectorize::ID = 0;
  static const char lv_name[] = "Loop Vectorization";
  INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
  INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
  INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
  INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
  INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
  
  namespace llvm {
-  Pass *createLoopVectorizePass(bool NoUnrolling) {
-    return new LoopVectorize(NoUnrolling);
+  Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
+    return new LoopVectorize(NoUnrolling, AlwaysVectorize);
    }
  }
  
@@ -4901,7 +5139,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) {
      Instruction *Cloned = Instr->clone();
        if (!IsVoidRetTy)
          Cloned->setName(Instr->getName() + ".cloned");
-      // Replace the operands of the cloned instrucions with extracted scalars.
+      // Replace the operands of the cloned instructions with extracted scalars.
        for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
          Value *Op = Params[op][Part];
          Cloned->setOperand(op, Op);