Vectorize: Remove implicit ilist iterator conversions, NFC

[oota-llvm.git] / lib / Transforms / Vectorize / BBVectorize.cpp
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp

index 1773cff3bb76ae4b7d04520f0accf4e65ee486e0..8844d574a79d320f2c2a8e431b6aa473f0be1f69 100644 (file)
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -15,7 +15,6 @@
  //===----------------------------------------------------------------------===//
  
  #define BBV_NAME "bb-vectorize"
-#define DEBUG_TYPE BBV_NAME
  #include "llvm/Transforms/Vectorize.h"
  #include "llvm/ADT/DenseMap.h"
  #include "llvm/ADT/DenseSet.h"
@@ -26,30 +25,36 @@
  #include "llvm/ADT/StringExtras.h"
  #include "llvm/Analysis/AliasAnalysis.h"
  #include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/GlobalsModRef.h"
  #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
  #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
  #include "llvm/Analysis/TargetTransformInfo.h"
  #include "llvm/Analysis/ValueTracking.h"
  #include "llvm/IR/Constants.h"
  #include "llvm/IR/DataLayout.h"
  #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
  #include "llvm/IR/Function.h"
  #include "llvm/IR/Instructions.h"
  #include "llvm/IR/IntrinsicInst.h"
  #include "llvm/IR/Intrinsics.h"
  #include "llvm/IR/LLVMContext.h"
  #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
  #include "llvm/IR/Type.h"
+#include "llvm/IR/ValueHandle.h"
  #include "llvm/Pass.h"
  #include "llvm/Support/CommandLine.h"
  #include "llvm/Support/Debug.h"
-#include "llvm/Support/ValueHandle.h"
  #include "llvm/Support/raw_ostream.h"
  #include "llvm/Transforms/Utils/Local.h"
  #include <algorithm>
  using namespace llvm;
  
+#define DEBUG_TYPE BBV_NAME
+
  static cl::opt<bool>
  IgnoreTargetInfo("bb-vectorize-ignore-target-info",  cl::init(false),
    cl::Hidden, cl::desc("Ignore target information"));
@@ -121,6 +126,10 @@ static cl::opt<bool>
  NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
    cl::desc("Don't try to vectorize floating-point math intrinsics"));
  
+static cl::opt<bool>
+  NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize BitManipulation intrinsics"));
+
  static cl::opt<bool>
  NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
    cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
@@ -196,13 +205,15 @@ namespace {
        initializeBBVectorizePass(*PassRegistry::getPassRegistry());
      }
  
-    BBVectorize(Pass *P, const VectorizeConfig &C)
+    BBVectorize(Pass *P, Function &F, const VectorizeConfig &C)
        : BasicBlockPass(ID), Config(C) {
-      AA = &P->getAnalysis<AliasAnalysis>();
-      DT = &P->getAnalysis<DominatorTree>();
-      SE = &P->getAnalysis<ScalarEvolution>();
-      TD = P->getAnalysisIfAvailable<DataLayout>();
-      TTI = IgnoreTargetInfo ? 0 : &P->getAnalysis<TargetTransformInfo>();
+      AA = &P->getAnalysis<AAResultsWrapperPass>().getAAResults();
+      DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+      SE = &P->getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+      TLI = &P->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+      TTI = IgnoreTargetInfo
+                ? nullptr
+                : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
      }
  
      typedef std::pair<Value *, Value *> ValuePair;
@@ -214,7 +225,7 @@ namespace {
      AliasAnalysis *AA;
      DominatorTree *DT;
      ScalarEvolution *SE;
-    DataLayout *TD;
+    const TargetLibraryInfo *TLI;
      const TargetTransformInfo *TTI;
  
      // FIXME: const correct?
@@ -278,7 +289,7 @@ namespace {
      bool trackUsesOfI(DenseSet<Value *> &Users,
                        AliasSetTracker &WriteSet, Instruction *I,
                        Instruction *J, bool UpdateUsers = true,
-                      DenseSet<ValuePair> *LoadMoveSetPairs = 0);
+                      DenseSet<ValuePair> *LoadMoveSetPairs = nullptr);
  
    void computePairsConnectedTo(
               DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
@@ -291,8 +302,8 @@ namespace {
      bool pairsConflict(ValuePair P, ValuePair Q,
               DenseSet<ValuePair> &PairableInstUsers,
               DenseMap<ValuePair, std::vector<ValuePair> >
-               *PairableInstUserMap = 0,
-             DenseSet<VPPair> *PairableInstUserPairSet = 0);
+               *PairableInstUserMap = nullptr,
+             DenseSet<VPPair> *PairableInstUserPairSet = nullptr);
  
      bool pairWillFormCycle(ValuePair P,
               DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers,
@@ -356,7 +367,7 @@ namespace {
                       Instruction *J, unsigned o, bool IBeforeJ);
  
      void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
-                     Instruction *J, SmallVector<Value *, 3> &ReplacedOperands,
+                     Instruction *J, SmallVectorImpl<Value *> &ReplacedOperands,
                       bool IBeforeJ);
  
      void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
@@ -385,9 +396,9 @@ namespace {
                       Instruction *&InsertionPt,
                       Instruction *I, Instruction *J);
  
-    void combineMetadata(Instruction *K, const Instruction *J);
-
      bool vectorizeBB(BasicBlock &BB) {
+      if (skipOptnoneFunction(BB))
+        return false;
        if (!DT->isReachableFromEntry(&BB)) {
          DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() <<
                " in " << BB.getParent()->getName() << "\n");
@@ -428,25 +439,32 @@ namespace {
        return changed;
      }
  
-    virtual bool runOnBasicBlock(BasicBlock &BB) {
-      AA = &getAnalysis<AliasAnalysis>();
-      DT = &getAnalysis<DominatorTree>();
-      SE = &getAnalysis<ScalarEvolution>();
-      TD = getAnalysisIfAvailable<DataLayout>();
-      TTI = IgnoreTargetInfo ? 0 : &getAnalysis<TargetTransformInfo>();
+    bool runOnBasicBlock(BasicBlock &BB) override {
+      // OptimizeNone check deferred to vectorizeBB().
+
+      AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+      DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+      SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+      TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+      TTI = IgnoreTargetInfo
+                ? nullptr
+                : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+                      *BB.getParent());
  
        return vectorizeBB(BB);
      }
  
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
        BasicBlockPass::getAnalysisUsage(AU);
-      AU.addRequired<AliasAnalysis>();
-      AU.addRequired<DominatorTree>();
-      AU.addRequired<ScalarEvolution>();
-      AU.addRequired<TargetTransformInfo>();
-      AU.addPreserved<AliasAnalysis>();
-      AU.addPreserved<DominatorTree>();
-      AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<ScalarEvolutionWrapperPass>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+      AU.addPreserved<ScalarEvolutionWrapperPass>();
+      AU.addPreserved<SCEVAAWrapperPass>();
        AU.setPreservesCFG();
      }
  
@@ -528,12 +546,16 @@ namespace {
  
      // Returns the cost of the provided instruction using TTI.
      // This does not handle loads and stores.
-    unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2) {
+    unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2,
+                          TargetTransformInfo::OperandValueKind Op1VK = 
+                              TargetTransformInfo::OK_AnyValue,
+                          TargetTransformInfo::OperandValueKind Op2VK =
+                              TargetTransformInfo::OK_AnyValue) {
        switch (Opcode) {
        default: break;
        case Instruction::GetElementPtr:
          // We mark this instruction as zero-cost because scalar GEPs are usually
-        // lowered to the intruction addressing mode. At the moment we don't
+        // lowered to the instruction addressing mode. At the moment we don't
          // generate vector GEPs.
          return 0;
        case Instruction::Br:
@@ -558,7 +580,7 @@ namespace {
        case Instruction::And:
        case Instruction::Or:
        case Instruction::Xor:
-        return TTI->getArithmeticInstrCost(Opcode, T1);
+        return TTI->getArithmeticInstrCost(Opcode, T1, Op1VK, Op2VK);
        case Instruction::Select:
        case Instruction::ICmp:
        case Instruction::FCmp:
@@ -624,19 +646,19 @@ namespace {
              dyn_cast<SCEVConstant>(OffsetSCEV)) {
          ConstantInt *IntOff = ConstOffSCEV->getValue();
          int64_t Offset = IntOff->getSExtValue();
+        const DataLayout &DL = I->getModule()->getDataLayout();
+        Type *VTy = IPtr->getType()->getPointerElementType();
+        int64_t VTyTSS = (int64_t)DL.getTypeStoreSize(VTy);
  
-        Type *VTy = cast<PointerType>(IPtr->getType())->getElementType();
-        int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy);
-
-        Type *VTy2 = cast<PointerType>(JPtr->getType())->getElementType();
+        Type *VTy2 = JPtr->getType()->getPointerElementType();
          if (VTy != VTy2 && Offset < 0) {
-          int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2);
+          int64_t VTy2TSS = (int64_t)DL.getTypeStoreSize(VTy2);
            OffsetInElmts = Offset/VTy2TSS;
-          return (abs64(Offset) % VTy2TSS) == 0;
+          return (std::abs(Offset) % VTy2TSS) == 0;
          }
  
          OffsetInElmts = Offset/VTyTSS;
-        return (abs64(Offset) % VTyTSS) == 0;
+        return (std::abs(Offset) % VTyTSS) == 0;
        }
  
        return false;
@@ -648,7 +670,7 @@ namespace {
        Function *F = I->getCalledFunction();
        if (!F) return false;
  
-      Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID();
+      Intrinsic::ID IID = F->getIntrinsicID();
        if (!IID) return false;
  
        switch(IID) {
@@ -664,7 +686,22 @@ namespace {
        case Intrinsic::exp:
        case Intrinsic::exp2:
        case Intrinsic::pow:
+      case Intrinsic::round:
+      case Intrinsic::copysign:
+      case Intrinsic::ceil:
+      case Intrinsic::nearbyint:
+      case Intrinsic::rint:
+      case Intrinsic::trunc:
+      case Intrinsic::floor:
+      case Intrinsic::fabs:
+      case Intrinsic::minnum:
+      case Intrinsic::maxnum:
          return Config.VectorizeMath;
+      case Intrinsic::bswap:
+      case Intrinsic::ctpop:
+      case Intrinsic::ctlz:
+      case Intrinsic::cttz:
+        return Config.VectorizeBitManipulations;
        case Intrinsic::fma:
        case Intrinsic::fmuladd:
          return Config.VectorizeFMA;
@@ -813,7 +850,7 @@ namespace {
  
      // It is important to cleanup here so that future iterations of this
      // function have less work to do.
-    (void) SimplifyInstructionsInBlock(&BB, TD, AA->getTargetLibraryInfo());
+    (void)SimplifyInstructionsInBlock(&BB, TLI);
      return true;
    }
  
@@ -867,10 +904,6 @@ namespace {
        return false;
      }
  
-    // We can't vectorize memory operations without target data
-    if (TD == 0 && IsSimpleLoadStore)
-      return false;
-
      Type *T1, *T2;
      getInstructionTypes(I, T1, T2);
  
@@ -905,9 +938,8 @@ namespace {
      if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy())
        return false;
  
-    if ((!Config.VectorizePointers || TD == 0) &&
-        (T1->getScalarType()->isPointerTy() ||
-         T2->getScalarType()->isPointerTy()))
+    if (!Config.VectorizePointers && (T1->getScalarType()->isPointerTy() ||
+                                      T2->getScalarType()->isPointerTy()))
        return false;
  
      if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
@@ -952,8 +984,8 @@ namespace {
        unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
        int64_t OffsetInElmts = 0;
        if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
-            IAddressSpace, JAddressSpace,
-            OffsetInElmts) && abs64(OffsetInElmts) == 1) {
+                         IAddressSpace, JAddressSpace, OffsetInElmts) &&
+          std::abs(OffsetInElmts) == 1) {
          FixedOrder = (int) OffsetInElmts;
          unsigned BottomAlignment = IAlignment;
          if (OffsetInElmts < 0) BottomAlignment = JAlignment;
@@ -968,8 +1000,8 @@ namespace {
            // An aligned load or store is possible only if the instruction
            // with the lower offset has an alignment suitable for the
            // vector type.
-
-          unsigned VecAlignment = TD->getPrefTypeAlignment(VType);
+          const DataLayout &DL = I->getModule()->getDataLayout();
+          unsigned VecAlignment = DL.getPrefTypeAlignment(VType);
            if (BottomAlignment < VecAlignment)
              return false;
          }
@@ -1009,13 +1041,49 @@ namespace {
        unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
        Type *VT1 = getVecTypeForPair(IT1, JT1),
             *VT2 = getVecTypeForPair(IT2, JT2);
+      TargetTransformInfo::OperandValueKind Op1VK =
+          TargetTransformInfo::OK_AnyValue;
+      TargetTransformInfo::OperandValueKind Op2VK =
+          TargetTransformInfo::OK_AnyValue;
+
+      // On some targets (example X86) the cost of a vector shift may vary
+      // depending on whether the second operand is a Uniform or
+      // NonUniform Constant.
+      switch (I->getOpcode()) {
+      default : break;
+      case Instruction::Shl:
+      case Instruction::LShr:
+      case Instruction::AShr:
+
+        // If both I and J are scalar shifts by constant, then the
+        // merged vector shift count would be either a constant splat value
+        // or a non-uniform vector of constants.
+        if (ConstantInt *CII = dyn_cast<ConstantInt>(I->getOperand(1))) {
+          if (ConstantInt *CIJ = dyn_cast<ConstantInt>(J->getOperand(1)))
+            Op2VK = CII == CIJ ? TargetTransformInfo::OK_UniformConstantValue :
+                               TargetTransformInfo::OK_NonUniformConstantValue;
+        } else {
+          // Check for a splat of a constant or for a non uniform vector
+          // of constants.
+          Value *IOp = I->getOperand(1);
+          Value *JOp = J->getOperand(1);
+          if ((isa<ConstantVector>(IOp) || isa<ConstantDataVector>(IOp)) &&
+              (isa<ConstantVector>(JOp) || isa<ConstantDataVector>(JOp))) {
+            Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+            Constant *SplatValue = cast<Constant>(IOp)->getSplatValue();
+            if (SplatValue != nullptr &&
+                SplatValue == cast<Constant>(JOp)->getSplatValue())
+              Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+          }
+        }
+      }
  
        // Note that this procedure is incorrect for insert and extract element
        // instructions (because combining these often results in a shuffle),
        // but this cost is ignored (because insert and extract element
        // instructions are assigned a zero depth factor and are not really
        // fused in general).
-      unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2);
+      unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK);
  
        if (VCost > ICost + JCost)
          return false;
@@ -1033,13 +1101,14 @@ namespace {
        CostSavings = ICost + JCost - VCost;
      }
  
-    // The powi intrinsic is special because only the first argument is
-    // vectorized, the second arguments must be equal.
+    // The powi,ctlz,cttz intrinsics are special because only the first
+    // argument is vectorized, the second arguments must be equal.
      CallInst *CI = dyn_cast<CallInst>(I);
      Function *FI;
      if (CI && (FI = CI->getCalledFunction())) {
-      Intrinsic::ID IID = (Intrinsic::ID) FI->getIntrinsicID();
-      if (IID == Intrinsic::powi) {
+      Intrinsic::ID IID = FI->getIntrinsicID();
+      if (IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
+          IID == Intrinsic::cttz) {
          Value *A1I = CI->getArgOperand(1),
                *A1J = cast<CallInst>(J)->getArgOperand(1);
          const SCEV *A1ISCEV = SE->getSCEV(A1I),
@@ -1063,7 +1132,8 @@ namespace {
          assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&
                 "Intrinsic argument counts differ");
          for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
-          if (IID == Intrinsic::powi && i == 1)
+          if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
+               IID == Intrinsic::cttz) && i == 1)
              Tys.push_back(CI->getArgOperand(i)->getType());
            else
              Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(),
@@ -1177,18 +1247,23 @@ namespace {
        if (I == Start) IAfterStart = true;
  
        bool IsSimpleLoadStore;
-      if (!isInstVectorizable(I, IsSimpleLoadStore)) continue;
+      if (!isInstVectorizable(&*I, IsSimpleLoadStore))
+        continue;
  
        // Look for an instruction with which to pair instruction *I...
        DenseSet<Value *> Users;
        AliasSetTracker WriteSet(*AA);
+      if (I->mayWriteToMemory())
+        WriteSet.add(&*I);
+
        bool JAfterStart = IAfterStart;
-      BasicBlock::iterator J = llvm::next(I);
+      BasicBlock::iterator J = std::next(I);
        for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
-        if (J == Start) JAfterStart = true;
+        if (&*J == Start)
+          JAfterStart = true;
  
          // Determine if J uses I, if so, exit the loop.
-        bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep);
+        bool UsesI = trackUsesOfI(Users, WriteSet, &*I, &*J, !Config.FastDep);
          if (Config.FastDep) {
            // Note: For this heuristic to be effective, independent operations
            // must tend to be intermixed. This is likely to be true from some
@@ -1205,30 +1280,31 @@ namespace {
          // J does not use I, and comes before the first use of I, so it can be
          // merged with I if the instructions are compatible.
          int CostSavings, FixedOrder;
-        if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len,
-            CostSavings, FixedOrder)) continue;
+        if (!areInstsCompatible(&*I, &*J, IsSimpleLoadStore, NonPow2Len,
+                                CostSavings, FixedOrder))
+          continue;
  
          // J is a candidate for merging with I.
-        if (!PairableInsts.size() ||
-             PairableInsts[PairableInsts.size()-1] != I) {
-          PairableInsts.push_back(I);
+        if (PairableInsts.empty() ||
+            PairableInsts[PairableInsts.size() - 1] != &*I) {
+          PairableInsts.push_back(&*I);
          }
  
-        CandidatePairs[I].push_back(J);
+        CandidatePairs[&*I].push_back(&*J);
          ++TotalPairs;
          if (TTI)
-          CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J),
-                                                            CostSavings));
+          CandidatePairCostSavings.insert(
+              ValuePairWithCost(ValuePair(&*I, &*J), CostSavings));
  
          if (FixedOrder == 1)
-          FixedOrderPairs.insert(ValuePair(I, J));
+          FixedOrderPairs.insert(ValuePair(&*I, &*J));
          else if (FixedOrder == -1)
-          FixedOrderPairs.insert(ValuePair(J, I));
+          FixedOrderPairs.insert(ValuePair(&*J, &*I));
  
          // The next call to this function must start after the last instruction
          // selected during this invocation.
          if (JAfterStart) {
-          Start = llvm::next(J);
+          Start = std::next(J);
            IAfterStart = JAfterStart = false;
          }
  
@@ -1270,13 +1346,15 @@ namespace {
  
      // For each possible pairing for this variable, look at the uses of
      // the first value...
-    for (Value::use_iterator I = P.first->use_begin(),
-         E = P.first->use_end(); I != E; ++I) {
-      if (isa<LoadInst>(*I)) {
+    for (Value::user_iterator I = P.first->user_begin(),
+                              E = P.first->user_end();
+         I != E; ++I) {
+      User *UI = *I;
+      if (isa<LoadInst>(UI)) {
          // A pair cannot be connected to a load because the load only takes one
          // operand (the address) and it is a scalar even after vectorization.
          continue;
-      } else if ((SI = dyn_cast<StoreInst>(*I)) &&
+      } else if ((SI = dyn_cast<StoreInst>(UI)) &&
                   P.first == SI->getPointerOperand()) {
          // Similarly, a pair cannot be connected to a store through its
          // pointer operand.
@@ -1285,22 +1363,21 @@ namespace {
  
        // For each use of the first variable, look for uses of the second
        // variable...
-      for (Value::use_iterator J = P.second->use_begin(),
-           E2 = P.second->use_end(); J != E2; ++J) {
-        if ((SJ = dyn_cast<StoreInst>(*J)) &&
+      for (User *UJ : P.second->users()) {
+        if ((SJ = dyn_cast<StoreInst>(UJ)) &&
              P.second == SJ->getPointerOperand())
            continue;
  
          // Look for <I, J>:
-        if (CandidatePairsSet.count(ValuePair(*I, *J))) {
-          VPPair VP(P, ValuePair(*I, *J));
+        if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
+          VPPair VP(P, ValuePair(UI, UJ));
            ConnectedPairs[VP.first].push_back(VP.second);
            PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
          }
  
          // Look for <J, I>:
-        if (CandidatePairsSet.count(ValuePair(*J, *I))) {
-          VPPair VP(P, ValuePair(*J, *I));
+        if (CandidatePairsSet.count(ValuePair(UJ, UI))) {
+          VPPair VP(P, ValuePair(UJ, UI));
            ConnectedPairs[VP.first].push_back(VP.second);
            PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
          }
@@ -1309,13 +1386,14 @@ namespace {
        if (Config.SplatBreaksChain) continue;
        // Look for cases where just the first value in the pair is used by
        // both members of another pair (splatting).
-      for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) {
-        if ((SJ = dyn_cast<StoreInst>(*J)) &&
+      for (Value::user_iterator J = P.first->user_begin(); J != E; ++J) {
+        User *UJ = *J;
+        if ((SJ = dyn_cast<StoreInst>(UJ)) &&
              P.first == SJ->getPointerOperand())
            continue;
  
-        if (CandidatePairsSet.count(ValuePair(*I, *J))) {
-          VPPair VP(P, ValuePair(*I, *J));
+        if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
+          VPPair VP(P, ValuePair(UI, UJ));
            ConnectedPairs[VP.first].push_back(VP.second);
            PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
          }
@@ -1325,21 +1403,24 @@ namespace {
      if (Config.SplatBreaksChain) return;
      // Look for cases where just the second value in the pair is used by
      // both members of another pair (splatting).
-    for (Value::use_iterator I = P.second->use_begin(),
-         E = P.second->use_end(); I != E; ++I) {
-      if (isa<LoadInst>(*I))
+    for (Value::user_iterator I = P.second->user_begin(),
+                              E = P.second->user_end();
+         I != E; ++I) {
+      User *UI = *I;
+      if (isa<LoadInst>(UI))
          continue;
-      else if ((SI = dyn_cast<StoreInst>(*I)) &&
+      else if ((SI = dyn_cast<StoreInst>(UI)) &&
                 P.second == SI->getPointerOperand())
          continue;
  
-      for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) {
-        if ((SJ = dyn_cast<StoreInst>(*J)) &&
+      for (Value::user_iterator J = P.second->user_begin(); J != E; ++J) {
+        User *UJ = *J;
+        if ((SJ = dyn_cast<StoreInst>(UJ)) &&
              P.second == SJ->getPointerOperand())
            continue;
  
-        if (CandidatePairsSet.count(ValuePair(*I, *J))) {
-          VPPair VP(P, ValuePair(*I, *J));
+        if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
+          VPPair VP(P, ValuePair(UI, UJ));
            ConnectedPairs[VP.first].push_back(VP.second);
            PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
          }
@@ -1399,12 +1480,16 @@ namespace {
      BasicBlock::iterator E = BB.end(), EL =
        BasicBlock::iterator(cast<Instruction>(PairableInsts.back()));
      for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
-      if (IsInPair.find(I) == IsInPair.end()) continue;
+      if (IsInPair.find(&*I) == IsInPair.end())
+        continue;
  
        DenseSet<Value *> Users;
        AliasSetTracker WriteSet(*AA);
-      for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) {
-        (void) trackUsesOfI(Users, WriteSet, I, J);
+      if (I->mayWriteToMemory())
+        WriteSet.add(&*I);
+
+      for (BasicBlock::iterator J = std::next(I); J != E; ++J) {
+        (void)trackUsesOfI(Users, WriteSet, &*I, &*J);
  
          if (J == EL)
            break;
@@ -1413,7 +1498,7 @@ namespace {
        for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
             U != E; ++U) {
          if (IsInPair.find(*U) == IsInPair.end()) continue;
-        PairableInstUsers.insert(ValuePair(I, *U));
+        PairableInstUsers.insert(ValuePair(&*I, *U));
        }
  
        if (I == EL)
@@ -1602,7 +1687,7 @@ namespace {
          DenseSet<ValuePair> CurrentPairs;
  
          bool CanAdd = true;
-        for (SmallVector<ValuePairWithDepth, 8>::iterator C2
+        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2
                = BestChildren.begin(), E2 = BestChildren.end();
               C2 != E2; ++C2) {
            if (C2->first.first == C->first.first ||
@@ -1610,8 +1695,9 @@ namespace {
                C2->first.second == C->first.first ||
                C2->first.second == C->first.second ||
                pairsConflict(C2->first, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : 0,
-                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
+                            UseCycleCheck ? &PairableInstUserMap : nullptr,
+                            UseCycleCheck ? &PairableInstUserPairSet
+                                          : nullptr)) {
              if (C2->second >= C->second) {
                CanAdd = false;
                break;
@@ -1631,8 +1717,9 @@ namespace {
                T->second == C->first.first ||
                T->second == C->first.second ||
                pairsConflict(*T, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : 0,
-                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
+                            UseCycleCheck ? &PairableInstUserMap : nullptr,
+                            UseCycleCheck ? &PairableInstUserPairSet
+                                          : nullptr)) {
              CanAdd = false;
              break;
            }
@@ -1642,15 +1729,16 @@ namespace {
          if (!CanAdd) continue;
  
          // And check the queue too...
-        for (SmallVector<ValuePairWithDepth, 32>::iterator C2 = Q.begin(),
+        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = Q.begin(),
               E2 = Q.end(); C2 != E2; ++C2) {
            if (C2->first.first == C->first.first ||
                C2->first.first == C->first.second ||
                C2->first.second == C->first.first ||
                C2->first.second == C->first.second ||
                pairsConflict(C2->first, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : 0,
-                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
+                            UseCycleCheck ? &PairableInstUserMap : nullptr,
+                            UseCycleCheck ? &PairableInstUserPairSet
+                                          : nullptr)) {
              CanAdd = false;
              break;
            }
@@ -1665,8 +1753,9 @@ namespace {
                ChosenPairs.begin(), E2 = ChosenPairs.end();
               C2 != E2; ++C2) {
            if (pairsConflict(*C2, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : 0,
-                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
+                            UseCycleCheck ? &PairableInstUserMap : nullptr,
+                            UseCycleCheck ? &PairableInstUserPairSet
+                                          : nullptr)) {
              CanAdd = false;
              break;
            }
@@ -1691,7 +1780,7 @@ namespace {
          // to an already-selected child. Check for this here, and if a
          // conflict is found, then remove the previously-selected child
          // before adding this one in its place.
-        for (SmallVector<ValuePairWithDepth, 8>::iterator C2
+        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2
                = BestChildren.begin(); C2 != BestChildren.end();) {
            if (C2->first.first == C->first.first ||
                C2->first.first == C->first.second ||
@@ -1706,7 +1795,7 @@ namespace {
          BestChildren.push_back(ValuePairWithDepth(C->first, C->second));
        }
  
-      for (SmallVector<ValuePairWithDepth, 8>::iterator C
+      for (SmallVectorImpl<ValuePairWithDepth>::iterator C
              = BestChildren.begin(), E2 = BestChildren.end();
             C != E2; ++C) {
          size_t DepthF = getDepthFactor(C->first.first);
@@ -1747,8 +1836,8 @@ namespace {
        for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
             E = ChosenPairs.end(); C != E; ++C) {
          if (pairsConflict(*C, IJ, PairableInstUsers,
-                          UseCycleCheck ? &PairableInstUserMap : 0,
-                          UseCycleCheck ? &PairableInstUserPairSet : 0)) {
+                          UseCycleCheck ? &PairableInstUserMap : nullptr,
+                          UseCycleCheck ? &PairableInstUserPairSet : nullptr)) {
            DoesConflict = true;
            break;
          }
@@ -1771,7 +1860,7 @@ namespace {
        size_t MaxDepth = DAG.lookup(IJ);
  
        DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {"
-                   << IJ.first << " <-> " << IJ.second << "} of depth " <<
+                   << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
                     MaxDepth << " and size " << DAG.size() << "\n");
  
        // At this point the DAG has been constructed, but, may contain
@@ -1897,16 +1986,15 @@ namespace {
              Type *VTy = getVecTypeForPair(Ty1, Ty2);
  
              bool NeedsExtraction = false;
-            for (Value::use_iterator I = S->first->use_begin(),
-                 IE = S->first->use_end(); I != IE; ++I) {
-              if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) {
+            for (User *U : S->first->users()) {
+              if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) {
                  // Shuffle can be folded if it has no other input
                  if (isa<UndefValue>(SI->getOperand(1)))
                    continue;
                }
-              if (isa<ExtractElementInst>(*I))
+              if (isa<ExtractElementInst>(U))
                  continue;
-              if (PrunedDAGInstrs.count(*I))
+              if (PrunedDAGInstrs.count(U))
                  continue;
                NeedsExtraction = true;
                break;
@@ -1929,16 +2017,15 @@ namespace {
              }
  
              NeedsExtraction = false;
-            for (Value::use_iterator I = S->second->use_begin(),
-                 IE = S->second->use_end(); I != IE; ++I) {
-              if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) {
+            for (User *U : S->second->users()) {
+              if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) {
                  // Shuffle can be folded if it has no other input
                  if (isa<UndefValue>(SI->getOperand(1)))
                    continue;
                }
-              if (isa<ExtractElementInst>(*I))
+              if (isa<ExtractElementInst>(U))
                  continue;
-              if (PrunedDAGInstrs.count(*I))
+              if (PrunedDAGInstrs.count(U))
                  continue;
                NeedsExtraction = true;
                break;
@@ -2086,7 +2173,7 @@ namespace {
  
        DEBUG(if (DebugPairSelection)
               dbgs() << "BBV: found pruned DAG for pair {"
-             << IJ.first << " <-> " << IJ.second << "} of depth " <<
+             << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
               MaxDepth << " and size " << PrunedDAG.size() <<
              " (effective size: " << EffSize << ")\n");
        if (((TTI && !UseChainDepthWithTI) ||
@@ -2164,10 +2251,7 @@ namespace {
                 *S->second << "\n");
  
          // Remove all candidate pairs that have values in the chosen dag.
-        std::vector<Value *> &KK = CandidatePairs[S->first],
-                             &LL = CandidatePairs2[S->second],
-                             &MM = CandidatePairs[S->second],
-                             &NN = CandidatePairs2[S->first];
+        std::vector<Value *> &KK = CandidatePairs[S->first];
          for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end();
               K != KE; ++K) {
            if (*K == S->second)
@@ -2175,6 +2259,8 @@ namespace {
  
            CandidatePairsSet.erase(ValuePair(S->first, *K));
          }
+
+        std::vector<Value *> &LL = CandidatePairs2[S->second];
          for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end();
               L != LE; ++L) {
            if (*L == S->first)
@@ -2182,11 +2268,15 @@ namespace {
  
            CandidatePairsSet.erase(ValuePair(*L, S->second));
          }
+
+        std::vector<Value *> &MM = CandidatePairs[S->second];
          for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end();
               M != ME; ++M) {
            assert(*M != S->first && "Flipped pair in candidate list?");
            CandidatePairsSet.erase(ValuePair(S->second, *M));
          }
+
+        std::vector<Value *> &NN = CandidatePairs2[S->first];
          for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end();
               N != NE; ++N) {
            assert(*N != S->second && "Flipped pair in candidate list?");
@@ -2224,11 +2314,12 @@ namespace {
      // The pointer value is taken to be the one with the lowest offset.
      Value *VPtr = IPtr;
  
-    Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType();
-    Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType();
+    Type *ArgTypeI = IPtr->getType()->getPointerElementType();
+    Type *ArgTypeJ = JPtr->getType()->getPointerElementType();
      Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
-    Type *VArgPtrType = PointerType::get(VArgType,
-      cast<PointerType>(IPtr->getType())->getAddressSpace());
+    Type *VArgPtrType
+      = PointerType::get(VArgType,
+                         IPtr->getType()->getPointerAddressSpace());
      return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
                          /* insert before */ I);
    }
@@ -2237,7 +2328,7 @@ namespace {
                       unsigned MaskOffset, unsigned NumInElem,
                       unsigned NumInElem1, unsigned IdxOffset,
                       std::vector<Constant*> &Mask) {
-    unsigned NumElem1 = cast<VectorType>(J->getType())->getNumElements();
+    unsigned NumElem1 = J->getType()->getVectorNumElements();
      for (unsigned v = 0; v < NumElem1; ++v) {
        int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
        if (m < 0) {
@@ -2264,18 +2355,18 @@ namespace {
      Type *ArgTypeJ = J->getType();
      Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
  
-    unsigned NumElemI = cast<VectorType>(ArgTypeI)->getNumElements();
+    unsigned NumElemI = ArgTypeI->getVectorNumElements();
  
      // Get the total number of elements in the fused vector type.
      // By definition, this must equal the number of elements in
      // the final mask.
-    unsigned NumElem = cast<VectorType>(VArgType)->getNumElements();
+    unsigned NumElem = VArgType->getVectorNumElements();
      std::vector<Constant*> Mask(NumElem);
  
      Type *OpTypeI = I->getOperand(0)->getType();
-    unsigned NumInElemI = cast<VectorType>(OpTypeI)->getNumElements();
+    unsigned NumInElemI = OpTypeI->getVectorNumElements();
      Type *OpTypeJ = J->getOperand(0)->getType();
-    unsigned NumInElemJ = cast<VectorType>(OpTypeJ)->getNumElements();
+    unsigned NumInElemJ = OpTypeJ->getVectorNumElements();
  
      // The fused vector will be:
      // -----------------------------------------------------
@@ -2316,7 +2407,7 @@ namespace {
          } while ((LIENext =
                     dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
  
-        LIENext = 0;
+        LIENext = nullptr;
          Value *LIEPrev = UndefValue::get(ArgTypeH);
          for (unsigned i = 0; i < numElemL; ++i) {
            if (isa<UndefValue>(VectElemts[i])) continue;
@@ -2337,6 +2428,12 @@ namespace {
      return ExpandedIEChain;
    }
  
+  static unsigned getNumScalarElements(Type *Ty) {
+    if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
+      return VecTy->getNumElements();
+    return 1;
+  }
+
    // Returns the value to be used as the specified operand of the vector
    // instruction that fuses I with J.
    Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
@@ -2352,17 +2449,8 @@ namespace {
      Instruction *L = I, *H = J;
      Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
  
-    unsigned numElemL;
-    if (ArgTypeL->isVectorTy())
-      numElemL = cast<VectorType>(ArgTypeL)->getNumElements();
-    else
-      numElemL = 1;
-
-    unsigned numElemH;
-    if (ArgTypeH->isVectorTy())
-      numElemH = cast<VectorType>(ArgTypeH)->getNumElements();
-    else
-      numElemH = 1;
+    unsigned numElemL = getNumScalarElements(ArgTypeL);
+    unsigned numElemH = getNumScalarElements(ArgTypeH);
  
      Value *LOp = L->getOperand(o);
      Value *HOp = H->getOperand(o);
@@ -2387,14 +2475,14 @@ namespace {
      if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) {
        // We can have at most two unique vector inputs.
        bool CanUseInputs = true;
-      Value *I1, *I2 = 0;
+      Value *I1, *I2 = nullptr;
        if (LEE) {
          I1 = LEE->getOperand(0);
        } else {
          I1 = LSV->getOperand(0);
          I2 = LSV->getOperand(1);
          if (I2 == I1 || isa<UndefValue>(I2))
-          I2 = 0;
+          I2 = nullptr;
        }
    
        if (HEE) {
@@ -2423,11 +2511,12 @@ namespace {
  
        if (CanUseInputs) {
          unsigned LOpElem =
-          cast<VectorType>(cast<Instruction>(LOp)->getOperand(0)->getType())
-            ->getNumElements();
+          cast<Instruction>(LOp)->getOperand(0)->getType()
+            ->getVectorNumElements();
+
          unsigned HOpElem =
-          cast<VectorType>(cast<Instruction>(HOp)->getOperand(0)->getType())
-            ->getNumElements();
+          cast<Instruction>(HOp)->getOperand(0)->getType()
+            ->getVectorNumElements();
  
          // We have one or two input vectors. We need to map each index of the
          // operands to the index of the original vector.
@@ -2530,7 +2619,6 @@ namespace {
                                                       true, o, 1));
            NewI1->insertBefore(IBeforeJ ? J : I);
            I1 = NewI1;
-          I1T = I2T;
            I1Elem = I2Elem;
          } else if (I1Elem > I2Elem) {
            std::vector<Constant *> Mask(I1Elem);
@@ -2547,8 +2635,6 @@ namespace {
                                                       true, o, 1));
            NewI2->insertBefore(IBeforeJ ? J : I);
            I2 = NewI2;
-          I2T = I1T;
-          I2Elem = I1Elem;
          }
  
          // Now that both I1 and I2 are the same length we can shuffle them
@@ -2643,14 +2729,14 @@ namespace {
                                             getReplacementName(IBeforeJ ? I : J,
                                                                true, o, 1));
          }
-  
+
          NHOp->insertBefore(IBeforeJ ? J : I);
          HOp = NHOp;
        }
      }
  
      if (ArgType->isVectorTy()) {
-      unsigned numElem = cast<VectorType>(VArgType)->getNumElements();
+      unsigned numElem = VArgType->getVectorNumElements();
        std::vector<Constant*> Mask(numElem);
        for (unsigned v = 0; v < numElem; ++v) {
          unsigned Idx = v;
@@ -2684,7 +2770,7 @@ namespace {
    // to the vector instruction that fuses I with J.
    void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
                       Instruction *I, Instruction *J,
-                     SmallVector<Value *, 3> &ReplacedOperands,
+                     SmallVectorImpl<Value *> &ReplacedOperands,
                       bool IBeforeJ) {
      unsigned NumOperands = I->getNumOperands();
  
@@ -2698,7 +2784,7 @@ namespace {
          continue;
        } else if (isa<CallInst>(I)) {
          Function *F = cast<CallInst>(I)->getCalledFunction();
-        Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID();
+        Intrinsic::ID IID = F->getIntrinsicID();
          if (o == NumOperands-1) {
            BasicBlock &BB = *I->getParent();
  
@@ -2709,10 +2795,11 @@ namespace {
  
            ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType);
            continue;
-        } else if (IID == Intrinsic::powi && o == 1) {
-          // The second argument of powi is a single integer and we've already
-          // checked that both arguments are equal. As a result, we just keep
-          // I's second argument.
+        } else if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
+                    IID == Intrinsic::cttz) && o == 1) {
+          // The second argument of powi/ctlz/cttz is a single integer/constant
+          // and we've already checked that both arguments are equal.
+          // As a result, we just keep I's second argument.
            ReplacedOperands[o] = I->getOperand(o);
            continue;
          }
@@ -2733,63 +2820,51 @@ namespace {
                       Instruction *J, Instruction *K,
                       Instruction *&InsertionPt,
                       Instruction *&K1, Instruction *&K2) {
-    if (isa<StoreInst>(I)) {
-      AA->replaceWithNewValue(I, K);
-      AA->replaceWithNewValue(J, K);
-    } else {
-      Type *IType = I->getType();
-      Type *JType = J->getType();
+    if (isa<StoreInst>(I))
+      return;
  
-      VectorType *VType = getVecTypeForPair(IType, JType);
-      unsigned numElem = VType->getNumElements();
+    Type *IType = I->getType();
+    Type *JType = J->getType();
  
-      unsigned numElemI, numElemJ;
-      if (IType->isVectorTy())
-        numElemI = cast<VectorType>(IType)->getNumElements();
-      else
-        numElemI = 1;
-
-      if (JType->isVectorTy())
-        numElemJ = cast<VectorType>(JType)->getNumElements();
-      else
-        numElemJ = 1;
+    VectorType *VType = getVecTypeForPair(IType, JType);
+    unsigned numElem = VType->getNumElements();
  
-      if (IType->isVectorTy()) {
-        std::vector<Constant*> Mask1(numElemI), Mask2(numElemI);
-        for (unsigned v = 0; v < numElemI; ++v) {
-          Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-          Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v);
-        }
+    unsigned numElemI = getNumScalarElements(IType);
+    unsigned numElemJ = getNumScalarElements(JType);
  
-        K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
-                                   ConstantVector::get( Mask1),
-                                   getReplacementName(K, false, 1));
-      } else {
-        Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
-        K1 = ExtractElementInst::Create(K, CV0,
-                                          getReplacementName(K, false, 1));
+    if (IType->isVectorTy()) {
+      std::vector<Constant *> Mask1(numElemI), Mask2(numElemI);
+      for (unsigned v = 0; v < numElemI; ++v) {
+        Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+        Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ + v);
        }
  
-      if (JType->isVectorTy()) {
-        std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ);
-        for (unsigned v = 0; v < numElemJ; ++v) {
-          Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-          Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v);
-        }
+      K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
+                                 ConstantVector::get(Mask1),
+                                 getReplacementName(K, false, 1));
+    } else {
+      Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
+      K1 = ExtractElementInst::Create(K, CV0, getReplacementName(K, false, 1));
+    }
  
-        K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
-                                   ConstantVector::get( Mask2),
-                                   getReplacementName(K, false, 2));
-      } else {
-        Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1);
-        K2 = ExtractElementInst::Create(K, CV1,
-                                          getReplacementName(K, false, 2));
+    if (JType->isVectorTy()) {
+      std::vector<Constant *> Mask1(numElemJ), Mask2(numElemJ);
+      for (unsigned v = 0; v < numElemJ; ++v) {
+        Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+        Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI + v);
        }
  
-      K1->insertAfter(K);
-      K2->insertAfter(K1);
-      InsertionPt = K2;
+      K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
+                                 ConstantVector::get(Mask2),
+                                 getReplacementName(K, false, 2));
+    } else {
+      Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem - 1);
+      K2 = ExtractElementInst::Create(K, CV1, getReplacementName(K, false, 2));
      }
+
+    K1->insertAfter(K);
+    K2->insertAfter(K1);
+    InsertionPt = K2;
    }
  
    // Move all uses of the function I (including pairing-induced uses) after J.
@@ -2797,12 +2872,14 @@ namespace {
                       DenseSet<ValuePair> &LoadMoveSetPairs,
                       Instruction *I, Instruction *J) {
      // Skip to the first instruction past I.
-    BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
+    BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
  
      DenseSet<Value *> Users;
      AliasSetTracker WriteSet(*AA);
+    if (I->mayWriteToMemory()) WriteSet.add(I);
+
      for (; cast<Instruction>(L) != J; ++L)
-      (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs);
+      (void)trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs);
  
      assert(cast<Instruction>(L) == J &&
        "Tracking has not proceeded far enough to check for dependencies");
@@ -2817,14 +2894,16 @@ namespace {
                       Instruction *&InsertionPt,
                       Instruction *I, Instruction *J) {
      // Skip to the first instruction past I.
-    BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
+    BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
  
      DenseSet<Value *> Users;
      AliasSetTracker WriteSet(*AA);
+    if (I->mayWriteToMemory()) WriteSet.add(I);
+
      for (; cast<Instruction>(L) != J;) {
-      if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) {
+      if (trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs)) {
          // Move this instruction
-        Instruction *InstToMove = L; ++L;
+        Instruction *InstToMove = &*L++;
  
          DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<
                          " to after " << *InsertionPt << "\n");
@@ -2846,19 +2925,20 @@ namespace {
                       DenseSet<ValuePair> &LoadMoveSetPairs,
                       Instruction *I) {
      // Skip to the first instruction past I.
-    BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
+    BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
  
      DenseSet<Value *> Users;
      AliasSetTracker WriteSet(*AA);
+    if (I->mayWriteToMemory()) WriteSet.add(I);
  
      // Note: We cannot end the loop when we reach J because J could be moved
      // farther down the use chain by another instruction pairing. Also, J
      // could be before I if this is an inverted input.
-    for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) {
-      if (trackUsesOfI(Users, WriteSet, I, L)) {
+    for (BasicBlock::iterator E = BB.end(); L != E; ++L) {
+      if (trackUsesOfI(Users, WriteSet, I, &*L)) {
          if (L->mayReadFromMemory()) {
-          LoadMoveSet[L].push_back(I);
-          LoadMoveSetPairs.insert(ValuePair(L, I));
+          LoadMoveSet[&*L].push_back(I);
+          LoadMoveSetPairs.insert(ValuePair(&*L, I));
          }
        }
      }
@@ -2887,31 +2967,6 @@ namespace {
      }
    }
  
-  // When the first instruction in each pair is cloned, it will inherit its
-  // parent's metadata. This metadata must be combined with that of the other
-  // instruction in a safe way.
-  void BBVectorize::combineMetadata(Instruction *K, const Instruction *J) {
-    SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata;
-    K->getAllMetadataOtherThanDebugLoc(Metadata);
-    for (unsigned i = 0, n = Metadata.size(); i < n; ++i) {
-      unsigned Kind = Metadata[i].first;
-      MDNode *JMD = J->getMetadata(Kind);
-      MDNode *KMD = Metadata[i].second;
-
-      switch (Kind) {
-      default:
-        K->setMetadata(Kind, 0); // Remove unknown metadata
-        break;
-      case LLVMContext::MD_tbaa:
-        K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD));
-        break;
-      case LLVMContext::MD_fpmath:
-        K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD));
-        break;
-      }
-    }
-  }
-
    // This function fuses the chosen instruction pairs into vector instructions,
    // taking care preserve any needed scalar outputs and, then, it reorders the
    // remaining instructions as needed (users of the first member of the pair
@@ -2946,7 +3001,7 @@ namespace {
      DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
  
      for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
-      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI);
+      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(&*PI);
        if (P == ChosenPairs.end()) {
          ++PI;
          continue;
@@ -3058,10 +3113,23 @@ namespace {
        else if (H->hasName())
          K->takeName(H);
  
-      if (!isa<StoreInst>(K))
+      if (auto CS = CallSite(K)) {
+        SmallVector<Type *, 3> Tys;
+        FunctionType *Old = CS.getFunctionType();
+        unsigned NumOld = Old->getNumParams();
+        assert(NumOld <= ReplacedOperands.size());
+        for (unsigned i = 0; i != NumOld; ++i)
+          Tys.push_back(ReplacedOperands[i]->getType());
+        CS.mutateFunctionType(
+            FunctionType::get(getVecTypeForPair(L->getType(), H->getType()),
+                              Tys, Old->isVarArg()));
+      } else if (!isa<StoreInst>(K))
          K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
  
-      combineMetadata(K, H);
+      unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+                             LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
+                             LLVMContext::MD_invariant_group};
+      combineMetadata(K, H, KnownIDs);
        K->intersectOptionalDataWith(H);
  
        for (unsigned o = 0; o < NumOperands; ++o)
@@ -3071,7 +3139,7 @@ namespace {
  
        // Instruction insertion point:
        Instruction *InsertionPt = K;
-      Instruction *K1 = 0, *K2 = 0;
+      Instruction *K1 = nullptr, *K2 = nullptr;
        replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2);
  
        // The use dag of the first original instruction must be moved to after
@@ -3084,8 +3152,6 @@ namespace {
        if (!isa<StoreInst>(I)) {
          L->replaceAllUsesWith(K1);
          H->replaceAllUsesWith(K2);
-        AA->replaceWithNewValue(L, K1);
-        AA->replaceWithNewValue(H, K2);
        }
  
        // Instructions that may read from memory may be in the load move set.
@@ -3116,7 +3182,7 @@ namespace {
        }
  
        // Before removing I, set the iterator to the next instruction.
-      PI = llvm::next(BasicBlock::iterator(I));
+      PI = std::next(BasicBlock::iterator(I));
        if (cast<Instruction>(PI) == J)
          ++PI;
  
@@ -3136,10 +3202,14 @@ namespace {
  char BBVectorize::ID = 0;
  static const char bb_vectorize_name[] = "Basic-Block Vectorization";
  INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
-INITIALIZE_PASS_DEPENDENCY(DominatorTree)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
  INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
  
  BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) {
@@ -3148,7 +3218,7 @@ BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) {
  
  bool
  llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) {
-  BBVectorize BBVectorizer(P, C);
+  BBVectorize BBVectorizer(P, *BB.getParent(), C);
    return BBVectorizer.vectorizeBB(BB);
  }
  
@@ -3161,6 +3231,7 @@ VectorizeConfig::VectorizeConfig() {
    VectorizePointers = !::NoPointers;
    VectorizeCasts = !::NoCasts;
    VectorizeMath = !::NoMath;
+  VectorizeBitManipulations = !::NoBitManipulation;
    VectorizeFMA = !::NoFMA;
    VectorizeSelect = !::NoSelect;
    VectorizeCmp = !::NoCmp;