AsmWriter/Bitcode: MDLocalVariable

[oota-llvm.git] / lib / Transforms / Vectorize / SLPVectorizer.cpp
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp

index 8fb87b8095498681d248ae717c11358303211508..baf97418b56f10a1fd83733389fea443b7d2c6c2 100644 (file)
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -75,6 +75,27 @@ static const unsigned MinVecRegSize = 128;
  
  static const unsigned RecursionMaxDepth = 12;
  
+// Limit the number of alias checks. The limit is chosen so that
+// it has no negative effect on the llvm benchmarks.
+static const unsigned AliasedCheckLimit = 10;
+
+// Another limit for the alias checks: The maximum distance between load/store
+// instructions where alias checks are done.
+// This limit is useful for very large basic blocks.
+static const unsigned MaxMemDepDistance = 160;
+
+/// \brief Predicate for the element types that the SLP vectorizer supports.
+///
+/// The most important thing to filter here are types which are invalid in LLVM
+/// vectors. We also filter target specific types which have absolutely no
+/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
+/// avoids spending time checking the cost model and realizing that they will
+/// be inevitably scalarized.
+static bool isValidElementType(Type *Ty) {
+  return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
+         !Ty->isPPC_FP128Ty();
+}
+
  /// \returns the parent basic block if all of the instructions in \p VL
  /// are in the same block or null otherwise.
  static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
@@ -208,6 +229,8 @@ static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
          MD = MDNode::getMostGenericTBAA(MD, IMD);
          break;
        case LLVMContext::MD_alias_scope:
+        MD = MDNode::getMostGenericAliasScope(MD, IMD);
+        break;
        case LLVMContext::MD_noalias:
          MD = MDNode::intersect(MD, IMD);
          break;
@@ -264,104 +287,6 @@ static bool CanReuseExtract(ArrayRef<Value *> VL) {
    return true;
  }
  
-static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
-                                           SmallVectorImpl<Value *> &Left,
-                                           SmallVectorImpl<Value *> &Right) {
-
-  SmallVector<Value *, 16> OrigLeft, OrigRight;
-
-  bool AllSameOpcodeLeft = true;
-  bool AllSameOpcodeRight = true;
-  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
-    Instruction *I = cast<Instruction>(VL[i]);
-    Value *V0 = I->getOperand(0);
-    Value *V1 = I->getOperand(1);
-
-    OrigLeft.push_back(V0);
-    OrigRight.push_back(V1);
-
-    Instruction *I0 = dyn_cast<Instruction>(V0);
-    Instruction *I1 = dyn_cast<Instruction>(V1);
-
-    // Check whether all operands on one side have the same opcode. In this case
-    // we want to preserve the original order and not make things worse by
-    // reordering.
-    AllSameOpcodeLeft = I0;
-    AllSameOpcodeRight = I1;
-
-    if (i && AllSameOpcodeLeft) {
-      if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) {
-        if(P0->getOpcode() != I0->getOpcode())
-          AllSameOpcodeLeft = false;
-      } else
-        AllSameOpcodeLeft = false;
-    }
-    if (i && AllSameOpcodeRight) {
-      if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) {
-        if(P1->getOpcode() != I1->getOpcode())
-          AllSameOpcodeRight = false;
-      } else
-        AllSameOpcodeRight = false;
-    }
-
-    // Sort two opcodes. In the code below we try to preserve the ability to use
-    // broadcast of values instead of individual inserts.
-    // vl1 = load
-    // vl2 = phi
-    // vr1 = load
-    // vr2 = vr2
-    //    = vl1 x vr1
-    //    = vl2 x vr2
-    // If we just sorted according to opcode we would leave the first line in
-    // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
-    //    = vl1 x vr1
-    //    = vr2 x vl2
-    // Because vr2 and vr1 are from the same load we loose the opportunity of a
-    // broadcast for the packed right side in the backend: we have [vr1, vl2]
-    // instead of [vr1, vr2=vr1].
-    if (I0 && I1) {
-       if(!i && I0->getOpcode() > I1->getOpcode()) {
-         Left.push_back(I1);
-         Right.push_back(I0);
-       } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) {
-         // Try not to destroy a broad cast for no apparent benefit.
-         Left.push_back(I1);
-         Right.push_back(I0);
-       } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] ==  I0) {
-         // Try preserve broadcasts.
-         Left.push_back(I1);
-         Right.push_back(I0);
-       } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) {
-         // Try preserve broadcasts.
-         Left.push_back(I1);
-         Right.push_back(I0);
-       } else {
-         Left.push_back(I0);
-         Right.push_back(I1);
-       }
-       continue;
-    }
-    // One opcode, put the instruction on the right.
-    if (I0) {
-      Left.push_back(V1);
-      Right.push_back(I0);
-      continue;
-    }
-    Left.push_back(V0);
-    Right.push_back(V1);
-  }
-
-  bool LeftBroadcast = isSplat(Left);
-  bool RightBroadcast = isSplat(Right);
-
-  // Don't reorder if the operands where good to begin with.
-  if (!(LeftBroadcast || RightBroadcast) &&
-      (AllSameOpcodeRight || AllSameOpcodeLeft)) {
-    Left = OrigLeft;
-    Right = OrigRight;
-  }
-}
-
  /// \returns True if in-tree use also needs extract. This refers to
  /// possible scalar operand in vectorized instruction.
  static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
@@ -398,6 +323,17 @@ static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) {
    return AliasAnalysis::Location();
  }
  
+/// \returns True if the instruction is not a volatile or atomic load/store.
+static bool isSimple(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isSimple();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isSimple();
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+    return !MI->isVolatile();
+  return true;
+}
+
  /// Bottom Up SLP Vectorizer.
  class BoUpSLP {
  public:
@@ -504,6 +440,16 @@ private:
    /// be beneficial even the tree height is tiny.
    bool isFullyVectorizableTinyTree();
  
+  /// \reorder commutative operands in alt shuffle if they result in
+  ///  vectorized code.
+  void reorderAltShuffleOperands(ArrayRef<Value *> VL,
+                                 SmallVectorImpl<Value *> &Left,
+                                 SmallVectorImpl<Value *> &Right);
+  /// \reorder commutative operands to get better probability of
+  /// generating vectorized code.
+  void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+                                      SmallVectorImpl<Value *> &Left,
+                                      SmallVectorImpl<Value *> &Right);
    struct TreeEntry {
      TreeEntry() : Scalars(), VectorizedValue(nullptr),
      NeedToGather(0) {}
@@ -580,7 +526,7 @@ private:
      }
      AliasAnalysis::Location Loc2 = getLocation(Inst2, AA);
      bool aliased = true;
-    if (Loc1.Ptr && Loc2.Ptr) {
+    if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
        // Do the alias check.
        aliased = AA->alias(Loc1, Loc2);
      }
@@ -592,8 +538,25 @@ private:
    typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
  
    /// Cache for alias results.
+  /// TODO: consider moving this to the AliasAnalysis itself.
    DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
  
+  /// Removes an instruction from its block and eventually deletes it.
+  /// It's like Instruction::eraseFromParent() except that the actual deletion
+  /// is delayed until BoUpSLP is destructed.
+  /// This is required to ensure that there are no incorrect collisions in the
+  /// AliasCache, which can happen if a new instruction is allocated at the
+  /// same address as a previously deleted instruction.
+  void eraseInstruction(Instruction *I) {
+    I->removeFromParent();
+    I->dropAllReferences();
+    DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
+  }
+
+  /// Temporary store for deleted instructions. Instructions will be deleted
+  /// eventually when the BoUpSLP is destructed.
+  SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
+
    /// A list of values that need to extracted out of the tree.
    /// This list holds pairs of (Internal Scalar : External User).
    UserList ExternalUses;
@@ -896,7 +859,7 @@ private:
    };
  
    /// Attaches the BlockScheduling structures to basic blocks.
-  DenseMap<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
+  MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
  
    /// Performs the "real" scheduling. Done before vectorization is actually
    /// performed in a basic block.
@@ -1197,7 +1160,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
        Type *SrcTy = VL0->getOperand(0)->getType();
        for (unsigned i = 0; i < VL.size(); ++i) {
          Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
-        if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) {
+        if (Ty != SrcTy || !isValidElementType(Ty)) {
            BS.cancelScheduling(VL);
            newTreeEntry(VL, false);
            DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
@@ -1420,6 +1383,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
        }
        newTreeEntry(VL, true);
        DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
+
+      // Reorder operands if reordering would enable vectorization.
+      if (isa<BinaryOperator>(VL0)) {
+        ValueList Left, Right;
+        reorderAltShuffleOperands(VL, Left, Right);
+        buildTree_rec(Left, Depth + 1);
+        buildTree_rec(Right, Depth + 1);
+        return;
+      }
+
        for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
          ValueList Operands;
          // Prepare the operand vector.
@@ -1743,7 +1716,7 @@ int BoUpSLP::getTreeCost() {
  
    // We only vectorize tiny trees if it is fully vectorizable.
    if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
-    if (!VectorizableTree.size()) {
+    if (VectorizableTree.empty()) {
        assert(!ExternalUses.size() && "We should not have any external users");
      }
      return INT_MAX;
@@ -1857,6 +1830,195 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
    return X == PtrSCEVB;
  }
  
+// Reorder commutative operations in alternate shuffle if the resulting vectors
+// are consecutive loads. This would allow us to vectorize the tree.
+// If we have something like-
+// load a[0] - load b[0]
+// load b[1] + load a[1]
+// load a[2] - load b[2]
+// load a[3] + load b[3]
+// Reordering the second load b[1]  load a[1] would allow us to vectorize this
+// code.
+void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
+                                        SmallVectorImpl<Value *> &Left,
+                                        SmallVectorImpl<Value *> &Right) {
+
+  // Push left and right operands of binary operation into Left and Right
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    Left.push_back(cast<Instruction>(VL[i])->getOperand(0));
+    Right.push_back(cast<Instruction>(VL[i])->getOperand(1));
+  }
+
+  // Reorder if we have a commutative operation and consecutive access
+  // are on either side of the alternate instructions.
+  for (unsigned j = 0; j < VL.size() - 1; ++j) {
+    if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
+      if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
+        Instruction *VL1 = cast<Instruction>(VL[j]);
+        Instruction *VL2 = cast<Instruction>(VL[j + 1]);
+        if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) {
+          std::swap(Left[j], Right[j]);
+          continue;
+        } else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) {
+          std::swap(Left[j + 1], Right[j + 1]);
+          continue;
+        }
+        // else unchanged
+      }
+    }
+    if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
+      if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
+        Instruction *VL1 = cast<Instruction>(VL[j]);
+        Instruction *VL2 = cast<Instruction>(VL[j + 1]);
+        if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) {
+          std::swap(Left[j], Right[j]);
+          continue;
+        } else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) {
+          std::swap(Left[j + 1], Right[j + 1]);
+          continue;
+        }
+        // else unchanged
+      }
+    }
+  }
+}
+
+void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+                                             SmallVectorImpl<Value *> &Left,
+                                             SmallVectorImpl<Value *> &Right) {
+
+  SmallVector<Value *, 16> OrigLeft, OrigRight;
+
+  bool AllSameOpcodeLeft = true;
+  bool AllSameOpcodeRight = true;
+  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+    Instruction *I = cast<Instruction>(VL[i]);
+    Value *VLeft = I->getOperand(0);
+    Value *VRight = I->getOperand(1);
+
+    OrigLeft.push_back(VLeft);
+    OrigRight.push_back(VRight);
+
+    Instruction *ILeft = dyn_cast<Instruction>(VLeft);
+    Instruction *IRight = dyn_cast<Instruction>(VRight);
+
+    // Check whether all operands on one side have the same opcode. In this case
+    // we want to preserve the original order and not make things worse by
+    // reordering.
+    if (i && AllSameOpcodeLeft && ILeft) {
+      if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) {
+        if (PLeft->getOpcode() != ILeft->getOpcode())
+          AllSameOpcodeLeft = false;
+      } else
+        AllSameOpcodeLeft = false;
+    }
+    if (i && AllSameOpcodeRight && IRight) {
+      if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) {
+        if (PRight->getOpcode() != IRight->getOpcode())
+          AllSameOpcodeRight = false;
+      } else
+        AllSameOpcodeRight = false;
+    }
+
+    // Sort two opcodes. In the code below we try to preserve the ability to use
+    // broadcast of values instead of individual inserts.
+    // vl1 = load
+    // vl2 = phi
+    // vr1 = load
+    // vr2 = vr2
+    //    = vl1 x vr1
+    //    = vl2 x vr2
+    // If we just sorted according to opcode we would leave the first line in
+    // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
+    //    = vl1 x vr1
+    //    = vr2 x vl2
+    // Because vr2 and vr1 are from the same load we loose the opportunity of a
+    // broadcast for the packed right side in the backend: we have [vr1, vl2]
+    // instead of [vr1, vr2=vr1].
+    if (ILeft && IRight) {
+      if (!i && ILeft->getOpcode() > IRight->getOpcode()) {
+        Left.push_back(IRight);
+        Right.push_back(ILeft);
+      } else if (i && ILeft->getOpcode() > IRight->getOpcode() &&
+                 Right[i - 1] != IRight) {
+        // Try not to destroy a broad cast for no apparent benefit.
+        Left.push_back(IRight);
+        Right.push_back(ILeft);
+      } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
+                 Right[i - 1] == ILeft) {
+        // Try preserve broadcasts.
+        Left.push_back(IRight);
+        Right.push_back(ILeft);
+      } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
+                 Left[i - 1] == IRight) {
+        // Try preserve broadcasts.
+        Left.push_back(IRight);
+        Right.push_back(ILeft);
+      } else {
+        Left.push_back(ILeft);
+        Right.push_back(IRight);
+      }
+      continue;
+    }
+    // One opcode, put the instruction on the right.
+    if (ILeft) {
+      Left.push_back(VRight);
+      Right.push_back(ILeft);
+      continue;
+    }
+    Left.push_back(VLeft);
+    Right.push_back(VRight);
+  }
+
+  bool LeftBroadcast = isSplat(Left);
+  bool RightBroadcast = isSplat(Right);
+
+  // If operands end up being broadcast return this operand order.
+  if (LeftBroadcast || RightBroadcast)
+    return;
+
+  // Don't reorder if the operands where good to begin.
+  if (AllSameOpcodeRight || AllSameOpcodeLeft) {
+    Left = OrigLeft;
+    Right = OrigRight;
+  }
+
+  // Finally check if we can get longer vectorizable chain by reordering
+  // without breaking the good operand order detected above.
+  // E.g. If we have something like-
+  // load a[0]  load b[0]
+  // load b[1]  load a[1]
+  // load a[2]  load b[2]
+  // load a[3]  load b[3]
+  // Reordering the second load b[1]  load a[1] would allow us to vectorize
+  // this code and we still retain AllSameOpcode property.
+  // FIXME: This load reordering might break AllSameOpcode in some rare cases
+  // such as-
+  // add a[0],c[0]  load b[0]
+  // add a[1],c[2]  load b[1]
+  // b[2]           load b[2]
+  // add a[3],c[3]  load b[3]
+  for (unsigned j = 0; j < VL.size() - 1; ++j) {
+    if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
+      if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
+        if (isConsecutiveAccess(L, L1)) {
+          std::swap(Left[j + 1], Right[j + 1]);
+          continue;
+        }
+      }
+    }
+    if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
+      if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
+        if (isConsecutiveAccess(L, L1)) {
+          std::swap(Left[j + 1], Right[j + 1]);
+          continue;
+        }
+      }
+    }
+    // else unchanged
+  }
+}
+
  void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
    Instruction *VL0 = cast<Instruction>(VL[0]);
    BasicBlock::iterator NextInst = VL0;
@@ -2253,10 +2415,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
      }
      case Instruction::ShuffleVector: {
        ValueList LHSVL, RHSVL;
-      for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
-        LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
-        RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
-      }
+      assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
+      reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
        setInsertPointAfterBundle(E->Scalars);
  
        Value *LHS = vectorizeTree(LHSVL);
@@ -2399,7 +2559,7 @@ Value *BoUpSLP::vectorizeTree() {
          Scalar->replaceAllUsesWith(Undef);
        }
        DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
-      cast<Instruction>(Scalar)->eraseFromParent();
+      eraseInstruction(cast<Instruction>(Scalar));
      }
    }
  
@@ -2481,7 +2641,7 @@ void BoUpSLP::optimizeGatherSequence() {
          if (In->isIdenticalTo(*v) &&
              DT->dominates((*v)->getParent(), In->getParent())) {
            In->replaceAllUsesWith(*v);
-          In->eraseFromParent();
+          eraseInstruction(In);
            In = nullptr;
            break;
          }
@@ -2737,23 +2897,57 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
            Instruction *SrcInst = BundleMember->Inst;
            AliasAnalysis::Location SrcLoc = getLocation(SrcInst, SLP->AA);
            bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
+          unsigned numAliased = 0;
+          unsigned DistToSrc = 1;
  
            while (DepDest) {
              assert(isInSchedulingRegion(DepDest));
-            if (SrcMayWrite || DepDest->Inst->mayWriteToMemory()) {
-              if (SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)) {
-                DepDest->MemoryDependencies.push_back(BundleMember);
-                BundleMember->Dependencies++;
-                ScheduleData *DestBundle = DepDest->FirstInBundle;
-                if (!DestBundle->IsScheduled) {
-                  BundleMember->incrementUnscheduledDeps(1);
-                }
-                if (!DestBundle->hasValidDependencies()) {
-                  WorkList.push_back(DestBundle);
-                }
+
+            // We have two limits to reduce the complexity:
+            // 1) AliasedCheckLimit: It's a small limit to reduce calls to
+            //    SLP->isAliased (which is the expensive part in this loop).
+            // 2) MaxMemDepDistance: It's for very large blocks and it aborts
+            //    the whole loop (even if the loop is fast, it's quadratic).
+            //    It's important for the loop break condition (see below) to
+            //    check this limit even between two read-only instructions.
+            if (DistToSrc >= MaxMemDepDistance ||
+                    ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
+                     (numAliased >= AliasedCheckLimit ||
+                      SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
+
+              // We increment the counter only if the locations are aliased
+              // (instead of counting all alias checks). This gives a better
+              // balance between reduced runtime and accurate dependencies.
+              numAliased++;
+
+              DepDest->MemoryDependencies.push_back(BundleMember);
+              BundleMember->Dependencies++;
+              ScheduleData *DestBundle = DepDest->FirstInBundle;
+              if (!DestBundle->IsScheduled) {
+                BundleMember->incrementUnscheduledDeps(1);
+              }
+              if (!DestBundle->hasValidDependencies()) {
+                WorkList.push_back(DestBundle);
                }
              }
              DepDest = DepDest->NextLoadStore;
+
+            // Example, explaining the loop break condition: Let's assume our
+            // starting instruction is i0 and MaxMemDepDistance = 3.
+            //
+            //                      +--------v--v--v
+            //             i0,i1,i2,i3,i4,i5,i6,i7,i8
+            //             +--------^--^--^
+            //
+            // MaxMemDepDistance let us stop alias-checking at i3 and we add
+            // dependencies from i0 to i3,i4,.. (even if they are not aliased).
+            // Previously we already added dependencies from i3 to i6,i7,i8
+            // (because of MaxMemDepDistance). As we added a dependency from
+            // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
+            // and we can abort this loop at i6.
+            if (DistToSrc >= 2 * MaxMemDepDistance)
+                break;
+            DistToSrc++;
            }
          }
        }
@@ -2872,10 +3066,11 @@ struct SLPVectorizer : public FunctionPass {
      SE = &getAnalysis<ScalarEvolution>();
      DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
      DL = DLP ? &DLP->getDataLayout() : nullptr;
-    TTI = &getAnalysis<TargetTransformInfo>();
-    TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+    TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+    TLI = TLIP ? &TLIP->getTLI() : nullptr;
      AA = &getAnalysis<AliasAnalysis>();
-    LI = &getAnalysis<LoopInfo>();
+    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
      DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
      AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
  
@@ -2902,6 +3097,9 @@ struct SLPVectorizer : public FunctionPass {
      // store instructions.
      BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AC);
  
+    // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
+    // delete instructions.
+
      // Scan the blocks in the function in post order.
      for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
           e = po_end(&F.getEntryBlock()); it != e; ++it) {
@@ -2930,10 +3128,10 @@ struct SLPVectorizer : public FunctionPass {
      AU.addRequired<AssumptionCacheTracker>();
      AU.addRequired<ScalarEvolution>();
      AU.addRequired<AliasAnalysis>();
-    AU.addRequired<TargetTransformInfo>();
-    AU.addRequired<LoopInfo>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
      AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addPreserved<LoopInfo>();
+    AU.addPreserved<LoopInfoWrapperPass>();
      AU.addPreserved<DominatorTreeWrapperPass>();
      AU.setPreservesCFG();
    }
@@ -3108,7 +3306,7 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
  
      // Check that the pointer points to scalars.
      Type *Ty = SI->getValueOperand()->getType();
-    if (Ty->isAggregateType() || Ty->isVectorTy())
+    if (!isValidElementType(Ty))
        continue;
  
      // Find the base pointer.
@@ -3149,7 +3347,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
  
    for (int i = 0, e = VL.size(); i < e; ++i) {
      Type *Ty = VL[i]->getType();
-    if (Ty->isAggregateType() || Ty->isVectorTy())
+    if (!isValidElementType(Ty))
        return false;
      Instruction *Inst = dyn_cast<Instruction>(VL[i]);
      if (!Inst || Inst->getOpcode() != Opcode0)
@@ -3369,7 +3567,7 @@ public:
        return false;
  
      Type *Ty = B->getType();
-    if (Ty->isVectorTy())
+    if (!isValidElementType(Ty))
        return false;
  
      ReductionOpcode = B->getOpcode();
@@ -3759,6 +3957,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
              // and the iterator may become invalid value.
              it = BB->begin();
              e = BB->end();
+            break;
            }
          }
        }
@@ -3815,7 +4014,7 @@ char SLPVectorizer::ID = 0;
  static const char lv_name[] = "SLP Vectorizer";
  INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
  INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
  INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
  INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
  INITIALIZE_PASS_DEPENDENCY(LoopSimplify)