[LAA] Merge memchecks for accesses separated by a constant offset

[oota-llvm.git] / lib / Analysis / LoopAccessAnalysis.cpp
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp

index 3e86cafd14fc141aa88cdfa61a986c48ab141dad..65a258698e47294f304b7626f31f749c786d2fbc 100644 (file)
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -15,12 +15,14 @@
  #include "llvm/Analysis/LoopAccessAnalysis.h"
  #include "llvm/Analysis/LoopInfo.h"
  #include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
  #include "llvm/Analysis/ValueTracking.h"
  #include "llvm/IR/DiagnosticInfo.h"
  #include "llvm/IR/Dominators.h"
  #include "llvm/IR/IRBuilder.h"
  #include "llvm/Support/Debug.h"
-#include "llvm/Transforms/Utils/VectorUtils.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Analysis/VectorUtils.h"
  using namespace llvm;
  
  #define DEBUG_TYPE "loop-accesses"
@@ -29,7 +31,7 @@ static cl::opt<unsigned, true>
  VectorizationFactor("force-vector-width", cl::Hidden,
                      cl::desc("Sets the SIMD width. Zero is autoselect."),
                      cl::location(VectorizerParams::VectorizationFactor));
-unsigned VectorizerParams::VectorizationFactor = 0;
+unsigned VectorizerParams::VectorizationFactor;
  
  static cl::opt<unsigned, true>
  VectorizationInterleave("force-vector-interleave", cl::Hidden,
@@ -37,15 +39,32 @@ VectorizationInterleave("force-vector-interleave", cl::Hidden,
                                   "Zero is autoselect."),
                          cl::location(
                              VectorizerParams::VectorizationInterleave));
-unsigned VectorizerParams::VectorizationInterleave = 0;
-
-/// When performing memory disambiguation checks at runtime do not make more
-/// than this number of comparisons.
-const unsigned VectorizerParams::RuntimeMemoryCheckThreshold = 8;
+unsigned VectorizerParams::VectorizationInterleave;
+
+static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold(
+    "runtime-memory-check-threshold", cl::Hidden,
+    cl::desc("When performing memory disambiguation checks at runtime do not "
+             "generate more than this number of comparisons (default = 8)."),
+    cl::location(VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(8));
+unsigned VectorizerParams::RuntimeMemoryCheckThreshold;
+
+/// \brief The maximum iterations used to merge memory checks
+static cl::opt<unsigned> MemoryCheckMergeThreshold(
+    "memory-check-merge-threshold", cl::Hidden,
+    cl::desc("Maximum number of comparisons done when trying to merge "
+             "runtime memory checks. (default = 100)"),
+    cl::init(100));
  
  /// Maximum SIMD width.
  const unsigned VectorizerParams::MaxVectorWidth = 64;
  
+/// \brief We collect interesting dependences up to this threshold.
+static cl::opt<unsigned> MaxInterestingDependence(
+    "max-interesting-dependences", cl::Hidden,
+    cl::desc("Maximum number of interesting dependences collected by "
+             "loop-access analysis (default = 100)"),
+    cl::init(100));
+
  bool VectorizerParams::isInterleaveForced() {
    return ::VectorizationInterleave.getNumOccurrences() > 0;
  }
@@ -69,14 +88,15 @@ Value *llvm::stripIntegerCast(Value *V) {
  }
  
  const SCEV *llvm::replaceSymbolicStrideSCEV(ScalarEvolution *SE,
-                                            ValueToValueMap &PtrToStride,
+                                            const ValueToValueMap &PtrToStride,
                                              Value *Ptr, Value *OrigPtr) {
  
    const SCEV *OrigSCEV = SE->getSCEV(Ptr);
  
    // If there is an entry in the map return the SCEV of the pointer with the
    // symbolic stride replaced by one.
-  ValueToValueMap::iterator SI = PtrToStride.find(OrigPtr ? OrigPtr : Ptr);
+  ValueToValueMap::const_iterator SI =
+      PtrToStride.find(OrigPtr ? OrigPtr : Ptr);
    if (SI != PtrToStride.end()) {
      Value *StrideVal = SI->second;
  
@@ -99,11 +119,9 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(ScalarEvolution *SE,
    return SE->getSCEV(Ptr);
  }
  
-void LoopAccessInfo::RuntimePointerCheck::insert(ScalarEvolution *SE, Loop *Lp,
-                                                 Value *Ptr, bool WritePtr,
-                                                 unsigned DepSetId,
-                                                 unsigned ASId,
-                                                 ValueToValueMap &Strides) {
+void LoopAccessInfo::RuntimePointerCheck::insert(
+    Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId, unsigned ASId,
+    const ValueToValueMap &Strides) {
    // Get the stride replaced scev.
    const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
@@ -116,10 +134,140 @@ void LoopAccessInfo::RuntimePointerCheck::insert(ScalarEvolution *SE, Loop *Lp,
    IsWritePtr.push_back(WritePtr);
    DependencySetId.push_back(DepSetId);
    AliasSetId.push_back(ASId);
+  Exprs.push_back(Sc);
+}
+
+bool LoopAccessInfo::RuntimePointerCheck::needsChecking(
+    const CheckingPtrGroup &M, const CheckingPtrGroup &N,
+    const SmallVectorImpl<int> *PtrPartition) const {
+  for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I)
+    for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J)
+      if (needsChecking(M.Members[I], N.Members[J], PtrPartition))
+        return true;
+  return false;
+}
+
+/// Compare \p I and \p J and return the minimum.
+/// Return nullptr in case we couldn't find an answer.
+static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
+                                   ScalarEvolution *SE) {
+  const SCEV *Diff = SE->getMinusSCEV(J, I);
+  const SCEVConstant *C = dyn_cast<const SCEVConstant>(Diff);
+
+  if (!C)
+    return nullptr;
+  if (C->getValue()->isNegative())
+    return J;
+  return I;
+}
+
+bool LoopAccessInfo::RuntimePointerCheck::CheckingPtrGroup::addPointer(
+    unsigned Index) {
+  // Compare the starts and ends with the known minimum and maximum
+  // of this set. We need to know how we compare against the min/max
+  // of the set in order to be able to emit memchecks.
+  const SCEV *Min0 = getMinFromExprs(RtCheck.Starts[Index], Low, RtCheck.SE);
+  if (!Min0)
+    return false;
+
+  const SCEV *Min1 = getMinFromExprs(RtCheck.Ends[Index], High, RtCheck.SE);
+  if (!Min1)
+    return false;
+
+  // Update the low bound  expression if we've found a new min value.
+  if (Min0 == RtCheck.Starts[Index])
+    Low = RtCheck.Starts[Index];
+
+  // Update the high bound expression if we've found a new max value.
+  if (Min1 != RtCheck.Ends[Index])
+    High = RtCheck.Ends[Index];
+
+  Members.push_back(Index);
+  return true;
+}
+
+void LoopAccessInfo::RuntimePointerCheck::groupChecks(
+    MemoryDepChecker::DepCandidates &DepCands,
+    bool UseDependencies) {
+  // We build the groups from dependency candidates equivalence classes
+  // because:
+  //    - We know that pointers in the same equivalence class share
+  //      the same underlying object and therefore there is a chance
+  //      that we can compare pointers
+  //    - We wouldn't be able to merge two pointers for which we need
+  //      to emit a memcheck. The classes in DepCands are already
+  //      conveniently built such that no two pointers in the same
+  //      class need checking against each other.
+
+  // We use the following (greedy) algorithm to construct the groups
+  // For every pointer in the equivalence class:
+  //   For each existing group:
+  //   - if the difference between this pointer and the min/max bounds
+  //     of the group is a constant, then make the pointer part of the
+  //     group and update the min/max bounds of that group as required.
+
+  CheckingGroups.clear();
+
+  // If we don't have the dependency partitions, construct a new
+  // checking pointer group for each pointer.
+  if (!UseDependencies) {
+    for (unsigned I = 0; I < Pointers.size(); ++I)
+      CheckingGroups.push_back(CheckingPtrGroup(I, *this));
+    return;
+  }
+
+  unsigned TotalComparisons = 0;
+
+  DenseMap<Value *, unsigned> PositionMap;
+  for (unsigned Pointer = 0; Pointer < Pointers.size(); ++Pointer)
+    PositionMap[Pointers[Pointer]] = Pointer;
+
+  // Go through all equivalence classes, get the the "pointer check groups"
+  // and add them to the overall solution.
+  for (auto DI = DepCands.begin(), DE = DepCands.end(); DI != DE; ++DI) {
+    if (!DI->isLeader())
+      continue;
+
+    SmallVector<CheckingPtrGroup, 2> Groups;
+
+    for (auto MI = DepCands.member_begin(DI), ME = DepCands.member_end();
+         MI != ME; ++MI) {
+      unsigned Pointer = PositionMap[MI->getPointer()];
+      bool Merged = false;
+
+      // Go through all the existing sets and see if we can find one
+      // which can include this pointer.
+      for (CheckingPtrGroup &Group : Groups) {
+        // Don't perform more than a certain amount of comparisons.
+        // This should limit the cost of grouping the pointers to something
+        // reasonable.  If we do end up hitting this threshold, the algorithm
+        // will create separate groups for all remaining pointers.
+        if (TotalComparisons > MemoryCheckMergeThreshold)
+          break;
+
+        TotalComparisons++;
+
+        if (Group.addPointer(Pointer)) {
+          Merged = true;
+          break;
+        }
+      }
+
+      if (!Merged)
+        // We couldn't add this pointer to any existing set or the threshold
+        // for the number of comparisons has been reached. Create a new group
+        // to hold the current pointer.
+        Groups.push_back(CheckingPtrGroup(Pointer, *this));
+    }
+
+    // We've computed the grouped checks for this partition.
+    // Save the results and continue with the next one.
+    std::copy(Groups.begin(), Groups.end(), std::back_inserter(CheckingGroups));
+  }
  }
  
-bool LoopAccessInfo::RuntimePointerCheck::needsChecking(unsigned I,
-                                                        unsigned J) const {
+bool LoopAccessInfo::RuntimePointerCheck::needsChecking(
+    unsigned I, unsigned J, const SmallVectorImpl<int> *PtrPartition) const {
    // No need to check if two readonly pointers intersect.
    if (!IsWritePtr[I] && !IsWritePtr[J])
      return false;
@@ -132,9 +280,86 @@ bool LoopAccessInfo::RuntimePointerCheck::needsChecking(unsigned I,
    if (AliasSetId[I] != AliasSetId[J])
      return false;
  
+  // If PtrPartition is set omit checks between pointers of the same partition.
+  // Partition number -1 means that the pointer is used in multiple partitions.
+  // In this case we can't omit the check.
+  if (PtrPartition && (*PtrPartition)[I] != -1 &&
+      (*PtrPartition)[I] == (*PtrPartition)[J])
+    return false;
+
    return true;
  }
  
+void LoopAccessInfo::RuntimePointerCheck::print(
+    raw_ostream &OS, unsigned Depth,
+    const SmallVectorImpl<int> *PtrPartition) const {
+
+  OS.indent(Depth) << "Run-time memory checks:\n";
+
+  unsigned N = 0;
+  for (unsigned I = 0; I < CheckingGroups.size(); ++I)
+    for (unsigned J = I + 1; J < CheckingGroups.size(); ++J)
+      if (needsChecking(CheckingGroups[I], CheckingGroups[J], PtrPartition)) {
+        OS.indent(Depth) << "Check " << N++ << ":\n";
+        OS.indent(Depth + 2) << "Comparing group " << I << ":\n";
+
+        for (unsigned K = 0; K < CheckingGroups[I].Members.size(); ++K) {
+          OS.indent(Depth + 2) << *Pointers[CheckingGroups[I].Members[K]]
+                               << "\n";
+          if (PtrPartition)
+            OS << " (Partition: "
+               << (*PtrPartition)[CheckingGroups[I].Members[K]] << ")"
+               << "\n";
+        }
+
+        OS.indent(Depth + 2) << "Against group " << J << ":\n";
+
+        for (unsigned K = 0; K < CheckingGroups[J].Members.size(); ++K) {
+          OS.indent(Depth + 2) << *Pointers[CheckingGroups[J].Members[K]]
+                               << "\n";
+          if (PtrPartition)
+            OS << " (Partition: "
+               << (*PtrPartition)[CheckingGroups[J].Members[K]] << ")"
+               << "\n";
+        }
+      }
+
+  OS.indent(Depth) << "Grouped accesses:\n";
+  for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
+    OS.indent(Depth + 2) << "Group " << I << ":\n";
+    OS.indent(Depth + 4) << "(Low: " << *CheckingGroups[I].Low
+                         << " High: " << *CheckingGroups[I].High << ")\n";
+    for (unsigned J = 0; J < CheckingGroups[I].Members.size(); ++J) {
+      OS.indent(Depth + 6) << "Member: " << *Exprs[CheckingGroups[I].Members[J]]
+                           << "\n";
+    }
+  }
+}
+
+unsigned LoopAccessInfo::RuntimePointerCheck::getNumberOfChecks(
+    const SmallVectorImpl<int> *PtrPartition) const {
+
+  unsigned NumPartitions = CheckingGroups.size();
+  unsigned CheckCount = 0;
+
+  for (unsigned I = 0; I < NumPartitions; ++I)
+    for (unsigned J = I + 1; J < NumPartitions; ++J)
+      if (needsChecking(CheckingGroups[I], CheckingGroups[J], PtrPartition))
+        CheckCount++;
+  return CheckCount;
+}
+
+bool LoopAccessInfo::RuntimePointerCheck::needsAnyChecking(
+    const SmallVectorImpl<int> *PtrPartition) const {
+  unsigned NumPointers = Pointers.size();
+
+  for (unsigned I = 0; I < NumPointers; ++I)
+    for (unsigned J = I + 1; J < NumPointers; ++J)
+      if (needsChecking(I, J, PtrPartition))
+        return true;
+  return false;
+}
+
  namespace {
  /// \brief Analyses memory accesses in a loop.
  ///
@@ -146,34 +371,32 @@ public:
    typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
    typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
  
-  /// \brief Set of potential dependent memory accesses.
-  typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
-
-  AccessAnalysis(const DataLayout *Dl, AliasAnalysis *AA, DepCandidates &DA) :
-    DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {}
+  AccessAnalysis(const DataLayout &Dl, AliasAnalysis *AA, LoopInfo *LI,
+                 MemoryDepChecker::DepCandidates &DA)
+      : DL(Dl), AST(*AA), LI(LI), DepCands(DA), IsRTCheckNeeded(false) {}
  
    /// \brief Register a load  and whether it is only read from.
-  void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) {
+  void addLoad(MemoryLocation &Loc, bool IsReadOnly) {
      Value *Ptr = const_cast<Value*>(Loc.Ptr);
-    AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
+    AST.add(Ptr, MemoryLocation::UnknownSize, Loc.AATags);
      Accesses.insert(MemAccessInfo(Ptr, false));
      if (IsReadOnly)
        ReadOnlyPtr.insert(Ptr);
    }
  
    /// \brief Register a store.
-  void addStore(AliasAnalysis::Location &Loc) {
+  void addStore(MemoryLocation &Loc) {
      Value *Ptr = const_cast<Value*>(Loc.Ptr);
-    AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
+    AST.add(Ptr, MemoryLocation::UnknownSize, Loc.AATags);
      Accesses.insert(MemAccessInfo(Ptr, true));
    }
  
    /// \brief Check whether we can check the pointers at runtime for
-  /// non-intersection.
+  /// non-intersection. Returns true when we have 0 pointers
+  /// (a check on 0 pointers for non-intersection will always return true).
    bool canCheckPtrAtRT(LoopAccessInfo::RuntimePointerCheck &RtCheck,
-                       unsigned &NumComparisons,
-                       ScalarEvolution *SE, Loop *TheLoop,
-                       ValueToValueMap &Strides,
+                       bool &NeedRTCheck, ScalarEvolution *SE, Loop *TheLoop,
+                       const ValueToValueMap &Strides,
                         bool ShouldCheckStride = false);
  
    /// \brief Goes over all memory accesses, checks whether a RT check is needed
@@ -185,7 +408,12 @@ public:
    bool isRTCheckNeeded() { return IsRTCheckNeeded; }
  
    bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
-  void resetDepChecks() { CheckDeps.clear(); }
+
+  /// We decided that no dependence analysis would be used.  Reset the state.
+  void resetDepChecks(MemoryDepChecker &DepChecker) {
+    CheckDeps.clear();
+    DepChecker.clearInterestingDependences();
+  }
  
    MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; }
  
@@ -199,22 +427,24 @@ private:
    /// Set of all accesses.
    PtrAccessSet Accesses;
  
+  const DataLayout &DL;
+
    /// Set of accesses that need a further dependence check.
    MemAccessInfoSet CheckDeps;
  
    /// Set of pointers that are read only.
    SmallPtrSet<Value*, 16> ReadOnlyPtr;
  
-  const DataLayout *DL;
-
    /// An alias set tracker to partition the access set by underlying object and
    //intrinsic property (such as TBAA metadata).
    AliasSetTracker AST;
  
+  LoopInfo *LI;
+
    /// Sets of potentially dependent accesses - members of one set share an
    /// underlying pointer. The set "CheckDeps" identfies which sets really need a
    /// dependence check.
-  DepCandidates &DepCands;
+  MemoryDepChecker::DepCandidates &DepCands;
  
    bool IsRTCheckNeeded;
  };
@@ -222,8 +452,8 @@ private:
  } // end anonymous namespace
  
  /// \brief Check whether a pointer can participate in a runtime bounds check.
-static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides,
-                                Value *Ptr) {
+static bool hasComputableBounds(ScalarEvolution *SE,
+                                const ValueToValueMap &Strides, Value *Ptr) {
    const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
    if (!AR)
@@ -232,29 +462,23 @@ static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides,
    return AR->isAffine();
  }
  
-/// \brief Check the stride of the pointer and ensure that it does not wrap in
-/// the address space.
-static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr,
-                        const Loop *Lp, ValueToValueMap &StridesMap);
-
  bool AccessAnalysis::canCheckPtrAtRT(
-    LoopAccessInfo::RuntimePointerCheck &RtCheck,
-    unsigned &NumComparisons, ScalarEvolution *SE, Loop *TheLoop,
-    ValueToValueMap &StridesMap, bool ShouldCheckStride) {
+    LoopAccessInfo::RuntimePointerCheck &RtCheck, bool &NeedRTCheck,
+    ScalarEvolution *SE, Loop *TheLoop, const ValueToValueMap &StridesMap,
+    bool ShouldCheckStride) {
    // Find pointers with computable bounds. We are going to use this information
    // to place a runtime bound check.
    bool CanDoRT = true;
  
+  NeedRTCheck = false;
+  if (!IsRTCheckNeeded) return true;
+
    bool IsDepCheckNeeded = isDependencyCheckNeeded();
-  NumComparisons = 0;
  
    // We assign a consecutive id to access from different alias sets.
    // Accesses between different groups doesn't need to be checked.
    unsigned ASId = 1;
    for (auto &AS : AST) {
-    unsigned NumReadPtrChecks = 0;
-    unsigned NumWritePtrChecks = 0;
-
      // We assign consecutive id to access from different dependence sets.
      // Accesses within the same set don't need a runtime check.
      unsigned RunningDepId = 1;
@@ -265,16 +489,11 @@ bool AccessAnalysis::canCheckPtrAtRT(
        bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
        MemAccessInfo Access(Ptr, IsWrite);
  
-      if (IsWrite)
-        ++NumWritePtrChecks;
-      else
-        ++NumReadPtrChecks;
-
        if (hasComputableBounds(SE, StridesMap, Ptr) &&
-          // When we run after a failing dependency check we have to make sure we
-          // don't have wrapping pointers.
+          // When we run after a failing dependency check we have to make sure
+          // we don't have wrapping pointers.
            (!ShouldCheckStride ||
-           isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) {
+           isStridedPtr(SE, Ptr, TheLoop, StridesMap) == 1)) {
          // The id of the dependence set.
          unsigned DepId;
  
@@ -288,24 +507,24 @@ bool AccessAnalysis::canCheckPtrAtRT(
            // Each access has its own dependence set.
            DepId = RunningDepId++;
  
-        RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap);
+        RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap);
  
          DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
        } else {
+        DEBUG(dbgs() << "LAA: Can't find bounds for ptr:" << *Ptr << '\n');
          CanDoRT = false;
        }
      }
  
-    if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2)
-      NumComparisons += 0; // Only one dependence set.
-    else {
-      NumComparisons += (NumWritePtrChecks * (NumReadPtrChecks +
-                                              NumWritePtrChecks - 1));
-    }
-
      ++ASId;
    }
  
+  // We need a runtime check if there are any accesses that need checking.
+  // However, some accesses cannot be checked (for example because we
+  // can't determine their bounds). In these cases we would need a check
+  // but wouldn't be able to add it.
+  NeedRTCheck = !CanDoRT || RtCheck.needsAnyChecking(nullptr);
+
    // If the pointers that we would use for the bounds comparison have different
    // address spaces, assume the values aren't directly comparable, so we can't
    // use them for the runtime check. We also have to assume they could
@@ -334,6 +553,9 @@ bool AccessAnalysis::canCheckPtrAtRT(
      }
    }
  
+  if (NeedRTCheck && CanDoRT)
+    RtCheck.groupChecks(DepCands, IsDepCheckNeeded);
+
    return CanDoRT;
  }
  
@@ -344,7 +566,7 @@ void AccessAnalysis::processMemAccesses() {
  
    DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
    DEBUG(dbgs() << "  AST: "; AST.dump());
-  DEBUG(dbgs() << "LAA:   Accesses:\n");
+  DEBUG(dbgs() << "LAA:   Accesses(" << Accesses.size() << "):\n");
    DEBUG({
      for (auto A : Accesses)
        dbgs() << "\t" << *A.getPointer() << " (" <<
@@ -427,7 +649,9 @@ void AccessAnalysis::processMemAccesses() {
            // underlying object.
            typedef SmallVector<Value *, 16> ValueVector;
            ValueVector TempObjects;
-          GetUnderlyingObjects(Ptr, TempObjects, DL);
+
+          GetUnderlyingObjects(Ptr, TempObjects, DL, LI);
+          DEBUG(dbgs() << "Underlying objects for pointer " << *Ptr << "\n");
            for (Value *UnderlyingObj : TempObjects) {
              UnderlyingObjToAccessMap::iterator Prev =
                  ObjToLastAccess.find(UnderlyingObj);
@@ -435,6 +659,7 @@ void AccessAnalysis::processMemAccesses() {
                DepCands.unionSets(Access, Prev->second);
  
              ObjToLastAccess[UnderlyingObj] = Access;
+            DEBUG(dbgs() << "  " << *UnderlyingObj << "\n");
            }
          }
        }
@@ -442,133 +667,63 @@ void AccessAnalysis::processMemAccesses() {
    }
  }
  
-namespace {
-/// \brief Checks memory dependences among accesses to the same underlying
-/// object to determine whether there vectorization is legal or not (and at
-/// which vectorization factor).
-///
-/// This class works under the assumption that we already checked that memory
-/// locations with different underlying pointers are "must-not alias".
-/// We use the ScalarEvolution framework to symbolically evalutate access
-/// functions pairs. Since we currently don't restructure the loop we can rely
-/// on the program order of memory accesses to determine their safety.
-/// At the moment we will only deem accesses as safe for:
-///  * A negative constant distance assuming program order.
-///
-///      Safe: tmp = a[i + 1];     OR     a[i + 1] = x;
-///            a[i] = tmp;                y = a[i];
-///
-///   The latter case is safe because later checks guarantuee that there can't
-///   be a cycle through a phi node (that is, we check that "x" and "y" is not
-///   the same variable: a header phi can only be an induction or a reduction, a
-///   reduction can't have a memory sink, an induction can't have a memory
-///   source). This is important and must not be violated (or we have to
-///   resort to checking for cycles through memory).
-///
-///  * A positive constant distance assuming program order that is bigger
-///    than the biggest memory access.
-///
-///     tmp = a[i]        OR              b[i] = x
-///     a[i+2] = tmp                      y = b[i+2];
-///
-///     Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively.
-///
-///  * Zero distances and all accesses have the same size.
-///
-class MemoryDepChecker {
-public:
-  typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
-  typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
-
-  MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L)
-      : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0),
-        ShouldRetryWithRuntimeCheck(false) {}
-
-  /// \brief Register the location (instructions are given increasing numbers)
-  /// of a write access.
-  void addAccess(StoreInst *SI) {
-    Value *Ptr = SI->getPointerOperand();
-    Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
-    InstMap.push_back(SI);
-    ++AccessIdx;
-  }
-
-  /// \brief Register the location (instructions are given increasing numbers)
-  /// of a write access.
-  void addAccess(LoadInst *LI) {
-    Value *Ptr = LI->getPointerOperand();
-    Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
-    InstMap.push_back(LI);
-    ++AccessIdx;
-  }
+static bool isInBoundsGep(Value *Ptr) {
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
+    return GEP->isInBounds();
+  return false;
+}
  
-  /// \brief Check whether the dependencies between the accesses are safe.
-  ///
-  /// Only checks sets with elements in \p CheckDeps.
-  bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
-                   MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides);
+/// \brief Return true if an AddRec pointer \p Ptr is unsigned non-wrapping,
+/// i.e. monotonically increasing/decreasing.
+static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
+                           ScalarEvolution *SE, const Loop *L) {
+  // FIXME: This should probably only return true for NUW.
+  if (AR->getNoWrapFlags(SCEV::NoWrapMask))
+    return true;
  
-  /// \brief The maximum number of bytes of a vector register we can vectorize
-  /// the accesses safely with.
-  unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
+  // Scalar evolution does not propagate the non-wrapping flags to values that
+  // are derived from a non-wrapping induction variable because non-wrapping
+  // could be flow-sensitive.
+  //
+  // Look through the potentially overflowing instruction to try to prove
+  // non-wrapping for the *specific* value of Ptr.
  
-  /// \brief In same cases when the dependency check fails we can still
-  /// vectorize the loop with a dynamic array access check.
-  bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; }
+  // The arithmetic implied by an inbounds GEP can't overflow.
+  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  if (!GEP || !GEP->isInBounds())
+    return false;
  
-private:
-  ScalarEvolution *SE;
-  const DataLayout *DL;
-  const Loop *InnermostLoop;
-
-  /// \brief Maps access locations (ptr, read/write) to program order.
-  DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses;
-
-  /// \brief Memory access instructions in program order.
-  SmallVector<Instruction *, 16> InstMap;
-
-  /// \brief The program order index to be used for the next instruction.
-  unsigned AccessIdx;
-
-  // We can access this many bytes in parallel safely.
-  unsigned MaxSafeDepDistBytes;
-
-  /// \brief If we see a non-constant dependence distance we can still try to
-  /// vectorize this loop with runtime checks.
-  bool ShouldRetryWithRuntimeCheck;
-
-  /// \brief Check whether there is a plausible dependence between the two
-  /// accesses.
-  ///
-  /// Access \p A must happen before \p B in program order. The two indices
-  /// identify the index into the program order map.
-  ///
-  /// This function checks  whether there is a plausible dependence (or the
-  /// absence of such can't be proved) between the two accesses. If there is a
-  /// plausible dependence but the dependence distance is bigger than one
-  /// element access it records this distance in \p MaxSafeDepDistBytes (if this
-  /// distance is smaller than any other distance encountered so far).
-  /// Otherwise, this function returns true signaling a possible dependence.
-  bool isDependent(const MemAccessInfo &A, unsigned AIdx,
-                   const MemAccessInfo &B, unsigned BIdx,
-                   ValueToValueMap &Strides);
-
-  /// \brief Check whether the data dependence could prevent store-load
-  /// forwarding.
-  bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
-};
+  // Make sure there is only one non-const index and analyze that.
+  Value *NonConstIndex = nullptr;
+  for (auto Index = GEP->idx_begin(); Index != GEP->idx_end(); ++Index)
+    if (!isa<ConstantInt>(*Index)) {
+      if (NonConstIndex)
+        return false;
+      NonConstIndex = *Index;
+    }
+  if (!NonConstIndex)
+    // The recurrence is on the pointer, ignore for now.
+    return false;
  
-} // end anonymous namespace
+  // The index in GEP is signed.  It is non-wrapping if it's derived from a NSW
+  // AddRec using a NSW operation.
+  if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(NonConstIndex))
+    if (OBO->hasNoSignedWrap() &&
+        // Assume constant for other the operand so that the AddRec can be
+        // easily found.
+        isa<ConstantInt>(OBO->getOperand(1))) {
+      auto *OpScev = SE->getSCEV(OBO->getOperand(0));
+
+      if (auto *OpAR = dyn_cast<SCEVAddRecExpr>(OpScev))
+        return OpAR->getLoop() == L && OpAR->getNoWrapFlags(SCEV::FlagNSW);
+    }
  
-static bool isInBoundsGep(Value *Ptr) {
-  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
-    return GEP->isInBounds();
    return false;
  }
  
  /// \brief Check whether the access through \p Ptr has a constant stride.
-static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr,
-                        const Loop *Lp, ValueToValueMap &StridesMap) {
+int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
+                       const ValueToValueMap &StridesMap) {
    const Type *Ty = Ptr->getType();
    assert(Ty->isPointerTy() && "Unexpected non-ptr");
  
@@ -603,7 +758,7 @@ static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr,
    // to access the pointer value "0" which is undefined behavior in address
    // space 0, therefore we can also vectorize this case.
    bool IsInBoundsGEP = isInBoundsGep(Ptr);
-  bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask);
+  bool IsNoWrapAddRec = isNoWrapAddRec(Ptr, AR, SE, Lp);
    bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
    if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) {
      DEBUG(dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
@@ -622,7 +777,8 @@ static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr,
      return 0;
    }
  
-  int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType());
+  auto &DL = Lp->getHeader()->getModule()->getDataLayout();
+  int64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
    const APInt &APStepVal = C->getValue()->getValue();
  
    // Huge step value - give up.
@@ -647,6 +803,54 @@ static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr,
    return Stride;
  }
  
+bool MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) {
+  switch (Type) {
+  case NoDep:
+  case Forward:
+  case BackwardVectorizable:
+    return true;
+
+  case Unknown:
+  case ForwardButPreventsForwarding:
+  case Backward:
+  case BackwardVectorizableButPreventsForwarding:
+    return false;
+  }
+  llvm_unreachable("unexpected DepType!");
+}
+
+bool MemoryDepChecker::Dependence::isInterestingDependence(DepType Type) {
+  switch (Type) {
+  case NoDep:
+  case Forward:
+    return false;
+
+  case BackwardVectorizable:
+  case Unknown:
+  case ForwardButPreventsForwarding:
+  case Backward:
+  case BackwardVectorizableButPreventsForwarding:
+    return true;
+  }
+  llvm_unreachable("unexpected DepType!");
+}
+
+bool MemoryDepChecker::Dependence::isPossiblyBackward() const {
+  switch (Type) {
+  case NoDep:
+  case Forward:
+  case ForwardButPreventsForwarding:
+    return false;
+
+  case Unknown:
+  case BackwardVectorizable:
+  case Backward:
+  case BackwardVectorizableButPreventsForwarding:
+    return true;
+  }
+  llvm_unreachable("unexpected DepType!");
+}
+
  bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance,
                                                      unsigned TypeByteSize) {
    // If loads occur at a distance that is not a multiple of a feasible vector
@@ -686,9 +890,46 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance,
    return false;
  }
  
-bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
-                                   const MemAccessInfo &B, unsigned BIdx,
-                                   ValueToValueMap &Strides) {
+/// \brief Check the dependence for two accesses with the same stride \p Stride.
+/// \p Distance is the positive distance and \p TypeByteSize is type size in
+/// bytes.
+///
+/// \returns true if they are independent.
+static bool areStridedAccessesIndependent(unsigned Distance, unsigned Stride,
+                                          unsigned TypeByteSize) {
+  assert(Stride > 1 && "The stride must be greater than 1");
+  assert(TypeByteSize > 0 && "The type size in byte must be non-zero");
+  assert(Distance > 0 && "The distance must be non-zero");
+
+  // Skip if the distance is not multiple of type byte size.
+  if (Distance % TypeByteSize)
+    return false;
+
+  unsigned ScaledDist = Distance / TypeByteSize;
+
+  // No dependence if the scaled distance is not multiple of the stride.
+  // E.g.
+  //      for (i = 0; i < 1024 ; i += 4)
+  //        A[i+2] = A[i] + 1;
+  //
+  // Two accesses in memory (scaled distance is 2, stride is 4):
+  //     | A[0] |      |      |      | A[4] |      |      |      |
+  //     |      |      | A[2] |      |      |      | A[6] |      |
+  //
+  // E.g.
+  //      for (i = 0; i < 1024 ; i += 3)
+  //        A[i+4] = A[i] + 1;
+  //
+  // Two accesses in memory (scaled distance is 4, stride is 3):
+  //     | A[0] |      |      | A[3] |      |      | A[6] |      |      |
+  //     |      |      |      |      | A[4] |      |      | A[7] |      |
+  return ScaledDist % Stride;
+}
+
+MemoryDepChecker::Dependence::DepType
+MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
+                              const MemAccessInfo &B, unsigned BIdx,
+                              const ValueToValueMap &Strides) {
    assert (AIdx < BIdx && "Must pass arguments in program order");
  
    Value *APtr = A.getPointer();
@@ -698,18 +939,18 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
  
    // Two reads are independent.
    if (!AIsWrite && !BIsWrite)
-    return false;
+    return Dependence::NoDep;
  
    // We cannot check pointers in different address spaces.
    if (APtr->getType()->getPointerAddressSpace() !=
        BPtr->getType()->getPointerAddressSpace())
-    return true;
+    return Dependence::Unknown;
  
    const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr);
    const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr);
  
-  int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop, Strides);
-  int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop, Strides);
+  int StrideAPtr = isStridedPtr(SE, APtr, InnermostLoop, Strides);
+  int StrideBPtr = isStridedPtr(SE, BPtr, InnermostLoop, Strides);
  
    const SCEV *Src = AScev;
    const SCEV *Sink = BScev;
@@ -738,19 +979,20 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
    // the address space.
    if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){
      DEBUG(dbgs() << "Non-consecutive pointer access\n");
-    return true;
+    return Dependence::Unknown;
    }
  
    const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
    if (!C) {
      DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
      ShouldRetryWithRuntimeCheck = true;
-    return true;
+    return Dependence::Unknown;
    }
  
    Type *ATy = APtr->getType()->getPointerElementType();
    Type *BTy = BPtr->getType()->getPointerElementType();
-  unsigned TypeByteSize = DL->getTypeAllocSize(ATy);
+  auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
+  unsigned TypeByteSize = DL.getTypeAllocSize(ATy);
  
    // Negative distances are not plausible dependencies.
    const APInt &Val = C->getValue()->getValue();
@@ -759,66 +1001,119 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
      if (IsTrueDataDependence &&
          (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) ||
           ATy != BTy))
-      return true;
+      return Dependence::ForwardButPreventsForwarding;
  
      DEBUG(dbgs() << "LAA: Dependence is negative: NoDep\n");
-    return false;
+    return Dependence::Forward;
    }
  
    // Write to the same location with the same size.
    // Could be improved to assert type sizes are the same (i32 == float, etc).
    if (Val == 0) {
      if (ATy == BTy)
-      return false;
+      return Dependence::NoDep;
      DEBUG(dbgs() << "LAA: Zero dependence difference but different types\n");
-    return true;
+    return Dependence::Unknown;
    }
  
    assert(Val.isStrictlyPositive() && "Expect a positive value");
  
-  // Positive distance bigger than max vectorization factor.
    if (ATy != BTy) {
      DEBUG(dbgs() <<
            "LAA: ReadWrite-Write positive dependency with different types\n");
-    return false;
+    return Dependence::Unknown;
    }
  
    unsigned Distance = (unsigned) Val.getZExtValue();
  
+  unsigned Stride = std::abs(StrideAPtr);
+  if (Stride > 1 &&
+      areStridedAccessesIndependent(Distance, Stride, TypeByteSize))
+    return Dependence::NoDep;
+
    // Bail out early if passed-in parameters make vectorization not feasible.
    unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
                             VectorizerParams::VectorizationFactor : 1);
    unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
                             VectorizerParams::VectorizationInterleave : 1);
+  // The minimum number of iterations for a vectorized/unrolled version.
+  unsigned MinNumIter = std::max(ForcedFactor * ForcedUnroll, 2U);
+
+  // It's not vectorizable if the distance is smaller than the minimum distance
+  // needed for a vectroized/unrolled version. Vectorizing one iteration in
+  // front needs TypeByteSize * Stride. Vectorizing the last iteration needs
+  // TypeByteSize (No need to plus the last gap distance).
+  //
+  // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
+  //      foo(int *A) {
+  //        int *B = (int *)((char *)A + 14);
+  //        for (i = 0 ; i < 1024 ; i += 2)
+  //          B[i] = A[i] + 1;
+  //      }
+  //
+  // Two accesses in memory (stride is 2):
+  //     | A[0] |      | A[2] |      | A[4] |      | A[6] |      |
+  //                              | B[0] |      | B[2] |      | B[4] |
+  //
+  // Distance needs for vectorizing iterations except the last iteration:
+  // 4 * 2 * (MinNumIter - 1). Distance needs for the last iteration: 4.
+  // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4.
+  //
+  // If MinNumIter is 2, it is vectorizable as the minimum distance needed is
+  // 12, which is less than distance.
+  //
+  // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
+  // the minimum distance needed is 28, which is greater than distance. It is
+  // not safe to do vectorization.
+  unsigned MinDistanceNeeded =
+      TypeByteSize * Stride * (MinNumIter - 1) + TypeByteSize;
+  if (MinDistanceNeeded > Distance) {
+    DEBUG(dbgs() << "LAA: Failure because of positive distance " << Distance
+                 << '\n');
+    return Dependence::Backward;
+  }
  
-  // The distance must be bigger than the size needed for a vectorized version
-  // of the operation and the size of the vectorized operation must not be
-  // bigger than the currrent maximum size.
-  if (Distance < 2*TypeByteSize ||
-      2*TypeByteSize > MaxSafeDepDistBytes ||
-      Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
-    DEBUG(dbgs() << "LAA: Failure because of Positive distance "
-        << Val.getSExtValue() << '\n');
-    return true;
+  // Unsafe if the minimum distance needed is greater than max safe distance.
+  if (MinDistanceNeeded > MaxSafeDepDistBytes) {
+    DEBUG(dbgs() << "LAA: Failure because it needs at least "
+                 << MinDistanceNeeded << " size in bytes");
+    return Dependence::Backward;
    }
  
-  MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
-    Distance : MaxSafeDepDistBytes;
+  // Positive distance bigger than max vectorization factor.
+  // FIXME: Should use max factor instead of max distance in bytes, which could
+  // not handle different types.
+  // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
+  //      void foo (int *A, char *B) {
+  //        for (unsigned i = 0; i < 1024; i++) {
+  //          A[i+2] = A[i] + 1;
+  //          B[i+2] = B[i] + 1;
+  //        }
+  //      }
+  //
+  // This case is currently unsafe according to the max safe distance. If we
+  // analyze the two accesses on array B, the max safe dependence distance
+  // is 2. Then we analyze the accesses on array A, the minimum distance needed
+  // is 8, which is less than 2 and forbidden vectorization, But actually
+  // both A and B could be vectorized by 2 iterations.
+  MaxSafeDepDistBytes =
+      Distance < MaxSafeDepDistBytes ? Distance : MaxSafeDepDistBytes;
  
    bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
    if (IsTrueDataDependence &&
        couldPreventStoreLoadForward(Distance, TypeByteSize))
-     return true;
+    return Dependence::BackwardVectorizableButPreventsForwarding;
  
-  DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue() <<
-        " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n');
+  DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
+               << " with max VF = "
+               << MaxSafeDepDistBytes / (TypeByteSize * Stride) << '\n');
  
-  return false;
+  return Dependence::BackwardVectorizable;
  }
  
-bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
+bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
                                     MemAccessInfoSet &CheckDeps,
-                                   ValueToValueMap &Strides) {
+                                   const ValueToValueMap &Strides) {
  
    MaxSafeDepDistBytes = -1U;
    while (!CheckDeps.empty()) {
@@ -842,9 +1137,33 @@ bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
               I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
            for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(),
                 I2E = Accesses[*OI].end(); I2 != I2E; ++I2) {
-            if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2, Strides))
-              return false;
-            if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1, Strides))
+            auto A = std::make_pair(&*AI, *I1);
+            auto B = std::make_pair(&*OI, *I2);
+
+            assert(*I1 != *I2);
+            if (*I1 > *I2)
+              std::swap(A, B);
+
+            Dependence::DepType Type =
+                isDependent(*A.first, A.second, *B.first, B.second, Strides);
+            SafeForVectorization &= Dependence::isSafeForVectorization(Type);
+
+            // Gather dependences unless we accumulated MaxInterestingDependence
+            // dependences.  In that case return as soon as we find the first
+            // unsafe dependence.  This puts a limit on this quadratic
+            // algorithm.
+            if (RecordInterestingDependences) {
+              if (Dependence::isInterestingDependence(Type))
+                InterestingDependences.push_back(
+                    Dependence(A.second, B.second, Type));
+
+              if (InterestingDependences.size() >= MaxInterestingDependence) {
+                RecordInterestingDependences = false;
+                InterestingDependences.clear();
+                DEBUG(dbgs() << "Too many dependences, stopped recording\n");
+              }
+            }
+            if (!RecordInterestingDependences && !SafeForVectorization)
                return false;
            }
          ++OI;
@@ -852,18 +1171,51 @@ bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
        AI++;
      }
    }
-  return true;
+
+  DEBUG(dbgs() << "Total Interesting Dependences: "
+               << InterestingDependences.size() << "\n");
+  return SafeForVectorization;
+}
+
+SmallVector<Instruction *, 4>
+MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool isWrite) const {
+  MemAccessInfo Access(Ptr, isWrite);
+  auto &IndexVector = Accesses.find(Access)->second;
+
+  SmallVector<Instruction *, 4> Insts;
+  std::transform(IndexVector.begin(), IndexVector.end(),
+                 std::back_inserter(Insts),
+                 [&](unsigned Idx) { return this->InstMap[Idx]; });
+  return Insts;
+}
+
+const char *MemoryDepChecker::Dependence::DepName[] = {
+    "NoDep", "Unknown", "Forward", "ForwardButPreventsForwarding", "Backward",
+    "BackwardVectorizable", "BackwardVectorizableButPreventsForwarding"};
+
+void MemoryDepChecker::Dependence::print(
+    raw_ostream &OS, unsigned Depth,
+    const SmallVectorImpl<Instruction *> &Instrs) const {
+  OS.indent(Depth) << DepName[Type] << ":\n";
+  OS.indent(Depth + 2) << *Instrs[Source] << " -> \n";
+  OS.indent(Depth + 2) << *Instrs[Destination] << "\n";
  }
  
  bool LoopAccessInfo::canAnalyzeLoop() {
+  // We need to have a loop header.
+  DEBUG(dbgs() << "LAA: Found a loop: " <<
+        TheLoop->getHeader()->getName() << '\n');
+
      // We can only analyze innermost loops.
    if (!TheLoop->empty()) {
+    DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
      emitAnalysis(LoopAccessReport() << "loop is not the innermost loop");
      return false;
    }
  
    // We must have a single backedge.
    if (TheLoop->getNumBackEdges() != 1) {
+    DEBUG(dbgs() << "LAA: loop control flow is not understood by analyzer\n");
      emitAnalysis(
          LoopAccessReport() <<
          "loop control flow is not understood by analyzer");
@@ -872,6 +1224,7 @@ bool LoopAccessInfo::canAnalyzeLoop() {
  
    // We must have a single exiting block.
    if (!TheLoop->getExitingBlock()) {
+    DEBUG(dbgs() << "LAA: loop control flow is not understood by analyzer\n");
      emitAnalysis(
          LoopAccessReport() <<
          "loop control flow is not understood by analyzer");
@@ -882,16 +1235,13 @@ bool LoopAccessInfo::canAnalyzeLoop() {
    // checked at the end of each iteration. With that we can assume that all
    // instructions in the loop are executed the same number of times.
    if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+    DEBUG(dbgs() << "LAA: loop control flow is not understood by analyzer\n");
      emitAnalysis(
          LoopAccessReport() <<
          "loop control flow is not understood by analyzer");
      return false;
    }
  
-  // We need to have a loop header.
-  DEBUG(dbgs() << "LAA: Found a loop: " <<
-        TheLoop->getHeader()->getName() << '\n');
-
    // ScalarEvolution needs to be able to find the exit count.
    const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
    if (ExitCount == SE->getCouldNotCompute()) {
@@ -904,7 +1254,7 @@ bool LoopAccessInfo::canAnalyzeLoop() {
    return true;
  }
  
-void LoopAccessInfo::analyzeLoop(ValueToValueMap &Strides) {
+void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
  
    typedef SmallVector<Value*, 16> ValueVector;
    typedef SmallPtrSet<Value*, 16> ValueSet;
@@ -921,7 +1271,6 @@ void LoopAccessInfo::analyzeLoop(ValueToValueMap &Strides) {
    PtrRtCheck.Need = false;
  
    const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
-  MemoryDepChecker DepChecker(SE, DL, TheLoop);
  
    // For each block.
    for (Loop::block_iterator bb = TheLoop->block_begin(),
@@ -942,6 +1291,12 @@ void LoopAccessInfo::analyzeLoop(ValueToValueMap &Strides) {
          if (Call && getIntrinsicIDForCall(Call, TLI))
            continue;
  
+        // If the function has an explicit vectorized counterpart, we can safely
+        // assume that it can be vectorized.
+        if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
+            TLI->isFunctionVectorizable(Call->getCalledFunction()->getName()))
+          continue;
+
          LoadInst *Ld = dyn_cast<LoadInst>(it);
          if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) {
            emitAnalysis(LoopAccessReport(Ld)
@@ -990,8 +1345,9 @@ void LoopAccessInfo::analyzeLoop(ValueToValueMap &Strides) {
      return;
    }
  
-  AccessAnalysis::DepCandidates DependentAccesses;
-  AccessAnalysis Accesses(DL, AA, DependentAccesses);
+  MemoryDepChecker::DepCandidates DependentAccesses;
+  AccessAnalysis Accesses(TheLoop->getHeader()->getModule()->getDataLayout(),
+                          AA, LI, DependentAccesses);
  
    // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
    // multiple times on the same object. If the ptr is accessed twice, once
@@ -1004,22 +1360,14 @@ void LoopAccessInfo::analyzeLoop(ValueToValueMap &Strides) {
    for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
      StoreInst *ST = cast<StoreInst>(*I);
      Value* Ptr = ST->getPointerOperand();
-
-    if (isUniform(Ptr)) {
-      emitAnalysis(
-          LoopAccessReport(ST)
-          << "write to a loop invariant address could not be vectorized");
-      DEBUG(dbgs() << "LAA: We don't allow storing to uniform addresses\n");
-      CanVecMem = false;
-      return;
-    }
-
+    // Check for store to loop invariant address.
+    StoreToLoopInvariantAddress |= isUniform(Ptr);
      // If we did *not* see this pointer before, insert it to  the read-write
      // list. At this phase it is only a 'write' list.
      if (Seen.insert(Ptr).second) {
        ++NumReadWrites;
  
-      AliasAnalysis::Location Loc = AA->getLocation(ST);
+      MemoryLocation Loc = MemoryLocation::get(ST);
        // The TBAA metadata could have a control dependency on the predication
        // condition, so we cannot rely on it when determining whether or not we
        // need runtime pointer checks.
@@ -1050,13 +1398,12 @@ void LoopAccessInfo::analyzeLoop(ValueToValueMap &Strides) {
      // read a few words, modify, and write a few words, and some of the
      // words may be written to the same address.
      bool IsReadOnlyPtr = false;
-    if (Seen.insert(Ptr).second ||
-        !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) {
+    if (Seen.insert(Ptr).second || !isStridedPtr(SE, Ptr, TheLoop, Strides)) {
        ++NumReads;
        IsReadOnlyPtr = true;
      }
  
-    AliasAnalysis::Location Loc = AA->getLocation(LD);
+    MemoryLocation Loc = MemoryLocation::get(LD);
      // The TBAA metadata could have a control dependency on the predication
      // condition, so we cannot rely on it when determining whether or not we
      // need runtime pointer checks.
@@ -1077,37 +1424,22 @@ void LoopAccessInfo::analyzeLoop(ValueToValueMap &Strides) {
    // Build dependence sets and check whether we need a runtime pointer bounds
    // check.
    Accesses.buildDependenceSets();
-  bool NeedRTCheck = Accesses.isRTCheckNeeded();
  
    // Find pointers with computable bounds. We are going to use this information
    // to place a runtime bound check.
-  unsigned NumComparisons = 0;
-  bool CanDoRT = false;
-  if (NeedRTCheck)
-    CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop,
-                                       Strides);
-
-  DEBUG(dbgs() << "LAA: We need to do " << NumComparisons <<
-        " pointer comparisons.\n");
-
-  // If we only have one set of dependences to check pointers among we don't
-  // need a runtime check.
-  if (NumComparisons == 0 && NeedRTCheck)
-    NeedRTCheck = false;
-
-  // Check that we did not collect too many pointers or found an unsizeable
-  // pointer.
-  if (!CanDoRT ||
-      NumComparisons > VectorizerParams::RuntimeMemoryCheckThreshold) {
-    PtrRtCheck.reset();
-    CanDoRT = false;
-  }
+  bool NeedRTCheck;
+  bool CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck,
+                                          NeedRTCheck, SE,
+                                          TheLoop, Strides);
  
-  if (CanDoRT) {
-    DEBUG(dbgs() << "LAA: We can perform a memory runtime check if needed.\n");
-  }
+  DEBUG(dbgs() << "LAA: We need to do "
+               << PtrRtCheck.getNumberOfChecks(nullptr)
+               << " pointer comparisons.\n");
  
-  if (NeedRTCheck && !CanDoRT) {
+  // Check that we found the bounds for the pointer.
+  if (CanDoRT)
+    DEBUG(dbgs() << "LAA: We can perform a memory runtime check if needed.\n");
+  else if (NeedRTCheck) {
      emitAnalysis(LoopAccessReport() << "cannot identify array bounds");
      DEBUG(dbgs() << "LAA: We can't vectorize because we can't find " <<
            "the array bounds.\n");
@@ -1130,25 +1462,18 @@ void LoopAccessInfo::analyzeLoop(ValueToValueMap &Strides) {
        NeedRTCheck = true;
  
        // Clear the dependency checks. We assume they are not needed.
-      Accesses.resetDepChecks();
+      Accesses.resetDepChecks(DepChecker);
  
        PtrRtCheck.reset();
        PtrRtCheck.Need = true;
  
-      CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE,
+      CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NeedRTCheck, SE,
                                           TheLoop, Strides, true);
-      // Check that we did not collect too many pointers or found an unsizeable
-      // pointer.
-      if (!CanDoRT ||
-          NumComparisons > VectorizerParams::RuntimeMemoryCheckThreshold) {
-        if (!CanDoRT && NumComparisons > 0)
-          emitAnalysis(LoopAccessReport()
-                       << "cannot check memory dependencies at runtime");
-        else
-          emitAnalysis(LoopAccessReport()
-                       << NumComparisons << " exceeds limit of "
-                       << VectorizerParams::RuntimeMemoryCheckThreshold
-                       << " dependent memory operations checked at runtime");
+
+      // Check that we found the bounds for the pointer.
+      if (NeedRTCheck && !CanDoRT) {
+        emitAnalysis(LoopAccessReport()
+                     << "cannot check memory dependencies at runtime");
          DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
          PtrRtCheck.reset();
          CanVecMem = false;
@@ -1159,12 +1484,15 @@ void LoopAccessInfo::analyzeLoop(ValueToValueMap &Strides) {
      }
    }
  
-  if (!CanVecMem)
+  if (CanVecMem)
+    DEBUG(dbgs() << "LAA: No unsafe dependent memory operations in loop.  We"
+                 << (NeedRTCheck ? "" : " don't")
+                 << " need a runtime memory check.\n");
+  else {
      emitAnalysis(LoopAccessReport() <<
                   "unsafe dependent memory operations in loop");
-
-  DEBUG(dbgs() << "LAA: We" << (NeedRTCheck ? "" : " don't") <<
-        " need a runtime memory check.\n");
+    DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
+  }
  }
  
  bool LoopAccessInfo::blockNeedsPredication(BasicBlock *BB, Loop *TheLoop,
@@ -1181,7 +1509,7 @@ void LoopAccessInfo::emitAnalysis(LoopAccessReport &Message) {
    Report = Message;
  }
  
-bool LoopAccessInfo::isUniform(Value *V) {
+bool LoopAccessInfo::isUniform(Value *V) const {
    return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
  }
  
@@ -1196,38 +1524,40 @@ static Instruction *getFirstInst(Instruction *FirstInst, Value *V,
    return nullptr;
  }
  
-std::pair<Instruction *, Instruction *>
-LoopAccessInfo::addRuntimeCheck(Instruction *Loc) {
-  Instruction *tnullptr = nullptr;
+std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeCheck(
+    Instruction *Loc, const SmallVectorImpl<int> *PtrPartition) const {
    if (!PtrRtCheck.Need)
-    return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
+    return std::make_pair(nullptr, nullptr);
  
-  unsigned NumPointers = PtrRtCheck.Pointers.size();
-  SmallVector<TrackingVH<Value> , 2> Starts;
-  SmallVector<TrackingVH<Value> , 2> Ends;
+  SmallVector<TrackingVH<Value>, 2> Starts;
+  SmallVector<TrackingVH<Value>, 2> Ends;
  
    LLVMContext &Ctx = Loc->getContext();
-  SCEVExpander Exp(*SE, "induction");
+  SCEVExpander Exp(*SE, DL, "induction");
    Instruction *FirstInst = nullptr;
  
-  for (unsigned i = 0; i < NumPointers; ++i) {
-    Value *Ptr = PtrRtCheck.Pointers[i];
+  for (unsigned i = 0; i < PtrRtCheck.CheckingGroups.size(); ++i) {
+    const RuntimePointerCheck::CheckingPtrGroup &CG =
+        PtrRtCheck.CheckingGroups[i];
+    Value *Ptr = PtrRtCheck.Pointers[CG.Members[0]];
      const SCEV *Sc = SE->getSCEV(Ptr);
  
      if (SE->isLoopInvariant(Sc, TheLoop)) {
-      DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:" <<
-            *Ptr <<"\n");
+      DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:" << *Ptr
+                   << "\n");
        Starts.push_back(Ptr);
        Ends.push_back(Ptr);
      } else {
-      DEBUG(dbgs() << "LAA: Adding RT check for range:" << *Ptr << '\n');
        unsigned AS = Ptr->getType()->getPointerAddressSpace();
  
        // Use this type for pointer arithmetic.
        Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
+      Value *Start = nullptr, *End = nullptr;
  
-      Value *Start = Exp.expandCodeFor(PtrRtCheck.Starts[i], PtrArithTy, Loc);
-      Value *End = Exp.expandCodeFor(PtrRtCheck.Ends[i], PtrArithTy, Loc);
+      DEBUG(dbgs() << "LAA: Adding RT check for range:\n");
+      Start = Exp.expandCodeFor(CG.Low, PtrArithTy, Loc);
+      End = Exp.expandCodeFor(CG.High, PtrArithTy, Loc);
+      DEBUG(dbgs() << "Start: " << *CG.Low << " End: " << *CG.High << "\n");
        Starts.push_back(Start);
        Ends.push_back(End);
      }
@@ -1236,9 +1566,14 @@ LoopAccessInfo::addRuntimeCheck(Instruction *Loc) {
    IRBuilder<> ChkBuilder(Loc);
    // Our instructions might fold to a constant.
    Value *MemoryRuntimeCheck = nullptr;
-  for (unsigned i = 0; i < NumPointers; ++i) {
-    for (unsigned j = i+1; j < NumPointers; ++j) {
-      if (!PtrRtCheck.needsChecking(i, j))
+  for (unsigned i = 0; i < PtrRtCheck.CheckingGroups.size(); ++i) {
+    for (unsigned j = i + 1; j < PtrRtCheck.CheckingGroups.size(); ++j) {
+      const RuntimePointerCheck::CheckingPtrGroup &CGI =
+          PtrRtCheck.CheckingGroups[i];
+      const RuntimePointerCheck::CheckingPtrGroup &CGJ =
+          PtrRtCheck.CheckingGroups[j];
+
+      if (!PtrRtCheck.needsChecking(CGI, CGJ, PtrPartition))
          continue;
  
        unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace();
@@ -1271,6 +1606,9 @@ LoopAccessInfo::addRuntimeCheck(Instruction *Loc) {
      }
    }
  
+  if (!MemoryRuntimeCheck)
+    return std::make_pair(nullptr, nullptr);
+
    // We have to do this trickery because the IRBuilder might fold the check to a
    // constant expression in which case there is no Instruction anchored in a
    // the block.
@@ -1282,16 +1620,49 @@ LoopAccessInfo::addRuntimeCheck(Instruction *Loc) {
  }
  
  LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
-                               const DataLayout *DL,
+                               const DataLayout &DL,
                                 const TargetLibraryInfo *TLI, AliasAnalysis *AA,
-                               DominatorTree *DT, ValueToValueMap &Strides)
-    : TheLoop(L), SE(SE), DL(DL), TLI(TLI), AA(AA), DT(DT), NumLoads(0),
-      NumStores(0), MaxSafeDepDistBytes(-1U), CanVecMem(false) {
+                               DominatorTree *DT, LoopInfo *LI,
+                               const ValueToValueMap &Strides)
+    : PtrRtCheck(SE), DepChecker(SE, L), TheLoop(L), SE(SE), DL(DL), TLI(TLI),
+      AA(AA), DT(DT), LI(LI), NumLoads(0), NumStores(0),
+      MaxSafeDepDistBytes(-1U), CanVecMem(false),
+      StoreToLoopInvariantAddress(false) {
    if (canAnalyzeLoop())
      analyzeLoop(Strides);
  }
  
-LoopAccessInfo &LoopAccessAnalysis::getInfo(Loop *L, ValueToValueMap &Strides) {
+void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
+  if (CanVecMem) {
+    if (PtrRtCheck.Need)
+      OS.indent(Depth) << "Memory dependences are safe with run-time checks\n";
+    else
+      OS.indent(Depth) << "Memory dependences are safe\n";
+  }
+
+  if (Report)
+    OS.indent(Depth) << "Report: " << Report->str() << "\n";
+
+  if (auto *InterestingDependences = DepChecker.getInterestingDependences()) {
+    OS.indent(Depth) << "Interesting Dependences:\n";
+    for (auto &Dep : *InterestingDependences) {
+      Dep.print(OS, Depth + 2, DepChecker.getMemoryInstructions());
+      OS << "\n";
+    }
+  } else
+    OS.indent(Depth) << "Too many interesting dependences, not recorded\n";
+
+  // List the pair of accesses need run-time checks to prove independence.
+  PtrRtCheck.print(OS, Depth);
+  OS << "\n";
+
+  OS.indent(Depth) << "Store to invariant address was "
+                   << (StoreToLoopInvariantAddress ? "" : "not ")
+                   << "found in loop.\n";
+}
+
+const LoopAccessInfo &
+LoopAccessAnalysis::getInfo(Loop *L, const ValueToValueMap &Strides) {
    auto &LAI = LoopAccessInfoMap[L];
  
  #ifndef NDEBUG
@@ -1300,7 +1671,9 @@ LoopAccessInfo &LoopAccessAnalysis::getInfo(Loop *L, ValueToValueMap &Strides) {
  #endif
  
    if (!LAI) {
-    LAI = llvm::make_unique<LoopAccessInfo>(L, SE, DL, TLI, AA, DT, Strides);
+    const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+    LAI = llvm::make_unique<LoopAccessInfo>(L, SE, DL, TLI, AA, DT, LI,
+                                            Strides);
  #ifndef NDEBUG
      LAI->NumSymbolicStrides = Strides.size();
  #endif
@@ -1308,13 +1681,26 @@ LoopAccessInfo &LoopAccessAnalysis::getInfo(Loop *L, ValueToValueMap &Strides) {
    return *LAI.get();
  }
  
+void LoopAccessAnalysis::print(raw_ostream &OS, const Module *M) const {
+  LoopAccessAnalysis &LAA = *const_cast<LoopAccessAnalysis *>(this);
+
+  ValueToValueMap NoSymbolicStrides;
+
+  for (Loop *TopLevelLoop : *LI)
+    for (Loop *L : depth_first(TopLevelLoop)) {
+      OS.indent(2) << L->getHeader()->getName() << ":\n";
+      auto &LAI = LAA.getInfo(L, NoSymbolicStrides);
+      LAI.print(OS, 4);
+    }
+}
+
  bool LoopAccessAnalysis::runOnFunction(Function &F) {
    SE = &getAnalysis<ScalarEvolution>();
-  DL = F.getParent()->getDataLayout();
    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
    TLI = TLIP ? &TLIP->getTLI() : nullptr;
    AA = &getAnalysis<AliasAnalysis>();
    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  
    return false;
  }
@@ -1323,6 +1709,7 @@ void LoopAccessAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
      AU.addRequired<ScalarEvolution>();
      AU.addRequired<AliasAnalysis>();
      AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
  
      AU.setPreservesAll();
  }
@@ -1335,6 +1722,7 @@ INITIALIZE_PASS_BEGIN(LoopAccessAnalysis, LAA_NAME, laa_name, false, true)
  INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
  INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
  INITIALIZE_PASS_END(LoopAccessAnalysis, LAA_NAME, laa_name, false, true)
  
  namespace llvm {