From bd67564ff236b5213324980894c36eda6e3dd317 Mon Sep 17 00:00:00 2001
From: Adam Nemet <anemet@apple.com>
Date: Tue, 3 Nov 2015 23:50:08 +0000
Subject: [PATCH] LLE 6/6: Add LoopLoadElimination pass

Summary:
The goal of this pass is to perform store-to-load forwarding across the
backedge of a loop.  E.g.:

  for (i)
     A[i + 1] = A[i] + B[i]

  =>

  T = A[0]
  for (i)
     T = T + B[i]
     A[i + 1] = T

The pass relies on loop dependence analysis via LoopAccessAnalisys to
find opportunities of loop-carried dependences with a distance of one
between a store and a load.  Since it's using LoopAccessAnalysis, it was
easy to also add support for versioning away may-aliasing intervening
stores that would otherwise prevent this transformation.

This optimization is also performed by Load-PRE in GVN without the
option of multi-versioning.  As was discussed with Daniel Berlin in
http://reviews.llvm.org/D9548, this is inferior to a more loop-aware
solution applied here.  Hopefully, we will be able to remove some
complexity from GVN/MemorySSA as a consequence.

In the long run, we may want to extend this pass (or create a new one if
there is little overlap) to also eliminate loop-indepedent redundant
loads and store that *require* versioning due to may-aliasing
intervening stores/loads.  I have some motivating cases for store
elimination. My plan right now is to wait for MemorySSA to come online
first rather than using memdep for this.

The main motiviation for this pass is the 456.hmmer loop in SPECint2006
where after distributing the original loop and vectorizing the top part,
we are left with the critical path exposed in the bottom loop.  Being
able to promote the memory dependence into a register depedence (even
though the HW does perform store-to-load fowarding as well) results in a
major gain (~20%).  This gain also transfers over to x86: it's
around 8-10%.

Right now the pass is off by default and can be enabled
with -enable-loop-load-elim.  On the LNT testsuite, there are two
performance changes (negative number -> improvement):

  1. -28% in Polybench/linear-algebra/solvers/dynprog: the length of the
     critical paths is reduced
  2. +2% in Polybench/stencils/adi: Unfortunately, I couldn't reproduce this
     outside of LNT

The pass is scheduled after the loop vectorizer (which is after loop
distribution).  The rational is to try to reuse LAA state, rather than
recomputing it.  The order between LV and LLE is not critical because
normally LV does not touch scalar st->ld forwarding cases where
vectorizing would inhibit the CPU's st->ld forwarding to kick in.

LoopLoadElimination requires LAA to provide the full set of dependences
(including forward dependences).  LAA is known to omit loop-independent
dependences in certain situations.  The big comment before
removeDependencesFromMultipleStores explains why this should not occur
for the cases that we're interested in.

Reviewers: dberlin, hfinkel

Subscribers: junbuml, dberlin, mssimpso, rengolin, sanjoy, llvm-commits

Differential Revision: http://reviews.llvm.org/D13259

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@252017 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/LoopAccessAnalysis.h    |  16 +
 include/llvm/InitializePasses.h               |   1 +
 include/llvm/Transforms/Scalar.h              |   6 +
 lib/Transforms/IPO/PassManagerBuilder.cpp     |  10 +
 lib/Transforms/Scalar/CMakeLists.txt          |   1 +
 lib/Transforms/Scalar/LoopLoadElimination.cpp | 554 ++++++++++++++++++
 lib/Transforms/Scalar/Scalar.cpp              |   1 +
 test/Transforms/LoopLoadElim/backward.ll      |  32 +
 .../LoopLoadElim/def-store-before-load.ll     |  35 ++
 test/Transforms/LoopLoadElim/forward.ll       |  47 ++
 test/Transforms/LoopLoadElim/memcheck.ll      |  52 ++
 .../multiple-stores-same-block.ll             |  48 ++
 test/Transforms/LoopLoadElim/unknown-dep.ll   |  54 ++
 13 files changed, 857 insertions(+)
 create mode 100644 lib/Transforms/Scalar/LoopLoadElimination.cpp
 create mode 100644 test/Transforms/LoopLoadElim/backward.ll
 create mode 100644 test/Transforms/LoopLoadElim/def-store-before-load.ll
 create mode 100644 test/Transforms/LoopLoadElim/forward.ll
 create mode 100644 test/Transforms/LoopLoadElim/memcheck.ll
 create mode 100644 test/Transforms/LoopLoadElim/multiple-stores-same-block.ll
 create mode 100644 test/Transforms/LoopLoadElim/unknown-dep.ll

diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h
index f864fd1de30..909429415ab 100644
--- a/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -250,6 +250,17 @@ public:
     return InstMap;
   }
 
+  /// \brief Generate a mapping between the memory instructions and their
+  /// indices according to program order.
+  DenseMap<Instruction *, unsigned> generateInstructionOrderMap() const {
+    DenseMap<Instruction *, unsigned> OrderMap;
+
+    for (unsigned I = 0; I < InstMap.size(); ++I)
+      OrderMap[InstMap[I]] = I;
+
+    return OrderMap;
+  }
+
   /// \brief Find the set of instructions that read or write via \p Ptr.
   SmallVector<Instruction *, 4> getInstructionsForAccess(Value *Ptr,
                                                          bool isWrite) const;
@@ -453,6 +464,11 @@ public:
   /// index \p I and \p J to prove their independence.
   bool needsChecking(unsigned I, unsigned J) const;
 
+  /// \brief Return PointerInfo for pointer at index \p PtrIdx.
+  const PointerInfo &getPointerInfo(unsigned PtrIdx) const {
+    return Pointers[PtrIdx];
+  }
+
 private:
   /// \brief Groups pointers such that a single memcheck is required
   /// between two different groups. This will clear the CheckingGroups vector
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 92127fa3301..cdd2ba235e4 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -301,6 +301,7 @@ void initializeLoopDistributePass(PassRegistry&);
 void initializeSjLjEHPreparePass(PassRegistry&);
 void initializeDemandedBitsPass(PassRegistry&);
 void initializeFuncletLayoutPass(PassRegistry &);
+void initializeLoopLoadEliminationPass(PassRegistry&);
 }
 
 #endif
diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index 5c703966eb1..9173de1112f 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -480,6 +480,12 @@ FunctionPass *createNaryReassociatePass();
 //
 FunctionPass *createLoopDistributePass();
 
+//===----------------------------------------------------------------------===//
+//
+// LoopLoadElimination - Perform loop-aware load elimination.
+//
+FunctionPass *createLoopLoadEliminationPass();
+
 } // End llvm namespace
 
 #endif
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index f7b8ef38223..6e0d23961ca 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -99,6 +99,10 @@ static cl::opt<bool> EnableNonLTOGlobalsModRef(
     cl::desc(
         "Enable the GlobalsModRef AliasAnalysis outside of the LTO pipeline."));
 
+static cl::opt<bool> EnableLoopLoadElim(
+    "enable-loop-load-elim", cl::init(false), cl::Hidden,
+    cl::desc("Enable the new, experimental LoopLoadElimination Pass"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
@@ -378,6 +382,12 @@ void PassManagerBuilder::populateModulePassManager(
     MPM.add(createLoopDistributePass());
 
   MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize));
+
+  // Eliminate loads by forwarding stores from the previous iteration to loads
+  // of the current iteration.
+  if (EnableLoopLoadElim)
+    MPM.add(createLoopLoadEliminationPass());
+
   // FIXME: Because of #pragma vectorize enable, the passes below are always
   // inserted in the pipeline, even when the vectorizer doesn't run (ex. when
   // on -O1 and no #pragma is found). Would be good to have these two passes
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index 1bbbb0317de..a0ddbd08520 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -21,6 +21,7 @@ add_llvm_library(LLVMScalarOpts
   LoopIdiomRecognize.cpp
   LoopInstSimplify.cpp
   LoopInterchange.cpp
+  LoopLoadElimination.cpp
   LoopRerollPass.cpp
   LoopRotation.cpp
   LoopStrengthReduce.cpp
diff --git a/lib/Transforms/Scalar/LoopLoadElimination.cpp b/lib/Transforms/Scalar/LoopLoadElimination.cpp
new file mode 100644
index 00000000000..3dc9f84387e
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -0,0 +1,554 @@
+//===- LoopLoadElimination.cpp - Loop Load Elimination Pass ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implement a loop-aware load elimination pass.
+//
+// It uses LoopAccessAnalysis to identify loop-carried dependences with a
+// distance of one between stores and loads.  These form the candidates for the
+// transformation.  The source value of each store then propagated to the user
+// of the corresponding load.  This makes the load dead.
+//
+// The pass can also version the loop and add memchecks in order to prove that
+// may-aliasing stores can't change the value in memory before it's read by the
+// load.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include <forward_list>
+
+#define LLE_OPTION "loop-load-elim"
+#define DEBUG_TYPE LLE_OPTION
+
+using namespace llvm;
+
+static cl::opt<unsigned> CheckPerElim(
+    "runtime-check-per-loop-load-elim", cl::Hidden,
+    cl::desc("Max number of memchecks allowed per eliminated load on average"),
+    cl::init(1));
+
+STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE");
+
+namespace {
+
+/// \brief Represent a store-to-forwarding candidate.
+struct StoreToLoadForwardingCandidate {
+  LoadInst *Load;
+  StoreInst *Store;
+
+  StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store)
+      : Load(Load), Store(Store) {}
+
+  /// \brief Return true if the dependence from the store to the load has a
+  /// distance of one.  E.g. A[i+1] = A[i]
+  bool isDependenceDistanceOfOne(ScalarEvolution *SE) const {
+    Value *LoadPtr = Load->getPointerOperand();
+    Value *StorePtr = Store->getPointerOperand();
+    Type *LoadPtrType = LoadPtr->getType();
+    Type *StorePtrType = StorePtr->getType();
+    Type *LoadType = LoadPtrType->getPointerElementType();
+
+    assert(LoadPtrType->getPointerAddressSpace() ==
+               StorePtrType->getPointerAddressSpace() &&
+           LoadType == StorePtrType->getPointerElementType() &&
+           "Should be a known dependence");
+
+    auto &DL = Load->getParent()->getModule()->getDataLayout();
+    unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType));
+
+    auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
+    auto *StorePtrSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+
+    // We don't need to check non-wrapping here because forward/backward
+    // dependence wouldn't be valid if these weren't monotonic accesses.
+    auto *Dist =
+        cast<SCEVConstant>(SE->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV));
+    const APInt &Val = Dist->getValue()->getValue();
+    return Val.abs() == TypeByteSize;
+  }
+
+  Value *getLoadPtr() const { return Load->getPointerOperand(); }
+
+#ifndef NDEBUG
+  friend raw_ostream &operator<<(raw_ostream &OS,
+                                 const StoreToLoadForwardingCandidate &Cand) {
+    OS << *Cand.Store << " -->\n";
+    OS.indent(2) << *Cand.Load << "\n";
+    return OS;
+  }
+#endif
+};
+
+/// \brief Check if the store dominates all latches, so as long as there is no
+/// intervening store this value will be loaded in the next iteration.
+bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
+                                  DominatorTree *DT) {
+  SmallVector<BasicBlock *, 8> Latches;
+  L->getLoopLatches(Latches);
+  return std::all_of(Latches.begin(), Latches.end(),
+                     [&](const BasicBlock *Latch) {
+                       return DT->dominates(StoreBlock, Latch);
+                     });
+}
+
+/// \brief The per-loop class that does most of the work.
+class LoadEliminationForLoop {
+public:
+  LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
+                         DominatorTree *DT, ScalarEvolution *SE)
+      : L(L), LI(LI), LAI(LAI), DT(DT), SE(SE) {}
+
+  /// \brief Look through the loop-carried and loop-independent dependences in
+  /// this loop and find store->load dependences.
+  ///
+  /// Note that no candidate is returned if LAA has failed to analyze the loop
+  /// (e.g. if it's not bottom-tested, contains volatile memops, etc.)
+  std::forward_list<StoreToLoadForwardingCandidate>
+  findStoreToLoadDependences(const LoopAccessInfo &LAI) {
+    std::forward_list<StoreToLoadForwardingCandidate> Candidates;
+
+    const auto *Deps = LAI.getDepChecker().getDependences();
+    if (!Deps)
+      return Candidates;
+
+    // Find store->load dependences (consequently true dep).  Both lexically
+    // forward and backward dependences qualify.  Disqualify loads that have
+    // other unknown dependences.
+
+    SmallSet<Instruction *, 4> LoadsWithUnknownDepedence;
+
+    for (const auto &Dep : *Deps) {
+      Instruction *Source = Dep.getSource(LAI);
+      Instruction *Destination = Dep.getDestination(LAI);
+
+      if (Dep.Type == MemoryDepChecker::Dependence::Unknown) {
+        if (isa<LoadInst>(Source))
+          LoadsWithUnknownDepedence.insert(Source);
+        if (isa<LoadInst>(Destination))
+          LoadsWithUnknownDepedence.insert(Destination);
+        continue;
+      }
+
+      if (Dep.isBackward())
+        // Note that the designations source and destination follow the program
+        // order, i.e. source is always first.  (The direction is given by the
+        // DepType.)
+        std::swap(Source, Destination);
+      else
+        assert(Dep.isForward() && "Needs to be a forward dependence");
+
+      auto *Store = dyn_cast<StoreInst>(Source);
+      if (!Store)
+        continue;
+      auto *Load = dyn_cast<LoadInst>(Destination);
+      if (!Load)
+        continue;
+      Candidates.emplace_front(Load, Store);
+    }
+
+    if (!LoadsWithUnknownDepedence.empty())
+      Candidates.remove_if([&](const StoreToLoadForwardingCandidate &C) {
+        return LoadsWithUnknownDepedence.count(C.Load);
+      });
+
+    return Candidates;
+  }
+
+  /// \brief Return the index of the instruction according to program order.
+  unsigned getInstrIndex(Instruction *Inst) {
+    auto I = InstOrder.find(Inst);
+    assert(I != InstOrder.end() && "No index for instruction");
+    return I->second;
+  }
+
+  /// \brief If a load has multiple candidates associated (i.e. different
+  /// stores), it means that it could be forwarding from multiple stores
+  /// depending on control flow.  Remove these candidates.
+  ///
+  /// Here, we rely on LAA to include the relevant loop-independent dependences.
+  /// LAA is known to omit these in the very simple case when the read and the
+  /// write within an alias set always takes place using the *same* pointer.
+  ///
+  /// However, we know that this is not the case here, i.e. we can rely on LAA
+  /// to provide us with loop-independent dependences for the cases we're
+  /// interested.  Consider the case for example where a loop-independent
+  /// dependece S1->S2 invalidates the forwarding S3->S2.
+  ///
+  ///         A[i]   = ...   (S1)
+  ///         ...    = A[i]  (S2)
+  ///         A[i+1] = ...   (S3)
+  ///
+  /// LAA will perform dependence analysis here because there are two
+  /// *different* pointers involved in the same alias set (&A[i] and &A[i+1]).
+  void removeDependencesFromMultipleStores(
+      std::forward_list<StoreToLoadForwardingCandidate> &Candidates) {
+    // If Store is nullptr it means that we have multiple stores forwarding to
+    // this store.
+    typedef DenseMap<LoadInst *, const StoreToLoadForwardingCandidate *>
+        LoadToSingleCandT;
+    LoadToSingleCandT LoadToSingleCand;
+
+    for (const auto &Cand : Candidates) {
+      bool NewElt;
+      LoadToSingleCandT::iterator Iter;
+
+      std::tie(Iter, NewElt) =
+          LoadToSingleCand.insert(std::make_pair(Cand.Load, &Cand));
+      if (!NewElt) {
+        const StoreToLoadForwardingCandidate *&OtherCand = Iter->second;
+        // Already multiple stores forward to this load.
+        if (OtherCand == nullptr)
+          continue;
+
+        // Handle the very basic of case when the two stores are in the same
+        // block so deciding which one forwards is easy.  The later one forwards
+        // as long as they both have a dependence distance of one to the load.
+        if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
+            Cand.isDependenceDistanceOfOne(SE) &&
+            OtherCand->isDependenceDistanceOfOne(SE)) {
+          // They are in the same block, the later one will forward to the load.
+          if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
+            OtherCand = &Cand;
+        } else
+          OtherCand = nullptr;
+      }
+    }
+
+    Candidates.remove_if([&](const StoreToLoadForwardingCandidate &Cand) {
+      if (LoadToSingleCand[Cand.Load] != &Cand) {
+        DEBUG(dbgs() << "Removing from candidates: \n" << Cand
+                     << "  The load may have multiple stores forwarding to "
+                     << "it\n");
+        return true;
+      }
+      return false;
+    });
+  }
+
+  /// \brief Given two pointers operations by their RuntimePointerChecking
+  /// indices, return true if they require an alias check.
+  ///
+  /// We need a check if one is a pointer for a candidate load and the other is
+  /// a pointer for a possibly intervening store.
+  bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2,
+                     const SmallSet<Value *, 4> &PtrsWrittenOnFwdingPath,
+                     const std::set<Value *> &CandLoadPtrs) {
+    Value *Ptr1 =
+        LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue;
+    Value *Ptr2 =
+        LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx2).PointerValue;
+    return ((PtrsWrittenOnFwdingPath.count(Ptr1) && CandLoadPtrs.count(Ptr2)) ||
+            (PtrsWrittenOnFwdingPath.count(Ptr2) && CandLoadPtrs.count(Ptr1)));
+  }
+
+  /// \brief Return pointers that are possibly written to on the path from a
+  /// forwarding store to a load.
+  ///
+  /// These pointers need to be alias-checked against the forwarding candidates.
+  SmallSet<Value *, 4> findPointersWrittenOnForwardingPath(
+      const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
+    // From FirstStore to LastLoad neither of the elimination candidate loads
+    // should overlap with any of the stores.
+    //
+    // E.g.:
+    //
+    // st1 C[i]
+    // ld1 B[i] <-------,
+    // ld0 A[i] <----,  |              * LastLoad
+    // ...           |  |
+    // st2 E[i]      |  |
+    // st3 B[i+1] -- | -'              * FirstStore
+    // st0 A[i+1] ---'
+    // st4 D[i]
+    //
+    // st0 forwards to ld0 if the accesses in st4 and st1 don't overlap with
+    // ld0.
+
+    LoadInst *LastLoad =
+        std::max_element(Candidates.begin(), Candidates.end(),
+                         [&](const StoreToLoadForwardingCandidate &A,
+                             const StoreToLoadForwardingCandidate &B) {
+                           return getInstrIndex(A.Load) < getInstrIndex(B.Load);
+                         })
+            ->Load;
+    StoreInst *FirstStore =
+        std::min_element(Candidates.begin(), Candidates.end(),
+                         [&](const StoreToLoadForwardingCandidate &A,
+                             const StoreToLoadForwardingCandidate &B) {
+                           return getInstrIndex(A.Store) <
+                                  getInstrIndex(B.Store);
+                         })
+            ->Store;
+
+    // We're looking for stores after the first forwarding store until the end
+    // of the loop, then from the beginning of the loop until the last
+    // forwarded-to load.  Collect the pointer for the stores.
+    SmallSet<Value *, 4> PtrsWrittenOnFwdingPath;
+
+    auto InsertStorePtr = [&](Instruction *I) {
+      if (auto *S = dyn_cast<StoreInst>(I))
+        PtrsWrittenOnFwdingPath.insert(S->getPointerOperand());
+    };
+    const auto &MemInstrs = LAI.getDepChecker().getMemoryInstructions();
+    std::for_each(MemInstrs.begin() + getInstrIndex(FirstStore) + 1,
+                  MemInstrs.end(), InsertStorePtr);
+    std::for_each(MemInstrs.begin(), &MemInstrs[getInstrIndex(LastLoad)],
+                  InsertStorePtr);
+
+    return PtrsWrittenOnFwdingPath;
+  }
+
+  /// \brief Determine the pointer alias checks to prove that there are no
+  /// intervening stores.
+  SmallVector<RuntimePointerChecking::PointerCheck, 4> collectMemchecks(
+      const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
+
+    SmallSet<Value *, 4> PtrsWrittenOnFwdingPath =
+        findPointersWrittenOnForwardingPath(Candidates);
+
+    // Collect the pointers of the candidate loads.
+    // FIXME: SmallSet does not work with std::inserter.
+    std::set<Value *> CandLoadPtrs;
+    std::transform(Candidates.begin(), Candidates.end(),
+                   std::inserter(CandLoadPtrs, CandLoadPtrs.begin()),
+                   std::mem_fn(&StoreToLoadForwardingCandidate::getLoadPtr));
+
+    const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks();
+    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
+
+    std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
+                 [&](const RuntimePointerChecking::PointerCheck &Check) {
+                   for (auto PtrIdx1 : Check.first->Members)
+                     for (auto PtrIdx2 : Check.second->Members)
+                       if (needsChecking(PtrIdx1, PtrIdx2,
+                                         PtrsWrittenOnFwdingPath, CandLoadPtrs))
+                         return true;
+                   return false;
+                 });
+
+    DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n");
+    DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+
+    return Checks;
+  }
+
+  /// \brief Perform the transformation for a candidate.
+  void
+  propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand,
+                                  SCEVExpander &SEE) {
+    //
+    // loop:
+    //      %x = load %gep_i
+    //         = ... %x
+    //      store %y, %gep_i_plus_1
+    //
+    // =>
+    //
+    // ph:
+    //      %x.initial = load %gep_0
+    // loop:
+    //      %x.storeforward = phi [%x.initial, %ph] [%y, %loop]
+    //      %x = load %gep_i            <---- now dead
+    //         = ... %x.storeforward
+    //      store %y, %gep_i_plus_1
+
+    Value *Ptr = Cand.Load->getPointerOperand();
+    auto *PtrSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));
+    auto *PH = L->getLoopPreheader();
+    Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(),
+                                          PH->getTerminator());
+    Value *Initial =
+        new LoadInst(InitialPtr, "load_initial", PH->getTerminator());
+    PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded",
+                                   L->getHeader()->begin());
+    PHI->addIncoming(Initial, PH);
+    PHI->addIncoming(Cand.Store->getOperand(0), L->getLoopLatch());
+
+    Cand.Load->replaceAllUsesWith(PHI);
+  }
+
+  /// \brief Top-level driver for each loop: find store->load forwarding
+  /// candidates, add run-time checks and perform transformation.
+  bool processLoop() {
+    DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName()
+                 << "\" checking " << *L << "\n");
+    // Look for store-to-load forwarding cases across the
+    // backedge. E.g.:
+    //
+    // loop:
+    //      %x = load %gep_i
+    //         = ... %x
+    //      store %y, %gep_i_plus_1
+    //
+    // =>
+    //
+    // ph:
+    //      %x.initial = load %gep_0
+    // loop:
+    //      %x.storeforward = phi [%x.initial, %ph] [%y, %loop]
+    //      %x = load %gep_i            <---- now dead
+    //         = ... %x.storeforward
+    //      store %y, %gep_i_plus_1
+
+    // First start with store->load dependences.
+    auto StoreToLoadDependences = findStoreToLoadDependences(LAI);
+    if (StoreToLoadDependences.empty())
+      return false;
+
+    // Generate an index for each load and store according to the original
+    // program order.  This will be used later.
+    InstOrder = LAI.getDepChecker().generateInstructionOrderMap();
+
+    // To keep things simple for now, remove those where the load is potentially
+    // fed by multiple stores.
+    removeDependencesFromMultipleStores(StoreToLoadDependences);
+    if (StoreToLoadDependences.empty())
+      return false;
+
+    // Filter the candidates further.
+    SmallVector<StoreToLoadForwardingCandidate, 4> Candidates;
+    unsigned NumForwarding = 0;
+    for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) {
+      DEBUG(dbgs() << "Candidate " << Cand);
+      // Make sure that the stored values is available everywhere in the loop in
+      // the next iteration.
+      if (!doesStoreDominatesAllLatches(Cand.Store->getParent(), L, DT))
+        continue;
+
+      // Check whether the SCEV difference is the same as the induction step,
+      // thus we load the value in the next iteration.
+      if (!Cand.isDependenceDistanceOfOne(SE))
+        continue;
+
+      ++NumForwarding;
+      DEBUG(dbgs()
+            << NumForwarding
+            << ". Valid store-to-load forwarding across the loop backedge\n");
+      Candidates.push_back(Cand);
+    }
+    if (Candidates.empty())
+      return false;
+
+    // Check intervening may-alias stores.  These need runtime checks for alias
+    // disambiguation.
+    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks =
+        collectMemchecks(Candidates);
+
+    // Too many checks are likely to outweigh the benefits of forwarding.
+    if (Checks.size() > Candidates.size() * CheckPerElim) {
+      DEBUG(dbgs() << "Too many run-time checks needed.\n");
+      return false;
+    }
+
+    // Point of no-return, start the transformation.  First, version the loop if
+    // necessary.
+    if (!Checks.empty()) {
+      LoopVersioning LV(std::move(Checks), LAI, L, LI, DT);
+      LV.versionLoop();
+    }
+
+    // Next, propagate the value stored by the store to the users of the load.
+    // Also for the first iteration, generate the initial value of the load.
+    SCEVExpander SEE(*SE, L->getHeader()->getModule()->getDataLayout(),
+                     "storeforward");
+    for (const auto &Cand : Candidates)
+      propagateStoredValueToLoadUsers(Cand, SEE);
+    NumLoopLoadEliminted += NumForwarding;
+
+    return true;
+  }
+
+private:
+  Loop *L;
+
+  /// \brief Maps the load/store instructions to their index according to
+  /// program order.
+  DenseMap<Instruction *, unsigned> InstOrder;
+
+  // Analyses used.
+  LoopInfo *LI;
+  const LoopAccessInfo &LAI;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+};
+
+/// \brief The pass.  Most of the work is delegated to the per-loop
+/// LoadEliminationForLoop class.
+class LoopLoadElimination : public FunctionPass {
+public:
+  LoopLoadElimination() : FunctionPass(ID) {
+    initializeLoopLoadEliminationPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *LAA = &getAnalysis<LoopAccessAnalysis>();
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
+    // Build up a worklist of inner-loops to vectorize. This is necessary as the
+    // act of distributing a loop creates new loops and can invalidate iterators
+    // across the loops.
+    SmallVector<Loop *, 8> Worklist;
+
+    for (Loop *TopLevelLoop : *LI)
+      for (Loop *L : depth_first(TopLevelLoop))
+        // We only handle inner-most loops.
+        if (L->empty())
+          Worklist.push_back(L);
+
+    // Now walk the identified inner loops.
+    bool Changed = false;
+    for (Loop *L : Worklist) {
+      const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap());
+      // The actual work is performed by LoadEliminationForLoop.
+      LoadEliminationForLoop LEL(L, LI, LAI, DT, SE);
+      Changed |= LEL.processLoop();
+    }
+
+    // Process each loop nest in the function.
+    return Changed;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<LoopAccessAnalysis>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+
+  static char ID;
+};
+}
+
+char LoopLoadElimination::ID;
+static const char LLE_name[] = "Loop Load Elimination";
+
+INITIALIZE_PASS_BEGIN(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
+
+namespace llvm {
+FunctionPass *createLoopLoadEliminationPass() {
+  return new LoopLoadElimination();
+}
+}
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index fd74fa84226..52d477cc957 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -83,6 +83,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializePlaceSafepointsPass(Registry);
   initializeFloat2IntPass(Registry);
   initializeLoopDistributePass(Registry);
+  initializeLoopLoadEliminationPass(Registry);
 }
 
 void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
diff --git a/test/Transforms/LoopLoadElim/backward.ll b/test/Transforms/LoopLoadElim/backward.ll
new file mode 100644
index 00000000000..7c750a51a2a
--- /dev/null
+++ b/test/Transforms/LoopLoadElim/backward.ll
@@ -0,0 +1,32 @@
+; RUN: opt -loop-load-elim -S < %s | FileCheck %s
+
+; Simple st->ld forwarding derived from a lexical backward dep.
+;
+;   for (unsigned i = 0; i < 100; i++)
+;     A[i+1] = A[i] + B[i];
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i64 %N) {
+entry:
+; CHECK: %load_initial = load i32, i32* %A
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+; CHECK: %store_forwarded = phi i32 [ %load_initial, %entry ], [ %add, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %load = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %load_1 = load i32, i32* %arrayidx2, align 4
+; CHECK: %add = add i32 %load_1, %store_forwarded
+  %add = add i32 %load_1, %load
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next
+  store i32 %add, i32* %arrayidx_next, align 4
+  %exitcond = icmp eq i64 %indvars.iv.next, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Transforms/LoopLoadElim/def-store-before-load.ll b/test/Transforms/LoopLoadElim/def-store-before-load.ll
new file mode 100644
index 00000000000..3dc93f6786e
--- /dev/null
+++ b/test/Transforms/LoopLoadElim/def-store-before-load.ll
@@ -0,0 +1,35 @@
+; RUN: opt -loop-load-elim -S < %s | FileCheck %s
+
+;  No loop-carried forwarding: The intervening store to A[i] kills the stored
+;  value from the previous iteration.
+;
+;   for (unsigned i = 0; i < 100; i++) {
+;     A[i] = 1;
+;     A[i+1] = A[i] + B[i];
+;   }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i64 %N) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+; CHECK-NOT: %store_forwarded
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 1, i32* %arrayidx, align 4
+  %a = load i32, i32* %arrayidx, align 4
+  %arrayidxB = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %b = load i32, i32* %arrayidxB, align 4
+; CHECK: %add = add i32 %b, %a
+  %add = add i32 %b, %a
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next
+  store i32 %add, i32* %arrayidx_next, align 4
+  %exitcond = icmp eq i64 %indvars.iv.next, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Transforms/LoopLoadElim/forward.ll b/test/Transforms/LoopLoadElim/forward.ll
new file mode 100644
index 00000000000..1a77297a064
--- /dev/null
+++ b/test/Transforms/LoopLoadElim/forward.ll
@@ -0,0 +1,47 @@
+; RUN: opt -loop-load-elim -S < %s | FileCheck %s
+
+; Simple st->ld forwarding derived from a lexical forwrad dep.
+;
+;   for (unsigned i = 0; i < 100; i++) {
+;     A[i+1] = B[i] + 2;
+;     C[i] = A[i] * 2;
+;   }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(i32* %A, i32* %B, i32* %C, i64 %N) {
+
+; CHECK:   for.body.lver.memcheck:
+; CHECK:     %found.conflict{{.*}} =
+; CHECK-NOT: %found.conflict{{.*}} =
+
+entry:
+; for.body.ph:
+; CHECK: %load_initial = load i32, i32* %A
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+; CHECK: %store_forwarded = phi i32 [ %load_initial, %for.body.ph ], [ %a_p1, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+
+  %Aidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next
+  %Bidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %Cidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+  %Aidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+
+  %b = load i32, i32* %Bidx, align 4
+  %a_p1 = add i32 %b, 2
+  store i32 %a_p1, i32* %Aidx_next, align 4
+
+  %a = load i32, i32* %Aidx, align 4
+; CHECK: %c = mul i32 %store_forwarded, 2
+  %c = mul i32 %a, 2
+  store i32 %c, i32* %Cidx, align 4
+
+  %exitcond = icmp eq i64 %indvars.iv.next, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Transforms/LoopLoadElim/memcheck.ll b/test/Transforms/LoopLoadElim/memcheck.ll
new file mode 100644
index 00000000000..ebb52825754
--- /dev/null
+++ b/test/Transforms/LoopLoadElim/memcheck.ll
@@ -0,0 +1,52 @@
+; RUN: opt -loop-load-elim -S < %s | FileCheck %s
+; RUN: opt -loop-load-elim -S -runtime-check-per-loop-load-elim=2 < %s | FileCheck %s --check-prefix=AGGRESSIVE
+
+; This needs two pairs of memchecks (A * { C, D }) for a single load
+; elimination which is considered to expansive by default.
+;
+;   for (unsigned i = 0; i < 100; i++) {
+;     A[i+1] = B[i] + 2;
+;     C[i] = A[i] * 2;
+;     D[i] = 2;
+;   }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(i32*  %A, i32*  %B, i32*  %C, i64 %N, i32* %D) {
+entry:
+  br label %for.body
+
+; AGGRESSIVE: for.body.lver.memcheck:
+; AGGRESSIVE: %found.conflict{{.*}} =
+; AGGRESSIVE: %found.conflict{{.*}} =
+; AGGRESSIVE-NOT: %found.conflict{{.*}} =
+
+for.body:                                         ; preds = %for.body, %entry
+; CHECK-NOT: %store_forwarded =
+; AGGRESSIVE: %store_forwarded =
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+
+  %Aidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next
+  %Bidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %Cidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+  %Aidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %Didx = getelementptr inbounds i32, i32* %D, i64 %indvars.iv
+
+  %b = load i32, i32* %Bidx, align 4
+  %a_p1 = add i32 %b, 2
+  store i32 %a_p1, i32* %Aidx_next, align 4
+
+  %a = load i32, i32* %Aidx, align 4
+; CHECK: %c = mul i32 %a, 2
+; AGGRESSIVE: %c = mul i32 %store_forwarded, 2
+  %c = mul i32 %a, 2
+  store i32 %c, i32* %Cidx, align 4
+  store i32 2, i32* %Didx, align 4
+
+  %exitcond = icmp eq i64 %indvars.iv.next, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Transforms/LoopLoadElim/multiple-stores-same-block.ll b/test/Transforms/LoopLoadElim/multiple-stores-same-block.ll
new file mode 100644
index 00000000000..b0c0f3dee86
--- /dev/null
+++ b/test/Transforms/LoopLoadElim/multiple-stores-same-block.ll
@@ -0,0 +1,48 @@
+; RUN: opt -basicaa -loop-load-elim -S < %s | FileCheck %s
+
+; In this case the later store forward to the load:
+;
+;   for (unsigned i = 0; i < 100; i++) {
+;     B[i] = A[i] + 1;
+;     A[i+1] = C[i] + 2;
+;     A[i+1] = D[i] + 3;
+;   }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(i32* noalias nocapture %A, i32* noalias nocapture readonly %B,
+               i32* noalias nocapture %C, i32* noalias nocapture readonly %D,
+               i64 %N) {
+entry:
+; CHECK: %load_initial = load i32, i32* %A
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+; CHECK: %store_forwarded = phi i32 [ %load_initial, %entry ], [ %addD, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidxA = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %loadA = load i32, i32* %arrayidxA, align 4
+; CHECK: %addA = add i32 %store_forwarded, 1
+  %addA = add i32 %loadA, 1
+
+  %arrayidxB = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  store i32 %addA, i32* %arrayidxB, align 4
+
+  %arrayidxC = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+  %loadC = load i32, i32* %arrayidxC, align 4
+  %addC = add i32 %loadC, 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidxA_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next
+  store i32 %addC, i32* %arrayidxA_next, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %D, i64 %indvars.iv
+  %loadD = load i32, i32* %arrayidxD, align 4
+  %addD = add i32 %loadD, 3
+  store i32 %addD, i32* %arrayidxA_next, align 4
+
+  %exitcond = icmp eq i64 %indvars.iv.next, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Transforms/LoopLoadElim/unknown-dep.ll b/test/Transforms/LoopLoadElim/unknown-dep.ll
new file mode 100644
index 00000000000..d2df718ca4c
--- /dev/null
+++ b/test/Transforms/LoopLoadElim/unknown-dep.ll
@@ -0,0 +1,54 @@
+; RUN: opt -basicaa -loop-load-elim -S < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Give up in the presence of unknown deps. Here, the different strides result
+; in unknown dependence:
+;
+;   for (unsigned i = 0; i < 100; i++) {
+;     A[i+1] = B[i] + 2;
+;     A[2*i] = C[i] + 2;
+;     D[i] = A[i] + 2;
+;   }
+
+define void @f(i32* noalias %A, i32* noalias %B, i32* noalias %C,
+               i32* noalias %D, i64 %N) {
+
+entry:
+; for.body.ph:
+; CHECK-NOT: %load_initial =
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+; CHECK-NOT: %store_forwarded =
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+
+  %Aidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next
+  %Bidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %Cidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+  %Didx = getelementptr inbounds i32, i32* %D, i64 %indvars.iv
+  %Aidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %indvars.m2 = mul nuw nsw i64 %indvars.iv, 2
+  %A2idx = getelementptr inbounds i32, i32* %A, i64 %indvars.m2
+
+  %b = load i32, i32* %Bidx, align 4
+  %a_p1 = add i32 %b, 2
+  store i32 %a_p1, i32* %Aidx_next, align 4
+
+  %c = load i32, i32* %Cidx, align 4
+  %a_m2 = add i32 %c, 2
+  store i32 %a_m2, i32* %A2idx, align 4
+
+  %a = load i32, i32* %Aidx, align 4
+; CHECK-NOT: %d = add i32 %store_forwarded, 2
+; CHECK: %d = add i32 %a, 2
+  %d = add i32 %a, 2
+  store i32 %d, i32* %Didx, align 4
+
+  %exitcond = icmp eq i64 %indvars.iv.next, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
-- 
2.34.1