From 6e3744f374463c523762668824fee930842c05ee Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Mon, 3 Aug 2015 20:32:27 +0000
Subject: [PATCH] [Unroll] Improve the brute force loop unroll estimate by
 propagating through PHI nodes across iterations.

This patch teaches the new advanced loop unrolling heuristics to propagate
constants into the loop from the preheader and around the backedge after
simulating each iteration. This lets us brute force solve simple recurrances
that aren't modeled effectively by SCEV. It also makes it more clear why we
need to process the loop in-order rather than bottom-up which might otherwise
make much more sense (for example, for DCE).

This came out of an attempt I'm making to develop a principled way to account
for dead code in the unroll estimation. When I implemented
a forward-propagating version of that it produced incorrect results due to
failing to propagate *cost* between loop iterations through the PHI nodes, and
it occured to me we really should at least propagate simplifications across
those edges, and it is quite easy thanks to the loop being in canonical and
LCSSA form.

Differential Revision: http://reviews.llvm.org/D11706

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@243900 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/LoopUnrollPass.cpp      | 46 +++++++++++++++++--
 .../full-unroll-heuristics-phi-prop.ll        | 23 ++++++++++
 2 files changed, 65 insertions(+), 4 deletions(-)
 create mode 100644 test/Transforms/LoopUnroll/full-unroll-heuristics-phi-prop.ll
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 6b64d4e6ecf..f8aa64733db 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -137,6 +137,7 @@ namespace {
     ///
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<LoopInfoWrapperPass>();
       AU.addPreserved<LoopInfoWrapperPass>();
       AU.addRequiredID(LoopSimplifyID);
@@ -235,6 +236,7 @@ char LoopUnroll::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -516,8 +518,8 @@ struct EstimatedUnrollCost {
 /// the analysis failed (no benefits expected from the unrolling, or the loop is
 /// too big to analyze), the returned value is None.
 Optional<EstimatedUnrollCost>
-analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE,
-                      const TargetTransformInfo &TTI,
+analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
+                      ScalarEvolution &SE, const TargetTransformInfo &TTI,
                       unsigned MaxUnrolledLoopSize) {
   // We want to be able to scale offsets by the trip count and add more offsets
   // to them without checking for overflows, and we already don't want to
@@ -532,6 +534,7 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE,
 
   SmallSetVector<BasicBlock *, 16> BBWorklist;
   DenseMap<Value *, Constant *> SimplifiedValues;
+  SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;
 
   // The estimated cost of the unrolled form of the loop. We try to estimate
   // this by simplifying as much as we can while computing the estimate.
@@ -543,6 +546,12 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE,
   // unrolling.
   unsigned RolledDynamicCost = 0;
 
+  // Ensure that we don't violate the loop structure invariants relied on by
+  // this analysis.
+  assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
+  assert(L->isLCSSAForm(DT) &&
+         "Must have loops in LCSSA form to track live-out values.");
+
   DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
 
   // Simulate execution of each iteration of the loop counting instructions,
@@ -551,7 +560,34 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE,
   // we literally have to go through all loop's iterations.
   for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
     DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
+
+    // Prepare for the iteration by collecting any simplified entry or backedge
+    // inputs.
+    for (Instruction &I : *L->getHeader()) {
+      auto *PHI = dyn_cast<PHINode>(&I);
+      if (!PHI)
+        break;
+
+      // The loop header PHI nodes must have exactly two input: one from the
+      // loop preheader and one from the loop latch.
+      assert(
+          PHI->getNumIncomingValues() == 2 &&
+          "Must have an incoming value only for the preheader and the latch.");
+
+      Value *V = PHI->getIncomingValueForBlock(
+          Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
+      Constant *C = dyn_cast<Constant>(V);
+      if (Iteration != 0 && !C)
+        C = SimplifiedValues.lookup(V);
+      if (C)
+        SimplifiedInputValues.push_back({PHI, C});
+    }
+
+    // Now clear and re-populate the map for the next iteration.
     SimplifiedValues.clear();
+    while (!SimplifiedInputValues.empty())
+      SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
+
     UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, L, SE);
 
     BBWorklist.clear();
@@ -849,6 +885,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   Function &F = *L->getHeader()->getParent();
 
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
   const TargetTransformInfo &TTI =
@@ -930,8 +967,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
       // The loop isn't that small, but we still can fully unroll it if that
       // helps to remove a significant number of instructions.
       // To check that, run additional analysis on the loop.
-      if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
-              L, TripCount, *SE, TTI, Threshold + DynamicCostSavingsDiscount))
+      if (Optional<EstimatedUnrollCost> Cost =
+              analyzeLoopUnrollCost(L, TripCount, DT, *SE, TTI,
+                                    Threshold + DynamicCostSavingsDiscount))
         if (canUnrollCompletely(L, Threshold, PercentDynamicCostSavedThreshold,
                                 DynamicCostSavingsDiscount, Cost->UnrolledCost,
                                 Cost->RolledDynamicCost)) {
diff --git a/test/Transforms/LoopUnroll/full-unroll-heuristics-phi-prop.ll b/test/Transforms/LoopUnroll/full-unroll-heuristics-phi-prop.ll
new file mode 100644
index 00000000000..dd8582e6877
--- /dev/null
+++ b/test/Transforms/LoopUnroll/full-unroll-heuristics-phi-prop.ll
@@ -0,0 +1,23 @@
+; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=50 | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define i64 @propagate_loop_phis() {
+; CHECK-LABEL: @propagate_loop_phis(
+; CHECK-NOT: br i1
+; CHECK: ret i64 3
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %inc, %loop ]
+  %x0 = phi i64 [ 0, %entry ], [ %x2, %loop ]
+  %x1 = or i64 %x0, 1
+  %x2 = or i64 %x1, 2
+  %inc = add nuw nsw i64 %iv, 1
+  %cond = icmp sge i64 %inc, 10
+  br i1 %cond, label %loop.end, label %loop
+
+loop.end:
+  %x.lcssa = phi i64 [ %x2, %loop ]
+  ret i64 %x.lcssa
+}
-- 
2.34.1