DebugInfo: Drop rest of DIDescriptor subclasses
[oota-llvm.git] / lib / Transforms / Scalar / LoopUnrollPass.cpp
index a575c457553f63b0ca6b2202790d01f0a0ad1556..3de3b3d980b96461719b1c6fb15d9f676409dde7 100644 (file)
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/Analysis/InstructionSimplify.h"
 #include <climits>
 
 using namespace llvm;
@@ -41,7 +42,7 @@ UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden,
   cl::desc("The cut-off point for automatic loop unrolling"));
 
 static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
-    "unroll-max-iteration-count-to-analyze", cl::init(1000), cl::Hidden,
+    "unroll-max-iteration-count-to-analyze", cl::init(0), cl::Hidden,
     cl::desc("Don't allow loop unrolling to simulate more than this number of"
              "iterations when checking full unroll profitability"));
 
@@ -164,6 +165,7 @@ namespace {
       UP.MaxCount = UINT_MAX;
       UP.Partial = CurrentAllowPartial;
       UP.Runtime = CurrentRuntime;
+      UP.AllowExpensiveTripCount = false;
       TTI.getUnrollingPreferences(L, UP);
     }
 
@@ -212,9 +214,8 @@ namespace {
 
       PartialThreshold = UserThreshold ? CurrentThreshold : UP.PartialThreshold;
       if (!UserThreshold &&
-          L->getHeader()->getParent()->getAttributes().
-              hasAttribute(AttributeSet::FunctionIndex,
-                           Attribute::OptimizeForSize)) {
+          L->getHeader()->getParent()->hasFnAttribute(
+              Attribute::OptimizeForSize)) {
         Threshold = UP.OptSizeThreshold;
         PartialThreshold = UP.PartialOptSizeThreshold;
       }
@@ -259,6 +260,7 @@ static bool isLoadFromConstantInitializer(Value *V) {
   return false;
 }
 
+namespace {
 struct FindConstantPointers {
   bool LoadCanBeConstantFolded;
   bool IndexIsConstant;
@@ -330,11 +332,13 @@ class UnrollAnalyzer : public InstVisitor<UnrollAnalyzer, bool> {
   unsigned TripCount;
   ScalarEvolution &SE;
   const TargetTransformInfo &TTI;
-  unsigned NumberOfOptimizedInstructions;
 
   DenseMap<Value *, Constant *> SimplifiedValues;
   DenseMap<LoadInst *, Value *> LoadBaseAddresses;
-  SmallPtrSet<Instruction *, 32> CountedInsns;
+  SmallPtrSet<Instruction *, 32> CountedInstructions;
+
+  /// \brief Count the number of optimized instructions.
+  unsigned NumberOfOptimizedInstructions;
 
   // Provide base case for our instruction visit.
   bool visitInstruction(Instruction &I) { return false; };
@@ -354,13 +358,14 @@ class UnrollAnalyzer : public InstVisitor<UnrollAnalyzer, bool> {
       if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
         RHS = SimpleRHS;
     Value *SimpleV = nullptr;
+    const DataLayout &DL = I.getModule()->getDataLayout();
     if (auto FI = dyn_cast<FPMathOperator>(&I))
       SimpleV =
-          SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags());
+          SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
     else
-      SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS);
+      SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
 
-    if (SimpleV && CountedInsns.insert(&I).second)
+    if (SimpleV && CountedInstructions.insert(&I).second)
       NumberOfOptimizedInstructions += TTI.getUserCost(&I);
 
     if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
@@ -450,29 +455,30 @@ public:
 
   // Given a list of loads that could be constant-folded (LoadBaseAddresses),
   // estimate number of optimized instructions after substituting the concrete
-  // values for the given Iteration.
-  // Fill in SimplifiedInsns map for future use in DCE-estimation.
-  unsigned estimateNumberOfSimplifiedInsns(unsigned Iteration) {
-    SmallVector<Instruction *, 8> Worklist;
+  // values for the given Iteration. Also track how many instructions become
+  // dead through this process.
+  unsigned estimateNumberOfOptimizedInstructions(unsigned Iteration) {
+    // We keep a set vector for the worklist so that we don't wast space in the
+    // worklist queuing up the same instruction repeatedly. This can happen due
+    // to multiple operands being the same instruction or due to the same
+    // instruction being an operand of lots of things that end up dead or
+    // simplified.
+    SmallSetVector<Instruction *, 8> Worklist;
+
+    // Clear the simplified values and counts for this iteration.
     SimplifiedValues.clear();
-    CountedInsns.clear();
-
+    CountedInstructions.clear();
     NumberOfOptimizedInstructions = 0;
+
     // We start by adding all loads to the worklist.
-    for (auto LoadDescr : LoadBaseAddresses) {
+    for (auto &LoadDescr : LoadBaseAddresses) {
       LoadInst *LI = LoadDescr.first;
       SimplifiedValues[LI] = computeLoadValue(LI, Iteration);
-      if (CountedInsns.insert(LI).second)
+      if (CountedInstructions.insert(LI).second)
         NumberOfOptimizedInstructions += TTI.getUserCost(LI);
 
-      for (auto U : LI->users()) {
-        Instruction *UI = dyn_cast<Instruction>(U);
-        if (!UI)
-          continue;
-        if (!L->contains(UI))
-          continue;
-        Worklist.push_back(UI);
-      }
+      for (User *U : LI->users())
+        Worklist.insert(cast<Instruction>(U));
     }
 
     // And then we try to simplify every user of every instruction from the
@@ -480,65 +486,64 @@ public:
     // its users as well.
     while (!Worklist.empty()) {
       Instruction *I = Worklist.pop_back_val();
+      if (!L->contains(I))
+        continue;
       if (!visit(I))
         continue;
-      for (auto U : I->users()) {
-        Instruction *UI = dyn_cast<Instruction>(U);
-        if (!UI)
-          continue;
-        if (!L->contains(UI))
-          continue;
-        Worklist.push_back(UI);
-      }
+      for (User *U : I->users())
+        Worklist.insert(cast<Instruction>(U));
     }
-    return NumberOfOptimizedInstructions;
-  }
 
-  // Given a list of potentially simplifed instructions, estimate number of
-  // instructions that would become dead if we do perform the simplification.
-  unsigned estimateNumberOfDeadInsns() {
-    NumberOfOptimizedInstructions = 0;
-    SmallVector<Instruction *, 8> Worklist;
-    DenseMap<Instruction *, bool> DeadInstructions;
+    // Now that we know the potentially simplifed instructions, estimate number
+    // of instructions that would become dead if we do perform the
+    // simplification.
+
+    // The dead instructions are held in a separate set. This is used to
+    // prevent us from re-examining instructions and make sure we only count
+    // the benifit once. The worklist's internal set handles insertion
+    // deduplication.
+    SmallPtrSet<Instruction *, 16> DeadInstructions;
+
+    // Lambda to enque operands onto the worklist.
+    auto EnqueueOperands = [&](Instruction &I) {
+      for (auto *Op : I.operand_values())
+        if (auto *OpI = dyn_cast<Instruction>(Op))
+          if (!OpI->use_empty())
+            Worklist.insert(OpI);
+    };
+
     // Start by initializing worklist with simplified instructions.
-    for (auto Folded : SimplifiedValues) {
-      if (auto FoldedInsn = dyn_cast<Instruction>(Folded.first)) {
-        Worklist.push_back(FoldedInsn);
-        DeadInstructions[FoldedInsn] = true;
+    for (auto &FoldedKeyValue : SimplifiedValues)
+      if (auto *FoldedInst = dyn_cast<Instruction>(FoldedKeyValue.first)) {
+        DeadInstructions.insert(FoldedInst);
+
+        // Add each instruction operand of this dead instruction to the
+        // worklist.
+        EnqueueOperands(*FoldedInst);
       }
-    }
+
     // If a definition of an insn is only used by simplified or dead
     // instructions, it's also dead. Check defs of all instructions from the
     // worklist.
     while (!Worklist.empty()) {
-      Instruction *FoldedInsn = Worklist.pop_back_val();
-      for (Value *Op : FoldedInsn->operands()) {
-        if (auto I = dyn_cast<Instruction>(Op)) {
-          if (!L->contains(I))
-            continue;
-          if (SimplifiedValues[I])
-            continue; // This insn has been counted already.
-          if (I->getNumUses() == 0)
-            continue;
-          bool AllUsersFolded = true;
-          for (auto U : I->users()) {
-            Instruction *UI = dyn_cast<Instruction>(U);
-            if (!SimplifiedValues[UI] && !DeadInstructions[UI]) {
-              AllUsersFolded = false;
-              break;
-            }
-          }
-          if (AllUsersFolded) {
-            NumberOfOptimizedInstructions += TTI.getUserCost(I);
-            Worklist.push_back(I);
-            DeadInstructions[I] = true;
-          }
-        }
+      Instruction *I = Worklist.pop_back_val();
+      if (!L->contains(I))
+        continue;
+      if (DeadInstructions.count(I))
+        continue;
+
+      if (std::all_of(I->user_begin(), I->user_end(), [&](User *U) {
+            return DeadInstructions.count(cast<Instruction>(U));
+          })) {
+        NumberOfOptimizedInstructions += TTI.getUserCost(I);
+        DeadInstructions.insert(I);
+        EnqueueOperands(*I);
       }
     }
     return NumberOfOptimizedInstructions;
   }
 };
+} // namespace
 
 // Complete loop unrolling can make some loads constant, and we need to know if
 // that would expose any further optimization opportunities.
@@ -562,10 +567,10 @@ approximateNumberOfOptimizedInstructions(const Loop *L, ScalarEvolution &SE,
   unsigned IterationsNumberForEstimate =
       std::min<unsigned>(UnrollMaxIterationsCountToAnalyze, TripCount);
   unsigned NumberOfOptimizedInstructions = 0;
-  for (unsigned i = 0; i < IterationsNumberForEstimate; ++i) {
-    NumberOfOptimizedInstructions += UA.estimateNumberOfSimplifiedInsns(i);
-    NumberOfOptimizedInstructions += UA.estimateNumberOfDeadInsns();
-  }
+  for (unsigned i = 0; i < IterationsNumberForEstimate; ++i)
+    NumberOfOptimizedInstructions +=
+        UA.estimateNumberOfOptimizedInstructions(i);
+
   NumberOfOptimizedInstructions *= TripCount / IterationsNumberForEstimate;
 
   return NumberOfOptimizedInstructions;
@@ -618,6 +623,11 @@ static bool HasUnrollDisablePragma(const Loop *L) {
   return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable");
 }
 
+// Returns true if the loop has an runtime unroll(disable) pragma.
+static bool HasRuntimeUnrollDisablePragma(const Loop *L) {
+  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
+}
+
 // If loop has an unroll_count pragma return the (necessarily
 // positive) value from the pragma.  Otherwise return 0.
 static unsigned UnrollCountPragmaValue(const Loop *L) {
@@ -806,6 +816,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   // Reduce count based on the type of unrolling and the threshold values.
   unsigned OriginalCount = Count;
   bool AllowRuntime = UserRuntime ? CurrentRuntime : UP.Runtime;
+  if (HasRuntimeUnrollDisablePragma(L)) {
+    AllowRuntime = false;
+  }
   if (Unrolling == Partial) {
     bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial;
     if (!AllowPartial && !CountSetExplicitly) {
@@ -874,8 +887,8 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   }
 
   // Unroll the loop.
-  if (!UnrollLoop(L, Count, TripCount, AllowRuntime, TripMultiple, LI, this,
-                  &LPM, &AC))
+  if (!UnrollLoop(L, Count, TripCount, AllowRuntime, UP.AllowExpensiveTripCount,
+                  TripMultiple, LI, this, &LPM, &AC))
     return false;
 
   return true;