X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTransforms%2FScalar%2FLoopUnrollPass.cpp;h=0e4462618aaf0bdd9fb0db1cb0a4c1d3e540a855;hp=bd9da14f66931e4835c0f18a9c84a5fb3ed9b68a;hb=41591e7e3004e7ad02a3fdf237866b871288442b;hpb=710ce5d36b44ad5bbb3cd322a6485ec170c5fcc8 diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index bd9da14f669..0e4462618aa 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -13,8 +13,11 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -22,14 +25,13 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Metadata.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/UnrollLoop.h" -#include "llvm/IR/InstVisitor.h" -#include "llvm/Analysis/InstructionSimplify.h" #include using namespace llvm; @@ -37,11 +39,22 @@ using namespace llvm; #define DEBUG_TYPE "loop-unroll" static cl::opt -UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden, - cl::desc("The cut-off point for automatic loop unrolling")); + UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden, + cl::desc("The baseline cost threshold for loop unrolling")); + +static cl::opt UnrollPercentDynamicCostSavedThreshold( + "unroll-percent-dynamic-cost-saved-threshold", cl::init(20), cl::Hidden, + cl::desc("The percentage of estimated dynamic cost which must be saved by " + "unrolling to allow unrolling up to the max threshold.")); + +static cl::opt UnrollDynamicCostSavingsDiscount( + "unroll-dynamic-cost-savings-discount", cl::init(2000), cl::Hidden, + cl::desc("This is the amount discounted from the total unroll cost when " + "the unrolled form has a high dynamic cost savings (triggered by " + "the '-unroll-perecent-dynamic-cost-saved-threshold' flag).")); static cl::opt UnrollMaxIterationsCountToAnalyze( - "unroll-max-iteration-count-to-analyze", cl::init(1000), cl::Hidden, + "unroll-max-iteration-count-to-analyze", cl::init(0), cl::Hidden, cl::desc("Don't allow loop unrolling to simulate more than this number of" "iterations when checking full unroll profitability")); @@ -70,11 +83,18 @@ namespace { static char ID; // Pass ID, replacement for typeid LoopUnroll(int T = -1, int C = -1, int P = -1, int R = -1) : LoopPass(ID) { CurrentThreshold = (T == -1) ? UnrollThreshold : unsigned(T); + CurrentPercentDynamicCostSavedThreshold = + UnrollPercentDynamicCostSavedThreshold; + CurrentDynamicCostSavingsDiscount = UnrollDynamicCostSavingsDiscount; CurrentCount = (C == -1) ? UnrollCount : unsigned(C); CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P; CurrentRuntime = (R == -1) ? UnrollRuntime : (bool)R; UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0); + UserPercentDynamicCostSavedThreshold = + (UnrollPercentDynamicCostSavedThreshold.getNumOccurrences() > 0); + UserDynamicCostSavingsDiscount = + (UnrollDynamicCostSavingsDiscount.getNumOccurrences() > 0); UserAllowPartial = (P != -1) || (UnrollAllowPartial.getNumOccurrences() > 0); UserRuntime = (R != -1) || (UnrollRuntime.getNumOccurrences() > 0); @@ -98,12 +118,18 @@ namespace { unsigned CurrentCount; unsigned CurrentThreshold; - bool CurrentAllowPartial; - bool CurrentRuntime; - bool UserCount; // CurrentCount is user-specified. - bool UserThreshold; // CurrentThreshold is user-specified. - bool UserAllowPartial; // CurrentAllowPartial is user-specified. - bool UserRuntime; // CurrentRuntime is user-specified. + unsigned CurrentPercentDynamicCostSavedThreshold; + unsigned CurrentDynamicCostSavingsDiscount; + bool CurrentAllowPartial; + bool CurrentRuntime; + + // Flags for whether the 'current' settings are user-specified. + bool UserCount; + bool UserThreshold; + bool UserPercentDynamicCostSavedThreshold; + bool UserDynamicCostSavingsDiscount; + bool UserAllowPartial; + bool UserRuntime; bool runOnLoop(Loop *L, LPPassManager &LPM) override; @@ -112,20 +138,22 @@ namespace { /// void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addPreserved(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info. // If loop unroll does not preserve dom info then LCSSA pass on next // loop will receive invalid dom info. // For now, recreate dom info, if loop is unrolled. AU.addPreserved(); + AU.addPreserved(); } // Fill in the UnrollingPreferences parameter with values from the @@ -133,6 +161,9 @@ namespace { void getUnrollingPreferences(Loop *L, const TargetTransformInfo &TTI, TargetTransformInfo::UnrollingPreferences &UP) { UP.Threshold = CurrentThreshold; + UP.PercentDynamicCostSavedThreshold = + CurrentPercentDynamicCostSavedThreshold; + UP.DynamicCostSavingsDiscount = CurrentDynamicCostSavingsDiscount; UP.OptSizeThreshold = OptSizeUnrollThreshold; UP.PartialThreshold = CurrentThreshold; UP.PartialOptSizeThreshold = OptSizeUnrollThreshold; @@ -140,6 +171,7 @@ namespace { UP.MaxCount = UINT_MAX; UP.Partial = CurrentAllowPartial; UP.Runtime = CurrentRuntime; + UP.AllowExpensiveTripCount = false; TTI.getUnrollingPreferences(L, UP); } @@ -157,10 +189,11 @@ namespace { // total unrolled size. Parameters Threshold and PartialThreshold // are set to the maximum unrolled size for fully and partially // unrolled loops respectively. - void selectThresholds(const Loop *L, bool HasPragma, + void selectThresholds(const Loop *L, bool UsePragmaThreshold, const TargetTransformInfo::UnrollingPreferences &UP, unsigned &Threshold, unsigned &PartialThreshold, - unsigned NumberOfSimplifiedInstructions) { + unsigned &PercentDynamicCostSavedThreshold, + unsigned &DynamicCostSavingsDiscount) { // Determine the current unrolling threshold. While this is // normally set from UnrollThreshold, it is overridden to a // smaller value if the current function is marked as @@ -168,14 +201,22 @@ namespace { // specified. Threshold = UserThreshold ? CurrentThreshold : UP.Threshold; PartialThreshold = UserThreshold ? CurrentThreshold : UP.PartialThreshold; + PercentDynamicCostSavedThreshold = + UserPercentDynamicCostSavedThreshold + ? CurrentPercentDynamicCostSavedThreshold + : UP.PercentDynamicCostSavedThreshold; + DynamicCostSavingsDiscount = UserDynamicCostSavingsDiscount + ? CurrentDynamicCostSavingsDiscount + : UP.DynamicCostSavingsDiscount; + if (!UserThreshold && - L->getHeader()->getParent()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize)) { + // FIXME: Use Function::optForSize(). + L->getHeader()->getParent()->hasFnAttribute( + Attribute::OptimizeForSize)) { Threshold = UP.OptSizeThreshold; PartialThreshold = UP.PartialOptSizeThreshold; } - if (HasPragma) { + if (UsePragmaThreshold) { // If the loop has an unrolling pragma, we want to be more // aggressive with unrolling limits. Set thresholds to at // least the PragmaTheshold value which is larger than the @@ -186,8 +227,11 @@ namespace { PartialThreshold = std::max(PartialThreshold, PragmaUnrollThreshold); } - Threshold += NumberOfSimplifiedInstructions; } + bool canUnrollCompletely(Loop *L, unsigned Threshold, + unsigned PercentDynamicCostSavedThreshold, + unsigned DynamicCostSavingsDiscount, + uint64_t UnrolledCost, uint64_t RolledDynamicCost); }; } @@ -195,10 +239,11 @@ char LoopUnroll::ID = 0; INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial, @@ -210,60 +255,7 @@ Pass *llvm::createSimpleLoopUnrollPass() { return llvm::createLoopUnrollPass(-1, -1, 0, 0); } -static bool IsLoadFromConstantInitializer(Value *V) { - if (GlobalVariable *GV = dyn_cast(V)) - if (GV->isConstant() && GV->hasDefinitiveInitializer()) - return GV->getInitializer(); - return false; -} - -struct FindConstantPointers { - bool LoadCanBeConstantFolded; - bool IndexIsConstant; - APInt Step; - APInt StartValue; - Value *BaseAddress; - const Loop *L; - ScalarEvolution &SE; - FindConstantPointers(const Loop *loop, ScalarEvolution &SE) - : LoadCanBeConstantFolded(true), IndexIsConstant(true), L(loop), SE(SE) {} - - bool follow(const SCEV *S) { - if (const SCEVUnknown *SC = dyn_cast(S)) { - // We've reached the leaf node of SCEV, it's most probably just a - // variable. Now it's time to see if it corresponds to a global constant - // global (in which case we can eliminate the load), or not. - BaseAddress = SC->getValue(); - LoadCanBeConstantFolded = - IndexIsConstant && IsLoadFromConstantInitializer(BaseAddress); - return false; - } - if (isa(S)) - return true; - if (const SCEVAddRecExpr *AR = dyn_cast(S)) { - // If the current SCEV expression is AddRec, and its loop isn't the loop - // we are about to unroll, then we won't get a constant address after - // unrolling, and thus, won't be able to eliminate the load. - if (AR->getLoop() != L) - return IndexIsConstant = false; - // If the step isn't constant, we won't get constant addresses in unrolled - // version. Bail out. - if (const SCEVConstant *StepSE = - dyn_cast(AR->getStepRecurrence(SE))) - Step = StepSE->getValue()->getValue(); - else - return IndexIsConstant = false; - - return IndexIsConstant; - } - // If Result is true, continue traversal. - // Otherwise, we have found something that prevents us from (possible) load - // elimination. - return IndexIsConstant; - } - bool isDone() const { return !IndexIsConstant; } -}; - +namespace { // This class is used to get an estimate of the optimization effects that we // could get from complete loop unrolling. It comes from the fact that some // loads might be replaced with concrete constant values and that could trigger @@ -280,29 +272,112 @@ struct FindConstantPointers { // v = b[0]* 0 + b[1]* 1 + b[2]* 0 // And finally: // v = b[1] -class UnrollAnalyzer : public InstVisitor { - typedef InstVisitor Base; - friend class InstVisitor; +class UnrolledInstAnalyzer : private InstVisitor { + typedef InstVisitor Base; + friend class InstVisitor; + struct SimplifiedAddress { + Value *Base = nullptr; + ConstantInt *Offset = nullptr; + }; + +public: + UnrolledInstAnalyzer(unsigned Iteration, + DenseMap &SimplifiedValues, + const Loop *L, ScalarEvolution &SE) + : Iteration(Iteration), SimplifiedValues(SimplifiedValues), L(L), SE(SE) { + IterationNumber = SE.getConstant(APInt(64, Iteration)); + } + + // Allow access to the initial visit method. + using Base::visit; + +private: + /// \brief A cache of pointer bases and constant-folded offsets corresponding + /// to GEP (or derived from GEP) instructions. + /// + /// In order to find the base pointer one needs to perform non-trivial + /// traversal of the corresponding SCEV expression, so it's good to have the + /// results saved. + DenseMap SimplifiedAddresses; + + /// \brief Number of currently simulated iteration. + /// + /// If an expression is ConstAddress+Constant, then the Constant is + /// Start + Iteration*Step, where Start and Step could be obtained from + /// SCEVGEPCache. + unsigned Iteration; + + /// \brief SCEV expression corresponding to number of currently simulated + /// iteration. + const SCEV *IterationNumber; + + /// \brief A Value->Constant map for keeping values that we managed to + /// constant-fold on the given iteration. + /// + /// While we walk the loop instructions, we build up and maintain a mapping + /// of simplified values specific to this iteration. The idea is to propagate + /// any special information we have about loads that can be replaced with + /// constants after complete unrolling, and account for likely simplifications + /// post-unrolling. + DenseMap &SimplifiedValues; const Loop *L; - unsigned TripCount; ScalarEvolution &SE; - const TargetTransformInfo &TTI; - unsigned NumberOfOptimizedInstructions; - DenseMap SimplifiedValues; - DenseMap LoadBaseAddresses; - SmallPtrSet CountedInsns; - - // Provide base case for our instruction visit. - bool visitInstruction(Instruction &I) { return false; }; - // TODO: We should also visit ICmp, FCmp, GetElementPtr, Trunc, ZExt, SExt, - // FPTrunc, FPExt, FPToUI, FPToSI, UIToFP, SIToFP, BitCast, Select, - // ExtractElement, InsertElement, ShuffleVector, ExtractValue, InsertValue. - // - // Probaly it's worth to hoist the code for estimating the simplifications - // effects to a separate class, since we have a very similar code in - // InlineCost already. + /// \brief Try to simplify instruction \param I using its SCEV expression. + /// + /// The idea is that some AddRec expressions become constants, which then + /// could trigger folding of other instructions. However, that only happens + /// for expressions whose start value is also constant, which isn't always the + /// case. In another common and important case the start value is just some + /// address (i.e. SCEVUnknown) - in this case we compute the offset and save + /// it along with the base address instead. + bool simplifyInstWithSCEV(Instruction *I) { + if (!SE.isSCEVable(I->getType())) + return false; + + const SCEV *S = SE.getSCEV(I); + if (auto *SC = dyn_cast(S)) { + SimplifiedValues[I] = SC->getValue(); + return true; + } + + auto *AR = dyn_cast(S); + if (!AR) + return false; + + const SCEV *ValueAtIteration = AR->evaluateAtIteration(IterationNumber, SE); + // Check if the AddRec expression becomes a constant. + if (auto *SC = dyn_cast(ValueAtIteration)) { + SimplifiedValues[I] = SC->getValue(); + return true; + } + + // Check if the offset from the base address becomes a constant. + auto *Base = dyn_cast(SE.getPointerBase(S)); + if (!Base) + return false; + auto *Offset = + dyn_cast(SE.getMinusSCEV(ValueAtIteration, Base)); + if (!Offset) + return false; + SimplifiedAddress Address; + Address.Base = Base->getValue(); + Address.Offset = Offset->getValue(); + SimplifiedAddresses[I] = Address; + return true; + } + + /// Base case for the instruction visitor. + bool visitInstruction(Instruction &I) { + return simplifyInstWithSCEV(&I); + } + + /// Try to simplify binary operator I. + /// + /// TODO: Probably it's worth to hoist the code for estimating the + /// simplifications effects to a separate class, since we have a very similar + /// code in InlineCost already. bool visitBinaryOperator(BinaryOperator &I) { Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); if (!isa(LHS)) @@ -311,217 +386,304 @@ class UnrollAnalyzer : public InstVisitor { if (!isa(RHS)) if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) RHS = SimpleRHS; - Value *SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS); - if (SimpleV && CountedInsns.insert(&I).second) - NumberOfOptimizedInstructions += TTI.getUserCost(&I); + Value *SimpleV = nullptr; + const DataLayout &DL = I.getModule()->getDataLayout(); + if (auto FI = dyn_cast(&I)) + SimpleV = + SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL); + else + SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL); - if (Constant *C = dyn_cast_or_null(SimpleV)) { + if (Constant *C = dyn_cast_or_null(SimpleV)) SimplifiedValues[&I] = C; + + if (SimpleV) return true; - } - return false; + return Base::visitBinaryOperator(I); } - Constant *computeLoadValue(LoadInst *LI, unsigned Iteration) { - if (!LI) - return nullptr; - Value *BaseAddr = LoadBaseAddresses[LI]; - if (!BaseAddr) - return nullptr; + /// Try to fold load I. + bool visitLoad(LoadInst &I) { + Value *AddrOp = I.getPointerOperand(); - auto GV = dyn_cast(BaseAddr); - if (!GV) - return nullptr; + auto AddressIt = SimplifiedAddresses.find(AddrOp); + if (AddressIt == SimplifiedAddresses.end()) + return false; + ConstantInt *SimplifiedAddrOp = AddressIt->second.Offset; + + auto *GV = dyn_cast(AddressIt->second.Base); + // We're only interested in loads that can be completely folded to a + // constant. + if (!GV || !GV->hasInitializer() || !GV->isConstant()) + return false; ConstantDataSequential *CDS = dyn_cast(GV->getInitializer()); if (!CDS) - return nullptr; - - const SCEV *BaseAddrSE = SE.getSCEV(BaseAddr); - const SCEV *S = SE.getSCEV(LI->getPointerOperand()); - const SCEV *OffSE = SE.getMinusSCEV(S, BaseAddrSE); + return false; - APInt StepC, StartC; - const SCEVAddRecExpr *AR = dyn_cast(OffSE); - if (!AR) - return nullptr; + int ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U; + assert(SimplifiedAddrOp->getValue().getActiveBits() < 64 && + "Unexpectedly large index value."); + int64_t Index = SimplifiedAddrOp->getSExtValue() / ElemSize; + if (Index >= CDS->getNumElements()) { + // FIXME: For now we conservatively ignore out of bound accesses, but + // we're allowed to perform the optimization in this case. + return false; + } - if (const SCEVConstant *StepSE = - dyn_cast(AR->getStepRecurrence(SE))) - StepC = StepSE->getValue()->getValue(); - else - return nullptr; + Constant *CV = CDS->getElementAsConstant(Index); + assert(CV && "Constant expected."); + SimplifiedValues[&I] = CV; - if (const SCEVConstant *StartSE = dyn_cast(AR->getStart())) - StartC = StartSE->getValue()->getValue(); - else - return nullptr; + return true; + } - unsigned ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U; - unsigned Start = StartC.getLimitedValue(); - unsigned Step = StepC.getLimitedValue(); + bool visitCastInst(CastInst &I) { + // Propagate constants through casts. + Constant *COp = dyn_cast(I.getOperand(0)); + if (!COp) + COp = SimplifiedValues.lookup(I.getOperand(0)); + if (COp) + if (Constant *C = + ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) { + SimplifiedValues[&I] = C; + return true; + } - unsigned Index = (Start + Step * Iteration) / ElemSize; - if (Index >= CDS->getNumElements()) - return nullptr; + return Base::visitCastInst(I); + } - Constant *CV = CDS->getElementAsConstant(Index); + bool visitCmpInst(CmpInst &I) { + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); - return CV; - } + // First try to handle simplified comparisons. + if (!isa(LHS)) + if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) + LHS = SimpleLHS; + if (!isa(RHS)) + if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) + RHS = SimpleRHS; -public: - UnrollAnalyzer(const Loop *L, unsigned TripCount, ScalarEvolution &SE, - const TargetTransformInfo &TTI) - : L(L), TripCount(TripCount), SE(SE), TTI(TTI), - NumberOfOptimizedInstructions(0) {} - - // Visit all loads the loop L, and for those that, after complete loop - // unrolling, would have a constant address and it will point to a known - // constant initializer, record its base address for future use. It is used - // when we estimate number of potentially simplified instructions. - void FindConstFoldableLoads() { - for (auto BB : L->getBlocks()) { - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { - if (LoadInst *LI = dyn_cast(I)) { - if (!LI->isSimple()) - continue; - Value *AddrOp = LI->getPointerOperand(); - const SCEV *S = SE.getSCEV(AddrOp); - FindConstantPointers Visitor(L, SE); - SCEVTraversal T(Visitor); - T.visitAll(S); - if (Visitor.IndexIsConstant && Visitor.LoadCanBeConstantFolded) { - LoadBaseAddresses[LI] = Visitor.BaseAddress; + if (!isa(LHS) && !isa(RHS)) { + auto SimplifiedLHS = SimplifiedAddresses.find(LHS); + if (SimplifiedLHS != SimplifiedAddresses.end()) { + auto SimplifiedRHS = SimplifiedAddresses.find(RHS); + if (SimplifiedRHS != SimplifiedAddresses.end()) { + SimplifiedAddress &LHSAddr = SimplifiedLHS->second; + SimplifiedAddress &RHSAddr = SimplifiedRHS->second; + if (LHSAddr.Base == RHSAddr.Base) { + LHS = LHSAddr.Offset; + RHS = RHSAddr.Offset; } } } } - } - // Given a list of loads that could be constant-folded (LoadBaseAddresses), - // estimate number of optimized instructions after substituting the concrete - // values for the given Iteration. - // Fill in SimplifiedInsns map for future use in DCE-estimation. - unsigned EstimateNumberOfSimplifiedInsns(unsigned Iteration) { - SmallVector Worklist; - SimplifiedValues.clear(); - CountedInsns.clear(); - - NumberOfOptimizedInstructions = 0; - // We start by adding all loads to the worklist. - for (auto LoadDescr : LoadBaseAddresses) { - LoadInst *LI = LoadDescr.first; - SimplifiedValues[LI] = computeLoadValue(LI, Iteration); - if (CountedInsns.insert(LI).second) - NumberOfOptimizedInstructions += TTI.getUserCost(LI); - - for (auto U : LI->users()) { - Instruction *UI = dyn_cast(U); - if (!UI) - continue; - if (!L->contains(UI)) - continue; - Worklist.push_back(UI); + if (Constant *CLHS = dyn_cast(LHS)) { + if (Constant *CRHS = dyn_cast(RHS)) { + if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) { + SimplifiedValues[&I] = C; + return true; + } } } - // And then we try to simplify every user of every instruction from the - // worklist. If we do simplify a user, add it to the worklist to process - // its users as well. - while (!Worklist.empty()) { - Instruction *I = Worklist.pop_back_val(); - if (!visit(I)) - continue; - for (auto U : I->users()) { - Instruction *UI = dyn_cast(U); - if (!UI) - continue; - if (!L->contains(UI)) - continue; - Worklist.push_back(UI); - } - } - return NumberOfOptimizedInstructions; + return Base::visitCmpInst(I); } +}; +} // namespace - // Given a list of potentially simplifed instructions, estimate number of - // instructions that would become dead if we do perform the simplification. - unsigned EstimateNumberOfDeadInsns() { - NumberOfOptimizedInstructions = 0; - SmallVector Worklist; - DenseMap DeadInstructions; - // Start by initializing worklist with simplified instructions. - for (auto Folded : SimplifiedValues) { - if (auto FoldedInsn = dyn_cast(Folded.first)) { - Worklist.push_back(FoldedInsn); - DeadInstructions[FoldedInsn] = true; - } + +namespace { +struct EstimatedUnrollCost { + /// \brief The estimated cost after unrolling. + int UnrolledCost; + + /// \brief The estimated dynamic cost of executing the instructions in the + /// rolled form. + int RolledDynamicCost; +}; +} + +/// \brief Figure out if the loop is worth full unrolling. +/// +/// Complete loop unrolling can make some loads constant, and we need to know +/// if that would expose any further optimization opportunities. This routine +/// estimates this optimization. It computes cost of unrolled loop +/// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By +/// dynamic cost we mean that we won't count costs of blocks that are known not +/// to be executed (i.e. if we have a branch in the loop and we know that at the +/// given iteration its condition would be resolved to true, we won't add up the +/// cost of the 'false'-block). +/// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If +/// the analysis failed (no benefits expected from the unrolling, or the loop is +/// too big to analyze), the returned value is None. +static Optional +analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, + ScalarEvolution &SE, const TargetTransformInfo &TTI, + int MaxUnrolledLoopSize) { + // We want to be able to scale offsets by the trip count and add more offsets + // to them without checking for overflows, and we already don't want to + // analyze *massive* trip counts, so we force the max to be reasonably small. + assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) && + "The unroll iterations max is too large!"); + + // Don't simulate loops with a big or unknown tripcount + if (!UnrollMaxIterationsCountToAnalyze || !TripCount || + TripCount > UnrollMaxIterationsCountToAnalyze) + return None; + + SmallSetVector BBWorklist; + DenseMap SimplifiedValues; + SmallVector, 4> SimplifiedInputValues; + + // The estimated cost of the unrolled form of the loop. We try to estimate + // this by simplifying as much as we can while computing the estimate. + int UnrolledCost = 0; + // We also track the estimated dynamic (that is, actually executed) cost in + // the rolled form. This helps identify cases when the savings from unrolling + // aren't just exposing dead control flows, but actual reduced dynamic + // instructions due to the simplifications which we expect to occur after + // unrolling. + int RolledDynamicCost = 0; + + // Ensure that we don't violate the loop structure invariants relied on by + // this analysis. + assert(L->isLoopSimplifyForm() && "Must put loop into normal form first."); + assert(L->isLCSSAForm(DT) && + "Must have loops in LCSSA form to track live-out values."); + + DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n"); + + // Simulate execution of each iteration of the loop counting instructions, + // which would be simplified. + // Since the same load will take different values on different iterations, + // we literally have to go through all loop's iterations. + for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) { + DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n"); + + // Prepare for the iteration by collecting any simplified entry or backedge + // inputs. + for (Instruction &I : *L->getHeader()) { + auto *PHI = dyn_cast(&I); + if (!PHI) + break; + + // The loop header PHI nodes must have exactly two input: one from the + // loop preheader and one from the loop latch. + assert( + PHI->getNumIncomingValues() == 2 && + "Must have an incoming value only for the preheader and the latch."); + + Value *V = PHI->getIncomingValueForBlock( + Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch()); + Constant *C = dyn_cast(V); + if (Iteration != 0 && !C) + C = SimplifiedValues.lookup(V); + if (C) + SimplifiedInputValues.push_back({PHI, C}); } - // If a definition of an insn is only used by simplified or dead - // instructions, it's also dead. Check defs of all instructions from the - // worklist. - while (!Worklist.empty()) { - Instruction *FoldedInsn = Worklist.pop_back_val(); - for (Value *Op : FoldedInsn->operands()) { - if (auto I = dyn_cast(Op)) { - if (!L->contains(I)) - continue; - if (SimplifiedValues[I]) - continue; // This insn has been counted already. - if (I->getNumUses() == 0) + + // Now clear and re-populate the map for the next iteration. + SimplifiedValues.clear(); + while (!SimplifiedInputValues.empty()) + SimplifiedValues.insert(SimplifiedInputValues.pop_back_val()); + + UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, L, SE); + + BBWorklist.clear(); + BBWorklist.insert(L->getHeader()); + // Note that we *must not* cache the size, this loop grows the worklist. + for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { + BasicBlock *BB = BBWorklist[Idx]; + + // Visit all instructions in the given basic block and try to simplify + // it. We don't change the actual IR, just count optimization + // opportunities. + for (Instruction &I : *BB) { + int InstCost = TTI.getUserCost(&I); + + // Visit the instruction to analyze its loop cost after unrolling, + // and if the visitor returns false, include this instruction in the + // unrolled cost. + if (!Analyzer.visit(I)) + UnrolledCost += InstCost; + else { + DEBUG(dbgs() << " " << I + << " would be simplified if loop is unrolled.\n"); + (void)0; + } + + // Also track this instructions expected cost when executing the rolled + // loop form. + RolledDynamicCost += InstCost; + + // If unrolled body turns out to be too big, bail out. + if (UnrolledCost > MaxUnrolledLoopSize) { + DEBUG(dbgs() << " Exceeded threshold.. exiting.\n" + << " UnrolledCost: " << UnrolledCost + << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize + << "\n"); + return None; + } + } + + TerminatorInst *TI = BB->getTerminator(); + + // Add in the live successors by first checking whether we have terminator + // that may be simplified based on the values simplified by this call. + if (BranchInst *BI = dyn_cast(TI)) { + if (BI->isConditional()) { + if (Constant *SimpleCond = + SimplifiedValues.lookup(BI->getCondition())) { + BasicBlock *Succ = nullptr; + // Just take the first successor if condition is undef + if (isa(SimpleCond)) + Succ = BI->getSuccessor(0); + else + Succ = BI->getSuccessor( + cast(SimpleCond)->isZero() ? 1 : 0); + if (L->contains(Succ)) + BBWorklist.insert(Succ); continue; - bool AllUsersFolded = true; - for (auto U : I->users()) { - Instruction *UI = dyn_cast(U); - if (!SimplifiedValues[UI] && !DeadInstructions[UI]) { - AllUsersFolded = false; - break; - } - } - if (AllUsersFolded) { - NumberOfOptimizedInstructions += TTI.getUserCost(I); - Worklist.push_back(I); - DeadInstructions[I] = true; } } + } else if (SwitchInst *SI = dyn_cast(TI)) { + if (Constant *SimpleCond = + SimplifiedValues.lookup(SI->getCondition())) { + BasicBlock *Succ = nullptr; + // Just take the first successor if condition is undef + if (isa(SimpleCond)) + Succ = SI->getSuccessor(0); + else + Succ = SI->findCaseValue(cast(SimpleCond)) + .getCaseSuccessor(); + if (L->contains(Succ)) + BBWorklist.insert(Succ); + continue; + } } + + // Add BB's successors to the worklist. + for (BasicBlock *Succ : successors(BB)) + if (L->contains(Succ)) + BBWorklist.insert(Succ); } - return NumberOfOptimizedInstructions; - } -}; -// Complete loop unrolling can make some loads constant, and we need to know if -// that would expose any further optimization opportunities. -// This routine estimates this optimization effect and returns the number of -// instructions, that potentially might be optimized away. -static unsigned -ApproximateNumberOfOptimizedInstructions(const Loop *L, ScalarEvolution &SE, - unsigned TripCount, - const TargetTransformInfo &TTI) { - if (!TripCount) - return 0; - - UnrollAnalyzer UA(L, TripCount, SE, TTI); - UA.FindConstFoldableLoads(); - - // Estimate number of instructions, that could be simplified if we replace a - // load with the corresponding constant. Since the same load will take - // different values on different iterations, we have to go through all loop's - // iterations here. To limit ourselves here, we check only first N - // iterations, and then scale the found number, if necessary. - unsigned IterationsNumberForEstimate = - std::min(UnrollMaxIterationsCountToAnalyze, TripCount); - unsigned NumberOfOptimizedInstructions = 0; - for (unsigned i = 0; i < IterationsNumberForEstimate; ++i) { - NumberOfOptimizedInstructions += UA.EstimateNumberOfSimplifiedInsns(i); - NumberOfOptimizedInstructions += UA.EstimateNumberOfDeadInsns(); + // If we found no optimization opportunities on the first iteration, we + // won't find them on later ones too. + if (UnrolledCost == RolledDynamicCost) { + DEBUG(dbgs() << " No opportunities found.. exiting.\n" + << " UnrolledCost: " << UnrolledCost << "\n"); + return None; + } } - NumberOfOptimizedInstructions *= TripCount / IterationsNumberForEstimate; - - return NumberOfOptimizedInstructions; + DEBUG(dbgs() << "Analysis finished:\n" + << "UnrolledCost: " << UnrolledCost << ", " + << "RolledDynamicCost: " << RolledDynamicCost << "\n"); + return {{UnrolledCost, RolledDynamicCost}}; } /// ApproximateLoopSize - Approximate the size of the loop. @@ -566,11 +728,22 @@ static bool HasUnrollFullPragma(const Loop *L) { return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.full"); } +// Returns true if the loop has an unroll(enable) pragma. This metadata is used +// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives. +static bool HasUnrollEnablePragma(const Loop *L) { + return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.enable"); +} + // Returns true if the loop has an unroll(disable) pragma. static bool HasUnrollDisablePragma(const Loop *L) { return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable"); } +// Returns true if the loop has an runtime unroll(disable) pragma. +static bool HasRuntimeUnrollDisablePragma(const Loop *L) { + return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable"); +} + // If loop has an unroll_count pragma return the (necessarily // positive) value from the pragma. Otherwise return 0. static unsigned UnrollCountPragmaValue(const Loop *L) { @@ -622,6 +795,59 @@ static void SetLoopAlreadyUnrolled(Loop *L) { L->setLoopID(NewLoopID); } +bool LoopUnroll::canUnrollCompletely(Loop *L, unsigned Threshold, + unsigned PercentDynamicCostSavedThreshold, + unsigned DynamicCostSavingsDiscount, + uint64_t UnrolledCost, + uint64_t RolledDynamicCost) { + + if (Threshold == NoThreshold) { + DEBUG(dbgs() << " Can fully unroll, because no threshold is set.\n"); + return true; + } + + if (UnrolledCost <= Threshold) { + DEBUG(dbgs() << " Can fully unroll, because unrolled cost: " + << UnrolledCost << "<" << Threshold << "\n"); + return true; + } + + assert(UnrolledCost && "UnrolledCost can't be 0 at this point."); + assert(RolledDynamicCost >= UnrolledCost && + "Cannot have a higher unrolled cost than a rolled cost!"); + + // Compute the percentage of the dynamic cost in the rolled form that is + // saved when unrolled. If unrolling dramatically reduces the estimated + // dynamic cost of the loop, we use a higher threshold to allow more + // unrolling. + unsigned PercentDynamicCostSaved = + (uint64_t)(RolledDynamicCost - UnrolledCost) * 100ull / RolledDynamicCost; + + if (PercentDynamicCostSaved >= PercentDynamicCostSavedThreshold && + (int64_t)UnrolledCost - (int64_t)DynamicCostSavingsDiscount <= + (int64_t)Threshold) { + DEBUG(dbgs() << " Can fully unroll, because unrolling will reduce the " + "expected dynamic cost by " << PercentDynamicCostSaved + << "% (threshold: " << PercentDynamicCostSavedThreshold + << "%)\n" + << " and the unrolled cost (" << UnrolledCost + << ") is less than the max threshold (" + << DynamicCostSavingsDiscount << ").\n"); + return true; + } + + DEBUG(dbgs() << " Too large to fully unroll:\n"); + DEBUG(dbgs() << " Threshold: " << Threshold << "\n"); + DEBUG(dbgs() << " Max threshold: " << DynamicCostSavingsDiscount << "\n"); + DEBUG(dbgs() << " Percent cost saved threshold: " + << PercentDynamicCostSavedThreshold << "%\n"); + DEBUG(dbgs() << " Unrolled cost: " << UnrolledCost << "\n"); + DEBUG(dbgs() << " Rolled dynamic cost: " << RolledDynamicCost << "\n"); + DEBUG(dbgs() << " Percent cost saved: " << PercentDynamicCostSaved + << "\n"); + return false; +} + unsigned LoopUnroll::selectUnrollCount( const Loop *L, unsigned TripCount, bool PragmaFullUnroll, unsigned PragmaCount, const TargetTransformInfo::UnrollingPreferences &UP, @@ -633,7 +859,7 @@ unsigned LoopUnroll::selectUnrollCount( unsigned Count = UserCount ? CurrentCount : 0; // If there is no user-specified count, unroll pragmas have the next - // highest precendence. + // highest precedence. if (Count == 0) { if (PragmaCount) { Count = PragmaCount; @@ -668,8 +894,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { Function &F = *L->getHeader()->getParent(); + auto &DT = getAnalysis().getDomTree(); LoopInfo *LI = &getAnalysis().getLoopInfo(); - ScalarEvolution *SE = &getAnalysis(); + ScalarEvolution *SE = &getAnalysis().getSE(); const TargetTransformInfo &TTI = getAnalysis().getTTI(F); auto &AC = getAnalysis().getAssumptionCache(F); @@ -682,8 +909,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { return false; } bool PragmaFullUnroll = HasUnrollFullPragma(L); + bool PragmaEnableUnroll = HasUnrollEnablePragma(L); unsigned PragmaCount = UnrollCountPragmaValue(L); - bool HasPragma = PragmaFullUnroll || PragmaCount > 0; + bool HasPragma = PragmaFullUnroll || PragmaEnableUnroll || PragmaCount > 0; TargetTransformInfo::UnrollingPreferences UP; getUnrollingPreferences(L, TTI, UP); @@ -728,27 +956,43 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { return false; } - unsigned NumberOfOptimizedInstructions = - ApproximateNumberOfOptimizedInstructions(L, *SE, TripCount, TTI); - DEBUG(dbgs() << " Complete unrolling could save: " - << NumberOfOptimizedInstructions << "\n"); - unsigned Threshold, PartialThreshold; - selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold, - NumberOfOptimizedInstructions); + unsigned PercentDynamicCostSavedThreshold; + unsigned DynamicCostSavingsDiscount; + // Only use the high pragma threshold when we have a target unroll factor such + // as with "#pragma unroll N" or a pragma indicating full unrolling and the + // trip count is known. Otherwise we rely on the standard threshold to + // heuristically select a reasonable unroll count. + bool UsePragmaThreshold = + PragmaCount > 0 || + ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount != 0); + + selectThresholds(L, UsePragmaThreshold, UP, Threshold, PartialThreshold, + PercentDynamicCostSavedThreshold, + DynamicCostSavingsDiscount); // Given Count, TripCount and thresholds determine the type of // unrolling which is to be performed. enum { Full = 0, Partial = 1, Runtime = 2 }; int Unrolling; if (TripCount && Count == TripCount) { - if (Threshold != NoThreshold && UnrolledSize > Threshold) { - DEBUG(dbgs() << " Too large to fully unroll with count: " << Count - << " because size: " << UnrolledSize << ">" << Threshold - << "\n"); - Unrolling = Partial; - } else { + Unrolling = Partial; + // If the loop is really small, we don't need to run an expensive analysis. + if (canUnrollCompletely(L, Threshold, 100, DynamicCostSavingsDiscount, + UnrolledSize, UnrolledSize)) { Unrolling = Full; + } else { + // The loop isn't that small, but we still can fully unroll it if that + // helps to remove a significant number of instructions. + // To check that, run additional analysis on the loop. + if (Optional Cost = + analyzeLoopUnrollCost(L, TripCount, DT, *SE, TTI, + Threshold + DynamicCostSavingsDiscount)) + if (canUnrollCompletely(L, Threshold, PercentDynamicCostSavedThreshold, + DynamicCostSavingsDiscount, Cost->UnrolledCost, + Cost->RolledDynamicCost)) { + Unrolling = Full; + } } } else if (TripCount && Count < TripCount) { Unrolling = Partial; @@ -758,9 +1002,15 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // Reduce count based on the type of unrolling and the threshold values. unsigned OriginalCount = Count; - bool AllowRuntime = UserRuntime ? CurrentRuntime : UP.Runtime; + bool AllowRuntime = PragmaEnableUnroll || (PragmaCount > 0) || + (UserRuntime ? CurrentRuntime : UP.Runtime); + // Don't unroll a runtime trip count loop with unroll full pragma. + if (HasRuntimeUnrollDisablePragma(L) || PragmaFullUnroll) { + AllowRuntime = false; + } if (Unrolling == Partial) { - bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial; + bool AllowPartial = PragmaEnableUnroll || + (UserAllowPartial ? CurrentAllowPartial : UP.Partial); if (!AllowPartial && !CountSetExplicitly) { DEBUG(dbgs() << " will not try to unroll partially because " << "-unroll-allow-partial not given\n"); @@ -800,23 +1050,27 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { DebugLoc LoopLoc = L->getStartLoc(); Function *F = Header->getParent(); LLVMContext &Ctx = F->getContext(); - if (PragmaFullUnroll && PragmaCount == 0) { - if (TripCount && Count != TripCount) { - emitOptimizationRemarkMissed( - Ctx, DEBUG_TYPE, *F, LoopLoc, - "Unable to fully unroll loop as directed by unroll(full) pragma " - "because unrolled size is too large."); - } else if (!TripCount) { - emitOptimizationRemarkMissed( - Ctx, DEBUG_TYPE, *F, LoopLoc, - "Unable to fully unroll loop as directed by unroll(full) pragma " - "because loop has a runtime trip count."); - } - } else if (PragmaCount > 0 && Count != OriginalCount) { + if ((PragmaCount > 0) && Count != OriginalCount) { emitOptimizationRemarkMissed( Ctx, DEBUG_TYPE, *F, LoopLoc, "Unable to unroll loop the number of times directed by " "unroll_count pragma because unrolled size is too large."); + } else if (PragmaFullUnroll && !TripCount) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to fully unroll loop as directed by unroll(full) pragma " + "because loop has a runtime trip count."); + } else if (PragmaEnableUnroll && Count != TripCount && Count < 2) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to unroll loop as directed by unroll(enable) pragma because " + "unrolled size is too large."); + } else if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount && + Count != TripCount) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to fully unroll loop as directed by unroll pragma because " + "unrolled size is too large."); } } @@ -827,8 +1081,8 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { } // Unroll the loop. - if (!UnrollLoop(L, Count, TripCount, AllowRuntime, TripMultiple, LI, this, - &LPM, &AC)) + if (!UnrollLoop(L, Count, TripCount, AllowRuntime, UP.AllowExpensiveTripCount, + TripMultiple, LI, this, &LPM, &AC)) return false; return true;