X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTransforms%2FScalar%2FLoopStrengthReduce.cpp;h=ec0c9a6abd6ab345c35767a4827855e896ff154b;hb=ee98aa87434d9d49a8e4dab41d873888ac9c4805;hp=0cf7f17228f860050cd529eaa975048e6995d0fd;hpb=402d43529c18d662d1964d70dbf55bc135a8b473;p=oota-llvm.git diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 0cf7f17228f..ec0c9a6abd6 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -63,18 +63,32 @@ #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Assembly/Writer.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/DenseSet.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLowering.h" #include using namespace llvm; +static cl::opt EnableNested( + "enable-lsr-nested", cl::Hidden, cl::desc("Enable LSR on nested loops")); + +static cl::opt EnableRetry( + "enable-lsr-retry", cl::Hidden, cl::desc("Enable LSR retry")); + +// Temporary flag to cleanup congruent phis after LSR phi expansion. +// It's currently disabled until we can determine whether it's truly useful or +// not. The flag should be removed after the v3.0 release. +static cl::opt EnablePhiElim( + "enable-lsr-phielim", cl::Hidden, cl::desc("Enable LSR phi elimination")); + namespace { /// RegSortData - This class holds data which is used to order reuse candidates. @@ -113,7 +127,7 @@ class RegUseTracker { public: void CountRegister(const SCEV *Reg, size_t LUIdx); void DropRegister(const SCEV *Reg, size_t LUIdx); - void DropUse(size_t LUIdx); + void SwapAndDropUse(size_t LUIdx, size_t LastLUIdx); bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const; @@ -152,18 +166,27 @@ RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) { } void -RegUseTracker::DropUse(size_t LUIdx) { - // Remove the use index from every register's use list. +RegUseTracker::SwapAndDropUse(size_t LUIdx, size_t LastLUIdx) { + assert(LUIdx <= LastLUIdx); + + // Update RegUses. The data structure is not optimized for this purpose; + // we must iterate through it and update each of the bit vectors. for (RegUsesTy::iterator I = RegUsesMap.begin(), E = RegUsesMap.end(); - I != E; ++I) - I->second.UsedByIndices.reset(LUIdx); + I != E; ++I) { + SmallBitVector &UsedByIndices = I->second.UsedByIndices; + if (LUIdx < UsedByIndices.size()) + UsedByIndices[LUIdx] = + LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : 0; + UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx)); + } } bool RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const { - if (!RegUsesMap.count(Reg)) return false; - const SmallBitVector &UsedByIndices = - RegUsesMap.find(Reg)->second.UsedByIndices; + RegUsesTy::const_iterator I = RegUsesMap.find(Reg); + if (I == RegUsesMap.end()) + return false; + const SmallBitVector &UsedByIndices = I->second.UsedByIndices; int i = UsedByIndices.find_first(); if (i == -1) return false; if ((size_t)i != LUIdx) return true; @@ -199,13 +222,17 @@ struct Formula { /// when AM.Scale is not zero. const SCEV *ScaledReg; - Formula() : ScaledReg(0) {} + /// UnfoldedOffset - An additional constant offset which added near the + /// use. This requires a temporary register, but the offset itself can + /// live in an add immediate field rather than a register. + int64_t UnfoldedOffset; - void InitialMatch(const SCEV *S, Loop *L, - ScalarEvolution &SE, DominatorTree &DT); + Formula() : ScaledReg(0), UnfoldedOffset(0) {} + + void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); unsigned getNumRegs() const; - const Type *getType() const; + Type *getType() const; void DeleteBaseReg(const SCEV *&S); @@ -223,9 +250,9 @@ struct Formula { static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl &Good, SmallVectorImpl &Bad, - ScalarEvolution &SE, DominatorTree &DT) { + ScalarEvolution &SE) { // Collect expressions which properly dominate the loop header. - if (S->properlyDominates(L->getHeader(), &DT)) { + if (SE.properlyDominates(S, L->getHeader())) { Good.push_back(S); return; } @@ -234,18 +261,19 @@ static void DoInitialMatch(const SCEV *S, Loop *L, if (const SCEVAddExpr *Add = dyn_cast(S)) { for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); I != E; ++I) - DoInitialMatch(*I, L, Good, Bad, SE, DT); + DoInitialMatch(*I, L, Good, Bad, SE); return; } // Look at addrec operands. if (const SCEVAddRecExpr *AR = dyn_cast(S)) if (!AR->getStart()->isZero()) { - DoInitialMatch(AR->getStart(), L, Good, Bad, SE, DT); + DoInitialMatch(AR->getStart(), L, Good, Bad, SE); DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0), AR->getStepRecurrence(SE), - AR->getLoop()), - L, Good, Bad, SE, DT); + // FIXME: AR->getNoWrapFlags() + AR->getLoop(), SCEV::FlagAnyWrap), + L, Good, Bad, SE); return; } @@ -257,7 +285,7 @@ static void DoInitialMatch(const SCEV *S, Loop *L, SmallVector MyGood; SmallVector MyBad; - DoInitialMatch(NewMul, L, MyGood, MyBad, SE, DT); + DoInitialMatch(NewMul, L, MyGood, MyBad, SE); const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue( SE.getEffectiveSCEVType(NewMul->getType()))); for (SmallVectorImpl::const_iterator I = MyGood.begin(), @@ -277,11 +305,10 @@ static void DoInitialMatch(const SCEV *S, Loop *L, /// InitialMatch - Incorporate loop-variant parts of S into this Formula, /// attempting to keep all loop-invariant and loop-computable values in a /// single base register. -void Formula::InitialMatch(const SCEV *S, Loop *L, - ScalarEvolution &SE, DominatorTree &DT) { +void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { SmallVector Good; SmallVector Bad; - DoInitialMatch(S, L, Good, Bad, SE, DT); + DoInitialMatch(S, L, Good, Bad, SE); if (!Good.empty()) { const SCEV *Sum = SE.getAddExpr(Good); if (!Sum->isZero()) @@ -305,7 +332,7 @@ unsigned Formula::getNumRegs() const { /// getType - Return the type of this formula, if it has one, or null /// otherwise. This type is meaningless except for the bit size. -const Type *Formula::getType() const { +Type *Formula::getType() const { return !BaseRegs.empty() ? BaseRegs.front()->getType() : ScaledReg ? ScaledReg->getType() : AM.BaseGV ? AM.BaseGV->getType() : @@ -370,6 +397,10 @@ void Formula::print(raw_ostream &OS) const { OS << ""; OS << ')'; } + if (UnfoldedOffset != 0) { + if (!First) OS << " + "; else First = false; + OS << "imm(" << UnfoldedOffset << ')'; + } } void Formula::dump() const { @@ -379,7 +410,7 @@ void Formula::dump() const { /// isAddRecSExtable - Return true if the given addrec can be sign-extended /// without changing its value. static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { - const Type *WideTy = + Type *WideTy = IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1); return isa(SE.getSignExtendExpr(AR, WideTy)); } @@ -387,17 +418,18 @@ static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { /// isAddSExtable - Return true if the given add can be sign-extended /// without changing its value. static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) { - const Type *WideTy = + Type *WideTy = IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1); return isa(SE.getSignExtendExpr(A, WideTy)); } -/// isMulSExtable - Return true if the given add can be sign-extended +/// isMulSExtable - Return true if the given mul can be sign-extended /// without changing its value. -static bool isMulSExtable(const SCEVMulExpr *A, ScalarEvolution &SE) { - const Type *WideTy = - IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1); - return isa(SE.getSignExtendExpr(A, WideTy)); +static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) { + Type *WideTy = + IntegerType::get(SE.getContext(), + SE.getTypeSizeInBits(M->getType()) * M->getNumOperands()); + return isa(SE.getSignExtendExpr(M, WideTy)); } /// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined @@ -413,33 +445,45 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, if (LHS == RHS) return SE.getConstant(LHS->getType(), 1); - // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do some - // folding. - if (RHS->isAllOnesValue()) - return SE.getMulExpr(LHS, RHS); + // Handle a few RHS special cases. + const SCEVConstant *RC = dyn_cast(RHS); + if (RC) { + const APInt &RA = RC->getValue()->getValue(); + // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do + // some folding. + if (RA.isAllOnesValue()) + return SE.getMulExpr(LHS, RC); + // Handle x /s 1 as x. + if (RA == 1) + return LHS; + } // Check for a division of a constant by a constant. if (const SCEVConstant *C = dyn_cast(LHS)) { - const SCEVConstant *RC = dyn_cast(RHS); if (!RC) return 0; - if (C->getValue()->getValue().srem(RC->getValue()->getValue()) != 0) + const APInt &LA = C->getValue()->getValue(); + const APInt &RA = RC->getValue()->getValue(); + if (LA.srem(RA) != 0) return 0; - return SE.getConstant(C->getValue()->getValue() - .sdiv(RC->getValue()->getValue())); + return SE.getConstant(LA.sdiv(RA)); } // Distribute the sdiv over addrec operands, if the addrec doesn't overflow. if (const SCEVAddRecExpr *AR = dyn_cast(LHS)) { if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) { - const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE, - IgnoreSignificantBits); - if (!Start) return 0; const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE, IgnoreSignificantBits); if (!Step) return 0; - return SE.getAddRecExpr(Start, Step, AR->getLoop()); + const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE, + IgnoreSignificantBits); + if (!Start) return 0; + // FlagNW is independent of the start value, step direction, and is + // preserved with smaller magnitude steps. + // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) + return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap); } + return 0; } // Distribute the sdiv over add operands, if the add doesn't overflow. @@ -455,10 +499,11 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, } return SE.getAddExpr(Ops); } + return 0; } // Check for a multiply operand that we can pull RHS out of. - if (const SCEVMulExpr *Mul = dyn_cast(LHS)) + if (const SCEVMulExpr *Mul = dyn_cast(LHS)) { if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) { SmallVector Ops; bool Found = false; @@ -475,6 +520,8 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, } return Found ? SE.getMulExpr(Ops) : 0; } + return 0; + } // Otherwise we don't know. return 0; @@ -492,12 +539,16 @@ static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { } else if (const SCEVAddExpr *Add = dyn_cast(S)) { SmallVector NewOps(Add->op_begin(), Add->op_end()); int64_t Result = ExtractImmediate(NewOps.front(), SE); - S = SE.getAddExpr(NewOps); + if (Result != 0) + S = SE.getAddExpr(NewOps); return Result; } else if (const SCEVAddRecExpr *AR = dyn_cast(S)) { SmallVector NewOps(AR->op_begin(), AR->op_end()); int64_t Result = ExtractImmediate(NewOps.front(), SE); - S = SE.getAddRecExpr(NewOps, AR->getLoop()); + if (Result != 0) + S = SE.getAddRecExpr(NewOps, AR->getLoop(), + // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) + SCEV::FlagAnyWrap); return Result; } return 0; @@ -515,12 +566,16 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { } else if (const SCEVAddExpr *Add = dyn_cast(S)) { SmallVector NewOps(Add->op_begin(), Add->op_end()); GlobalValue *Result = ExtractSymbol(NewOps.back(), SE); - S = SE.getAddExpr(NewOps); + if (Result) + S = SE.getAddExpr(NewOps); return Result; } else if (const SCEVAddRecExpr *AR = dyn_cast(S)) { SmallVector NewOps(AR->op_begin(), AR->op_end()); GlobalValue *Result = ExtractSymbol(NewOps.front(), SE); - S = SE.getAddRecExpr(NewOps, AR->getLoop()); + if (Result) + S = SE.getAddRecExpr(NewOps, AR->getLoop(), + // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) + SCEV::FlagAnyWrap); return Result; } return 0; @@ -539,14 +594,11 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) { switch (II->getIntrinsicID()) { default: break; case Intrinsic::prefetch: - case Intrinsic::x86_sse2_loadu_dq: - case Intrinsic::x86_sse2_loadu_pd: - case Intrinsic::x86_sse_loadu_ps: case Intrinsic::x86_sse_storeu_ps: case Intrinsic::x86_sse2_storeu_pd: case Intrinsic::x86_sse2_storeu_dq: case Intrinsic::x86_sse2_storel_dq: - if (II->getOperand(1) == OperandVal) + if (II->getArgOperand(0) == OperandVal) isAddress = true; break; } @@ -555,8 +607,8 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) { } /// getAccessType - Return the type of the memory being accessed. -static const Type *getAccessType(const Instruction *Inst) { - const Type *AccessTy = Inst->getType(); +static Type *getAccessType(const Instruction *Inst) { + Type *AccessTy = Inst->getType(); if (const StoreInst *SI = dyn_cast(Inst)) AccessTy = SI->getOperand(0)->getType(); else if (const IntrinsicInst *II = dyn_cast(Inst)) { @@ -568,20 +620,33 @@ static const Type *getAccessType(const Instruction *Inst) { case Intrinsic::x86_sse2_storeu_pd: case Intrinsic::x86_sse2_storeu_dq: case Intrinsic::x86_sse2_storel_dq: - AccessTy = II->getOperand(1)->getType(); + AccessTy = II->getArgOperand(0)->getType(); break; } } // All pointers have the same requirements, so canonicalize them to an // arbitrary pointer type to minimize variation. - if (const PointerType *PTy = dyn_cast(AccessTy)) + if (PointerType *PTy = dyn_cast(AccessTy)) AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1), PTy->getAddressSpace()); return AccessTy; } +/// isExistingPhi - Return true if this AddRec is already a phi in its loop. +static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { + for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin(); + PHINode *PN = dyn_cast(I); ++I) { + if (SE.isSCEVable(PN->getType()) && + (SE.getEffectiveSCEVType(PN->getType()) == + SE.getEffectiveSCEVType(AR->getType())) && + SE.getSCEV(PN) == AR) + return true; + } + return false; +} + /// DeleteTriviallyDeadInstructions - If any of the instructions is the /// specified set are trivially dead, delete them and see if this makes any of /// their operands subsequently dead. @@ -590,7 +655,7 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl &DeadInsts) { bool Changed = false; while (!DeadInsts.empty()) { - Instruction *I = dyn_cast_or_null(DeadInsts.pop_back_val()); + Instruction *I = dyn_cast_or_null(&*DeadInsts.pop_back_val()); if (I == 0 || !isInstructionTriviallyDead(I)) continue; @@ -627,18 +692,32 @@ public: : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0), SetupCost(0) {} - unsigned getNumRegs() const { return NumRegs; } - bool operator<(const Cost &Other) const; void Loose(); +#ifndef NDEBUG + // Once any of the metrics loses, they must all remain losers. + bool isValid() { + return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds + | ImmCost | SetupCost) != ~0u) + || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds + & ImmCost & SetupCost) == ~0u); + } +#endif + + bool isLoser() { + assert(isValid() && "invalid cost"); + return NumRegs == ~0u; + } + void RateFormula(const Formula &F, SmallPtrSet &Regs, const DenseSet &VisitedRegs, const Loop *L, const SmallVectorImpl &Offsets, - ScalarEvolution &SE, DominatorTree &DT); + ScalarEvolution &SE, DominatorTree &DT, + SmallPtrSet *LoserRegs = 0); void print(raw_ostream &OS) const; void dump() const; @@ -651,7 +730,8 @@ private: void RatePrimaryRegister(const SCEV *Reg, SmallPtrSet &Regs, const Loop *L, - ScalarEvolution &SE, DominatorTree &DT); + ScalarEvolution &SE, DominatorTree &DT, + SmallPtrSet *LoserRegs); }; } @@ -665,34 +745,43 @@ void Cost::RateRegister(const SCEV *Reg, if (AR->getLoop() == L) AddRecCost += 1; /// TODO: This should be a function of the stride. - // If this is an addrec for a loop that's already been visited by LSR, - // don't second-guess its addrec phi nodes. LSR isn't currently smart - // enough to reason about more than one loop at a time. Consider these - // registers free and leave them alone. - else if (L->contains(AR->getLoop()) || + // If this is an addrec for another loop, don't second-guess its addrec phi + // nodes. LSR isn't currently smart enough to reason about more than one + // loop at a time. LSR has either already run on inner loops, will not run + // on other loops, and cannot be expected to change sibling loops. If the + // AddRec exists, consider it's register free and leave it alone. Otherwise, + // do not consider this formula at all. + else if (!EnableNested || L->contains(AR->getLoop()) || (!AR->getLoop()->contains(L) && DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) { - for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin(); - PHINode *PN = dyn_cast(I); ++I) - if (SE.isSCEVable(PN->getType()) && - (SE.getEffectiveSCEVType(PN->getType()) == - SE.getEffectiveSCEVType(AR->getType())) && - SE.getSCEV(PN) == AR) - return; + if (isExistingPhi(AR, SE)) + return; + // For !EnableNested, never rewrite IVs in other loops. + if (!EnableNested) { + Loose(); + return; + } // If this isn't one of the addrecs that the loop already has, it // would require a costly new phi and add. TODO: This isn't // precisely modeled right now. ++NumBaseAdds; - if (!Regs.count(AR->getStart())) + if (!Regs.count(AR->getStart())) { RateRegister(AR->getStart(), Regs, L, SE, DT); + if (isLoser()) + return; + } } // Add the step value register, if it needs one. // TODO: The non-affine case isn't precisely modeled here. - if (!AR->isAffine() || !isa(AR->getOperand(1))) - if (!Regs.count(AR->getStart())) + if (!AR->isAffine() || !isa(AR->getOperand(1))) { + if (!Regs.count(AR->getOperand(1))) { RateRegister(AR->getOperand(1), Regs, L, SE, DT); + if (isLoser()) + return; + } + } } ++NumRegs; @@ -704,16 +793,28 @@ void Cost::RateRegister(const SCEV *Reg, (isa(cast(Reg)->getStart()) || isa(cast(Reg)->getStart())))) ++SetupCost; + + NumIVMuls += isa(Reg) && + SE.hasComputableLoopEvolution(Reg, L); } /// RatePrimaryRegister - Record this register in the set. If we haven't seen it -/// before, rate it. +/// before, rate it. Optional LoserRegs provides a way to declare any formula +/// that refers to one of those regs an instant loser. void Cost::RatePrimaryRegister(const SCEV *Reg, SmallPtrSet &Regs, const Loop *L, - ScalarEvolution &SE, DominatorTree &DT) { - if (Regs.insert(Reg)) + ScalarEvolution &SE, DominatorTree &DT, + SmallPtrSet *LoserRegs) { + if (LoserRegs && LoserRegs->count(Reg)) { + Loose(); + return; + } + if (Regs.insert(Reg)) { RateRegister(Reg, Regs, L, SE, DT); + if (isLoser()) + LoserRegs->insert(Reg); + } } void Cost::RateFormula(const Formula &F, @@ -721,14 +822,17 @@ void Cost::RateFormula(const Formula &F, const DenseSet &VisitedRegs, const Loop *L, const SmallVectorImpl &Offsets, - ScalarEvolution &SE, DominatorTree &DT) { + ScalarEvolution &SE, DominatorTree &DT, + SmallPtrSet *LoserRegs) { // Tally up the registers. if (const SCEV *ScaledReg = F.ScaledReg) { if (VisitedRegs.count(ScaledReg)) { Loose(); return; } - RatePrimaryRegister(ScaledReg, Regs, L, SE, DT); + RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs); + if (isLoser()) + return; } for (SmallVectorImpl::const_iterator I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) { @@ -737,14 +841,15 @@ void Cost::RateFormula(const Formula &F, Loose(); return; } - RatePrimaryRegister(BaseReg, Regs, L, SE, DT); - - NumIVMuls += isa(BaseReg) && - BaseReg->hasComputableLoopEvolution(L); + RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs); + if (isLoser()) + return; } - if (F.BaseRegs.size() > 1) - NumBaseAdds += F.BaseRegs.size() - 1; + // Determine how many (unfolded) adds we'll need inside the loop. + size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0); + if (NumBaseParts > 1) + NumBaseAdds += NumBaseParts - 1; // Tally up the non-zero immediates. for (SmallVectorImpl::const_iterator I = Offsets.begin(), @@ -756,9 +861,10 @@ void Cost::RateFormula(const Formula &F, else if (Offset != 0) ImmCost += APInt(64, Offset, true).getMinSignedBits(); } + assert(isValid() && "invalid cost"); } -/// Loose - Set this cost to a loosing value. +/// Loose - Set this cost to a losing value. void Cost::Loose() { NumRegs = ~0u; AddRecCost = ~0u; @@ -941,7 +1047,7 @@ public: }; KindType Kind; - const Type *AccessTy; + Type *AccessTy; SmallVector Offsets; int64_t MinOffset; @@ -952,6 +1058,12 @@ public: /// may be used. bool AllFixupsOutsideLoop; + /// WidestFixupType - This records the widest use type for any fixup using + /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different + /// max fixup widths to be equivalent, because the narrower one may be relying + /// on the implicit truncation to truncate away bogus bits. + Type *WidestFixupType; + /// Formulae - A list of ways to build a value that can satisfy this user. /// After the list is populated, one of these is selected heuristically and /// used to formulate a replacement for OperandValToReplace in UserInst. @@ -960,22 +1072,23 @@ public: /// Regs - The set of register candidates used by all formulae in this LSRUse. SmallPtrSet Regs; - LSRUse(KindType K, const Type *T) : Kind(K), AccessTy(T), + LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T), MinOffset(INT64_MAX), MaxOffset(INT64_MIN), - AllFixupsOutsideLoop(true) {} + AllFixupsOutsideLoop(true), + WidestFixupType(0) {} bool HasFormulaWithSameRegs(const Formula &F) const; bool InsertFormula(const Formula &F); void DeleteFormula(Formula &F); void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses); - void check() const; - void print(raw_ostream &OS) const; void dump() const; }; +} + /// HasFormula - Test whether this use as a formula which has the same /// registers as the given formula. bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { @@ -1010,7 +1123,6 @@ bool LSRUse::InsertFormula(const Formula &F) { Formulae.push_back(F); // Record registers now being used by this use. - if (F.ScaledReg) Regs.insert(F.ScaledReg); Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); return true; @@ -1021,7 +1133,6 @@ void LSRUse::DeleteFormula(Formula &F) { if (&F != &Formulae.back()) std::swap(F, Formulae.back()); Formulae.pop_back(); - assert(!Formulae.empty() && "LSRUse has no formulae left!"); } /// RecomputeRegs - Recompute the Regs field, and update RegUses. @@ -1061,13 +1172,16 @@ void LSRUse::print(raw_ostream &OS) const { for (SmallVectorImpl::const_iterator I = Offsets.begin(), E = Offsets.end(); I != E; ++I) { OS << *I; - if (next(I) != E) + if (llvm::next(I) != E) OS << ','; } OS << '}'; if (AllFixupsOutsideLoop) OS << ", all-fixups-outside-loop"; + + if (WidestFixupType) + OS << ", widest fixup type: " << *WidestFixupType; } void LSRUse::dump() const { @@ -1078,7 +1192,7 @@ void LSRUse::dump() const { /// be completely folded into the user instruction at isel time. This includes /// address-mode folding and special icmp tricks. static bool isLegalUse(const TargetLowering::AddrMode &AM, - LSRUse::KindType Kind, const Type *AccessTy, + LSRUse::KindType Kind, Type *AccessTy, const TargetLowering *TLI) { switch (Kind) { case LSRUse::Address: @@ -1107,7 +1221,7 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM, // If we have low-level target information, ask the target if it can fold an // integer immediate on an icmp. if (AM.BaseOffs != 0) { - if (TLI) return TLI->isLegalICmpImmediate(-AM.BaseOffs); + if (TLI) return TLI->isLegalICmpImmediate(-(uint64_t)AM.BaseOffs); return false; } @@ -1127,7 +1241,7 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM, static bool isLegalUse(TargetLowering::AddrMode AM, int64_t MinOffset, int64_t MaxOffset, - LSRUse::KindType Kind, const Type *AccessTy, + LSRUse::KindType Kind, Type *AccessTy, const TargetLowering *TLI) { // Check for overflow. if (((int64_t)((uint64_t)AM.BaseOffs + MinOffset) > AM.BaseOffs) != @@ -1149,7 +1263,7 @@ static bool isLegalUse(TargetLowering::AddrMode AM, static bool isAlwaysFoldable(int64_t BaseOffs, GlobalValue *BaseGV, bool HasBaseReg, - LSRUse::KindType Kind, const Type *AccessTy, + LSRUse::KindType Kind, Type *AccessTy, const TargetLowering *TLI) { // Fast-path: zero is always foldable. if (BaseOffs == 0 && !BaseGV) return true; @@ -1175,7 +1289,7 @@ static bool isAlwaysFoldable(int64_t BaseOffs, static bool isAlwaysFoldable(const SCEV *S, int64_t MinOffset, int64_t MaxOffset, bool HasBaseReg, - LSRUse::KindType Kind, const Type *AccessTy, + LSRUse::KindType Kind, Type *AccessTy, const TargetLowering *TLI, ScalarEvolution &SE) { // Fast-path: zero is always foldable. @@ -1203,29 +1317,29 @@ static bool isAlwaysFoldable(const SCEV *S, return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, TLI); } -/// FormulaSorter - This class implements an ordering for formulae which sorts -/// the by their standalone cost. -class FormulaSorter { - /// These two sets are kept empty, so that we compute standalone costs. - DenseSet VisitedRegs; - SmallPtrSet Regs; - Loop *L; - LSRUse *LU; - ScalarEvolution &SE; - DominatorTree &DT; +namespace { -public: - FormulaSorter(Loop *l, LSRUse &lu, ScalarEvolution &se, DominatorTree &dt) - : L(l), LU(&lu), SE(se), DT(dt) {} - - bool operator()(const Formula &A, const Formula &B) { - Cost CostA; - CostA.RateFormula(A, Regs, VisitedRegs, L, LU->Offsets, SE, DT); - Regs.clear(); - Cost CostB; - CostB.RateFormula(B, Regs, VisitedRegs, L, LU->Offsets, SE, DT); - Regs.clear(); - return CostA < CostB; +/// UseMapDenseMapInfo - A DenseMapInfo implementation for holding +/// DenseMaps and DenseSets of pairs of const SCEV* and LSRUse::Kind. +struct UseMapDenseMapInfo { + static std::pair getEmptyKey() { + return std::make_pair(reinterpret_cast(-1), LSRUse::Basic); + } + + static std::pair getTombstoneKey() { + return std::make_pair(reinterpret_cast(-2), LSRUse::Basic); + } + + static unsigned + getHashValue(const std::pair &V) { + unsigned Result = DenseMapInfo::getHashValue(V.first); + Result ^= DenseMapInfo::getHashValue(unsigned(V.second)); + return Result; + } + + static bool isEqual(const std::pair &LHS, + const std::pair &RHS) { + return LHS == RHS; } }; @@ -1250,7 +1364,7 @@ class LSRInstance { SmallSetVector Factors; /// Types - Interesting use types, to facilitate truncation reuse. - SmallSetVector Types; + SmallSetVector Types; /// Fixups - The list of operands which are to be replaced. SmallVector Fixups; @@ -1275,21 +1389,22 @@ class LSRInstance { } // Support for sharing of LSRUses between LSRFixups. - typedef DenseMap UseMapTy; + typedef DenseMap, + size_t, + UseMapDenseMapInfo> UseMapTy; UseMapTy UseMap; bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, - LSRUse::KindType Kind, const Type *AccessTy); + LSRUse::KindType Kind, Type *AccessTy); std::pair getUse(const SCEV *&Expr, LSRUse::KindType Kind, - const Type *AccessTy); + Type *AccessTy); - void DeleteUse(LSRUse &LU); + void DeleteUse(LSRUse &LU, size_t LUIdx); LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU); -public: void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); void CountRegisters(const Formula &F, size_t LUIdx); @@ -1311,6 +1426,10 @@ public: void FilterOutUndesirableDedicatedRegisters(); size_t EstimateSearchSpaceComplexity() const; + void NarrowSearchSpaceByDetectingSupersets(); + void NarrowSearchSpaceByCollapsingUnrolledCode(); + void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); + void NarrowSearchSpaceByPickingWinnerRegs(); void NarrowSearchSpaceUsingHeuristics(); void SolveRecurse(SmallVectorImpl &Solution, @@ -1346,6 +1465,7 @@ public: void ImplementSolution(const SmallVectorImpl &Solution, Pass *P); +public: LSRInstance(const TargetLowering *tli, Loop *l, Pass *P); bool getChanged() const { return Changed; } @@ -1371,7 +1491,8 @@ void LSRInstance::OptimizeShadowIV() { IVUsers::const_iterator CandidateUI = UI; ++UI; Instruction *ShadowUse = CandidateUI->getUser(); - const Type *DestTy = NULL; + Type *DestTy = NULL; + bool IsSigned = false; /* If shadow use is a int->float cast then insert a second IV to eliminate this cast. @@ -1385,10 +1506,14 @@ void LSRInstance::OptimizeShadowIV() { for (unsigned i = 0; i < n; ++i, ++d) foo(d); */ - if (UIToFPInst *UCast = dyn_cast(CandidateUI->getUser())) + if (UIToFPInst *UCast = dyn_cast(CandidateUI->getUser())) { + IsSigned = false; DestTy = UCast->getDestTy(); - else if (SIToFPInst *SCast = dyn_cast(CandidateUI->getUser())) + } + else if (SIToFPInst *SCast = dyn_cast(CandidateUI->getUser())) { + IsSigned = true; DestTy = SCast->getDestTy(); + } if (!DestTy) continue; if (TLI) { @@ -1402,7 +1527,7 @@ void LSRInstance::OptimizeShadowIV() { if (!PH) continue; if (PH->getNumIncomingValues() != 2) continue; - const Type *SrcTy = PH->getType(); + Type *SrcTy = PH->getType(); int Mantissa = DestTy->getFPMantissaWidth(); if (Mantissa == -1) continue; if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa) @@ -1419,7 +1544,9 @@ void LSRInstance::OptimizeShadowIV() { ConstantInt *Init = dyn_cast(PH->getIncomingValue(Entry)); if (!Init) continue; - Constant *NewInit = ConstantFP::get(DestTy, Init->getZExtValue()); + Constant *NewInit = ConstantFP::get(DestTy, IsSigned ? + (double)Init->getSExtValue() : + (double)Init->getZExtValue()); BinaryOperator *Incr = dyn_cast(PH->getIncomingValue(Latch)); @@ -1444,7 +1571,7 @@ void LSRInstance::OptimizeShadowIV() { if (!C->getValue().isStrictlyPositive()) continue; /* Add new PHINode. */ - PHINode *NewPH = PHINode::Create(DestTy, "IV.S.", PH); + PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH); /* create new increment. '++d' in above example. */ Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue()); @@ -1544,7 +1671,7 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1); // Add one to the backedge-taken count to get the trip count. - const SCEV *IterationCount = SE.getAddExpr(BackedgeTakenCount, One); + const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount); if (IterationCount != SE.getSCEV(Sel)) return Cond; // Check for a max calculation that matches the pattern. There's no check @@ -1613,8 +1740,11 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { NewRHS = Sel->getOperand(1); else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS) NewRHS = Sel->getOperand(2); + else if (const SCEVUnknown *SU = dyn_cast(MaxRHS)) + NewRHS = SU->getValue(); else - llvm_unreachable("Max doesn't match expected pattern!"); + // Max doesn't match expected pattern. + return Cond; // Determine the new comparison opcode. It may be signed or unsigned, // and the original comparison may be either equality or inequality. @@ -1705,22 +1835,22 @@ LSRInstance::OptimizeLoopTermCond() { } if (const SCEVConstant *D = dyn_cast_or_null(getExactSDiv(B, A, SE))) { + const ConstantInt *C = D->getValue(); // Stride of one or negative one can have reuse with non-addresses. - if (D->getValue()->isOne() || - D->getValue()->isAllOnesValue()) + if (C->isOne() || C->isAllOnesValue()) goto decline_post_inc; // Avoid weird situations. - if (D->getValue()->getValue().getMinSignedBits() >= 64 || - D->getValue()->getValue().isMinSignedValue()) + if (C->getValue().getMinSignedBits() >= 64 || + C->getValue().isMinSignedValue()) goto decline_post_inc; // Without TLI, assume that any stride might be valid, and so any // use might be shared. if (!TLI) goto decline_post_inc; // Check for possible scaled-address reuse. - const Type *AccessTy = getAccessType(UI->getUser()); + Type *AccessTy = getAccessType(UI->getUser()); TargetLowering::AddrMode AM; - AM.Scale = D->getValue()->getSExtValue(); + AM.Scale = C->getSExtValue(); if (TLI->isLegalAddressingMode(AM, AccessTy)) goto decline_post_inc; AM.Scale = -AM.Scale; @@ -1777,12 +1907,15 @@ LSRInstance::OptimizeLoopTermCond() { } } +/// reconcileNewOffset - Determine if the given use can accommodate a fixup +/// at the given offset and other details. If so, update the use and +/// return true. bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, - LSRUse::KindType Kind, const Type *AccessTy) { + LSRUse::KindType Kind, Type *AccessTy) { int64_t NewMinOffset = LU.MinOffset; int64_t NewMaxOffset = LU.MaxOffset; - const Type *NewAccessTy = AccessTy; + Type *NewAccessTy = AccessTy; // Check for a mismatched kind. It's tempting to collapse mismatched kinds to // something conservative, however this can pessimize in the case that one of @@ -1802,6 +1935,8 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, NewMaxOffset = NewOffset; } // Check for a mismatched access type, and fall back conservatively as needed. + // TODO: Be less conservative when the type is similar and can use the same + // addressing modes. if (Kind == LSRUse::Address && AccessTy != LU.AccessTy) NewAccessTy = Type::getVoidTy(AccessTy->getContext()); @@ -1819,7 +1954,7 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, /// Either reuse an existing use or create a new one, as needed. std::pair LSRInstance::getUse(const SCEV *&Expr, - LSRUse::KindType Kind, const Type *AccessTy) { + LSRUse::KindType Kind, Type *AccessTy) { const SCEV *Copy = Expr; int64_t Offset = ExtractImmediate(Expr, SE); @@ -1830,7 +1965,7 @@ LSRInstance::getUse(const SCEV *&Expr, } std::pair P = - UseMap.insert(std::make_pair(Expr, 0)); + UseMap.insert(std::make_pair(std::make_pair(Expr, Kind), 0)); if (!P.second) { // A use already existed with this base. size_t LUIdx = P.first->second; @@ -1857,10 +1992,13 @@ LSRInstance::getUse(const SCEV *&Expr, } /// DeleteUse - Delete the given use from the Uses list. -void LSRInstance::DeleteUse(LSRUse &LU) { +void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) { if (&LU != &Uses.back()) std::swap(LU, Uses.back()); Uses.pop_back(); + + // Update RegUses. + RegUses.SwapAndDropUse(LUIdx, Uses.size()); } /// FindUseWithFormula - Look for a use distinct from OrigLU which is has @@ -1868,32 +2006,42 @@ void LSRInstance::DeleteUse(LSRUse &LU) { LSRUse * LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, const LSRUse &OrigLU) { - // Search all uses for the formula. This could be more clever. Ignore - // ICmpZero uses because they may contain formulae generated by - // GenerateICmpZeroScales, in which case adding fixup offsets may - // be invalid. + // Search all uses for the formula. This could be more clever. for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { LSRUse &LU = Uses[LUIdx]; + // Check whether this use is close enough to OrigLU, to see whether it's + // worthwhile looking through its formulae. + // Ignore ICmpZero uses because they may contain formulae generated by + // GenerateICmpZeroScales, in which case adding fixup offsets may + // be invalid. if (&LU != &OrigLU && LU.Kind != LSRUse::ICmpZero && LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy && + LU.WidestFixupType == OrigLU.WidestFixupType && LU.HasFormulaWithSameRegs(OrigF)) { + // Scan through this use's formulae. for (SmallVectorImpl::const_iterator I = LU.Formulae.begin(), E = LU.Formulae.end(); I != E; ++I) { const Formula &F = *I; + // Check to see if this formula has the same registers and symbols + // as OrigF. if (F.BaseRegs == OrigF.BaseRegs && F.ScaledReg == OrigF.ScaledReg && F.AM.BaseGV == OrigF.AM.BaseGV && F.AM.Scale == OrigF.AM.Scale && - LU.Kind) { + F.UnfoldedOffset == OrigF.UnfoldedOffset) { if (F.AM.BaseOffs == 0) return &LU; + // This is the formula where all the registers and symbols matched; + // there aren't going to be any others. Since we declined it, we + // can skip the rest of the formulae and procede to the next LSRUse. break; } } } } + // Nothing looked good. return 0; } @@ -1913,10 +2061,11 @@ void LSRInstance::CollectInterestingTypesAndFactors() { do { const SCEV *S = Worklist.pop_back_val(); if (const SCEVAddRecExpr *AR = dyn_cast(S)) { - Strides.insert(AR->getStepRecurrence(SE)); + if (EnableNested || AR->getLoop() == L) + Strides.insert(AR->getStepRecurrence(SE)); Worklist.push_back(AR->getStart()); } else if (const SCEVAddExpr *Add = dyn_cast(S)) { - Worklist.insert(Worklist.end(), Add->op_begin(), Add->op_end()); + Worklist.append(Add->op_begin(), Add->op_end()); } } while (!Worklist.empty()); } @@ -1925,7 +2074,7 @@ void LSRInstance::CollectInterestingTypesAndFactors() { for (SmallSetVector::const_iterator I = Strides.begin(), E = Strides.end(); I != E; ++I) for (SmallSetVector::const_iterator NewStrideIter = - next(I); NewStrideIter != E; ++NewStrideIter) { + llvm::next(I); NewStrideIter != E; ++NewStrideIter) { const SCEV *OldStride = *I; const SCEV *NewStride = *NewStrideIter; @@ -1968,7 +2117,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { LF.PostIncLoops = UI->getPostIncLoops(); LSRUse::KindType Kind = LSRUse::Basic; - const Type *AccessTy = 0; + Type *AccessTy = 0; if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) { Kind = LSRUse::Address; AccessTy = getAccessType(LF.UserInst); @@ -1996,7 +2145,11 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { // x == y --> x - y == 0 const SCEV *N = SE.getSCEV(NV); - if (N->isLoopInvariant(L)) { + if (SE.isLoopInvariant(N, L)) { + // S is normalized, so normalize N before folding it into S + // to keep the result normalized. + N = TransformForPostIncUse(Normalize, N, CI, 0, + LF.PostIncLoops, SE, DT); Kind = LSRUse::ICmpZero; S = SE.getMinusSCEV(N, S); } @@ -2015,6 +2168,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { LF.Offset = P.second; LSRUse &LU = Uses[LF.LUIdx]; LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); + if (!LU.WidestFixupType || + SE.getTypeSizeInBits(LU.WidestFixupType) < + SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) + LU.WidestFixupType = LF.OperandValToReplace->getType(); // If this is the first use of this LSRUse, give it a formula. if (LU.Formulae.empty()) { @@ -2026,14 +2183,19 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { DEBUG(print_fixups(dbgs())); } +/// InsertInitialFormula - Insert a formula for the given expression into +/// the given use, separating out loop-variant portions from loop-invariant +/// and loop-computable portions. void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { Formula F; - F.InitialMatch(S, L, SE, DT); + F.InitialMatch(S, L, SE); bool Inserted = InsertFormula(LU, LUIdx, F); assert(Inserted && "Initial formula already exists!"); (void)Inserted; } +/// InsertSupplementalFormula - Insert a simple single-register formula for +/// the given expression into the given use. void LSRInstance::InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { @@ -2078,7 +2240,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { const SCEV *S = Worklist.pop_back_val(); if (const SCEVNAryExpr *N = dyn_cast(S)) - Worklist.insert(Worklist.end(), N->op_begin(), N->op_end()); + Worklist.append(N->op_begin(), N->op_end()); else if (const SCEVCastExpr *C = dyn_cast(S)) Worklist.push_back(C->getOperand()); else if (const SCEVUDivExpr *D = dyn_cast(S)) { @@ -2087,8 +2249,12 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { } else if (const SCEVUnknown *U = dyn_cast(S)) { if (!Inserted.insert(U)) continue; const Value *V = U->getValue(); - if (const Instruction *Inst = dyn_cast(V)) + if (const Instruction *Inst = dyn_cast(V)) { + // Look for instructions defined outside the loop. if (L->contains(Inst)) continue; + } else if (isa(V)) + // Undef doesn't have a live range, so it doesn't matter. + continue; for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end(); UI != UE; ++UI) { const Instruction *UserInst = dyn_cast(*UI); @@ -2123,7 +2289,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { if (const ICmpInst *ICI = dyn_cast(UserInst)) { unsigned OtherIdx = !UI.getOperandNo(); Value *OtherOp = const_cast(ICI->getOperand(OtherIdx)); - if (SE.getSCEV(OtherOp)->hasComputableLoopEvolution(L)) + if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L)) continue; } @@ -2135,6 +2301,10 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { LF.Offset = P.second; LSRUse &LU = Uses[LF.LUIdx]; LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); + if (!LU.WidestFixupType || + SE.getTypeSizeInBits(LU.WidestFixupType) < + SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) + LU.WidestFixupType = LF.OperandValToReplace->getType(); InsertSupplementalFormula(U, LU, LF.LUIdx); CountRegisters(LU.Formulae.back(), Uses.size() - 1); break; @@ -2147,20 +2317,24 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { /// separate registers. If C is non-null, multiply each subexpression by C. static void CollectSubexprs(const SCEV *S, const SCEVConstant *C, SmallVectorImpl &Ops, + const Loop *L, ScalarEvolution &SE) { if (const SCEVAddExpr *Add = dyn_cast(S)) { // Break out add operands. for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); I != E; ++I) - CollectSubexprs(*I, C, Ops, SE); + CollectSubexprs(*I, C, Ops, L, SE); return; } else if (const SCEVAddRecExpr *AR = dyn_cast(S)) { // Split a non-zero base out of an addrec. if (!AR->getStart()->isZero()) { CollectSubexprs(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0), AR->getStepRecurrence(SE), - AR->getLoop()), C, Ops, SE); - CollectSubexprs(AR->getStart(), C, Ops, SE); + AR->getLoop(), + //FIXME: AR->getNoWrapFlags(SCEV::FlagNW) + SCEV::FlagAnyWrap), + C, Ops, L, SE); + CollectSubexprs(AR->getStart(), C, Ops, L, SE); return; } } else if (const SCEVMulExpr *Mul = dyn_cast(S)) { @@ -2170,12 +2344,12 @@ static void CollectSubexprs(const SCEV *S, const SCEVConstant *C, dyn_cast(Mul->getOperand(0))) { CollectSubexprs(Mul->getOperand(1), C ? cast(SE.getMulExpr(C, Op0)) : Op0, - Ops, SE); + Ops, L, SE); return; } } - // Otherwise use the value itself. + // Otherwise use the value itself, optionally with a scale applied. Ops.push_back(C ? SE.getMulExpr(C, S) : S); } @@ -2191,11 +2365,18 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, const SCEV *BaseReg = Base.BaseRegs[i]; SmallVector AddOps; - CollectSubexprs(BaseReg, 0, AddOps, SE); + CollectSubexprs(BaseReg, 0, AddOps, L, SE); + if (AddOps.size() == 1) continue; for (SmallVectorImpl::const_iterator J = AddOps.begin(), JE = AddOps.end(); J != JE; ++J) { + + // Loop-variant "unknown" values are uninteresting; we won't be able to + // do anything meaningful with them. + if (isa(*J) && !SE.isLoopInvariant(*J, L)) + continue; + // Don't pull a constant into a register if the constant could be folded // into an immediate field. if (isAlwaysFoldable(*J, LU.MinOffset, LU.MaxOffset, @@ -2204,11 +2385,10 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, continue; // Collect all operands except *J. - SmallVector InnerAddOps; - for (SmallVectorImpl::const_iterator K = AddOps.begin(), - KE = AddOps.end(); K != KE; ++K) - if (K != J) - InnerAddOps.push_back(*K); + SmallVector InnerAddOps + (((const SmallVector &)AddOps).begin(), J); + InnerAddOps.append + (llvm::next(J), ((const SmallVector &)AddOps).end()); // Don't leave just a constant behind in a register if the constant could // be folded into an immediate field. @@ -2222,8 +2402,29 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, if (InnerSum->isZero()) continue; Formula F = Base; - F.BaseRegs[i] = InnerSum; - F.BaseRegs.push_back(*J); + + // Add the remaining pieces of the add back into the new formula. + const SCEVConstant *InnerSumSC = dyn_cast(InnerSum); + if (TLI && InnerSumSC && + SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 && + TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + InnerSumSC->getValue()->getZExtValue())) { + F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + + InnerSumSC->getValue()->getZExtValue(); + F.BaseRegs.erase(F.BaseRegs.begin() + i); + } else + F.BaseRegs[i] = InnerSum; + + // Add J as its own register, or an unfolded immediate. + const SCEVConstant *SC = dyn_cast(*J); + if (TLI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && + TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + SC->getValue()->getZExtValue())) + F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + + SC->getValue()->getZExtValue(); + else + F.BaseRegs.push_back(*J); + if (InsertFormula(LU, LUIdx, F)) // If that formula hadn't been seen before, recurse to find more like // it. @@ -2245,8 +2446,8 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, for (SmallVectorImpl::const_iterator I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) { const SCEV *BaseReg = *I; - if (BaseReg->properlyDominates(L->getHeader(), &DT) && - !BaseReg->hasComputableLoopEvolution(L)) + if (SE.properlyDominates(BaseReg, L->getHeader()) && + !SE.hasComputableLoopEvolution(BaseReg, L)) Ops.push_back(BaseReg); else F.BaseRegs.push_back(BaseReg); @@ -2289,7 +2490,7 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base) { // TODO: For now, just add the min and max offset, because it usually isn't // worthwhile looking at everything inbetween. - SmallVector Worklist; + SmallVector Worklist; Worklist.push_back(LU.MinOffset); if (LU.MaxOffset != LU.MinOffset) Worklist.push_back(LU.MaxOffset); @@ -2303,7 +2504,14 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I; if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind, LU.AccessTy, TLI)) { - F.BaseRegs[i] = SE.getAddExpr(G, SE.getConstant(G->getType(), *I)); + // Add the offset to the base register. + const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G); + // If it cancelled out, drop the base register, otherwise update it. + if (NewG->isZero()) { + std::swap(F.BaseRegs[i], F.BaseRegs.back()); + F.BaseRegs.pop_back(); + } else + F.BaseRegs[i] = NewG; (void)InsertFormula(LU, LUIdx, F); } @@ -2329,7 +2537,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, if (LU.Kind != LSRUse::ICmpZero) return; // Determine the integer type for the base formula. - const Type *IntTy = Base.getType(); + Type *IntTy = Base.getType(); if (!IntTy) return; if (SE.getTypeSizeInBits(IntTy) > 64) return; @@ -2342,13 +2550,12 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, for (SmallSetVector::const_iterator I = Factors.begin(), E = Factors.end(); I != E; ++I) { int64_t Factor = *I; - Formula F = Base; // Check that the multiplication doesn't overflow. - if (F.AM.BaseOffs == INT64_MIN && Factor == -1) + if (Base.AM.BaseOffs == INT64_MIN && Factor == -1) continue; - F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs * Factor; - if (F.AM.BaseOffs / Factor != Base.AM.BaseOffs) + int64_t NewBaseOffs = (uint64_t)Base.AM.BaseOffs * Factor; + if (NewBaseOffs / Factor != Base.AM.BaseOffs) continue; // Check that multiplying with the use offset doesn't overflow. @@ -2359,6 +2566,9 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, if (Offset / Factor != LU.MinOffset) continue; + Formula F = Base; + F.AM.BaseOffs = NewBaseOffs; + // Check that this scale is legal. if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI)) continue; @@ -2382,6 +2592,15 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, continue; } + // Check that multiplying with the unfolded offset doesn't overflow. + if (F.UnfoldedOffset != 0) { + if (F.UnfoldedOffset == INT64_MIN && Factor == -1) + continue; + F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor; + if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset) + continue; + } + // If we make it here and it's legal, add it. (void)InsertFormula(LU, LUIdx, F); next:; @@ -2392,7 +2611,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, /// scaled-offset address modes, for example. void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { // Determine the integer type for the base formula. - const Type *IntTy = Base.getType(); + Type *IntTy = Base.getType(); if (!IntTy) return; // If this Formula already has a scaled register, we can't add another one. @@ -2452,13 +2671,13 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { if (Base.AM.BaseGV) return; // Determine the integer type for the base formula. - const Type *DstTy = Base.getType(); + Type *DstTy = Base.getType(); if (!DstTy) return; DstTy = SE.getEffectiveSCEVType(DstTy); - for (SmallSetVector::const_iterator + for (SmallSetVector::const_iterator I = Types.begin(), E = Types.end(); I != E; ++I) { - const Type *SrcTy = *I; + Type *SrcTy = *I; if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) { Formula F = Base; @@ -2564,7 +2783,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // other orig regs. ImmMapTy::const_iterator OtherImms[] = { Imms.begin(), prior(Imms.end()), - Imms.upper_bound((Imms.begin()->first + prior(Imms.end())->first) / 2) + Imms.lower_bound((Imms.begin()->first + prior(Imms.end())->first) / 2) }; for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) { ImmMapTy::const_iterator M = OtherImms[i]; @@ -2595,13 +2814,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { int64_t Imm = WI.Imm; const SCEV *OrigReg = WI.OrigReg; - const Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType()); + Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType()); const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm)); unsigned BitWidth = SE.getTypeSizeInBits(IntTy); // TODO: Use a more targeted data structure. for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) { - Formula F = LU.Formulae[L]; + const Formula &F = LU.Formulae[L]; // Use the immediate in the scaled register. if (F.ScaledReg == OrigReg) { int64_t Offs = (uint64_t)F.AM.BaseOffs + @@ -2621,7 +2840,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // value to the immediate would produce a value closer to zero than the // immediate itself, then the formula isn't worthwhile. if (const SCEVConstant *C = dyn_cast(NewF.ScaledReg)) - if (C->getValue()->getValue().isNegative() != + if (C->getValue()->isNegative() != (NewF.AM.BaseOffs < 0) && (C->getValue()->getValue().abs() * APInt(BitWidth, F.AM.Scale)) .ule(abs64(NewF.AM.BaseOffs))) @@ -2638,8 +2857,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { Formula NewF = F; NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm; if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, TLI)) - continue; + LU.Kind, LU.AccessTy, TLI)) { + if (!TLI || + !TLI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) + continue; + NewF = F; + NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm; + } NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg); // If the new formula has a constant in a register, and adding the @@ -2696,11 +2920,18 @@ LSRInstance::GenerateAllReuseFormulae() { } GenerateCrossUseConstantOffsets(); + + DEBUG(dbgs() << "\n" + "After generating reuse formulae:\n"; + print_uses(dbgs())); } -/// If their are multiple formulae with the same set of registers used +/// If there are multiple formulae with the same set of registers used /// by other uses, pick the best one and delete the others. void LSRInstance::FilterOutUndesirableDedicatedRegisters() { + DenseSet VisitedRegs; + SmallPtrSet Regs; + SmallPtrSet LoserRegs; #ifndef NDEBUG bool ChangedFormulae = false; #endif @@ -2713,7 +2944,6 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { LSRUse &LU = Uses[LUIdx]; - FormulaSorter Sorter(L, LU, SE, DT); DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n'); bool Any = false; @@ -2721,39 +2951,66 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { FIdx != NumForms; ++FIdx) { Formula &F = LU.Formulae[FIdx]; - SmallVector Key; - for (SmallVectorImpl::const_iterator J = F.BaseRegs.begin(), - JE = F.BaseRegs.end(); J != JE; ++J) { - const SCEV *Reg = *J; - if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx)) - Key.push_back(Reg); + // Some formulas are instant losers. For example, they may depend on + // nonexistent AddRecs from other loops. These need to be filtered + // immediately, otherwise heuristics could choose them over others leading + // to an unsatisfactory solution. Passing LoserRegs into RateFormula here + // avoids the need to recompute this information across formulae using the + // same bad AddRec. Passing LoserRegs is also essential unless we remove + // the corresponding bad register from the Regs set. + Cost CostF; + Regs.clear(); + CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, + &LoserRegs); + if (CostF.isLoser()) { + // During initial formula generation, undesirable formulae are generated + // by uses within other loops that have some non-trivial address mode or + // use the postinc form of the IV. LSR needs to provide these formulae + // as the basis of rediscovering the desired formula that uses an AddRec + // corresponding to the existing phi. Once all formulae have been + // generated, these initial losers may be pruned. + DEBUG(dbgs() << " Filtering loser "; F.print(dbgs()); + dbgs() << "\n"); } - if (F.ScaledReg && - RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx)) - Key.push_back(F.ScaledReg); - // Unstable sort by host order ok, because this is only used for - // uniquifying. - std::sort(Key.begin(), Key.end()); - - std::pair P = - BestFormulae.insert(std::make_pair(Key, FIdx)); - if (!P.second) { + else { + SmallVector Key; + for (SmallVectorImpl::const_iterator J = F.BaseRegs.begin(), + JE = F.BaseRegs.end(); J != JE; ++J) { + const SCEV *Reg = *J; + if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx)) + Key.push_back(Reg); + } + if (F.ScaledReg && + RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx)) + Key.push_back(F.ScaledReg); + // Unstable sort by host order ok, because this is only used for + // uniquifying. + std::sort(Key.begin(), Key.end()); + + std::pair P = + BestFormulae.insert(std::make_pair(Key, FIdx)); + if (P.second) + continue; + Formula &Best = LU.Formulae[P.first->second]; - if (Sorter.operator()(F, Best)) + + Cost CostBest; + Regs.clear(); + CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT); + if (CostF < CostBest) std::swap(F, Best); DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); dbgs() << "\n" " in favor of formula "; Best.print(dbgs()); dbgs() << '\n'); + } #ifndef NDEBUG - ChangedFormulae = true; + ChangedFormulae = true; #endif - LU.DeleteFormula(F); - --FIdx; - --NumForms; - Any = true; - continue; - } + LU.DeleteFormula(F); + --FIdx; + --NumForms; + Any = true; } // Now that we've filtered out some formulae, recompute the Regs set. @@ -2779,7 +3036,7 @@ static const size_t ComplexityLimit = UINT16_MAX; /// this many solutions because it prune the search space, but the pruning /// isn't always sufficient. size_t LSRInstance::EstimateSearchSpaceComplexity() const { - uint32_t Power = 1; + size_t Power = 1; for (SmallVectorImpl::const_iterator I = Uses.begin(), E = Uses.end(); I != E; ++I) { size_t FSize = I->Formulae.size(); @@ -2794,11 +3051,11 @@ size_t LSRInstance::EstimateSearchSpaceComplexity() const { return Power; } -/// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of -/// formulae to choose from, use some rough heuristics to prune down the number -/// of formulae. This keeps the main solver from taking an extraordinary amount -/// of time in some worst-case scenarios. -void LSRInstance::NarrowSearchSpaceUsingHeuristics() { +/// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset +/// of the registers of another formula, it won't help reduce register +/// pressure (though it may not necessarily hurt register pressure); remove +/// it to simplify the system. +void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { DEBUG(dbgs() << "The search space is too complex.\n"); @@ -2856,7 +3113,12 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() { DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); } +} +/// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers +/// for expressions like A, A+1, A+2, etc., allocate a single register for +/// them. +void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { DEBUG(dbgs() << "The search space is too complex.\n"); @@ -2881,6 +3143,28 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() { LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop; + // Update the relocs to reference the new use. + for (SmallVectorImpl::iterator I = Fixups.begin(), + E = Fixups.end(); I != E; ++I) { + LSRFixup &Fixup = *I; + if (Fixup.LUIdx == LUIdx) { + Fixup.LUIdx = LUThatHas - &Uses.front(); + Fixup.Offset += F.AM.BaseOffs; + // Add the new offset to LUThatHas' offset list. + if (LUThatHas->Offsets.back() != Fixup.Offset) { + LUThatHas->Offsets.push_back(Fixup.Offset); + if (Fixup.Offset > LUThatHas->MaxOffset) + LUThatHas->MaxOffset = Fixup.Offset; + if (Fixup.Offset < LUThatHas->MinOffset) + LUThatHas->MinOffset = Fixup.Offset; + } + DEBUG(dbgs() << "New fixup has offset " + << Fixup.Offset << '\n'); + } + if (Fixup.LUIdx == NumUses-1) + Fixup.LUIdx = LUIdx; + } + // Delete formulae from the new use which are no longer legal. bool Any = false; for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) { @@ -2899,22 +3183,8 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() { if (Any) LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses); - // Update the relocs to reference the new use. - for (SmallVectorImpl::iterator I = Fixups.begin(), - E = Fixups.end(); I != E; ++I) { - LSRFixup &Fixup = *I; - if (Fixup.LUIdx == LUIdx) { - Fixup.LUIdx = LUThatHas - &Uses.front(); - Fixup.Offset += F.AM.BaseOffs; - DEBUG(errs() << "New fixup has offset " - << Fixup.Offset << '\n'); - } - if (Fixup.LUIdx == NumUses-1) - Fixup.LUIdx = LUIdx; - } - // Delete the old use. - DeleteUse(LU); + DeleteUse(LU, LUIdx); --LUIdx; --NumUses; break; @@ -2927,7 +3197,32 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() { DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); } +} + +/// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call +/// FilterOutUndesirableDedicatedRegisters again, if necessary, now that +/// we've done more filtering, as it may be able to find more formulae to +/// eliminate. +void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){ + if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { + DEBUG(dbgs() << "The search space is too complex.\n"); + + DEBUG(dbgs() << "Narrowing the search space by re-filtering out " + "undesirable dedicated registers.\n"); + + FilterOutUndesirableDedicatedRegisters(); + + DEBUG(dbgs() << "After pre-selection:\n"; + print_uses(dbgs())); + } +} +/// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely +/// to be profitable, and then in any use which has any reference to that +/// register, delete all formulae which do not reference that register. +void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { + // With all other options exhausted, loop until the system is simple + // enough to handle. SmallPtrSet Taken; while (EstimateSearchSpaceComplexity() >= ComplexityLimit) { // Ok, we have too many of formulae on our hands to conveniently handle. @@ -2987,6 +3282,17 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() { } } +/// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of +/// formulae to choose from, use some rough heuristics to prune down the number +/// of formulae. This keeps the main solver from taking an extraordinary amount +/// of time in some worst-case scenarios. +void LSRInstance::NarrowSearchSpaceUsingHeuristics() { + NarrowSearchSpaceByDetectingSupersets(); + NarrowSearchSpaceByCollapsingUnrolledCode(); + NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); + NarrowSearchSpaceByPickingWinnerRegs(); +} + /// SolveRecurse - This is the recursive solver. void LSRInstance::SolveRecurse(SmallVectorImpl &Solution, Cost &SolutionCost, @@ -3063,6 +3369,9 @@ retry: skip:; } + if (!EnableRetry && !AnySatisfiedReqRegs) + return; + // If none of the formulae had all of the required registers, relax the // constraint so that we don't exclude all formulae. if (!AnySatisfiedReqRegs) { @@ -3072,6 +3381,8 @@ retry: } } +/// Solve - Choose one formula from each use. Return the results in the given +/// Solution vector. void LSRInstance::Solve(SmallVectorImpl &Solution) const { SmallVector Workspace; Cost SolutionCost; @@ -3084,6 +3395,10 @@ void LSRInstance::Solve(SmallVectorImpl &Solution) const { // SolveRecurse does all the work. SolveRecurse(Solution, SolutionCost, Workspace, CurCost, CurRegs, VisitedRegs); + if (Solution.empty()) { + DEBUG(dbgs() << "\nNo Satisfactory Solution\n"); + return; + } // Ok, we've now made all our decisions. DEBUG(dbgs() << "\n" @@ -3097,6 +3412,8 @@ void LSRInstance::Solve(SmallVectorImpl &Solution) const { Solution[i]->print(dbgs()); dbgs() << '\n'; }); + + assert(Solution.size() == Uses.size() && "Malformed solution!"); } /// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up @@ -3113,7 +3430,7 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, BasicBlock *IDom; for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) { - assert(Rung && "Block has no DomTreeNode!"); + if (!Rung) return IP; Rung = Rung->getIDom(); if (!Rung) return IP; IDom = Rung->getBlock(); @@ -3200,12 +3517,17 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator IP, // Don't insert instructions before PHI nodes. while (isa(IP)) ++IP; + // Ignore landingpad instructions. + while (isa(IP)) ++IP; + // Ignore debug intrinsics. while (isa(IP)) ++IP; return IP; } +/// Expand - Emit instructions for the leading candidate expression for this +/// LSRUse (this is called "expanding"). Value *LSRInstance::Expand(const LSRFixup &LF, const Formula &F, BasicBlock::iterator IP, @@ -3222,9 +3544,9 @@ Value *LSRInstance::Expand(const LSRFixup &LF, Rewriter.setPostInc(LF.PostIncLoops); // This is the type that the user actually needs. - const Type *OpTy = LF.OperandValToReplace->getType(); + Type *OpTy = LF.OperandValToReplace->getType(); // This will be the type that we'll initially expand to. - const Type *Ty = F.getType(); + Type *Ty = F.getType(); if (!Ty) // No type known; just expand directly to the ultimate type. Ty = OpTy; @@ -3232,7 +3554,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Expand directly to the ultimate type if it's the right size. Ty = OpTy; // This is the type to do integer arithmetic in. - const Type *IntTy = SE.getEffectiveSCEVType(Ty); + Type *IntTy = SE.getEffectiveSCEVType(Ty); // Build up a list of operands to add together to form the full base. SmallVector Ops; @@ -3309,7 +3631,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // The other interesting way of "folding" with an ICmpZero is to use a // negated immediate. if (!ICmpScaledV) - ICmpScaledV = ConstantInt::get(IntTy, -Offset); + ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset); else { Ops.push_back(SE.getUnknown(ICmpScaledV)); ICmpScaledV = ConstantInt::get(IntTy, Offset); @@ -3321,6 +3643,14 @@ Value *LSRInstance::Expand(const LSRFixup &LF, } } + // Expand the unfolded offset portion. + int64_t UnfoldedOffset = F.UnfoldedOffset; + if (UnfoldedOffset != 0) { + // Just add the immediate values. + Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, + UnfoldedOffset))); + } + // Emit instructions summing all the operands. const SCEV *FullS = Ops.empty() ? SE.getConstant(IntTy, 0) : @@ -3384,21 +3714,33 @@ void LSRInstance::RewriteForPHI(PHINode *PN, // is the canonical backedge for this loop, which complicates post-inc // users. if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 && - !isa(BB->getTerminator()) && - (PN->getParent() != L->getHeader() || !L->contains(BB))) { - // Split the critical edge. - BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P); - - // If PN is outside of the loop and BB is in the loop, we want to - // move the block to be immediately before the PHI block, not - // immediately after BB. - if (L->contains(BB) && !L->contains(PN)) - NewBB->moveBefore(PN->getParent()); - - // Splitting the edge can reduce the number of PHI entries we have. - e = PN->getNumIncomingValues(); - BB = NewBB; - i = PN->getBasicBlockIndex(BB); + !isa(BB->getTerminator())) { + BasicBlock *Parent = PN->getParent(); + Loop *PNLoop = LI.getLoopFor(Parent); + if (!PNLoop || Parent != PNLoop->getHeader()) { + // Split the critical edge. + BasicBlock *NewBB = 0; + if (!Parent->isLandingPad()) { + NewBB = SplitCriticalEdge(BB, Parent, P, + /*MergeIdenticalEdges=*/true, + /*DontDeleteUselessPhis=*/true); + } else { + SmallVector NewBBs; + SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs); + NewBB = NewBBs[0]; + } + + // If PN is outside of the loop and BB is in the loop, we want to + // move the block to be immediately before the PHI block, not + // immediately after BB. + if (L->contains(BB) && !L->contains(PN)) + NewBB->moveBefore(PN->getParent()); + + // Splitting the edge can reduce the number of PHI entries we have. + e = PN->getNumIncomingValues(); + BB = NewBB; + i = PN->getBasicBlockIndex(BB); + } } std::pair::iterator, bool> Pair = @@ -3409,7 +3751,7 @@ void LSRInstance::RewriteForPHI(PHINode *PN, Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts); // If this is reuse-by-noop-cast, insert the noop cast. - const Type *OpTy = LF.OperandValToReplace->getType(); + Type *OpTy = LF.OperandValToReplace->getType(); if (FullV->getType() != OpTy) FullV = CastInst::Create(CastInst::getCastOpcode(FullV, false, @@ -3439,7 +3781,7 @@ void LSRInstance::Rewrite(const LSRFixup &LF, Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts); // If this is reuse-by-noop-cast, insert the noop cast. - const Type *OpTy = LF.OperandValToReplace->getType(); + Type *OpTy = LF.OperandValToReplace->getType(); if (FullV->getType() != OpTy) { Instruction *Cast = CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false), @@ -3461,6 +3803,8 @@ void LSRInstance::Rewrite(const LSRFixup &LF, DeadInsts.push_back(LF.OperandValToReplace); } +/// ImplementSolution - Rewrite all the fixup locations with new values, +/// following the chosen solution. void LSRInstance::ImplementSolution(const SmallVectorImpl &Solution, Pass *P) { @@ -3468,8 +3812,9 @@ LSRInstance::ImplementSolution(const SmallVectorImpl &Solution, // we can remove them after we are done working. SmallVector DeadInsts; - SCEVExpander Rewriter(SE); + SCEVExpander Rewriter(SE, "lsr"); Rewriter.disableCanonicalMode(); + Rewriter.enableLSRMode(); Rewriter.setIVIncInsertPos(L, IVIncInsertPos); // Expand the new value definitions and update the users. @@ -3510,6 +3855,23 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) OptimizeShadowIV(); OptimizeLoopTermCond(); + // If loop preparation eliminates all interesting IV users, bail. + if (IU.empty()) return; + + // Skip nested loops until we can model them better with formulae. + if (!EnableNested && !L->empty()) { + + if (EnablePhiElim) { + // Remove any extra phis created by processing inner loops. + SmallVector DeadInsts; + SCEVExpander Rewriter(SE, "lsr"); + Changed |= (bool)Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, TLI); + Changed |= (bool)DeleteTriviallyDeadInstructions(DeadInsts); + } + DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n"); + return; + } + // Start collecting data and preparing for the solver. CollectInterestingTypesAndFactors(); CollectFixupsAndInitialFormulae(); @@ -3522,22 +3884,20 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) // to formulate the values needed for the uses. GenerateAllReuseFormulae(); - DEBUG(dbgs() << "\n" - "After generating reuse formulae:\n"; - print_uses(dbgs())); - FilterOutUndesirableDedicatedRegisters(); NarrowSearchSpaceUsingHeuristics(); SmallVector Solution; Solve(Solution); - assert(Solution.size() == Uses.size() && "Malformed solution!"); // Release memory that is no longer needed. Factors.clear(); Types.clear(); RegUses.clear(); + if (Solution.empty()) + return; + #ifndef NDEBUG // Formulae should be legal. for (SmallVectorImpl::const_iterator I = Uses.begin(), @@ -3553,6 +3913,14 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) // Now that we've decided what we want, make it so. ImplementSolution(Solution, P); + + if (EnablePhiElim) { + // Remove any extra phis created by processing inner loops. + SmallVector DeadInsts; + SCEVExpander Rewriter(SE, "lsr"); + Changed |= (bool)Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, TLI); + Changed |= (bool)DeleteTriviallyDeadInstructions(DeadInsts); + } } void LSRInstance::print_factors_and_types(raw_ostream &OS) const { @@ -3568,7 +3936,7 @@ void LSRInstance::print_factors_and_types(raw_ostream &OS) const { OS << '*' << *I; } - for (SmallSetVector::const_iterator + for (SmallSetVector::const_iterator I = Types.begin(), E = Types.end(); I != E; ++I) { if (!First) OS << ", "; First = false; @@ -3581,9 +3949,8 @@ void LSRInstance::print_fixups(raw_ostream &OS) const { OS << "LSR is examining the following fixup sites:\n"; for (SmallVectorImpl::const_iterator I = Fixups.begin(), E = Fixups.end(); I != E; ++I) { - const LSRFixup &LF = *I; dbgs() << " "; - LF.print(OS); + I->print(OS); OS << '\n'; } } @@ -3634,21 +4001,30 @@ private: } char LoopStrengthReduce::ID = 0; -static RegisterPass -X("loop-reduce", "Loop Strength Reduction"); +INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", + "Loop Strength Reduction", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(IVUsers) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce", + "Loop Strength Reduction", false, false) + Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) { return new LoopStrengthReduce(TLI); } LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli) - : LoopPass(&ID), TLI(tli) {} + : LoopPass(ID), TLI(tli) { + initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry()); + } void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { // We split critical edges, so we change the CFG. However, we do update // many analyses if they are around. AU.addPreservedID(LoopSimplifyID); - AU.addPreserved("domfrontier"); AU.addRequired(); AU.addPreserved(); @@ -3657,6 +4033,9 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved(); AU.addRequired(); AU.addPreserved(); + // Requiring LoopSimplify a second time here prevents IVUsers from running + // twice, since LoopSimplify was invalidated by running ScalarEvolution. + AU.addRequiredID(LoopSimplifyID); AU.addRequired(); AU.addPreserved(); }