From: Andrew Trick Date: Thu, 29 Aug 2013 18:04:49 +0000 (+0000) Subject: Comment and revise the cyclic critical path code. X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=851bb2c9cbbd3b1847def5ca7ea8dadf457298b5;p=oota-llvm.git Comment and revise the cyclic critical path code. This should be much more clear now. It's still disabled pending testing. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189597 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h index 2749a19bb8f..31824405483 100644 --- a/include/llvm/CodeGen/MachineScheduler.h +++ b/include/llvm/CodeGen/MachineScheduler.h @@ -331,6 +331,9 @@ public: BitVector &getScheduledTrees() { return ScheduledTrees; } + /// Compute the cyclic critical path through the DAG. + unsigned computeCyclicCriticalPath(); + void viewGraph(const Twine &Name, const Twine &Title) LLVM_OVERRIDE; void viewGraph() LLVM_OVERRIDE; diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h index 1b813141a8a..4a447e2f4af 100644 --- a/include/llvm/CodeGen/ScheduleDAGInstrs.h +++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h @@ -197,9 +197,6 @@ namespace llvm { /// input. void buildSchedGraph(AliasAnalysis *AA, RegPressureTracker *RPTracker = 0); - /// Compute the cyclic critical path through the DAG. - unsigned computeCyclicCriticalPath(); - /// addSchedBarrierDeps - Add dependencies from instructions in the current /// list of instructions being scheduled to scheduling barrier. We want to /// make sure instructions which define registers that are either used by diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp index 36eeb67d642..94f38322f22 100644 --- a/lib/CodeGen/MachineScheduler.cpp +++ b/lib/CodeGen/MachineScheduler.cpp @@ -642,6 +642,90 @@ void ScheduleDAGMI::findRootsAndBiasEdges(SmallVectorImpl &TopRoots, ExitSU.biasCriticalPath(); } +/// Compute the max cyclic critical path through the DAG. The scheduling DAG +/// only provides the critical path for single block loops. To handle loops that +/// span blocks, we could use the vreg path latencies provided by +/// MachineTraceMetrics instead. However, MachineTraceMetrics is not currently +/// available for use in the scheduler. +/// +/// The cyclic path estimation identifies a def-use pair that crosses the back +/// end and considers the depth and height of the nodes. For example, consider +/// the following instruction sequence where each instruction has unit latency +/// and defines an epomymous virtual register: +/// +/// a->b(a,c)->c(b)->d(c)->exit +/// +/// The cyclic critical path is a two cycles: b->c->b +/// The acyclic critical path is four cycles: a->b->c->d->exit +/// LiveOutHeight = height(c) = len(c->d->exit) = 2 +/// LiveOutDepth = depth(c) + 1 = len(a->b->c) + 1 = 3 +/// LiveInHeight = height(b) + 1 = len(b->c->d->exit) + 1 = 4 +/// LiveInDepth = depth(b) = len(a->b) = 1 +/// +/// LiveOutDepth - LiveInDepth = 3 - 1 = 2 +/// LiveInHeight - LiveOutHeight = 4 - 2 = 2 +/// CyclicCriticalPath = min(2, 2) = 2 +unsigned ScheduleDAGMI::computeCyclicCriticalPath() { + // This only applies to single block loop. + if (!BB->isSuccessor(BB)) + return 0; + + unsigned MaxCyclicLatency = 0; + // Visit each live out vreg def to find def/use pairs that cross iterations. + ArrayRef LiveOuts = RPTracker.getPressure().LiveOutRegs; + for (ArrayRef::iterator RI = LiveOuts.begin(), RE = LiveOuts.end(); + RI != RE; ++RI) { + unsigned Reg = *RI; + if (!TRI->isVirtualRegister(Reg)) + continue; + const LiveInterval &LI = LIS->getInterval(Reg); + const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB)); + if (!DefVNI) + continue; + + MachineInstr *DefMI = LIS->getInstructionFromIndex(DefVNI->def); + const SUnit *DefSU = getSUnit(DefMI); + if (!DefSU) + continue; + + unsigned LiveOutHeight = DefSU->getHeight(); + unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency; + // Visit all local users of the vreg def. + for (VReg2UseMap::iterator + UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) { + if (UI->SU == &ExitSU) + continue; + + // Only consider uses of the phi. + LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(UI->SU->getInstr())); + if (!LRQ.valueIn()->isPHIDef()) + continue; + + // Assume that a path spanning two iterations is a cycle, which could + // overestimate in strange cases. This allows cyclic latency to be + // estimated as the minimum slack of the vreg's depth or height. + unsigned CyclicLatency = 0; + if (LiveOutDepth > UI->SU->getDepth()) + CyclicLatency = LiveOutDepth - UI->SU->getDepth(); + + unsigned LiveInHeight = UI->SU->getHeight() + DefSU->Latency; + if (LiveInHeight > LiveOutHeight) { + if (LiveInHeight - LiveOutHeight < CyclicLatency) + CyclicLatency = LiveInHeight - LiveOutHeight; + } + else + CyclicLatency = 0; + + DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU(" + << UI->SU->NodeNum << ") = " << CyclicLatency << "c\n"); + if (CyclicLatency > MaxCyclicLatency) + MaxCyclicLatency = CyclicLatency; + } + } + DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n"); + return MaxCyclicLatency; +} + /// Identify DAG roots and setup scheduler queues. void ScheduleDAGMI::initQueues(ArrayRef TopRoots, ArrayRef BotRoots) { @@ -1557,21 +1641,39 @@ void ConvergingScheduler::releaseBottomNode(SUnit *SU) { Bot.releaseNode(SU, SU->BotReadyCycle); } +/// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic +/// critical path by more cycles than it takes to drain the instruction buffer. +/// We estimate an upper bounds on in-flight instructions as: +/// +/// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height ) +/// InFlightIterations = AcyclicPath / CyclesPerIteration +/// InFlightResources = InFlightIterations * LoopResources +/// +/// TODO: Check execution resources in addition to IssueCount. void ConvergingScheduler::checkAcyclicLatency() { if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath) return; + // Scaled number of cycles per loop iteration. + unsigned IterCount = + std::max(Rem.CyclicCritPath * SchedModel->getLatencyFactor(), + Rem.RemIssueCount); + // Scaled acyclic critical path. + unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor(); + // InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop + unsigned InFlightCount = + (AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount; unsigned BufferLimit = SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor(); - unsigned LatencyLag = Rem.CriticalPath - Rem.CyclicCritPath; - Rem.IsAcyclicLatencyLimited = - (LatencyLag * SchedModel->getLatencyFactor()) > BufferLimit; - - DEBUG(dbgs() << "BufferLimit " << BufferLimit << "u / " - << Rem.RemIssueCount << "u = " - << (BufferLimit + Rem.RemIssueCount) / Rem.RemIssueCount << " iters. " - << "Latency = " << LatencyLag << "c = " - << LatencyLag * SchedModel->getLatencyFactor() << "u\n"; + + Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit; + + DEBUG(dbgs() << "IssueCycles=" + << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c " + << "IterCycles=" << IterCount / SchedModel->getLatencyFactor() + << "c NumIters=" << (AcyclicCount + IterCount-1) / IterCount + << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor() + << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n"; if (Rem.IsAcyclicLatencyLimited) dbgs() << " ACYCLIC LATENCY LIMIT\n"); } @@ -1579,10 +1681,6 @@ void ConvergingScheduler::checkAcyclicLatency() { void ConvergingScheduler::registerRoots() { Rem.CriticalPath = DAG->ExitSU.getDepth(); - if (EnableCyclicPath) { - Rem.CyclicCritPath = DAG->computeCyclicCriticalPath(); - checkAcyclicLatency(); - } // Some roots may not feed into ExitSU. Check all of them in case. for (std::vector::const_iterator I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) { @@ -1590,6 +1688,11 @@ void ConvergingScheduler::registerRoots() { Rem.CriticalPath = (*I)->getDepth(); } DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n'); + + if (EnableCyclicPath) { + Rem.CyclicCritPath = DAG->computeCyclicCriticalPath(); + checkAcyclicLatency(); + } } /// Does this SU have a hazard within the current instruction group. diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp index f6496e61877..17d31f48b1d 100644 --- a/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -987,65 +987,6 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, PendingLoads.clear(); } -/// Compute the max cyclic critical path through the DAG. For loops that span -/// basic blocks, MachineTraceMetrics should be used for this instead. -unsigned ScheduleDAGInstrs::computeCyclicCriticalPath() { - // This only applies to single block loop. - if (!BB->isSuccessor(BB)) - return 0; - - unsigned MaxCyclicLatency = 0; - // Visit each live out vreg def to find def/use pairs that cross iterations. - for (SUnit::const_pred_iterator - PI = ExitSU.Preds.begin(), PE = ExitSU.Preds.end(); PI != PE; ++PI) { - MachineInstr *MI = PI->getSUnit()->getInstr(); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); - if (!MO.isReg() || !MO.isDef()) - break; - unsigned Reg = MO.getReg(); - if (!Reg || TRI->isPhysicalRegister(Reg)) - continue; - - const LiveInterval &LI = LIS->getInterval(Reg); - unsigned LiveOutHeight = PI->getSUnit()->getHeight(); - unsigned LiveOutDepth = PI->getSUnit()->getDepth() + PI->getLatency(); - // Visit all local users of the vreg def. - for (VReg2UseMap::iterator - UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) { - if (UI->SU == &ExitSU) - continue; - - // Only consider uses of the phi. - LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(UI->SU->getInstr())); - if (!LRQ.valueIn()->isPHIDef()) - continue; - - // Cheat a bit and assume that a path spanning two iterations is a - // cycle, which could overestimate in strange cases. This allows cyclic - // latency to be estimated as the minimum height or depth slack. - unsigned CyclicLatency = 0; - if (LiveOutDepth > UI->SU->getDepth()) - CyclicLatency = LiveOutDepth - UI->SU->getDepth(); - unsigned LiveInHeight = UI->SU->getHeight() + PI->getLatency(); - if (LiveInHeight > LiveOutHeight) { - if (LiveInHeight - LiveOutHeight < CyclicLatency) - CyclicLatency = LiveInHeight - LiveOutHeight; - } - else - CyclicLatency = 0; - DEBUG(dbgs() << "Cyclic Path: SU(" << PI->getSUnit()->NodeNum - << ") -> SU(" << UI->SU->NodeNum << ") = " - << CyclicLatency << "\n"); - if (CyclicLatency > MaxCyclicLatency) - MaxCyclicLatency = CyclicLatency; - } - } - } - DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "\n"); - return MaxCyclicLatency; -} - void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) SU->getInstr()->dump();