static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
cl::desc("Stop scheduling after N instructions"), cl::init(~0U));
+
+static cl::opt<std::string> SchedOnlyFunc("misched-only-func", cl::Hidden,
+ cl::desc("Only schedule this function"));
+static cl::opt<unsigned> SchedOnlyBlock("misched-only-block", cl::Hidden,
+ cl::desc("Only schedule this MBB#"));
#else
static bool ViewMISchedDAGs = false;
#endif // NDEBUG
+static cl::opt<bool> EnableRegPressure("misched-regpressure", cl::Hidden,
+ cl::desc("Enable register pressure scheduling."), cl::init(true));
+
static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
- cl::desc("Enable cyclic critical path analysis."), cl::init(false));
+ cl::desc("Enable cyclic critical path analysis."), cl::init(true));
static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
cl::desc("Enable load clustering."), cl::init(true));
// DAG subtrees must have at least this many nodes.
static const unsigned MinSubtreeSize = 8;
+// Pin the vtables to this file.
+void MachineSchedStrategy::anchor() {}
+void ScheduleDAGMutation::anchor() {}
+
//===----------------------------------------------------------------------===//
// Machine Instruction Scheduling Pass and Registry
//===----------------------------------------------------------------------===//
}
namespace {
+/// Base class for a machine scheduler class that can run at any point.
+class MachineSchedulerBase : public MachineSchedContext,
+ public MachineFunctionPass {
+public:
+ MachineSchedulerBase(char &ID): MachineFunctionPass(ID) {}
+
+ virtual void print(raw_ostream &O, const Module* = 0) const;
+
+protected:
+ void scheduleRegions(ScheduleDAGInstrs &Scheduler);
+};
+
/// MachineScheduler runs after coalescing and before register allocation.
-class MachineScheduler : public MachineSchedContext,
- public MachineFunctionPass {
+class MachineScheduler : public MachineSchedulerBase {
public:
MachineScheduler();
virtual void getAnalysisUsage(AnalysisUsage &AU) const;
- virtual void releaseMemory() {}
-
virtual bool runOnMachineFunction(MachineFunction&);
- virtual void print(raw_ostream &O, const Module* = 0) const;
+ static char ID; // Class identification, replacement for typeinfo
+
+protected:
+ ScheduleDAGInstrs *createMachineScheduler();
+};
+
+/// PostMachineScheduler runs after shortly before code emission.
+class PostMachineScheduler : public MachineSchedulerBase {
+public:
+ PostMachineScheduler();
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+ virtual bool runOnMachineFunction(MachineFunction&);
static char ID; // Class identification, replacement for typeinfo
+
+protected:
+ ScheduleDAGInstrs *createPostMachineScheduler();
};
} // namespace
"Machine Instruction Scheduler", false, false)
MachineScheduler::MachineScheduler()
-: MachineFunctionPass(ID) {
+: MachineSchedulerBase(ID) {
initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
}
MachineFunctionPass::getAnalysisUsage(AU);
}
+char PostMachineScheduler::ID = 0;
+
+char &llvm::PostMachineSchedulerID = PostMachineScheduler::ID;
+
+INITIALIZE_PASS(PostMachineScheduler, "postmisched",
+ "PostRA Machine Instruction Scheduler", false, false)
+
+PostMachineScheduler::PostMachineScheduler()
+: MachineSchedulerBase(ID) {
+ initializePostMachineSchedulerPass(*PassRegistry::getPassRegistry());
+}
+
+void PostMachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+ AU.addRequiredID(MachineDominatorsID);
+ AU.addRequired<MachineLoopInfo>();
+ AU.addRequired<TargetPassConfig>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
MachinePassRegistry MachineSchedRegistry::Registry;
/// A dummy default scheduler factory indicates whether the scheduler
/// Forward declare the standard machine scheduler. This will be used as the
/// default scheduler if the target does not set a default.
-static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C);
-
+static ScheduleDAGInstrs *createGenericSchedLive(MachineSchedContext *C);
+static ScheduleDAGInstrs *createGenericSchedPostRA(MachineSchedContext *C);
/// Decrement this iterator until reaching the top or a non-debug instr.
-static MachineBasicBlock::iterator
-priorNonDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Beg) {
+static MachineBasicBlock::const_iterator
+priorNonDebug(MachineBasicBlock::const_iterator I,
+ MachineBasicBlock::const_iterator Beg) {
assert(I != Beg && "reached the top of the region, cannot decrement");
while (--I != Beg) {
if (!I->isDebugValue())
return I;
}
+/// Non-const version.
+static MachineBasicBlock::iterator
+priorNonDebug(MachineBasicBlock::iterator I,
+ MachineBasicBlock::const_iterator Beg) {
+ return const_cast<MachineInstr*>(
+ &*priorNonDebug(MachineBasicBlock::const_iterator(I), Beg));
+}
+
/// If this iterator is a debug value, increment until reaching the End or a
/// non-debug instruction.
-static MachineBasicBlock::iterator
-nextIfDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator End) {
+static MachineBasicBlock::const_iterator
+nextIfDebug(MachineBasicBlock::const_iterator I,
+ MachineBasicBlock::const_iterator End) {
for(; I != End; ++I) {
if (!I->isDebugValue())
break;
return I;
}
+/// Non-const version.
+static MachineBasicBlock::iterator
+nextIfDebug(MachineBasicBlock::iterator I,
+ MachineBasicBlock::const_iterator End) {
+ // Cast the return value to nonconst MachineInstr, then cast to an
+ // instr_iterator, which does not check for null, finally return a
+ // bundle_iterator.
+ return MachineBasicBlock::instr_iterator(
+ const_cast<MachineInstr*>(
+ &*nextIfDebug(MachineBasicBlock::const_iterator(I), End)));
+}
+
+/// Instantiate a ScheduleDAGInstrs that will be owned by the caller.
+ScheduleDAGInstrs *MachineScheduler::createMachineScheduler() {
+ // Select the scheduler, or set the default.
+ MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt;
+ if (Ctor != useDefaultMachineSched)
+ return Ctor(this);
+
+ // Get the default scheduler set by the target for this function.
+ ScheduleDAGInstrs *Scheduler = PassConfig->createMachineScheduler(this);
+ if (Scheduler)
+ return Scheduler;
+
+ // Default to GenericScheduler.
+ return createGenericSchedLive(this);
+}
+
+/// Instantiate a ScheduleDAGInstrs for PostRA scheduling that will be owned by
+/// the caller. We don't have a command line option to override the postRA
+/// scheduler. The Target must configure it.
+ScheduleDAGInstrs *PostMachineScheduler::createPostMachineScheduler() {
+ // Get the postRA scheduler set by the target for this function.
+ ScheduleDAGInstrs *Scheduler = PassConfig->createPostMachineScheduler(this);
+ if (Scheduler)
+ return Scheduler;
+
+ // Default to GenericScheduler.
+ return createGenericSchedPostRA(this);
+}
+
/// Top-level MachineScheduler pass driver.
///
/// Visit blocks in function order. Divide each block into scheduling regions
AA = &getAnalysis<AliasAnalysis>();
LIS = &getAnalysis<LiveIntervals>();
- const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
if (VerifyScheduling) {
DEBUG(LIS->dump());
}
RegClassInfo->runOnMachineFunction(*MF);
- // Select the scheduler, or set the default.
- MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt;
- if (Ctor == useDefaultMachineSched) {
- // Get the default scheduler set by the target.
- Ctor = MachineSchedRegistry::getDefault();
- if (!Ctor) {
- Ctor = createConvergingSched;
- MachineSchedRegistry::setDefault(Ctor);
- }
- }
- // Instantiate the selected scheduler.
- OwningPtr<ScheduleDAGInstrs> Scheduler(Ctor(this));
+ // Instantiate the selected scheduler for this target, function, and
+ // optimization level.
+ OwningPtr<ScheduleDAGInstrs> Scheduler(createMachineScheduler());
+ scheduleRegions(*Scheduler);
+
+ DEBUG(LIS->dump());
+ if (VerifyScheduling)
+ MF->verify(this, "After machine scheduling.");
+ return true;
+}
+
+bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
+ DEBUG(dbgs() << "Before post-MI-sched:\n"; mf.print(dbgs()));
+
+ // Initialize the context of the pass.
+ MF = &mf;
+ PassConfig = &getAnalysis<TargetPassConfig>();
+
+ if (VerifyScheduling)
+ MF->verify(this, "Before post machine scheduling.");
+
+ // Instantiate the selected scheduler for this target, function, and
+ // optimization level.
+ OwningPtr<ScheduleDAGInstrs> Scheduler(createPostMachineScheduler());
+ scheduleRegions(*Scheduler);
+
+ if (VerifyScheduling)
+ MF->verify(this, "After post machine scheduling.");
+ return true;
+}
+
+/// Return true of the given instruction should not be included in a scheduling
+/// region.
+///
+/// MachineScheduler does not currently support scheduling across calls. To
+/// handle calls, the DAG builder needs to be modified to create register
+/// anti/output dependencies on the registers clobbered by the call's regmask
+/// operand. In PreRA scheduling, the stack pointer adjustment already prevents
+/// scheduling across calls. In PostRA scheduling, we need the isCall to enforce
+/// the boundary, but there would be no benefit to postRA scheduling across
+/// calls this late anyway.
+static bool isSchedBoundary(MachineBasicBlock::iterator MI,
+ MachineBasicBlock *MBB,
+ MachineFunction *MF,
+ const TargetInstrInfo *TII,
+ bool IsPostRA) {
+ return MI->isCall() || TII->isSchedulingBoundary(MI, MBB, *MF);
+}
+
+/// Main driver for both MachineScheduler and PostMachineScheduler.
+void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) {
+ const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+ bool IsPostRA = Scheduler.isPostRA();
// Visit all machine basic blocks.
//
for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end();
MBB != MBBEnd; ++MBB) {
- Scheduler->startBlock(MBB);
+ Scheduler.startBlock(MBB);
+
+#ifndef NDEBUG
+ if (SchedOnlyFunc.getNumOccurrences() && SchedOnlyFunc != MF->getName())
+ continue;
+ if (SchedOnlyBlock.getNumOccurrences()
+ && (int)SchedOnlyBlock != MBB->getNumber())
+ continue;
+#endif
// Break the block into scheduling regions [I, RegionEnd), and schedule each
// region as soon as it is discovered. RegionEnd points the scheduling
// The Scheduler may insert instructions during either schedule() or
// exitRegion(), even for empty regions. So the local iterators 'I' and
// 'RegionEnd' are invalid across these calls.
- unsigned RemainingInstrs = MBB->size();
+ //
+ // MBB::size() uses instr_iterator to count. Here we need a bundle to count
+ // as a single instruction.
+ unsigned RemainingInstrs = std::distance(MBB->begin(), MBB->end());
for(MachineBasicBlock::iterator RegionEnd = MBB->end();
- RegionEnd != MBB->begin(); RegionEnd = Scheduler->begin()) {
+ RegionEnd != MBB->begin(); RegionEnd = Scheduler.begin()) {
// Avoid decrementing RegionEnd for blocks with no terminator.
if (RegionEnd != MBB->end()
- || TII->isSchedulingBoundary(llvm::prior(RegionEnd), MBB, *MF)) {
+ || isSchedBoundary(llvm::prior(RegionEnd), MBB, MF, TII, IsPostRA)) {
--RegionEnd;
// Count the boundary instruction.
--RemainingInstrs;
unsigned NumRegionInstrs = 0;
MachineBasicBlock::iterator I = RegionEnd;
for(;I != MBB->begin(); --I, --RemainingInstrs, ++NumRegionInstrs) {
- if (TII->isSchedulingBoundary(llvm::prior(I), MBB, *MF))
+ if (isSchedBoundary(llvm::prior(I), MBB, MF, TII, IsPostRA))
break;
}
// Notify the scheduler of the region, even if we may skip scheduling
// it. Perhaps it still needs to be bundled.
- Scheduler->enterRegion(MBB, I, RegionEnd, NumRegionInstrs);
+ Scheduler.enterRegion(MBB, I, RegionEnd, NumRegionInstrs);
// Skip empty scheduling regions (0 or 1 schedulable instructions).
if (I == RegionEnd || I == llvm::prior(RegionEnd)) {
// Close the current region. Bundle the terminator if needed.
// This invalidates 'RegionEnd' and 'I'.
- Scheduler->exitRegion();
+ Scheduler.exitRegion();
continue;
}
- DEBUG(dbgs() << "********** MI Scheduling **********\n");
+ DEBUG(dbgs() << "********** " << ((Scheduler.isPostRA()) ? "PostRA " : "")
+ << "MI Scheduling **********\n");
DEBUG(dbgs() << MF->getName()
<< ":BB#" << MBB->getNumber() << " " << MBB->getName()
<< "\n From: " << *I << " To: ";
// Schedule a region: possibly reorder instructions.
// This invalidates 'RegionEnd' and 'I'.
- Scheduler->schedule();
+ Scheduler.schedule();
// Close the current region.
- Scheduler->exitRegion();
+ Scheduler.exitRegion();
// Scheduling has invalidated the current iterator 'I'. Ask the
// scheduler for the top of it's scheduled region.
- RegionEnd = Scheduler->begin();
+ RegionEnd = Scheduler.begin();
}
assert(RemainingInstrs == 0 && "Instruction count mismatch!");
- Scheduler->finishBlock();
+ Scheduler.finishBlock();
+ if (Scheduler.isPostRA()) {
+ // FIXME: Ideally, no further passes should rely on kill flags. However,
+ // thumb2 size reduction is currently an exception.
+ Scheduler.fixupKills(MBB);
+ }
}
- Scheduler->finalizeSchedule();
- DEBUG(LIS->dump());
- if (VerifyScheduling)
- MF->verify(this, "After machine scheduling.");
- return true;
+ Scheduler.finalizeSchedule();
}
-void MachineScheduler::print(raw_ostream &O, const Module* m) const {
+void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const {
// unimplemented
}
#endif
//===----------------------------------------------------------------------===//
-// ScheduleDAGMI - Base class for MachineInstr scheduling with LiveIntervals
-// preservation.
-//===----------------------------------------------------------------------===//
+// ScheduleDAGMI - Basic machine instruction scheduling. This is
+// independent of PreRA/PostRA scheduling and involves no extra book-keeping for
+// virtual registers.
+// ===----------------------------------------------------------------------===/
ScheduleDAGMI::~ScheduleDAGMI() {
- delete DFSResult;
DeleteContainerPointers(Mutations);
delete SchedImpl;
}
}
}
+/// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
+/// crossing a scheduling boundary. [begin, end) includes all instructions in
+/// the region, including the boundary itself and single-instruction regions
+/// that don't get scheduled.
+void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb,
+ MachineBasicBlock::iterator begin,
+ MachineBasicBlock::iterator end,
+ unsigned regioninstrs)
+{
+ ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs);
+
+ SchedImpl->initPolicy(begin, end, regioninstrs);
+}
+
/// This is normally called from the main scheduler loop but may also be invoked
/// by the scheduling strategy to perform additional code motion.
-void ScheduleDAGMI::moveInstruction(MachineInstr *MI,
- MachineBasicBlock::iterator InsertPos) {
+void ScheduleDAGMI::moveInstruction(
+ MachineInstr *MI, MachineBasicBlock::iterator InsertPos) {
// Advance RegionBegin if the first instruction moves down.
if (&*RegionBegin == MI)
++RegionBegin;
BB->splice(InsertPos, BB, MI);
// Update LiveIntervals
- LIS->handleMove(MI, /*UpdateFlags=*/true);
+ if (LIS)
+ LIS->handleMove(MI, /*UpdateFlags=*/true);
// Recede RegionBegin if an instruction moves above the first.
if (RegionBegin == InsertPos)
return true;
}
-/// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
-/// crossing a scheduling boundary. [begin, end) includes all instructions in
-/// the region, including the boundary itself and single-instruction regions
-/// that don't get scheduled.
-void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb,
- MachineBasicBlock::iterator begin,
- MachineBasicBlock::iterator end,
- unsigned regioninstrs)
-{
- ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs);
-
- // For convenience remember the end of the liveness region.
- LiveRegionEnd =
- (RegionEnd == bb->end()) ? RegionEnd : llvm::next(RegionEnd);
-}
-
-// Setup the register pressure trackers for the top scheduled top and bottom
-// scheduled regions.
-void ScheduleDAGMI::initRegPressure() {
- TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin);
- BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd);
-
- // Close the RPTracker to finalize live ins.
- RPTracker.closeRegion();
-
- DEBUG(RPTracker.dump());
-
- // Initialize the live ins and live outs.
- TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
- BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
-
- // Close one end of the tracker so we can call
- // getMaxUpward/DownwardPressureDelta before advancing across any
- // instructions. This converts currently live regs into live ins/outs.
- TopRPTracker.closeTop();
- BotRPTracker.closeBottom();
-
- BotRPTracker.initLiveThru(RPTracker);
- if (!BotRPTracker.getLiveThru().empty()) {
- TopRPTracker.initLiveThru(BotRPTracker.getLiveThru());
- DEBUG(dbgs() << "Live Thru: ";
- dumpRegSetPressure(BotRPTracker.getLiveThru(), TRI));
- };
-
- // Account for liveness generated by the region boundary.
- if (LiveRegionEnd != RegionEnd)
- BotRPTracker.recede();
-
- assert(BotRPTracker.getPos() == RegionEnd && "Can't find the region bottom");
-
- // Cache the list of excess pressure sets in this region. This will also track
- // the max pressure in the scheduled code for these sets.
- RegionCriticalPSets.clear();
- const std::vector<unsigned> &RegionPressure =
- RPTracker.getPressure().MaxSetPressure;
- for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) {
- unsigned Limit = RegClassInfo->getRegPressureSetLimit(i);
- if (RegionPressure[i] > Limit) {
- DEBUG(dbgs() << TRI->getRegPressureSetName(i)
- << " Limit " << Limit
- << " Actual " << RegionPressure[i] << "\n");
- RegionCriticalPSets.push_back(PressureChange(i));
- }
- }
- DEBUG(dbgs() << "Excess PSets: ";
- for (unsigned i = 0, e = RegionCriticalPSets.size(); i != e; ++i)
- dbgs() << TRI->getRegPressureSetName(
- RegionCriticalPSets[i].getPSet()) << " ";
- dbgs() << "\n");
-}
-
-// FIXME: When the pressure tracker deals in pressure differences then we won't
-// iterate over all RegionCriticalPSets[i].
-void ScheduleDAGMI::
-updateScheduledPressure(const std::vector<unsigned> &NewMaxPressure) {
- for (unsigned i = 0, e = RegionCriticalPSets.size(); i < e; ++i) {
- unsigned ID = RegionCriticalPSets[i].getPSet();
- if ((int)NewMaxPressure[ID] > RegionCriticalPSets[i].getUnitInc()
- && NewMaxPressure[ID] <= INT16_MAX)
- RegionCriticalPSets[i].setUnitInc(NewMaxPressure[ID]);
- }
- DEBUG(
- for (unsigned i = 0, e = NewMaxPressure.size(); i < e; ++i) {
- unsigned Limit = RegClassInfo->getRegPressureSetLimit(i);
- if (NewMaxPressure[i] > Limit ) {
- dbgs() << " " << TRI->getRegPressureSetName(i) << ": "
- << NewMaxPressure[i] << " > " << Limit << "\n";
- }
- });
-}
-
-/// schedule - Called back from MachineScheduler::runOnMachineFunction
-/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
-/// only includes instructions that have DAG nodes, not scheduling boundaries.
-///
-/// This is a skeletal driver, with all the functionality pushed into helpers,
-/// so that it can be easilly extended by experimental schedulers. Generally,
-/// implementing MachineSchedStrategy should be sufficient to implement a new
-/// scheduling algorithm. However, if a scheduler further subclasses
-/// ScheduleDAGMI then it will want to override this virtual method in order to
-/// update any specialized state.
+/// Per-region scheduling driver, called back from
+/// MachineScheduler::runOnMachineFunction. This is a simplified driver that
+/// does not consider liveness or register pressure. It is useful for PostRA
+/// scheduling and potentially other custom schedulers.
void ScheduleDAGMI::schedule() {
- buildDAGWithRegPressure();
+ // Build the DAG.
+ buildSchedGraph(AA);
Topo.InitDAGTopologicalSorting();
if (!checkSchedLimit())
break;
- scheduleMI(SU, IsTopNode);
-
+ MachineInstr *MI = SU->getInstr();
+ if (IsTopNode) {
+ assert(SU->isTopReady() && "node still has unscheduled dependencies");
+ if (&*CurrentTop == MI)
+ CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
+ else
+ moveInstruction(MI, CurrentTop);
+ }
+ else {
+ assert(SU->isBottomReady() && "node still has unscheduled dependencies");
+ MachineBasicBlock::iterator priorII =
+ priorNonDebug(CurrentBottom, CurrentTop);
+ if (&*priorII == MI)
+ CurrentBottom = priorII;
+ else {
+ if (&*CurrentTop == MI)
+ CurrentTop = nextIfDebug(++CurrentTop, priorII);
+ moveInstruction(MI, CurrentBottom);
+ CurrentBottom = MI;
+ }
+ }
updateQueues(SU, IsTopNode);
+
+ // Notify the scheduling strategy after updating the DAG.
+ SchedImpl->schedNode(SU, IsTopNode);
}
assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
});
}
-/// Build the DAG and setup three register pressure trackers.
-void ScheduleDAGMI::buildDAGWithRegPressure() {
- // Initialize the register pressure tracker used by buildSchedGraph.
- RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd,
- /*TrackUntiedDefs=*/true);
-
- // Account for liveness generate by the region boundary.
- if (LiveRegionEnd != RegionEnd)
- RPTracker.recede();
-
- // Build the DAG, and compute current register pressure.
- buildSchedGraph(AA, &RPTracker, &SUPressureDiffs);
-
- // Initialize top/bottom trackers after computing region pressure.
- initRegPressure();
-}
-
/// Apply each ScheduleDAGMutation step in order.
void ScheduleDAGMI::postprocessDAG() {
for (unsigned i = 0, e = Mutations.size(); i < e; ++i) {
}
}
-void ScheduleDAGMI::computeDFSResult() {
- if (!DFSResult)
- DFSResult = new SchedDFSResult(/*BottomU*/true, MinSubtreeSize);
- DFSResult->clear();
- ScheduledTrees.clear();
- DFSResult->resize(SUnits.size());
- DFSResult->compute(SUnits);
- ScheduledTrees.resize(DFSResult->getNumSubtrees());
-}
-
-void ScheduleDAGMI::findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
- SmallVectorImpl<SUnit*> &BotRoots) {
+void ScheduleDAGMI::
+findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
+ SmallVectorImpl<SUnit*> &BotRoots) {
for (std::vector<SUnit>::iterator
I = SUnits.begin(), E = SUnits.end(); I != E; ++I) {
SUnit *SU = &(*I);
ExitSU.biasCriticalPath();
}
-/// Compute the max cyclic critical path through the DAG. The scheduling DAG
-/// only provides the critical path for single block loops. To handle loops that
-/// span blocks, we could use the vreg path latencies provided by
-/// MachineTraceMetrics instead. However, MachineTraceMetrics is not currently
-/// available for use in the scheduler.
-///
-/// The cyclic path estimation identifies a def-use pair that crosses the back
-/// edge and considers the depth and height of the nodes. For example, consider
-/// the following instruction sequence where each instruction has unit latency
-/// and defines an epomymous virtual register:
-///
-/// a->b(a,c)->c(b)->d(c)->exit
-///
-/// The cyclic critical path is a two cycles: b->c->b
-/// The acyclic critical path is four cycles: a->b->c->d->exit
-/// LiveOutHeight = height(c) = len(c->d->exit) = 2
-/// LiveOutDepth = depth(c) + 1 = len(a->b->c) + 1 = 3
-/// LiveInHeight = height(b) + 1 = len(b->c->d->exit) + 1 = 4
-/// LiveInDepth = depth(b) = len(a->b) = 1
-///
-/// LiveOutDepth - LiveInDepth = 3 - 1 = 2
-/// LiveInHeight - LiveOutHeight = 4 - 2 = 2
-/// CyclicCriticalPath = min(2, 2) = 2
-unsigned ScheduleDAGMI::computeCyclicCriticalPath() {
- // This only applies to single block loop.
- if (!BB->isSuccessor(BB))
- return 0;
-
- unsigned MaxCyclicLatency = 0;
- // Visit each live out vreg def to find def/use pairs that cross iterations.
- ArrayRef<unsigned> LiveOuts = RPTracker.getPressure().LiveOutRegs;
- for (ArrayRef<unsigned>::iterator RI = LiveOuts.begin(), RE = LiveOuts.end();
- RI != RE; ++RI) {
- unsigned Reg = *RI;
- if (!TRI->isVirtualRegister(Reg))
- continue;
- const LiveInterval &LI = LIS->getInterval(Reg);
- const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
- if (!DefVNI)
- continue;
-
- MachineInstr *DefMI = LIS->getInstructionFromIndex(DefVNI->def);
- const SUnit *DefSU = getSUnit(DefMI);
- if (!DefSU)
- continue;
-
- unsigned LiveOutHeight = DefSU->getHeight();
- unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency;
- // Visit all local users of the vreg def.
- for (VReg2UseMap::iterator
- UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
- if (UI->SU == &ExitSU)
- continue;
-
- // Only consider uses of the phi.
- LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(UI->SU->getInstr()));
- if (!LRQ.valueIn()->isPHIDef())
- continue;
-
- // Assume that a path spanning two iterations is a cycle, which could
- // overestimate in strange cases. This allows cyclic latency to be
- // estimated as the minimum slack of the vreg's depth or height.
- unsigned CyclicLatency = 0;
- if (LiveOutDepth > UI->SU->getDepth())
- CyclicLatency = LiveOutDepth - UI->SU->getDepth();
-
- unsigned LiveInHeight = UI->SU->getHeight() + DefSU->Latency;
- if (LiveInHeight > LiveOutHeight) {
- if (LiveInHeight - LiveOutHeight < CyclicLatency)
- CyclicLatency = LiveInHeight - LiveOutHeight;
- }
- else
- CyclicLatency = 0;
-
- DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU("
- << UI->SU->NodeNum << ") = " << CyclicLatency << "c\n");
- if (CyclicLatency > MaxCyclicLatency)
- MaxCyclicLatency = CyclicLatency;
- }
- }
- DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n");
- return MaxCyclicLatency;
-}
-
-/// Identify DAG roots and setup scheduler queues.
-void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
- ArrayRef<SUnit*> BotRoots) {
- NextClusterSucc = NULL;
- NextClusterPred = NULL;
+/// Identify DAG roots and setup scheduler queues.
+void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
+ ArrayRef<SUnit*> BotRoots) {
+ NextClusterSucc = NULL;
+ NextClusterPred = NULL;
// Release all DAG roots for scheduling, not including EntrySU/ExitSU.
//
SchedImpl->registerRoots();
// Advance past initial DebugValues.
- assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
CurrentTop = nextIfDebug(RegionBegin, RegionEnd);
- TopRPTracker.setPos(CurrentTop);
-
CurrentBottom = RegionEnd;
}
-/// Move an instruction and update register pressure.
-void ScheduleDAGMI::scheduleMI(SUnit *SU, bool IsTopNode) {
- // Move the instruction to its new location in the instruction stream.
- MachineInstr *MI = SU->getInstr();
-
- if (IsTopNode) {
- assert(SU->isTopReady() && "node still has unscheduled dependencies");
- if (&*CurrentTop == MI)
- CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
- else {
- moveInstruction(MI, CurrentTop);
- TopRPTracker.setPos(MI);
- }
-
- // Update top scheduled pressure.
- TopRPTracker.advance();
- assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
- updateScheduledPressure(TopRPTracker.getPressure().MaxSetPressure);
- }
- else {
- assert(SU->isBottomReady() && "node still has unscheduled dependencies");
- MachineBasicBlock::iterator priorII =
- priorNonDebug(CurrentBottom, CurrentTop);
- if (&*priorII == MI)
- CurrentBottom = priorII;
- else {
- if (&*CurrentTop == MI) {
- CurrentTop = nextIfDebug(++CurrentTop, priorII);
- TopRPTracker.setPos(CurrentTop);
- }
- moveInstruction(MI, CurrentBottom);
- CurrentBottom = MI;
- }
- // Update bottom scheduled pressure.
- BotRPTracker.recede();
- assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
- updateScheduledPressure(BotRPTracker.getPressure().MaxSetPressure);
- }
-}
-
/// Update scheduler queues after scheduling an instruction.
void ScheduleDAGMI::updateQueues(SUnit *SU, bool IsTopNode) {
// Release dependent instructions for scheduling.
releasePredecessors(SU);
SU->isScheduled = true;
-
- if (DFSResult) {
- unsigned SubtreeID = DFSResult->getSubtreeID(SU);
- if (!ScheduledTrees.test(SubtreeID)) {
- ScheduledTrees.set(SubtreeID);
- DFSResult->scheduleTree(SubtreeID);
- SchedImpl->scheduleTree(SubtreeID);
- }
- }
-
- // Notify the scheduling strategy after updating the DAG.
- SchedImpl->schedNode(SU, IsTopNode);
}
/// Reinsert any remaining debug_values, just like the PostRA scheduler.
#endif
//===----------------------------------------------------------------------===//
-// LoadClusterMutation - DAG post-processing to cluster loads.
+// ScheduleDAGMILive - Base class for MachineInstr scheduling with LiveIntervals
+// preservation.
//===----------------------------------------------------------------------===//
-namespace {
-/// \brief Post-process the DAG to create cluster edges between neighboring
-/// loads.
-class LoadClusterMutation : public ScheduleDAGMutation {
- struct LoadInfo {
- SUnit *SU;
- unsigned BaseReg;
- unsigned Offset;
- LoadInfo(SUnit *su, unsigned reg, unsigned ofs)
- : SU(su), BaseReg(reg), Offset(ofs) {}
- };
- static bool LoadInfoLess(const LoadClusterMutation::LoadInfo &LHS,
- const LoadClusterMutation::LoadInfo &RHS);
+ScheduleDAGMILive::~ScheduleDAGMILive() {
+ delete DFSResult;
+}
- const TargetInstrInfo *TII;
- const TargetRegisterInfo *TRI;
-public:
- LoadClusterMutation(const TargetInstrInfo *tii,
- const TargetRegisterInfo *tri)
- : TII(tii), TRI(tri) {}
+/// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
+/// crossing a scheduling boundary. [begin, end) includes all instructions in
+/// the region, including the boundary itself and single-instruction regions
+/// that don't get scheduled.
+void ScheduleDAGMILive::enterRegion(MachineBasicBlock *bb,
+ MachineBasicBlock::iterator begin,
+ MachineBasicBlock::iterator end,
+ unsigned regioninstrs)
+{
+ // ScheduleDAGMI initializes SchedImpl's per-region policy.
+ ScheduleDAGMI::enterRegion(bb, begin, end, regioninstrs);
- virtual void apply(ScheduleDAGMI *DAG);
-protected:
- void clusterNeighboringLoads(ArrayRef<SUnit*> Loads, ScheduleDAGMI *DAG);
-};
-} // anonymous
+ // For convenience remember the end of the liveness region.
+ LiveRegionEnd =
+ (RegionEnd == bb->end()) ? RegionEnd : llvm::next(RegionEnd);
-bool LoadClusterMutation::LoadInfoLess(
- const LoadClusterMutation::LoadInfo &LHS,
- const LoadClusterMutation::LoadInfo &RHS) {
- if (LHS.BaseReg != RHS.BaseReg)
- return LHS.BaseReg < RHS.BaseReg;
- return LHS.Offset < RHS.Offset;
+ SUPressureDiffs.clear();
+
+ ShouldTrackPressure = SchedImpl->shouldTrackPressure();
}
-void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads,
- ScheduleDAGMI *DAG) {
- SmallVector<LoadClusterMutation::LoadInfo,32> LoadRecords;
- for (unsigned Idx = 0, End = Loads.size(); Idx != End; ++Idx) {
- SUnit *SU = Loads[Idx];
- unsigned BaseReg;
- unsigned Offset;
- if (TII->getLdStBaseRegImmOfs(SU->getInstr(), BaseReg, Offset, TRI))
- LoadRecords.push_back(LoadInfo(SU, BaseReg, Offset));
+// Setup the register pressure trackers for the top scheduled top and bottom
+// scheduled regions.
+void ScheduleDAGMILive::initRegPressure() {
+ TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin);
+ BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd);
+
+ // Close the RPTracker to finalize live ins.
+ RPTracker.closeRegion();
+
+ DEBUG(RPTracker.dump());
+
+ // Initialize the live ins and live outs.
+ TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
+ BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
+
+ // Close one end of the tracker so we can call
+ // getMaxUpward/DownwardPressureDelta before advancing across any
+ // instructions. This converts currently live regs into live ins/outs.
+ TopRPTracker.closeTop();
+ BotRPTracker.closeBottom();
+
+ BotRPTracker.initLiveThru(RPTracker);
+ if (!BotRPTracker.getLiveThru().empty()) {
+ TopRPTracker.initLiveThru(BotRPTracker.getLiveThru());
+ DEBUG(dbgs() << "Live Thru: ";
+ dumpRegSetPressure(BotRPTracker.getLiveThru(), TRI));
+ };
+
+ // For each live out vreg reduce the pressure change associated with other
+ // uses of the same vreg below the live-out reaching def.
+ updatePressureDiffs(RPTracker.getPressure().LiveOutRegs);
+
+ // Account for liveness generated by the region boundary.
+ if (LiveRegionEnd != RegionEnd) {
+ SmallVector<unsigned, 8> LiveUses;
+ BotRPTracker.recede(&LiveUses);
+ updatePressureDiffs(LiveUses);
}
- if (LoadRecords.size() < 2)
- return;
- std::sort(LoadRecords.begin(), LoadRecords.end(), LoadInfoLess);
- unsigned ClusterLength = 1;
- for (unsigned Idx = 0, End = LoadRecords.size(); Idx < (End - 1); ++Idx) {
- if (LoadRecords[Idx].BaseReg != LoadRecords[Idx+1].BaseReg) {
- ClusterLength = 1;
- continue;
- }
- SUnit *SUa = LoadRecords[Idx].SU;
- SUnit *SUb = LoadRecords[Idx+1].SU;
- if (TII->shouldClusterLoads(SUa->getInstr(), SUb->getInstr(), ClusterLength)
- && DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
+ assert(BotRPTracker.getPos() == RegionEnd && "Can't find the region bottom");
- DEBUG(dbgs() << "Cluster loads SU(" << SUa->NodeNum << ") - SU("
- << SUb->NodeNum << ")\n");
- // Copy successor edges from SUa to SUb. Interleaving computation
- // dependent on SUa can prevent load combining due to register reuse.
- // Predecessor edges do not need to be copied from SUb to SUa since nearby
- // loads should have effectively the same inputs.
- for (SUnit::const_succ_iterator
- SI = SUa->Succs.begin(), SE = SUa->Succs.end(); SI != SE; ++SI) {
- if (SI->getSUnit() == SUb)
- continue;
- DEBUG(dbgs() << " Copy Succ SU(" << SI->getSUnit()->NodeNum << ")\n");
- DAG->addEdge(SI->getSUnit(), SDep(SUb, SDep::Artificial));
- }
- ++ClusterLength;
+ // Cache the list of excess pressure sets in this region. This will also track
+ // the max pressure in the scheduled code for these sets.
+ RegionCriticalPSets.clear();
+ const std::vector<unsigned> &RegionPressure =
+ RPTracker.getPressure().MaxSetPressure;
+ for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) {
+ unsigned Limit = RegClassInfo->getRegPressureSetLimit(i);
+ if (RegionPressure[i] > Limit) {
+ DEBUG(dbgs() << TRI->getRegPressureSetName(i)
+ << " Limit " << Limit
+ << " Actual " << RegionPressure[i] << "\n");
+ RegionCriticalPSets.push_back(PressureChange(i));
}
- else
- ClusterLength = 1;
}
+ DEBUG(dbgs() << "Excess PSets: ";
+ for (unsigned i = 0, e = RegionCriticalPSets.size(); i != e; ++i)
+ dbgs() << TRI->getRegPressureSetName(
+ RegionCriticalPSets[i].getPSet()) << " ";
+ dbgs() << "\n");
}
-/// \brief Callback from DAG postProcessing to create cluster edges for loads.
-void LoadClusterMutation::apply(ScheduleDAGMI *DAG) {
- // Map DAG NodeNum to store chain ID.
- DenseMap<unsigned, unsigned> StoreChainIDs;
- // Map each store chain to a set of dependent loads.
- SmallVector<SmallVector<SUnit*,4>, 32> StoreChainDependents;
- for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
- SUnit *SU = &DAG->SUnits[Idx];
- if (!SU->getInstr()->mayLoad())
- continue;
- unsigned ChainPredID = DAG->SUnits.size();
- for (SUnit::const_pred_iterator
- PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) {
- if (PI->isCtrl()) {
- ChainPredID = PI->getSUnit()->NodeNum;
- break;
- }
+void ScheduleDAGMILive::
+updateScheduledPressure(const SUnit *SU,
+ const std::vector<unsigned> &NewMaxPressure) {
+ const PressureDiff &PDiff = getPressureDiff(SU);
+ unsigned CritIdx = 0, CritEnd = RegionCriticalPSets.size();
+ for (PressureDiff::const_iterator I = PDiff.begin(), E = PDiff.end();
+ I != E; ++I) {
+ if (!I->isValid())
+ break;
+ unsigned ID = I->getPSet();
+ while (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() < ID)
+ ++CritIdx;
+ if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) {
+ if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc()
+ && NewMaxPressure[ID] <= INT16_MAX)
+ RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]);
+ }
+ unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID);
+ if (NewMaxPressure[ID] >= Limit - 2) {
+ DEBUG(dbgs() << " " << TRI->getRegPressureSetName(ID) << ": "
+ << NewMaxPressure[ID] << " > " << Limit << "(+ "
+ << BotRPTracker.getLiveThru()[ID] << " livethru)\n");
}
- // Check if this chain-like pred has been seen
- // before. ChainPredID==MaxNodeID for loads at the top of the schedule.
- unsigned NumChains = StoreChainDependents.size();
- std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result =
- StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains));
- if (Result.second)
- StoreChainDependents.resize(NumChains + 1);
- StoreChainDependents[Result.first->second].push_back(SU);
}
- // Iterate over the store chains.
- for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx)
- clusterNeighboringLoads(StoreChainDependents[Idx], DAG);
}
-//===----------------------------------------------------------------------===//
-// MacroFusion - DAG post-processing to encourage fusion of macro ops.
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// \brief Post-process the DAG to create cluster edges between instructions
-/// that may be fused by the processor into a single operation.
-class MacroFusion : public ScheduleDAGMutation {
- const TargetInstrInfo *TII;
-public:
- MacroFusion(const TargetInstrInfo *tii): TII(tii) {}
-
- virtual void apply(ScheduleDAGMI *DAG);
-};
-} // anonymous
-
-/// \brief Callback from DAG postProcessing to create cluster edges to encourage
-/// fused operations.
-void MacroFusion::apply(ScheduleDAGMI *DAG) {
- // For now, assume targets can only fuse with the branch.
- MachineInstr *Branch = DAG->ExitSU.getInstr();
- if (!Branch)
- return;
-
- for (unsigned Idx = DAG->SUnits.size(); Idx > 0;) {
- SUnit *SU = &DAG->SUnits[--Idx];
- if (!TII->shouldScheduleAdjacent(SU->getInstr(), Branch))
+/// Update the PressureDiff array for liveness after scheduling this
+/// instruction.
+void ScheduleDAGMILive::updatePressureDiffs(ArrayRef<unsigned> LiveUses) {
+ for (unsigned LUIdx = 0, LUEnd = LiveUses.size(); LUIdx != LUEnd; ++LUIdx) {
+ /// FIXME: Currently assuming single-use physregs.
+ unsigned Reg = LiveUses[LUIdx];
+ DEBUG(dbgs() << " LiveReg: " << PrintVRegOrUnit(Reg, TRI) << "\n");
+ if (!TRI->isVirtualRegister(Reg))
continue;
- // Create a single weak edge from SU to ExitSU. The only effect is to cause
- // bottom-up scheduling to heavily prioritize the clustered SU. There is no
- // need to copy predecessor edges from ExitSU to SU, since top-down
- // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
- // of SU, we could create an artificial edge from the deepest root, but it
- // hasn't been needed yet.
- bool Success = DAG->addEdge(&DAG->ExitSU, SDep(SU, SDep::Cluster));
- (void)Success;
- assert(Success && "No DAG nodes should be reachable from ExitSU");
-
- DEBUG(dbgs() << "Macro Fuse SU(" << SU->NodeNum << ")\n");
- break;
+ // This may be called before CurrentBottom has been initialized. However,
+ // BotRPTracker must have a valid position. We want the value live into the
+ // instruction or live out of the block, so ask for the previous
+ // instruction's live-out.
+ const LiveInterval &LI = LIS->getInterval(Reg);
+ VNInfo *VNI;
+ MachineBasicBlock::const_iterator I =
+ nextIfDebug(BotRPTracker.getPos(), BB->end());
+ if (I == BB->end())
+ VNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
+ else {
+ LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(I));
+ VNI = LRQ.valueIn();
+ }
+ // RegisterPressureTracker guarantees that readsReg is true for LiveUses.
+ assert(VNI && "No live value at use.");
+ for (VReg2UseMap::iterator
+ UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
+ SUnit *SU = UI->SU;
+ DEBUG(dbgs() << " UpdateRegP: SU(" << SU->NodeNum << ") "
+ << *SU->getInstr());
+ // If this use comes before the reaching def, it cannot be a last use, so
+ // descrease its pressure change.
+ if (!SU->isScheduled && SU != &ExitSU) {
+ LiveQueryResult LRQ
+ = LI.Query(LIS->getInstructionIndex(SU->getInstr()));
+ if (LRQ.valueIn() == VNI)
+ getPressureDiff(SU).addPressureChange(Reg, true, &MRI);
+ }
+ }
}
}
-//===----------------------------------------------------------------------===//
-// CopyConstrain - DAG post-processing to encourage copy elimination.
-//===----------------------------------------------------------------------===//
+/// schedule - Called back from MachineScheduler::runOnMachineFunction
+/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
+/// only includes instructions that have DAG nodes, not scheduling boundaries.
+///
+/// This is a skeletal driver, with all the functionality pushed into helpers,
+/// so that it can be easilly extended by experimental schedulers. Generally,
+/// implementing MachineSchedStrategy should be sufficient to implement a new
+/// scheduling algorithm. However, if a scheduler further subclasses
+/// ScheduleDAGMILive then it will want to override this virtual method in order
+/// to update any specialized state.
+void ScheduleDAGMILive::schedule() {
+ buildDAGWithRegPressure();
-namespace {
-/// \brief Post-process the DAG to create weak edges from all uses of a copy to
-/// the one use that defines the copy's source vreg, most likely an induction
-/// variable increment.
-class CopyConstrain : public ScheduleDAGMutation {
- // Transient state.
- SlotIndex RegionBeginIdx;
- // RegionEndIdx is the slot index of the last non-debug instruction in the
- // scheduling region. So we may have RegionBeginIdx == RegionEndIdx.
- SlotIndex RegionEndIdx;
-public:
- CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {}
+ Topo.InitDAGTopologicalSorting();
- virtual void apply(ScheduleDAGMI *DAG);
+ postprocessDAG();
-protected:
- void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMI *DAG);
-};
-} // anonymous
+ SmallVector<SUnit*, 8> TopRoots, BotRoots;
+ findRootsAndBiasEdges(TopRoots, BotRoots);
-/// constrainLocalCopy handles two possibilities:
-/// 1) Local src:
-/// I0: = dst
-/// I1: src = ...
-/// I2: = dst
-/// I3: dst = src (copy)
-/// (create pred->succ edges I0->I1, I2->I1)
-///
-/// 2) Local copy:
-/// I0: dst = src (copy)
-/// I1: = dst
-/// I2: src = ...
-/// I3: = dst
-/// (create pred->succ edges I1->I2, I3->I2)
-///
-/// Although the MachineScheduler is currently constrained to single blocks,
-/// this algorithm should handle extended blocks. An EBB is a set of
-/// contiguously numbered blocks such that the previous block in the EBB is
-/// always the single predecessor.
-void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMI *DAG) {
- LiveIntervals *LIS = DAG->getLIS();
- MachineInstr *Copy = CopySU->getInstr();
+ // Initialize the strategy before modifying the DAG.
+ // This may initialize a DFSResult to be used for queue priority.
+ SchedImpl->initialize(this);
- // Check for pure vreg copies.
- unsigned SrcReg = Copy->getOperand(1).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
- return;
+ DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+ SUnits[su].dumpAll(this));
+ if (ViewMISchedDAGs) viewGraph();
- unsigned DstReg = Copy->getOperand(0).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(DstReg))
- return;
+ // Initialize ready queues now that the DAG and priority data are finalized.
+ initQueues(TopRoots, BotRoots);
- // Check if either the dest or source is local. If it's live across a back
- // edge, it's not local. Note that if both vregs are live across the back
- // edge, we cannot successfully contrain the copy without cyclic scheduling.
- unsigned LocalReg = DstReg;
- unsigned GlobalReg = SrcReg;
- LiveInterval *LocalLI = &LIS->getInterval(LocalReg);
- if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) {
- LocalReg = SrcReg;
- GlobalReg = DstReg;
- LocalLI = &LIS->getInterval(LocalReg);
- if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx))
- return;
+ if (ShouldTrackPressure) {
+ assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
+ TopRPTracker.setPos(CurrentTop);
}
- LiveInterval *GlobalLI = &LIS->getInterval(GlobalReg);
- // Find the global segment after the start of the local LI.
- LiveInterval::iterator GlobalSegment = GlobalLI->find(LocalLI->beginIndex());
- // If GlobalLI does not overlap LocalLI->start, then a copy directly feeds a
- // local live range. We could create edges from other global uses to the local
- // start, but the coalescer should have already eliminated these cases, so
- // don't bother dealing with it.
- if (GlobalSegment == GlobalLI->end())
- return;
+ bool IsTopNode = false;
+ while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) {
+ assert(!SU->isScheduled && "Node already scheduled");
+ if (!checkSchedLimit())
+ break;
- // If GlobalSegment is killed at the LocalLI->start, the call to find()
- // returned the next global segment. But if GlobalSegment overlaps with
- // LocalLI->start, then advance to the next segement. If a hole in GlobalLI
- // exists in LocalLI's vicinity, GlobalSegment will be the end of the hole.
- if (GlobalSegment->contains(LocalLI->beginIndex()))
- ++GlobalSegment;
+ scheduleMI(SU, IsTopNode);
- if (GlobalSegment == GlobalLI->end())
- return;
+ updateQueues(SU, IsTopNode);
- // Check if GlobalLI contains a hole in the vicinity of LocalLI.
- if (GlobalSegment != GlobalLI->begin()) {
- // Two address defs have no hole.
- if (SlotIndex::isSameInstr(llvm::prior(GlobalSegment)->end,
- GlobalSegment->start)) {
- return;
- }
- // If the prior global segment may be defined by the same two-address
- // instruction that also defines LocalLI, then can't make a hole here.
- if (SlotIndex::isSameInstr(llvm::prior(GlobalSegment)->start,
- LocalLI->beginIndex())) {
- return;
+ if (DFSResult) {
+ unsigned SubtreeID = DFSResult->getSubtreeID(SU);
+ if (!ScheduledTrees.test(SubtreeID)) {
+ ScheduledTrees.set(SubtreeID);
+ DFSResult->scheduleTree(SubtreeID);
+ SchedImpl->scheduleTree(SubtreeID);
+ }
}
- // If GlobalLI has a prior segment, it must be live into the EBB. Otherwise
- // it would be a disconnected component in the live range.
- assert(llvm::prior(GlobalSegment)->start < LocalLI->beginIndex() &&
- "Disconnected LRG within the scheduling region.");
+
+ // Notify the scheduling strategy after updating the DAG.
+ SchedImpl->schedNode(SU, IsTopNode);
}
- MachineInstr *GlobalDef = LIS->getInstructionFromIndex(GlobalSegment->start);
- if (!GlobalDef)
- return;
+ assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
- SUnit *GlobalSU = DAG->getSUnit(GlobalDef);
- if (!GlobalSU)
- return;
+ placeDebugValues();
- // GlobalDef is the bottom of the GlobalLI hole. Open the hole by
- // constraining the uses of the last local def to precede GlobalDef.
- SmallVector<SUnit*,8> LocalUses;
- const VNInfo *LastLocalVN = LocalLI->getVNInfoBefore(LocalLI->endIndex());
- MachineInstr *LastLocalDef = LIS->getInstructionFromIndex(LastLocalVN->def);
- SUnit *LastLocalSU = DAG->getSUnit(LastLocalDef);
- for (SUnit::const_succ_iterator
- I = LastLocalSU->Succs.begin(), E = LastLocalSU->Succs.end();
- I != E; ++I) {
- if (I->getKind() != SDep::Data || I->getReg() != LocalReg)
- continue;
- if (I->getSUnit() == GlobalSU)
- continue;
- if (!DAG->canAddEdge(GlobalSU, I->getSUnit()))
- return;
- LocalUses.push_back(I->getSUnit());
- }
- // Open the top of the GlobalLI hole by constraining any earlier global uses
- // to precede the start of LocalLI.
- SmallVector<SUnit*,8> GlobalUses;
- MachineInstr *FirstLocalDef =
- LIS->getInstructionFromIndex(LocalLI->beginIndex());
- SUnit *FirstLocalSU = DAG->getSUnit(FirstLocalDef);
- for (SUnit::const_pred_iterator
- I = GlobalSU->Preds.begin(), E = GlobalSU->Preds.end(); I != E; ++I) {
- if (I->getKind() != SDep::Anti || I->getReg() != GlobalReg)
- continue;
- if (I->getSUnit() == FirstLocalSU)
- continue;
- if (!DAG->canAddEdge(FirstLocalSU, I->getSUnit()))
- return;
- GlobalUses.push_back(I->getSUnit());
- }
- DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n");
- // Add the weak edges.
- for (SmallVectorImpl<SUnit*>::const_iterator
- I = LocalUses.begin(), E = LocalUses.end(); I != E; ++I) {
- DEBUG(dbgs() << " Local use SU(" << (*I)->NodeNum << ") -> SU("
- << GlobalSU->NodeNum << ")\n");
- DAG->addEdge(GlobalSU, SDep(*I, SDep::Weak));
- }
- for (SmallVectorImpl<SUnit*>::const_iterator
- I = GlobalUses.begin(), E = GlobalUses.end(); I != E; ++I) {
- DEBUG(dbgs() << " Global use SU(" << (*I)->NodeNum << ") -> SU("
- << FirstLocalSU->NodeNum << ")\n");
- DAG->addEdge(FirstLocalSU, SDep(*I, SDep::Weak));
- }
+ DEBUG({
+ unsigned BBNum = begin()->getParent()->getNumber();
+ dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
+ dumpSchedule();
+ dbgs() << '\n';
+ });
}
-/// \brief Callback from DAG postProcessing to create weak edges to encourage
-/// copy elimination.
-void CopyConstrain::apply(ScheduleDAGMI *DAG) {
- MachineBasicBlock::iterator FirstPos = nextIfDebug(DAG->begin(), DAG->end());
- if (FirstPos == DAG->end())
+/// Build the DAG and setup three register pressure trackers.
+void ScheduleDAGMILive::buildDAGWithRegPressure() {
+ if (!ShouldTrackPressure) {
+ RPTracker.reset();
+ RegionCriticalPSets.clear();
+ buildSchedGraph(AA);
return;
- RegionBeginIdx = DAG->getLIS()->getInstructionIndex(&*FirstPos);
- RegionEndIdx = DAG->getLIS()->getInstructionIndex(
- &*priorNonDebug(DAG->end(), DAG->begin()));
+ }
- for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
- SUnit *SU = &DAG->SUnits[Idx];
- if (!SU->getInstr()->isCopy())
- continue;
+ // Initialize the register pressure tracker used by buildSchedGraph.
+ RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd,
+ /*TrackUntiedDefs=*/true);
+
+ // Account for liveness generate by the region boundary.
+ if (LiveRegionEnd != RegionEnd)
+ RPTracker.recede();
+
+ // Build the DAG, and compute current register pressure.
+ buildSchedGraph(AA, &RPTracker, &SUPressureDiffs);
- constrainLocalCopy(SU, DAG);
+ // Initialize top/bottom trackers after computing region pressure.
+ initRegPressure();
+}
+
+void ScheduleDAGMILive::computeDFSResult() {
+ if (!DFSResult)
+ DFSResult = new SchedDFSResult(/*BottomU*/true, MinSubtreeSize);
+ DFSResult->clear();
+ ScheduledTrees.clear();
+ DFSResult->resize(SUnits.size());
+ DFSResult->compute(SUnits);
+ ScheduledTrees.resize(DFSResult->getNumSubtrees());
+}
+
+/// Compute the max cyclic critical path through the DAG. The scheduling DAG
+/// only provides the critical path for single block loops. To handle loops that
+/// span blocks, we could use the vreg path latencies provided by
+/// MachineTraceMetrics instead. However, MachineTraceMetrics is not currently
+/// available for use in the scheduler.
+///
+/// The cyclic path estimation identifies a def-use pair that crosses the back
+/// edge and considers the depth and height of the nodes. For example, consider
+/// the following instruction sequence where each instruction has unit latency
+/// and defines an epomymous virtual register:
+///
+/// a->b(a,c)->c(b)->d(c)->exit
+///
+/// The cyclic critical path is a two cycles: b->c->b
+/// The acyclic critical path is four cycles: a->b->c->d->exit
+/// LiveOutHeight = height(c) = len(c->d->exit) = 2
+/// LiveOutDepth = depth(c) + 1 = len(a->b->c) + 1 = 3
+/// LiveInHeight = height(b) + 1 = len(b->c->d->exit) + 1 = 4
+/// LiveInDepth = depth(b) = len(a->b) = 1
+///
+/// LiveOutDepth - LiveInDepth = 3 - 1 = 2
+/// LiveInHeight - LiveOutHeight = 4 - 2 = 2
+/// CyclicCriticalPath = min(2, 2) = 2
+///
+/// This could be relevant to PostRA scheduling, but is currently implemented
+/// assuming LiveIntervals.
+unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
+ // This only applies to single block loop.
+ if (!BB->isSuccessor(BB))
+ return 0;
+
+ unsigned MaxCyclicLatency = 0;
+ // Visit each live out vreg def to find def/use pairs that cross iterations.
+ ArrayRef<unsigned> LiveOuts = RPTracker.getPressure().LiveOutRegs;
+ for (ArrayRef<unsigned>::iterator RI = LiveOuts.begin(), RE = LiveOuts.end();
+ RI != RE; ++RI) {
+ unsigned Reg = *RI;
+ if (!TRI->isVirtualRegister(Reg))
+ continue;
+ const LiveInterval &LI = LIS->getInterval(Reg);
+ const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
+ if (!DefVNI)
+ continue;
+
+ MachineInstr *DefMI = LIS->getInstructionFromIndex(DefVNI->def);
+ const SUnit *DefSU = getSUnit(DefMI);
+ if (!DefSU)
+ continue;
+
+ unsigned LiveOutHeight = DefSU->getHeight();
+ unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency;
+ // Visit all local users of the vreg def.
+ for (VReg2UseMap::iterator
+ UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
+ if (UI->SU == &ExitSU)
+ continue;
+
+ // Only consider uses of the phi.
+ LiveQueryResult LRQ =
+ LI.Query(LIS->getInstructionIndex(UI->SU->getInstr()));
+ if (!LRQ.valueIn()->isPHIDef())
+ continue;
+
+ // Assume that a path spanning two iterations is a cycle, which could
+ // overestimate in strange cases. This allows cyclic latency to be
+ // estimated as the minimum slack of the vreg's depth or height.
+ unsigned CyclicLatency = 0;
+ if (LiveOutDepth > UI->SU->getDepth())
+ CyclicLatency = LiveOutDepth - UI->SU->getDepth();
+
+ unsigned LiveInHeight = UI->SU->getHeight() + DefSU->Latency;
+ if (LiveInHeight > LiveOutHeight) {
+ if (LiveInHeight - LiveOutHeight < CyclicLatency)
+ CyclicLatency = LiveInHeight - LiveOutHeight;
+ }
+ else
+ CyclicLatency = 0;
+
+ DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU("
+ << UI->SU->NodeNum << ") = " << CyclicLatency << "c\n");
+ if (CyclicLatency > MaxCyclicLatency)
+ MaxCyclicLatency = CyclicLatency;
+ }
+ }
+ DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n");
+ return MaxCyclicLatency;
+}
+
+/// Move an instruction and update register pressure.
+void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
+ // Move the instruction to its new location in the instruction stream.
+ MachineInstr *MI = SU->getInstr();
+
+ if (IsTopNode) {
+ assert(SU->isTopReady() && "node still has unscheduled dependencies");
+ if (&*CurrentTop == MI)
+ CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
+ else {
+ moveInstruction(MI, CurrentTop);
+ TopRPTracker.setPos(MI);
+ }
+
+ if (ShouldTrackPressure) {
+ // Update top scheduled pressure.
+ TopRPTracker.advance();
+ assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
+ updateScheduledPressure(SU, TopRPTracker.getPressure().MaxSetPressure);
+ }
+ }
+ else {
+ assert(SU->isBottomReady() && "node still has unscheduled dependencies");
+ MachineBasicBlock::iterator priorII =
+ priorNonDebug(CurrentBottom, CurrentTop);
+ if (&*priorII == MI)
+ CurrentBottom = priorII;
+ else {
+ if (&*CurrentTop == MI) {
+ CurrentTop = nextIfDebug(++CurrentTop, priorII);
+ TopRPTracker.setPos(CurrentTop);
+ }
+ moveInstruction(MI, CurrentBottom);
+ CurrentBottom = MI;
+ }
+ if (ShouldTrackPressure) {
+ // Update bottom scheduled pressure.
+ SmallVector<unsigned, 8> LiveUses;
+ BotRPTracker.recede(&LiveUses);
+ assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
+ updateScheduledPressure(SU, BotRPTracker.getPressure().MaxSetPressure);
+ updatePressureDiffs(LiveUses);
+ }
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// LoadClusterMutation - DAG post-processing to cluster loads.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Post-process the DAG to create cluster edges between neighboring
+/// loads.
+class LoadClusterMutation : public ScheduleDAGMutation {
+ struct LoadInfo {
+ SUnit *SU;
+ unsigned BaseReg;
+ unsigned Offset;
+ LoadInfo(SUnit *su, unsigned reg, unsigned ofs)
+ : SU(su), BaseReg(reg), Offset(ofs) {}
+ };
+ static bool LoadInfoLess(const LoadClusterMutation::LoadInfo &LHS,
+ const LoadClusterMutation::LoadInfo &RHS);
+
+ const TargetInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+public:
+ LoadClusterMutation(const TargetInstrInfo *tii,
+ const TargetRegisterInfo *tri)
+ : TII(tii), TRI(tri) {}
+
+ virtual void apply(ScheduleDAGMI *DAG);
+protected:
+ void clusterNeighboringLoads(ArrayRef<SUnit*> Loads, ScheduleDAGMI *DAG);
+};
+} // anonymous
+
+bool LoadClusterMutation::LoadInfoLess(
+ const LoadClusterMutation::LoadInfo &LHS,
+ const LoadClusterMutation::LoadInfo &RHS) {
+ if (LHS.BaseReg != RHS.BaseReg)
+ return LHS.BaseReg < RHS.BaseReg;
+ return LHS.Offset < RHS.Offset;
+}
+
+void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads,
+ ScheduleDAGMI *DAG) {
+ SmallVector<LoadClusterMutation::LoadInfo,32> LoadRecords;
+ for (unsigned Idx = 0, End = Loads.size(); Idx != End; ++Idx) {
+ SUnit *SU = Loads[Idx];
+ unsigned BaseReg;
+ unsigned Offset;
+ if (TII->getLdStBaseRegImmOfs(SU->getInstr(), BaseReg, Offset, TRI))
+ LoadRecords.push_back(LoadInfo(SU, BaseReg, Offset));
+ }
+ if (LoadRecords.size() < 2)
+ return;
+ std::sort(LoadRecords.begin(), LoadRecords.end(), LoadInfoLess);
+ unsigned ClusterLength = 1;
+ for (unsigned Idx = 0, End = LoadRecords.size(); Idx < (End - 1); ++Idx) {
+ if (LoadRecords[Idx].BaseReg != LoadRecords[Idx+1].BaseReg) {
+ ClusterLength = 1;
+ continue;
+ }
+
+ SUnit *SUa = LoadRecords[Idx].SU;
+ SUnit *SUb = LoadRecords[Idx+1].SU;
+ if (TII->shouldClusterLoads(SUa->getInstr(), SUb->getInstr(), ClusterLength)
+ && DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
+
+ DEBUG(dbgs() << "Cluster loads SU(" << SUa->NodeNum << ") - SU("
+ << SUb->NodeNum << ")\n");
+ // Copy successor edges from SUa to SUb. Interleaving computation
+ // dependent on SUa can prevent load combining due to register reuse.
+ // Predecessor edges do not need to be copied from SUb to SUa since nearby
+ // loads should have effectively the same inputs.
+ for (SUnit::const_succ_iterator
+ SI = SUa->Succs.begin(), SE = SUa->Succs.end(); SI != SE; ++SI) {
+ if (SI->getSUnit() == SUb)
+ continue;
+ DEBUG(dbgs() << " Copy Succ SU(" << SI->getSUnit()->NodeNum << ")\n");
+ DAG->addEdge(SI->getSUnit(), SDep(SUb, SDep::Artificial));
+ }
+ ++ClusterLength;
+ }
+ else
+ ClusterLength = 1;
+ }
+}
+
+/// \brief Callback from DAG postProcessing to create cluster edges for loads.
+void LoadClusterMutation::apply(ScheduleDAGMI *DAG) {
+ // Map DAG NodeNum to store chain ID.
+ DenseMap<unsigned, unsigned> StoreChainIDs;
+ // Map each store chain to a set of dependent loads.
+ SmallVector<SmallVector<SUnit*,4>, 32> StoreChainDependents;
+ for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
+ SUnit *SU = &DAG->SUnits[Idx];
+ if (!SU->getInstr()->mayLoad())
+ continue;
+ unsigned ChainPredID = DAG->SUnits.size();
+ for (SUnit::const_pred_iterator
+ PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) {
+ if (PI->isCtrl()) {
+ ChainPredID = PI->getSUnit()->NodeNum;
+ break;
+ }
+ }
+ // Check if this chain-like pred has been seen
+ // before. ChainPredID==MaxNodeID for loads at the top of the schedule.
+ unsigned NumChains = StoreChainDependents.size();
+ std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result =
+ StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains));
+ if (Result.second)
+ StoreChainDependents.resize(NumChains + 1);
+ StoreChainDependents[Result.first->second].push_back(SU);
+ }
+ // Iterate over the store chains.
+ for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx)
+ clusterNeighboringLoads(StoreChainDependents[Idx], DAG);
+}
+
+//===----------------------------------------------------------------------===//
+// MacroFusion - DAG post-processing to encourage fusion of macro ops.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Post-process the DAG to create cluster edges between instructions
+/// that may be fused by the processor into a single operation.
+class MacroFusion : public ScheduleDAGMutation {
+ const TargetInstrInfo *TII;
+public:
+ MacroFusion(const TargetInstrInfo *tii): TII(tii) {}
+
+ virtual void apply(ScheduleDAGMI *DAG);
+};
+} // anonymous
+
+/// \brief Callback from DAG postProcessing to create cluster edges to encourage
+/// fused operations.
+void MacroFusion::apply(ScheduleDAGMI *DAG) {
+ // For now, assume targets can only fuse with the branch.
+ MachineInstr *Branch = DAG->ExitSU.getInstr();
+ if (!Branch)
+ return;
+
+ for (unsigned Idx = DAG->SUnits.size(); Idx > 0;) {
+ SUnit *SU = &DAG->SUnits[--Idx];
+ if (!TII->shouldScheduleAdjacent(SU->getInstr(), Branch))
+ continue;
+
+ // Create a single weak edge from SU to ExitSU. The only effect is to cause
+ // bottom-up scheduling to heavily prioritize the clustered SU. There is no
+ // need to copy predecessor edges from ExitSU to SU, since top-down
+ // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
+ // of SU, we could create an artificial edge from the deepest root, but it
+ // hasn't been needed yet.
+ bool Success = DAG->addEdge(&DAG->ExitSU, SDep(SU, SDep::Cluster));
+ (void)Success;
+ assert(Success && "No DAG nodes should be reachable from ExitSU");
+
+ DEBUG(dbgs() << "Macro Fuse SU(" << SU->NodeNum << ")\n");
+ break;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// CopyConstrain - DAG post-processing to encourage copy elimination.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Post-process the DAG to create weak edges from all uses of a copy to
+/// the one use that defines the copy's source vreg, most likely an induction
+/// variable increment.
+class CopyConstrain : public ScheduleDAGMutation {
+ // Transient state.
+ SlotIndex RegionBeginIdx;
+ // RegionEndIdx is the slot index of the last non-debug instruction in the
+ // scheduling region. So we may have RegionBeginIdx == RegionEndIdx.
+ SlotIndex RegionEndIdx;
+public:
+ CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {}
+
+ virtual void apply(ScheduleDAGMI *DAG);
+
+protected:
+ void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG);
+};
+} // anonymous
+
+/// constrainLocalCopy handles two possibilities:
+/// 1) Local src:
+/// I0: = dst
+/// I1: src = ...
+/// I2: = dst
+/// I3: dst = src (copy)
+/// (create pred->succ edges I0->I1, I2->I1)
+///
+/// 2) Local copy:
+/// I0: dst = src (copy)
+/// I1: = dst
+/// I2: src = ...
+/// I3: = dst
+/// (create pred->succ edges I1->I2, I3->I2)
+///
+/// Although the MachineScheduler is currently constrained to single blocks,
+/// this algorithm should handle extended blocks. An EBB is a set of
+/// contiguously numbered blocks such that the previous block in the EBB is
+/// always the single predecessor.
+void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {
+ LiveIntervals *LIS = DAG->getLIS();
+ MachineInstr *Copy = CopySU->getInstr();
+
+ // Check for pure vreg copies.
+ unsigned SrcReg = Copy->getOperand(1).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+ return;
+
+ unsigned DstReg = Copy->getOperand(0).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+ return;
+
+ // Check if either the dest or source is local. If it's live across a back
+ // edge, it's not local. Note that if both vregs are live across the back
+ // edge, we cannot successfully contrain the copy without cyclic scheduling.
+ unsigned LocalReg = DstReg;
+ unsigned GlobalReg = SrcReg;
+ LiveInterval *LocalLI = &LIS->getInterval(LocalReg);
+ if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) {
+ LocalReg = SrcReg;
+ GlobalReg = DstReg;
+ LocalLI = &LIS->getInterval(LocalReg);
+ if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx))
+ return;
+ }
+ LiveInterval *GlobalLI = &LIS->getInterval(GlobalReg);
+
+ // Find the global segment after the start of the local LI.
+ LiveInterval::iterator GlobalSegment = GlobalLI->find(LocalLI->beginIndex());
+ // If GlobalLI does not overlap LocalLI->start, then a copy directly feeds a
+ // local live range. We could create edges from other global uses to the local
+ // start, but the coalescer should have already eliminated these cases, so
+ // don't bother dealing with it.
+ if (GlobalSegment == GlobalLI->end())
+ return;
+
+ // If GlobalSegment is killed at the LocalLI->start, the call to find()
+ // returned the next global segment. But if GlobalSegment overlaps with
+ // LocalLI->start, then advance to the next segement. If a hole in GlobalLI
+ // exists in LocalLI's vicinity, GlobalSegment will be the end of the hole.
+ if (GlobalSegment->contains(LocalLI->beginIndex()))
+ ++GlobalSegment;
+
+ if (GlobalSegment == GlobalLI->end())
+ return;
+
+ // Check if GlobalLI contains a hole in the vicinity of LocalLI.
+ if (GlobalSegment != GlobalLI->begin()) {
+ // Two address defs have no hole.
+ if (SlotIndex::isSameInstr(llvm::prior(GlobalSegment)->end,
+ GlobalSegment->start)) {
+ return;
+ }
+ // If the prior global segment may be defined by the same two-address
+ // instruction that also defines LocalLI, then can't make a hole here.
+ if (SlotIndex::isSameInstr(llvm::prior(GlobalSegment)->start,
+ LocalLI->beginIndex())) {
+ return;
+ }
+ // If GlobalLI has a prior segment, it must be live into the EBB. Otherwise
+ // it would be a disconnected component in the live range.
+ assert(llvm::prior(GlobalSegment)->start < LocalLI->beginIndex() &&
+ "Disconnected LRG within the scheduling region.");
+ }
+ MachineInstr *GlobalDef = LIS->getInstructionFromIndex(GlobalSegment->start);
+ if (!GlobalDef)
+ return;
+
+ SUnit *GlobalSU = DAG->getSUnit(GlobalDef);
+ if (!GlobalSU)
+ return;
+
+ // GlobalDef is the bottom of the GlobalLI hole. Open the hole by
+ // constraining the uses of the last local def to precede GlobalDef.
+ SmallVector<SUnit*,8> LocalUses;
+ const VNInfo *LastLocalVN = LocalLI->getVNInfoBefore(LocalLI->endIndex());
+ MachineInstr *LastLocalDef = LIS->getInstructionFromIndex(LastLocalVN->def);
+ SUnit *LastLocalSU = DAG->getSUnit(LastLocalDef);
+ for (SUnit::const_succ_iterator
+ I = LastLocalSU->Succs.begin(), E = LastLocalSU->Succs.end();
+ I != E; ++I) {
+ if (I->getKind() != SDep::Data || I->getReg() != LocalReg)
+ continue;
+ if (I->getSUnit() == GlobalSU)
+ continue;
+ if (!DAG->canAddEdge(GlobalSU, I->getSUnit()))
+ return;
+ LocalUses.push_back(I->getSUnit());
+ }
+ // Open the top of the GlobalLI hole by constraining any earlier global uses
+ // to precede the start of LocalLI.
+ SmallVector<SUnit*,8> GlobalUses;
+ MachineInstr *FirstLocalDef =
+ LIS->getInstructionFromIndex(LocalLI->beginIndex());
+ SUnit *FirstLocalSU = DAG->getSUnit(FirstLocalDef);
+ for (SUnit::const_pred_iterator
+ I = GlobalSU->Preds.begin(), E = GlobalSU->Preds.end(); I != E; ++I) {
+ if (I->getKind() != SDep::Anti || I->getReg() != GlobalReg)
+ continue;
+ if (I->getSUnit() == FirstLocalSU)
+ continue;
+ if (!DAG->canAddEdge(FirstLocalSU, I->getSUnit()))
+ return;
+ GlobalUses.push_back(I->getSUnit());
+ }
+ DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n");
+ // Add the weak edges.
+ for (SmallVectorImpl<SUnit*>::const_iterator
+ I = LocalUses.begin(), E = LocalUses.end(); I != E; ++I) {
+ DEBUG(dbgs() << " Local use SU(" << (*I)->NodeNum << ") -> SU("
+ << GlobalSU->NodeNum << ")\n");
+ DAG->addEdge(GlobalSU, SDep(*I, SDep::Weak));
+ }
+ for (SmallVectorImpl<SUnit*>::const_iterator
+ I = GlobalUses.begin(), E = GlobalUses.end(); I != E; ++I) {
+ DEBUG(dbgs() << " Global use SU(" << (*I)->NodeNum << ") -> SU("
+ << FirstLocalSU->NodeNum << ")\n");
+ DAG->addEdge(FirstLocalSU, SDep(*I, SDep::Weak));
+ }
+}
+
+/// \brief Callback from DAG postProcessing to create weak edges to encourage
+/// copy elimination.
+void CopyConstrain::apply(ScheduleDAGMI *DAG) {
+ assert(DAG->hasVRegLiveness() && "Expect VRegs with LiveIntervals");
+
+ MachineBasicBlock::iterator FirstPos = nextIfDebug(DAG->begin(), DAG->end());
+ if (FirstPos == DAG->end())
+ return;
+ RegionBeginIdx = DAG->getLIS()->getInstructionIndex(&*FirstPos);
+ RegionEndIdx = DAG->getLIS()->getInstructionIndex(
+ &*priorNonDebug(DAG->end(), DAG->begin()));
+
+ for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
+ SUnit *SU = &DAG->SUnits[Idx];
+ if (!SU->getInstr()->isCopy())
+ continue;
+
+ constrainLocalCopy(SU, static_cast<ScheduleDAGMILive*>(DAG));
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// MachineSchedStrategy helpers used by GenericScheduler, GenericPostScheduler
+// and possibly other custom schedulers.
+//===----------------------------------------------------------------------===//
+
+static const unsigned InvalidCycle = ~0U;
+
+SchedBoundary::~SchedBoundary() { delete HazardRec; }
+
+void SchedBoundary::reset() {
+ // A new HazardRec is created for each DAG and owned by SchedBoundary.
+ // Destroying and reconstructing it is very expensive though. So keep
+ // invalid, placeholder HazardRecs.
+ if (HazardRec && HazardRec->isEnabled()) {
+ delete HazardRec;
+ HazardRec = 0;
+ }
+ Available.clear();
+ Pending.clear();
+ CheckPending = false;
+ NextSUs.clear();
+ CurrCycle = 0;
+ CurrMOps = 0;
+ MinReadyCycle = UINT_MAX;
+ ExpectedLatency = 0;
+ DependentLatency = 0;
+ RetiredMOps = 0;
+ MaxExecutedResCount = 0;
+ ZoneCritResIdx = 0;
+ IsResourceLimited = false;
+ ReservedCycles.clear();
+#ifndef NDEBUG
+ // Track the maximum number of stall cycles that could arise either from the
+ // latency of a DAG edge or the number of cycles that a processor resource is
+ // reserved (SchedBoundary::ReservedCycles).
+ MaxObservedLatency = 0;
+#endif
+ // Reserve a zero-count for invalid CritResIdx.
+ ExecutedResCounts.resize(1);
+ assert(!ExecutedResCounts[0] && "nonzero count for bad resource");
+}
+
+void SchedRemainder::
+init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
+ reset();
+ if (!SchedModel->hasInstrSchedModel())
+ return;
+ RemainingCounts.resize(SchedModel->getNumProcResourceKinds());
+ for (std::vector<SUnit>::iterator
+ I = DAG->SUnits.begin(), E = DAG->SUnits.end(); I != E; ++I) {
+ const MCSchedClassDesc *SC = DAG->getSchedClass(&*I);
+ RemIssueCount += SchedModel->getNumMicroOps(I->getInstr(), SC)
+ * SchedModel->getMicroOpFactor();
+ for (TargetSchedModel::ProcResIter
+ PI = SchedModel->getWriteProcResBegin(SC),
+ PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+ unsigned PIdx = PI->ProcResourceIdx;
+ unsigned Factor = SchedModel->getResourceFactor(PIdx);
+ RemainingCounts[PIdx] += (Factor * PI->Cycles);
+ }
+ }
+}
+
+void SchedBoundary::
+init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
+ reset();
+ DAG = dag;
+ SchedModel = smodel;
+ Rem = rem;
+ if (SchedModel->hasInstrSchedModel()) {
+ ExecutedResCounts.resize(SchedModel->getNumProcResourceKinds());
+ ReservedCycles.resize(SchedModel->getNumProcResourceKinds(), InvalidCycle);
+ }
+}
+
+/// Compute the stall cycles based on this SUnit's ready time. Heuristics treat
+/// these "soft stalls" differently than the hard stall cycles based on CPU
+/// resources and computed by checkHazard(). A fully in-order model
+/// (MicroOpBufferSize==0) will not make use of this since instructions are not
+/// available for scheduling until they are ready. However, a weaker in-order
+/// model may use this for heuristics. For example, if a processor has in-order
+/// behavior when reading certain resources, this may come into play.
+unsigned SchedBoundary::getLatencyStallCycles(SUnit *SU) {
+ if (!SU->isUnbuffered)
+ return 0;
+
+ unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
+ if (ReadyCycle > CurrCycle)
+ return ReadyCycle - CurrCycle;
+ return 0;
+}
+
+/// Compute the next cycle at which the given processor resource can be
+/// scheduled.
+unsigned SchedBoundary::
+getNextResourceCycle(unsigned PIdx, unsigned Cycles) {
+ unsigned NextUnreserved = ReservedCycles[PIdx];
+ // If this resource has never been used, always return cycle zero.
+ if (NextUnreserved == InvalidCycle)
+ return 0;
+ // For bottom-up scheduling add the cycles needed for the current operation.
+ if (!isTop())
+ NextUnreserved += Cycles;
+ return NextUnreserved;
+}
+
+/// Does this SU have a hazard within the current instruction group.
+///
+/// The scheduler supports two modes of hazard recognition. The first is the
+/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
+/// supports highly complicated in-order reservation tables
+/// (ScoreboardHazardRecognizer) and arbitraty target-specific logic.
+///
+/// The second is a streamlined mechanism that checks for hazards based on
+/// simple counters that the scheduler itself maintains. It explicitly checks
+/// for instruction dispatch limitations, including the number of micro-ops that
+/// can dispatch per cycle.
+///
+/// TODO: Also check whether the SU must start a new group.
+bool SchedBoundary::checkHazard(SUnit *SU) {
+ if (HazardRec->isEnabled()
+ && HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) {
+ return true;
+ }
+ unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
+ if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) {
+ DEBUG(dbgs() << " SU(" << SU->NodeNum << ") uops="
+ << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
+ return true;
+ }
+ if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
+ const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+ for (TargetSchedModel::ProcResIter
+ PI = SchedModel->getWriteProcResBegin(SC),
+ PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+ if (getNextResourceCycle(PI->ProcResourceIdx, PI->Cycles) > CurrCycle)
+ return true;
+ }
+ }
+ return false;
+}
+
+// Find the unscheduled node in ReadySUs with the highest latency.
+unsigned SchedBoundary::
+findMaxLatency(ArrayRef<SUnit*> ReadySUs) {
+ SUnit *LateSU = 0;
+ unsigned RemLatency = 0;
+ for (ArrayRef<SUnit*>::iterator I = ReadySUs.begin(), E = ReadySUs.end();
+ I != E; ++I) {
+ unsigned L = getUnscheduledLatency(*I);
+ if (L > RemLatency) {
+ RemLatency = L;
+ LateSU = *I;
+ }
+ }
+ if (LateSU) {
+ DEBUG(dbgs() << Available.getName() << " RemLatency SU("
+ << LateSU->NodeNum << ") " << RemLatency << "c\n");
+ }
+ return RemLatency;
+}
+
+// Count resources in this zone and the remaining unscheduled
+// instruction. Return the max count, scaled. Set OtherCritIdx to the critical
+// resource index, or zero if the zone is issue limited.
+unsigned SchedBoundary::
+getOtherResourceCount(unsigned &OtherCritIdx) {
+ OtherCritIdx = 0;
+ if (!SchedModel->hasInstrSchedModel())
+ return 0;
+
+ unsigned OtherCritCount = Rem->RemIssueCount
+ + (RetiredMOps * SchedModel->getMicroOpFactor());
+ DEBUG(dbgs() << " " << Available.getName() << " + Remain MOps: "
+ << OtherCritCount / SchedModel->getMicroOpFactor() << '\n');
+ for (unsigned PIdx = 1, PEnd = SchedModel->getNumProcResourceKinds();
+ PIdx != PEnd; ++PIdx) {
+ unsigned OtherCount = getResourceCount(PIdx) + Rem->RemainingCounts[PIdx];
+ if (OtherCount > OtherCritCount) {
+ OtherCritCount = OtherCount;
+ OtherCritIdx = PIdx;
+ }
+ }
+ if (OtherCritIdx) {
+ DEBUG(dbgs() << " " << Available.getName() << " + Remain CritRes: "
+ << OtherCritCount / SchedModel->getResourceFactor(OtherCritIdx)
+ << " " << SchedModel->getResourceName(OtherCritIdx) << "\n");
+ }
+ return OtherCritCount;
+}
+
+void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle) {
+ if (ReadyCycle < MinReadyCycle)
+ MinReadyCycle = ReadyCycle;
+
+ // Check for interlocks first. For the purpose of other heuristics, an
+ // instruction that cannot issue appears as if it's not in the ReadyQueue.
+ bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0;
+ if ((!IsBuffered && ReadyCycle > CurrCycle) || checkHazard(SU))
+ Pending.push(SU);
+ else
+ Available.push(SU);
+
+ // Record this node as an immediate dependent of the scheduled node.
+ NextSUs.insert(SU);
+}
+
+void SchedBoundary::releaseTopNode(SUnit *SU) {
+ if (SU->isScheduled)
+ return;
+
+ for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+ I != E; ++I) {
+ if (I->isWeak())
+ continue;
+ unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle;
+ unsigned Latency = I->getLatency();
+#ifndef NDEBUG
+ MaxObservedLatency = std::max(Latency, MaxObservedLatency);
+#endif
+ if (SU->TopReadyCycle < PredReadyCycle + Latency)
+ SU->TopReadyCycle = PredReadyCycle + Latency;
+ }
+ releaseNode(SU, SU->TopReadyCycle);
+}
+
+void SchedBoundary::releaseBottomNode(SUnit *SU) {
+ if (SU->isScheduled)
+ return;
+
+ assert(SU->getInstr() && "Scheduled SUnit must have instr");
+
+ for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+ I != E; ++I) {
+ if (I->isWeak())
+ continue;
+ unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
+ unsigned Latency = I->getLatency();
+#ifndef NDEBUG
+ MaxObservedLatency = std::max(Latency, MaxObservedLatency);
+#endif
+ if (SU->BotReadyCycle < SuccReadyCycle + Latency)
+ SU->BotReadyCycle = SuccReadyCycle + Latency;
+ }
+ releaseNode(SU, SU->BotReadyCycle);
+}
+
+/// Move the boundary of scheduled code by one cycle.
+void SchedBoundary::bumpCycle(unsigned NextCycle) {
+ if (SchedModel->getMicroOpBufferSize() == 0) {
+ assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
+ if (MinReadyCycle > NextCycle)
+ NextCycle = MinReadyCycle;
+ }
+ // Update the current micro-ops, which will issue in the next cycle.
+ unsigned DecMOps = SchedModel->getIssueWidth() * (NextCycle - CurrCycle);
+ CurrMOps = (CurrMOps <= DecMOps) ? 0 : CurrMOps - DecMOps;
+
+ // Decrement DependentLatency based on the next cycle.
+ if ((NextCycle - CurrCycle) > DependentLatency)
+ DependentLatency = 0;
+ else
+ DependentLatency -= (NextCycle - CurrCycle);
+
+ if (!HazardRec->isEnabled()) {
+ // Bypass HazardRec virtual calls.
+ CurrCycle = NextCycle;
+ }
+ else {
+ // Bypass getHazardType calls in case of long latency.
+ for (; CurrCycle != NextCycle; ++CurrCycle) {
+ if (isTop())
+ HazardRec->AdvanceCycle();
+ else
+ HazardRec->RecedeCycle();
+ }
+ }
+ CheckPending = true;
+ unsigned LFactor = SchedModel->getLatencyFactor();
+ IsResourceLimited =
+ (int)(getCriticalCount() - (getScheduledLatency() * LFactor))
+ > (int)LFactor;
+
+ DEBUG(dbgs() << "Cycle: " << CurrCycle << ' ' << Available.getName() << '\n');
+}
+
+void SchedBoundary::incExecutedResources(unsigned PIdx, unsigned Count) {
+ ExecutedResCounts[PIdx] += Count;
+ if (ExecutedResCounts[PIdx] > MaxExecutedResCount)
+ MaxExecutedResCount = ExecutedResCounts[PIdx];
+}
+
+/// Add the given processor resource to this scheduled zone.
+///
+/// \param Cycles indicates the number of consecutive (non-pipelined) cycles
+/// during which this resource is consumed.
+///
+/// \return the next cycle at which the instruction may execute without
+/// oversubscribing resources.
+unsigned SchedBoundary::
+countResource(unsigned PIdx, unsigned Cycles, unsigned NextCycle) {
+ unsigned Factor = SchedModel->getResourceFactor(PIdx);
+ unsigned Count = Factor * Cycles;
+ DEBUG(dbgs() << " " << SchedModel->getResourceName(PIdx)
+ << " +" << Cycles << "x" << Factor << "u\n");
+
+ // Update Executed resources counts.
+ incExecutedResources(PIdx, Count);
+ assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted");
+ Rem->RemainingCounts[PIdx] -= Count;
+
+ // Check if this resource exceeds the current critical resource. If so, it
+ // becomes the critical resource.
+ if (ZoneCritResIdx != PIdx && (getResourceCount(PIdx) > getCriticalCount())) {
+ ZoneCritResIdx = PIdx;
+ DEBUG(dbgs() << " *** Critical resource "
+ << SchedModel->getResourceName(PIdx) << ": "
+ << getResourceCount(PIdx) / SchedModel->getLatencyFactor() << "c\n");
+ }
+ // For reserved resources, record the highest cycle using the resource.
+ unsigned NextAvailable = getNextResourceCycle(PIdx, Cycles);
+ if (NextAvailable > CurrCycle) {
+ DEBUG(dbgs() << " Resource conflict: "
+ << SchedModel->getProcResource(PIdx)->Name << " reserved until @"
+ << NextAvailable << "\n");
+ }
+ return NextAvailable;
+}
+
+/// Move the boundary of scheduled code by one SUnit.
+void SchedBoundary::bumpNode(SUnit *SU) {
+ // Update the reservation table.
+ if (HazardRec->isEnabled()) {
+ if (!isTop() && SU->isCall) {
+ // Calls are scheduled with their preceding instructions. For bottom-up
+ // scheduling, clear the pipeline state before emitting.
+ HazardRec->Reset();
+ }
+ HazardRec->EmitInstruction(SU);
+ }
+ // checkHazard should prevent scheduling multiple instructions per cycle that
+ // exceed the issue width.
+ const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+ unsigned IncMOps = SchedModel->getNumMicroOps(SU->getInstr());
+ assert(
+ (CurrMOps == 0 || (CurrMOps + IncMOps) <= SchedModel->getIssueWidth()) &&
+ "Cannot schedule this instruction's MicroOps in the current cycle.");
+
+ unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
+ DEBUG(dbgs() << " Ready @" << ReadyCycle << "c\n");
+
+ unsigned NextCycle = CurrCycle;
+ switch (SchedModel->getMicroOpBufferSize()) {
+ case 0:
+ assert(ReadyCycle <= CurrCycle && "Broken PendingQueue");
+ break;
+ case 1:
+ if (ReadyCycle > NextCycle) {
+ NextCycle = ReadyCycle;
+ DEBUG(dbgs() << " *** Stall until: " << ReadyCycle << "\n");
+ }
+ break;
+ default:
+ // We don't currently model the OOO reorder buffer, so consider all
+ // scheduled MOps to be "retired". We do loosely model in-order resource
+ // latency. If this instruction uses an in-order resource, account for any
+ // likely stall cycles.
+ if (SU->isUnbuffered && ReadyCycle > NextCycle)
+ NextCycle = ReadyCycle;
+ break;
+ }
+ RetiredMOps += IncMOps;
+
+ // Update resource counts and critical resource.
+ if (SchedModel->hasInstrSchedModel()) {
+ unsigned DecRemIssue = IncMOps * SchedModel->getMicroOpFactor();
+ assert(Rem->RemIssueCount >= DecRemIssue && "MOps double counted");
+ Rem->RemIssueCount -= DecRemIssue;
+ if (ZoneCritResIdx) {
+ // Scale scheduled micro-ops for comparing with the critical resource.
+ unsigned ScaledMOps =
+ RetiredMOps * SchedModel->getMicroOpFactor();
+
+ // If scaled micro-ops are now more than the previous critical resource by
+ // a full cycle, then micro-ops issue becomes critical.
+ if ((int)(ScaledMOps - getResourceCount(ZoneCritResIdx))
+ >= (int)SchedModel->getLatencyFactor()) {
+ ZoneCritResIdx = 0;
+ DEBUG(dbgs() << " *** Critical resource NumMicroOps: "
+ << ScaledMOps / SchedModel->getLatencyFactor() << "c\n");
+ }
+ }
+ for (TargetSchedModel::ProcResIter
+ PI = SchedModel->getWriteProcResBegin(SC),
+ PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+ unsigned RCycle =
+ countResource(PI->ProcResourceIdx, PI->Cycles, NextCycle);
+ if (RCycle > NextCycle)
+ NextCycle = RCycle;
+ }
+ if (SU->hasReservedResource) {
+ // For reserved resources, record the highest cycle using the resource.
+ // For top-down scheduling, this is the cycle in which we schedule this
+ // instruction plus the number of cycles the operations reserves the
+ // resource. For bottom-up is it simply the instruction's cycle.
+ for (TargetSchedModel::ProcResIter
+ PI = SchedModel->getWriteProcResBegin(SC),
+ PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+ unsigned PIdx = PI->ProcResourceIdx;
+ if (SchedModel->getProcResource(PIdx)->BufferSize == 0) {
+ ReservedCycles[PIdx] = isTop() ? NextCycle + PI->Cycles : NextCycle;
+#ifndef NDEBUG
+ MaxObservedLatency = std::max(PI->Cycles, MaxObservedLatency);
+#endif
+ }
+ }
+ }
+ }
+ // Update ExpectedLatency and DependentLatency.
+ unsigned &TopLatency = isTop() ? ExpectedLatency : DependentLatency;
+ unsigned &BotLatency = isTop() ? DependentLatency : ExpectedLatency;
+ if (SU->getDepth() > TopLatency) {
+ TopLatency = SU->getDepth();
+ DEBUG(dbgs() << " " << Available.getName()
+ << " TopLatency SU(" << SU->NodeNum << ") " << TopLatency << "c\n");
+ }
+ if (SU->getHeight() > BotLatency) {
+ BotLatency = SU->getHeight();
+ DEBUG(dbgs() << " " << Available.getName()
+ << " BotLatency SU(" << SU->NodeNum << ") " << BotLatency << "c\n");
+ }
+ // If we stall for any reason, bump the cycle.
+ if (NextCycle > CurrCycle) {
+ bumpCycle(NextCycle);
+ }
+ else {
+ // After updating ZoneCritResIdx and ExpectedLatency, check if we're
+ // resource limited. If a stall occured, bumpCycle does this.
+ unsigned LFactor = SchedModel->getLatencyFactor();
+ IsResourceLimited =
+ (int)(getCriticalCount() - (getScheduledLatency() * LFactor))
+ > (int)LFactor;
+ }
+ // Update CurrMOps after calling bumpCycle to handle stalls, since bumpCycle
+ // resets CurrMOps. Loop to handle instructions with more MOps than issue in
+ // one cycle. Since we commonly reach the max MOps here, opportunistically
+ // bump the cycle to avoid uselessly checking everything in the readyQ.
+ CurrMOps += IncMOps;
+ while (CurrMOps >= SchedModel->getIssueWidth()) {
+ DEBUG(dbgs() << " *** Max MOps " << CurrMOps
+ << " at cycle " << CurrCycle << '\n');
+ bumpCycle(++NextCycle);
+ }
+ DEBUG(dumpScheduledState());
+}
+
+/// Release pending ready nodes in to the available queue. This makes them
+/// visible to heuristics.
+void SchedBoundary::releasePending() {
+ // If the available queue is empty, it is safe to reset MinReadyCycle.
+ if (Available.empty())
+ MinReadyCycle = UINT_MAX;
+
+ // Check to see if any of the pending instructions are ready to issue. If
+ // so, add them to the available queue.
+ bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0;
+ for (unsigned i = 0, e = Pending.size(); i != e; ++i) {
+ SUnit *SU = *(Pending.begin()+i);
+ unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
+
+ if (ReadyCycle < MinReadyCycle)
+ MinReadyCycle = ReadyCycle;
+
+ if (!IsBuffered && ReadyCycle > CurrCycle)
+ continue;
+
+ if (checkHazard(SU))
+ continue;
+
+ Available.push(SU);
+ Pending.remove(Pending.begin()+i);
+ --i; --e;
+ }
+ DEBUG(if (!Pending.empty()) Pending.dump());
+ CheckPending = false;
+}
+
+/// Remove SU from the ready set for this boundary.
+void SchedBoundary::removeReady(SUnit *SU) {
+ if (Available.isInQueue(SU))
+ Available.remove(Available.find(SU));
+ else {
+ assert(Pending.isInQueue(SU) && "bad ready count");
+ Pending.remove(Pending.find(SU));
+ }
+}
+
+/// If this queue only has one ready candidate, return it. As a side effect,
+/// defer any nodes that now hit a hazard, and advance the cycle until at least
+/// one node is ready. If multiple instructions are ready, return NULL.
+SUnit *SchedBoundary::pickOnlyChoice() {
+ if (CheckPending)
+ releasePending();
+
+ if (CurrMOps > 0) {
+ // Defer any ready instrs that now have a hazard.
+ for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) {
+ if (checkHazard(*I)) {
+ Pending.push(*I);
+ I = Available.remove(I);
+ continue;
+ }
+ ++I;
+ }
+ }
+ for (unsigned i = 0; Available.empty(); ++i) {
+ assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedLatency) &&
+ "permanent hazard"); (void)i;
+ bumpCycle(CurrCycle + 1);
+ releasePending();
+ }
+ if (Available.size() == 1)
+ return *Available.begin();
+ return NULL;
+}
+
+#ifndef NDEBUG
+// This is useful information to dump after bumpNode.
+// Note that the Queue contents are more useful before pickNodeFromQueue.
+void SchedBoundary::dumpScheduledState() {
+ unsigned ResFactor;
+ unsigned ResCount;
+ if (ZoneCritResIdx) {
+ ResFactor = SchedModel->getResourceFactor(ZoneCritResIdx);
+ ResCount = getResourceCount(ZoneCritResIdx);
+ }
+ else {
+ ResFactor = SchedModel->getMicroOpFactor();
+ ResCount = RetiredMOps * SchedModel->getMicroOpFactor();
}
+ unsigned LFactor = SchedModel->getLatencyFactor();
+ dbgs() << Available.getName() << " @" << CurrCycle << "c\n"
+ << " Retired: " << RetiredMOps;
+ dbgs() << "\n Executed: " << getExecutedCount() / LFactor << "c";
+ dbgs() << "\n Critical: " << ResCount / LFactor << "c, "
+ << ResCount / ResFactor << " "
+ << SchedModel->getResourceName(ZoneCritResIdx)
+ << "\n ExpectedLatency: " << ExpectedLatency << "c\n"
+ << (IsResourceLimited ? " - Resource" : " - Latency")
+ << " limited.\n";
}
+#endif
//===----------------------------------------------------------------------===//
-// ConvergingScheduler - Implementation of the generic MachineSchedStrategy.
+// GenericScheduler - Generic implementation of MachineSchedStrategy.
//===----------------------------------------------------------------------===//
namespace {
-/// ConvergingScheduler shrinks the unscheduled zone using heuristics to balance
-/// the schedule.
-class ConvergingScheduler : public MachineSchedStrategy {
+/// Base class for GenericScheduler. This class maintains information about
+/// scheduling candidates based on TargetSchedModel making it easy to implement
+/// heuristics for either preRA or postRA scheduling.
+class GenericSchedulerBase : public MachineSchedStrategy {
public:
/// Represent the type of SchedCandidate found within a single queue.
/// pickNodeBidirectional depends on these listed by decreasing priority.
enum CandReason {
- NoCand, PhysRegCopy, RegExcess, RegCritical, Cluster, Weak, RegMax,
+ NoCand, PhysRegCopy, RegExcess, RegCritical, Stall, Cluster, Weak, RegMax,
ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder};
#ifndef NDEBUG
- static const char *getReasonStr(ConvergingScheduler::CandReason Reason);
+ static const char *getReasonStr(GenericSchedulerBase::CandReason Reason);
#endif
/// Policy for scheduling the next instruction in the candidate's zone.
}
};
- /// Store the state used by ConvergingScheduler heuristics, required for the
+ /// Store the state used by GenericScheduler heuristics, required for the
/// lifetime of one invocation of pickNode().
struct SchedCandidate {
CandPolicy Policy;
void setRepeat(CandReason R) { RepeatReasonSet |= (1 << R); }
void initResourceDelta(const ScheduleDAGMI *DAG,
- const TargetSchedModel *SchedModel);
- };
-
- /// Summarize the unscheduled region.
- struct SchedRemainder {
- // Critical path through the DAG in expected latency.
- unsigned CriticalPath;
- unsigned CyclicCritPath;
-
- // Scaled count of micro-ops left to schedule.
- unsigned RemIssueCount;
-
- bool IsAcyclicLatencyLimited;
-
- // Unscheduled resources
- SmallVector<unsigned, 16> RemainingCounts;
-
- void reset() {
- CriticalPath = 0;
- CyclicCritPath = 0;
- RemIssueCount = 0;
- IsAcyclicLatencyLimited = false;
- RemainingCounts.clear();
- }
-
- SchedRemainder() { reset(); }
-
- void init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel);
- };
-
- /// Each Scheduling boundary is associated with ready queues. It tracks the
- /// current cycle in the direction of movement, and maintains the state
- /// of "hazards" and other interlocks at the current cycle.
- struct SchedBoundary {
- ScheduleDAGMI *DAG;
- const TargetSchedModel *SchedModel;
- SchedRemainder *Rem;
-
- ReadyQueue Available;
- ReadyQueue Pending;
- bool CheckPending;
-
- // For heuristics, keep a list of the nodes that immediately depend on the
- // most recently scheduled node.
- SmallPtrSet<const SUnit*, 8> NextSUs;
-
- ScheduleHazardRecognizer *HazardRec;
-
- /// Number of cycles it takes to issue the instructions scheduled in this
- /// zone. It is defined as: scheduled-micro-ops / issue-width + stalls.
- /// See getStalls().
- unsigned CurrCycle;
-
- /// Micro-ops issued in the current cycle
- unsigned CurrMOps;
-
- /// MinReadyCycle - Cycle of the soonest available instruction.
- unsigned MinReadyCycle;
-
- // The expected latency of the critical path in this scheduled zone.
- unsigned ExpectedLatency;
-
- // The latency of dependence chains leading into this zone.
- // For each node scheduled bottom-up: DLat = max DLat, N.Depth.
- // For each cycle scheduled: DLat -= 1.
- unsigned DependentLatency;
-
- /// Count the scheduled (issued) micro-ops that can be retired by
- /// time=CurrCycle assuming the first scheduled instr is retired at time=0.
- unsigned RetiredMOps;
-
- // Count scheduled resources that have been executed. Resources are
- // considered executed if they become ready in the time that it takes to
- // saturate any resource including the one in question. Counts are scaled
- // for direct comparison with other resources. Counts can be compared with
- // MOps * getMicroOpFactor and Latency * getLatencyFactor.
- SmallVector<unsigned, 16> ExecutedResCounts;
-
- /// Cache the max count for a single resource.
- unsigned MaxExecutedResCount;
-
- // Cache the critical resources ID in this scheduled zone.
- unsigned ZoneCritResIdx;
-
- // Is the scheduled region resource limited vs. latency limited.
- bool IsResourceLimited;
-
-#ifndef NDEBUG
- // Remember the greatest operand latency as an upper bound on the number of
- // times we should retry the pending queue because of a hazard.
- unsigned MaxObservedLatency;
-#endif
-
- void reset() {
- // A new HazardRec is created for each DAG and owned by SchedBoundary.
- delete HazardRec;
-
- Available.clear();
- Pending.clear();
- CheckPending = false;
- NextSUs.clear();
- HazardRec = 0;
- CurrCycle = 0;
- CurrMOps = 0;
- MinReadyCycle = UINT_MAX;
- ExpectedLatency = 0;
- DependentLatency = 0;
- RetiredMOps = 0;
- MaxExecutedResCount = 0;
- ZoneCritResIdx = 0;
- IsResourceLimited = false;
-#ifndef NDEBUG
- MaxObservedLatency = 0;
-#endif
- // Reserve a zero-count for invalid CritResIdx.
- ExecutedResCounts.resize(1);
- assert(!ExecutedResCounts[0] && "nonzero count for bad resource");
- }
-
- /// Pending queues extend the ready queues with the same ID and the
- /// PendingFlag set.
- SchedBoundary(unsigned ID, const Twine &Name):
- DAG(0), SchedModel(0), Rem(0), Available(ID, Name+".A"),
- Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P"),
- HazardRec(0) {
- reset();
- }
-
- ~SchedBoundary() { delete HazardRec; }
-
- void init(ScheduleDAGMI *dag, const TargetSchedModel *smodel,
- SchedRemainder *rem);
-
- bool isTop() const {
- return Available.getID() == ConvergingScheduler::TopQID;
- }
-
-#ifndef NDEBUG
- const char *getResourceName(unsigned PIdx) {
- if (!PIdx)
- return "MOps";
- return SchedModel->getProcResource(PIdx)->Name;
- }
-#endif
-
- /// Get the number of latency cycles "covered" by the scheduled
- /// instructions. This is the larger of the critical path within the zone
- /// and the number of cycles required to issue the instructions.
- unsigned getScheduledLatency() const {
- return std::max(ExpectedLatency, CurrCycle);
- }
-
- unsigned getUnscheduledLatency(SUnit *SU) const {
- return isTop() ? SU->getHeight() : SU->getDepth();
- }
-
- unsigned getResourceCount(unsigned ResIdx) const {
- return ExecutedResCounts[ResIdx];
- }
-
- /// Get the scaled count of scheduled micro-ops and resources, including
- /// executed resources.
- unsigned getCriticalCount() const {
- if (!ZoneCritResIdx)
- return RetiredMOps * SchedModel->getMicroOpFactor();
- return getResourceCount(ZoneCritResIdx);
- }
-
- /// Get a scaled count for the minimum execution time of the scheduled
- /// micro-ops that are ready to execute by getExecutedCount. Notice the
- /// feedback loop.
- unsigned getExecutedCount() const {
- return std::max(CurrCycle * SchedModel->getLatencyFactor(),
- MaxExecutedResCount);
- }
-
- bool checkHazard(SUnit *SU);
-
- unsigned findMaxLatency(ArrayRef<SUnit*> ReadySUs);
-
- unsigned getOtherResourceCount(unsigned &OtherCritIdx);
-
- void setPolicy(CandPolicy &Policy, SchedBoundary &OtherZone);
-
- void releaseNode(SUnit *SU, unsigned ReadyCycle);
-
- void bumpCycle(unsigned NextCycle);
-
- void incExecutedResources(unsigned PIdx, unsigned Count);
-
- unsigned countResource(unsigned PIdx, unsigned Cycles, unsigned ReadyCycle);
-
- void bumpNode(SUnit *SU);
-
- void releasePending();
-
- void removeReady(SUnit *SU);
-
- SUnit *pickOnlyChoice();
-
-#ifndef NDEBUG
- void dumpScheduledState();
-#endif
- };
-
-private:
- ScheduleDAGMI *DAG;
- const TargetSchedModel *SchedModel;
- const TargetRegisterInfo *TRI;
-
- // State of the top and bottom scheduled instruction boundaries.
- SchedRemainder Rem;
- SchedBoundary Top;
- SchedBoundary Bot;
-
-public:
- /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both)
- enum {
- TopQID = 1,
- BotQID = 2,
- LogMaxQID = 2
- };
-
- ConvergingScheduler():
- DAG(0), SchedModel(0), TRI(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
-
- virtual void initialize(ScheduleDAGMI *dag);
-
- virtual SUnit *pickNode(bool &IsTopNode);
-
- virtual void schedNode(SUnit *SU, bool IsTopNode);
-
- virtual void releaseTopNode(SUnit *SU);
-
- virtual void releaseBottomNode(SUnit *SU);
-
- virtual void registerRoots();
-
-protected:
- void checkAcyclicLatency();
-
- void tryCandidate(SchedCandidate &Cand,
- SchedCandidate &TryCand,
- SchedBoundary &Zone,
- const RegPressureTracker &RPTracker,
- RegPressureTracker &TempTracker);
-
- SUnit *pickNodeBidirectional(bool &IsTopNode);
-
- void pickNodeFromQueue(SchedBoundary &Zone,
- const RegPressureTracker &RPTracker,
- SchedCandidate &Candidate);
-
- void reschedulePhysRegCopies(SUnit *SU, bool isTop);
-
-#ifndef NDEBUG
- void traceCandidate(const SchedCandidate &Cand);
-#endif
-};
-} // namespace
-
-void ConvergingScheduler::SchedRemainder::
-init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
- reset();
- if (!SchedModel->hasInstrSchedModel())
- return;
- RemainingCounts.resize(SchedModel->getNumProcResourceKinds());
- for (std::vector<SUnit>::iterator
- I = DAG->SUnits.begin(), E = DAG->SUnits.end(); I != E; ++I) {
- const MCSchedClassDesc *SC = DAG->getSchedClass(&*I);
- RemIssueCount += SchedModel->getNumMicroOps(I->getInstr(), SC)
- * SchedModel->getMicroOpFactor();
- for (TargetSchedModel::ProcResIter
- PI = SchedModel->getWriteProcResBegin(SC),
- PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
- unsigned PIdx = PI->ProcResourceIdx;
- unsigned Factor = SchedModel->getResourceFactor(PIdx);
- RemainingCounts[PIdx] += (Factor * PI->Cycles);
- }
- }
-}
-
-void ConvergingScheduler::SchedBoundary::
-init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
- reset();
- DAG = dag;
- SchedModel = smodel;
- Rem = rem;
- if (SchedModel->hasInstrSchedModel())
- ExecutedResCounts.resize(SchedModel->getNumProcResourceKinds());
-}
-
-void ConvergingScheduler::initialize(ScheduleDAGMI *dag) {
- DAG = dag;
- SchedModel = DAG->getSchedModel();
- TRI = DAG->TRI;
-
- Rem.init(DAG, SchedModel);
- Top.init(DAG, SchedModel, &Rem);
- Bot.init(DAG, SchedModel, &Rem);
-
- // Initialize resource counts.
-
- // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
- // are disabled, then these HazardRecs will be disabled.
- const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
- const TargetMachine &TM = DAG->MF.getTarget();
- Top.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
- Bot.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
-
- assert((!ForceTopDown || !ForceBottomUp) &&
- "-misched-topdown incompatible with -misched-bottomup");
-}
-
-void ConvergingScheduler::releaseTopNode(SUnit *SU) {
- if (SU->isScheduled)
- return;
-
- for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
- I != E; ++I) {
- if (I->isWeak())
- continue;
- unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle;
- unsigned Latency = I->getLatency();
-#ifndef NDEBUG
- Top.MaxObservedLatency = std::max(Latency, Top.MaxObservedLatency);
-#endif
- if (SU->TopReadyCycle < PredReadyCycle + Latency)
- SU->TopReadyCycle = PredReadyCycle + Latency;
- }
- Top.releaseNode(SU, SU->TopReadyCycle);
-}
-
-void ConvergingScheduler::releaseBottomNode(SUnit *SU) {
- if (SU->isScheduled)
- return;
-
- assert(SU->getInstr() && "Scheduled SUnit must have instr");
-
- for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
- I != E; ++I) {
- if (I->isWeak())
- continue;
- unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
- unsigned Latency = I->getLatency();
-#ifndef NDEBUG
- Bot.MaxObservedLatency = std::max(Latency, Bot.MaxObservedLatency);
-#endif
- if (SU->BotReadyCycle < SuccReadyCycle + Latency)
- SU->BotReadyCycle = SuccReadyCycle + Latency;
- }
- Bot.releaseNode(SU, SU->BotReadyCycle);
-}
-
-/// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
-/// critical path by more cycles than it takes to drain the instruction buffer.
-/// We estimate an upper bounds on in-flight instructions as:
-///
-/// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height )
-/// InFlightIterations = AcyclicPath / CyclesPerIteration
-/// InFlightResources = InFlightIterations * LoopResources
-///
-/// TODO: Check execution resources in addition to IssueCount.
-void ConvergingScheduler::checkAcyclicLatency() {
- if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath)
- return;
-
- // Scaled number of cycles per loop iteration.
- unsigned IterCount =
- std::max(Rem.CyclicCritPath * SchedModel->getLatencyFactor(),
- Rem.RemIssueCount);
- // Scaled acyclic critical path.
- unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor();
- // InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop
- unsigned InFlightCount =
- (AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount;
- unsigned BufferLimit =
- SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor();
-
- Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit;
-
- DEBUG(dbgs() << "IssueCycles="
- << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c "
- << "IterCycles=" << IterCount / SchedModel->getLatencyFactor()
- << "c NumIters=" << (AcyclicCount + IterCount-1) / IterCount
- << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor()
- << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n";
- if (Rem.IsAcyclicLatencyLimited)
- dbgs() << " ACYCLIC LATENCY LIMIT\n");
-}
-
-void ConvergingScheduler::registerRoots() {
- Rem.CriticalPath = DAG->ExitSU.getDepth();
-
- // Some roots may not feed into ExitSU. Check all of them in case.
- for (std::vector<SUnit*>::const_iterator
- I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {
- if ((*I)->getDepth() > Rem.CriticalPath)
- Rem.CriticalPath = (*I)->getDepth();
- }
- DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n');
-
- if (EnableCyclicPath) {
- Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
- checkAcyclicLatency();
- }
-}
+ const TargetSchedModel *SchedModel);
+ };
-/// Does this SU have a hazard within the current instruction group.
-///
-/// The scheduler supports two modes of hazard recognition. The first is the
-/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
-/// supports highly complicated in-order reservation tables
-/// (ScoreboardHazardRecognizer) and arbitraty target-specific logic.
-///
-/// The second is a streamlined mechanism that checks for hazards based on
-/// simple counters that the scheduler itself maintains. It explicitly checks
-/// for instruction dispatch limitations, including the number of micro-ops that
-/// can dispatch per cycle.
-///
-/// TODO: Also check whether the SU must start a new group.
-bool ConvergingScheduler::SchedBoundary::checkHazard(SUnit *SU) {
- if (HazardRec->isEnabled())
- return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
+protected:
+ const MachineSchedContext *Context;
+ const TargetSchedModel *SchedModel;
+ const TargetRegisterInfo *TRI;
- unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
- if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) {
- DEBUG(dbgs() << " SU(" << SU->NodeNum << ") uops="
- << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
- return true;
- }
- return false;
-}
+ SchedRemainder Rem;
+protected:
+ GenericSchedulerBase(const MachineSchedContext *C):
+ Context(C), SchedModel(0), TRI(0) {}
-// Find the unscheduled node in ReadySUs with the highest latency.
-unsigned ConvergingScheduler::SchedBoundary::
-findMaxLatency(ArrayRef<SUnit*> ReadySUs) {
- SUnit *LateSU = 0;
- unsigned RemLatency = 0;
- for (ArrayRef<SUnit*>::iterator I = ReadySUs.begin(), E = ReadySUs.end();
- I != E; ++I) {
- unsigned L = getUnscheduledLatency(*I);
- if (L > RemLatency) {
- RemLatency = L;
- LateSU = *I;
- }
- }
- if (LateSU) {
- DEBUG(dbgs() << Available.getName() << " RemLatency SU("
- << LateSU->NodeNum << ") " << RemLatency << "c\n");
- }
- return RemLatency;
-}
+ void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone,
+ SchedBoundary *OtherZone);
-// Count resources in this zone and the remaining unscheduled
-// instruction. Return the max count, scaled. Set OtherCritIdx to the critical
-// resource index, or zero if the zone is issue limited.
-unsigned ConvergingScheduler::SchedBoundary::
-getOtherResourceCount(unsigned &OtherCritIdx) {
- OtherCritIdx = 0;
- if (!SchedModel->hasInstrSchedModel())
- return 0;
+#ifndef NDEBUG
+ void traceCandidate(const SchedCandidate &Cand);
+#endif
+};
+} // namespace
- unsigned OtherCritCount = Rem->RemIssueCount
- + (RetiredMOps * SchedModel->getMicroOpFactor());
- DEBUG(dbgs() << " " << Available.getName() << " + Remain MOps: "
- << OtherCritCount / SchedModel->getMicroOpFactor() << '\n');
- for (unsigned PIdx = 1, PEnd = SchedModel->getNumProcResourceKinds();
- PIdx != PEnd; ++PIdx) {
- unsigned OtherCount = getResourceCount(PIdx) + Rem->RemainingCounts[PIdx];
- if (OtherCount > OtherCritCount) {
- OtherCritCount = OtherCount;
- OtherCritIdx = PIdx;
- }
- }
- if (OtherCritIdx) {
- DEBUG(dbgs() << " " << Available.getName() << " + Remain CritRes: "
- << OtherCritCount / SchedModel->getResourceFactor(OtherCritIdx)
- << " " << getResourceName(OtherCritIdx) << "\n");
+void GenericSchedulerBase::SchedCandidate::
+initResourceDelta(const ScheduleDAGMI *DAG,
+ const TargetSchedModel *SchedModel) {
+ if (!Policy.ReduceResIdx && !Policy.DemandResIdx)
+ return;
+
+ const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+ for (TargetSchedModel::ProcResIter
+ PI = SchedModel->getWriteProcResBegin(SC),
+ PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+ if (PI->ProcResourceIdx == Policy.ReduceResIdx)
+ ResDelta.CritResources += PI->Cycles;
+ if (PI->ProcResourceIdx == Policy.DemandResIdx)
+ ResDelta.DemandedResources += PI->Cycles;
}
- return OtherCritCount;
}
-/// Set the CandPolicy for this zone given the current resources and latencies
-/// inside and outside the zone.
-void ConvergingScheduler::SchedBoundary::setPolicy(CandPolicy &Policy,
- SchedBoundary &OtherZone) {
- // Now that potential stalls have been considered, apply preemptive heuristics
- // based on the the total latency and resources inside and outside this
- // zone.
+/// Set the CandPolicy given a scheduling zone given the current resources and
+/// latencies inside and outside the zone.
+void GenericSchedulerBase::setPolicy(CandPolicy &Policy,
+ bool IsPostRA,
+ SchedBoundary &CurrZone,
+ SchedBoundary *OtherZone) {
+ // Apply preemptive heuristics based on the the total latency and resources
+ // inside and outside this zone. Potential stalls should be considered before
+ // following this policy.
// Compute remaining latency. We need this both to determine whether the
// overall schedule has become latency-limited and whether the instructions
// ILat = max N.depth for N in Available|Pending
//
// RemainingLatency is the greater of independent and dependent latency.
- unsigned RemLatency = DependentLatency;
- RemLatency = std::max(RemLatency, findMaxLatency(Available.elements()));
- RemLatency = std::max(RemLatency, findMaxLatency(Pending.elements()));
+ unsigned RemLatency = CurrZone.getDependentLatency();
+ RemLatency = std::max(RemLatency,
+ CurrZone.findMaxLatency(CurrZone.Available.elements()));
+ RemLatency = std::max(RemLatency,
+ CurrZone.findMaxLatency(CurrZone.Pending.elements()));
// Compute the critical resource outside the zone.
- unsigned OtherCritIdx;
- unsigned OtherCount = OtherZone.getOtherResourceCount(OtherCritIdx);
+ unsigned OtherCritIdx = 0;
+ unsigned OtherCount =
+ OtherZone ? OtherZone->getOtherResourceCount(OtherCritIdx) : 0;
bool OtherResLimited = false;
if (SchedModel->hasInstrSchedModel()) {
unsigned LFactor = SchedModel->getLatencyFactor();
OtherResLimited = (int)(OtherCount - (RemLatency * LFactor)) > (int)LFactor;
}
- if (!OtherResLimited && (RemLatency + CurrCycle > Rem->CriticalPath)) {
- Policy.ReduceLatency |= true;
- DEBUG(dbgs() << " " << Available.getName() << " RemainingLatency "
- << RemLatency << " + " << CurrCycle << "c > CritPath "
- << Rem->CriticalPath << "\n");
+ // Schedule aggressively for latency in PostRA mode. We don't check for
+ // acyclic latency during PostRA, and highly out-of-order processors will
+ // skip PostRA scheduling.
+ if (!OtherResLimited) {
+ if (IsPostRA || (RemLatency + CurrZone.getCurrCycle() > Rem.CriticalPath)) {
+ Policy.ReduceLatency |= true;
+ DEBUG(dbgs() << " " << CurrZone.Available.getName()
+ << " RemainingLatency " << RemLatency << " + "
+ << CurrZone.getCurrCycle() << "c > CritPath "
+ << Rem.CriticalPath << "\n");
+ }
}
// If the same resource is limiting inside and outside the zone, do nothing.
- if (ZoneCritResIdx == OtherCritIdx)
+ if (CurrZone.getZoneCritResIdx() == OtherCritIdx)
return;
DEBUG(
- if (IsResourceLimited) {
- dbgs() << " " << Available.getName() << " ResourceLimited: "
- << getResourceName(ZoneCritResIdx) << "\n";
+ if (CurrZone.isResourceLimited()) {
+ dbgs() << " " << CurrZone.Available.getName() << " ResourceLimited: "
+ << SchedModel->getResourceName(CurrZone.getZoneCritResIdx())
+ << "\n";
}
if (OtherResLimited)
- dbgs() << " RemainingLimit: " << getResourceName(OtherCritIdx) << "\n";
- if (!IsResourceLimited && !OtherResLimited)
+ dbgs() << " RemainingLimit: "
+ << SchedModel->getResourceName(OtherCritIdx) << "\n";
+ if (!CurrZone.isResourceLimited() && !OtherResLimited)
dbgs() << " Latency limited both directions.\n");
- if (IsResourceLimited && !Policy.ReduceResIdx)
- Policy.ReduceResIdx = ZoneCritResIdx;
+ if (CurrZone.isResourceLimited() && !Policy.ReduceResIdx)
+ Policy.ReduceResIdx = CurrZone.getZoneCritResIdx();
if (OtherResLimited)
Policy.DemandResIdx = OtherCritIdx;
}
-void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU,
- unsigned ReadyCycle) {
- if (ReadyCycle < MinReadyCycle)
- MinReadyCycle = ReadyCycle;
-
- // Check for interlocks first. For the purpose of other heuristics, an
- // instruction that cannot issue appears as if it's not in the ReadyQueue.
- bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0;
- if ((!IsBuffered && ReadyCycle > CurrCycle) || checkHazard(SU))
- Pending.push(SU);
- else
- Available.push(SU);
-
- // Record this node as an immediate dependent of the scheduled node.
- NextSUs.insert(SU);
+#ifndef NDEBUG
+const char *GenericSchedulerBase::getReasonStr(
+ GenericSchedulerBase::CandReason Reason) {
+ switch (Reason) {
+ case NoCand: return "NOCAND ";
+ case PhysRegCopy: return "PREG-COPY";
+ case RegExcess: return "REG-EXCESS";
+ case RegCritical: return "REG-CRIT ";
+ case Stall: return "STALL ";
+ case Cluster: return "CLUSTER ";
+ case Weak: return "WEAK ";
+ case RegMax: return "REG-MAX ";
+ case ResourceReduce: return "RES-REDUCE";
+ case ResourceDemand: return "RES-DEMAND";
+ case TopDepthReduce: return "TOP-DEPTH ";
+ case TopPathReduce: return "TOP-PATH ";
+ case BotHeightReduce:return "BOT-HEIGHT";
+ case BotPathReduce: return "BOT-PATH ";
+ case NextDefUse: return "DEF-USE ";
+ case NodeOrder: return "ORDER ";
+ };
+ llvm_unreachable("Unknown reason!");
}
-/// Move the boundary of scheduled code by one cycle.
-void ConvergingScheduler::SchedBoundary::bumpCycle(unsigned NextCycle) {
- if (SchedModel->getMicroOpBufferSize() == 0) {
- assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
- if (MinReadyCycle > NextCycle)
- NextCycle = MinReadyCycle;
+void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) {
+ PressureChange P;
+ unsigned ResIdx = 0;
+ unsigned Latency = 0;
+ switch (Cand.Reason) {
+ default:
+ break;
+ case RegExcess:
+ P = Cand.RPDelta.Excess;
+ break;
+ case RegCritical:
+ P = Cand.RPDelta.CriticalMax;
+ break;
+ case RegMax:
+ P = Cand.RPDelta.CurrentMax;
+ break;
+ case ResourceReduce:
+ ResIdx = Cand.Policy.ReduceResIdx;
+ break;
+ case ResourceDemand:
+ ResIdx = Cand.Policy.DemandResIdx;
+ break;
+ case TopDepthReduce:
+ Latency = Cand.SU->getDepth();
+ break;
+ case TopPathReduce:
+ Latency = Cand.SU->getHeight();
+ break;
+ case BotHeightReduce:
+ Latency = Cand.SU->getHeight();
+ break;
+ case BotPathReduce:
+ Latency = Cand.SU->getDepth();
+ break;
}
- // Update the current micro-ops, which will issue in the next cycle.
- unsigned DecMOps = SchedModel->getIssueWidth() * (NextCycle - CurrCycle);
- CurrMOps = (CurrMOps <= DecMOps) ? 0 : CurrMOps - DecMOps;
-
- // Decrement DependentLatency based on the next cycle.
- if ((NextCycle - CurrCycle) > DependentLatency)
- DependentLatency = 0;
+ dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
+ if (P.isValid())
+ dbgs() << " " << TRI->getRegPressureSetName(P.getPSet())
+ << ":" << P.getUnitInc() << " ";
else
- DependentLatency -= (NextCycle - CurrCycle);
-
- if (!HazardRec->isEnabled()) {
- // Bypass HazardRec virtual calls.
- CurrCycle = NextCycle;
- }
- else {
- // Bypass getHazardType calls in case of long latency.
- for (; CurrCycle != NextCycle; ++CurrCycle) {
- if (isTop())
- HazardRec->AdvanceCycle();
- else
- HazardRec->RecedeCycle();
- }
- }
- CheckPending = true;
- unsigned LFactor = SchedModel->getLatencyFactor();
- IsResourceLimited =
- (int)(getCriticalCount() - (getScheduledLatency() * LFactor))
- > (int)LFactor;
-
- DEBUG(dbgs() << "Cycle: " << CurrCycle << ' ' << Available.getName() << '\n');
-}
-
-void ConvergingScheduler::SchedBoundary::incExecutedResources(unsigned PIdx,
- unsigned Count) {
- ExecutedResCounts[PIdx] += Count;
- if (ExecutedResCounts[PIdx] > MaxExecutedResCount)
- MaxExecutedResCount = ExecutedResCounts[PIdx];
-}
-
-/// Add the given processor resource to this scheduled zone.
-///
-/// \param Cycles indicates the number of consecutive (non-pipelined) cycles
-/// during which this resource is consumed.
-///
-/// \return the next cycle at which the instruction may execute without
-/// oversubscribing resources.
-unsigned ConvergingScheduler::SchedBoundary::
-countResource(unsigned PIdx, unsigned Cycles, unsigned ReadyCycle) {
- unsigned Factor = SchedModel->getResourceFactor(PIdx);
- unsigned Count = Factor * Cycles;
- DEBUG(dbgs() << " " << getResourceName(PIdx)
- << " +" << Cycles << "x" << Factor << "u\n");
-
- // Update Executed resources counts.
- incExecutedResources(PIdx, Count);
- assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted");
- Rem->RemainingCounts[PIdx] -= Count;
-
- // Check if this resource exceeds the current critical resource. If so, it
- // becomes the critical resource.
- if (ZoneCritResIdx != PIdx && (getResourceCount(PIdx) > getCriticalCount())) {
- ZoneCritResIdx = PIdx;
- DEBUG(dbgs() << " *** Critical resource "
- << getResourceName(PIdx) << ": "
- << getResourceCount(PIdx) / SchedModel->getLatencyFactor() << "c\n");
- }
- // TODO: We don't yet model reserved resources. It's not hard though.
- return CurrCycle;
+ dbgs() << " ";
+ if (ResIdx)
+ dbgs() << " " << SchedModel->getProcResource(ResIdx)->Name << " ";
+ else
+ dbgs() << " ";
+ if (Latency)
+ dbgs() << " " << Latency << " cycles ";
+ else
+ dbgs() << " ";
+ dbgs() << '\n';
}
+#endif
-/// Move the boundary of scheduled code by one SUnit.
-void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) {
- // Update the reservation table.
- if (HazardRec->isEnabled()) {
- if (!isTop() && SU->isCall) {
- // Calls are scheduled with their preceding instructions. For bottom-up
- // scheduling, clear the pipeline state before emitting.
- HazardRec->Reset();
- }
- HazardRec->EmitInstruction(SU);
+/// Return true if this heuristic determines order.
+static bool tryLess(int TryVal, int CandVal,
+ GenericSchedulerBase::SchedCandidate &TryCand,
+ GenericSchedulerBase::SchedCandidate &Cand,
+ GenericSchedulerBase::CandReason Reason) {
+ if (TryVal < CandVal) {
+ TryCand.Reason = Reason;
+ return true;
}
- const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
- unsigned IncMOps = SchedModel->getNumMicroOps(SU->getInstr());
- CurrMOps += IncMOps;
- // checkHazard prevents scheduling multiple instructions per cycle that exceed
- // issue width. However, we commonly reach the maximum. In this case
- // opportunistically bump the cycle to avoid uselessly checking everything in
- // the readyQ. Furthermore, a single instruction may produce more than one
- // cycle's worth of micro-ops.
- //
- // TODO: Also check if this SU must end a dispatch group.
- unsigned NextCycle = CurrCycle;
- if (CurrMOps >= SchedModel->getIssueWidth()) {
- ++NextCycle;
- DEBUG(dbgs() << " *** Max MOps " << CurrMOps
- << " at cycle " << CurrCycle << '\n');
+ if (TryVal > CandVal) {
+ if (Cand.Reason > Reason)
+ Cand.Reason = Reason;
+ return true;
}
- unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
- DEBUG(dbgs() << " Ready @" << ReadyCycle << "c\n");
+ Cand.setRepeat(Reason);
+ return false;
+}
- switch (SchedModel->getMicroOpBufferSize()) {
- case 0:
- assert(ReadyCycle <= CurrCycle && "Broken PendingQueue");
- break;
- case 1:
- if (ReadyCycle > NextCycle) {
- NextCycle = ReadyCycle;
- DEBUG(dbgs() << " *** Stall until: " << ReadyCycle << "\n");
- }
- break;
- default:
- // We don't currently model the OOO reorder buffer, so consider all
- // scheduled MOps to be "retired".
- break;
+static bool tryGreater(int TryVal, int CandVal,
+ GenericSchedulerBase::SchedCandidate &TryCand,
+ GenericSchedulerBase::SchedCandidate &Cand,
+ GenericSchedulerBase::CandReason Reason) {
+ if (TryVal > CandVal) {
+ TryCand.Reason = Reason;
+ return true;
}
- RetiredMOps += IncMOps;
-
- // Update resource counts and critical resource.
- if (SchedModel->hasInstrSchedModel()) {
- unsigned DecRemIssue = IncMOps * SchedModel->getMicroOpFactor();
- assert(Rem->RemIssueCount >= DecRemIssue && "MOps double counted");
- Rem->RemIssueCount -= DecRemIssue;
- if (ZoneCritResIdx) {
- // Scale scheduled micro-ops for comparing with the critical resource.
- unsigned ScaledMOps =
- RetiredMOps * SchedModel->getMicroOpFactor();
+ if (TryVal < CandVal) {
+ if (Cand.Reason > Reason)
+ Cand.Reason = Reason;
+ return true;
+ }
+ Cand.setRepeat(Reason);
+ return false;
+}
- // If scaled micro-ops are now more than the previous critical resource by
- // a full cycle, then micro-ops issue becomes critical.
- if ((int)(ScaledMOps - getResourceCount(ZoneCritResIdx))
- >= (int)SchedModel->getLatencyFactor()) {
- ZoneCritResIdx = 0;
- DEBUG(dbgs() << " *** Critical resource NumMicroOps: "
- << ScaledMOps / SchedModel->getLatencyFactor() << "c\n");
- }
- }
- for (TargetSchedModel::ProcResIter
- PI = SchedModel->getWriteProcResBegin(SC),
- PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
- unsigned RCycle =
- countResource(PI->ProcResourceIdx, PI->Cycles, ReadyCycle);
- if (RCycle > NextCycle)
- NextCycle = RCycle;
+static bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
+ GenericSchedulerBase::SchedCandidate &Cand,
+ SchedBoundary &Zone) {
+ if (Zone.isTop()) {
+ if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
+ if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+ TryCand, Cand, GenericSchedulerBase::TopDepthReduce))
+ return true;
}
- }
- // Update ExpectedLatency and DependentLatency.
- unsigned &TopLatency = isTop() ? ExpectedLatency : DependentLatency;
- unsigned &BotLatency = isTop() ? DependentLatency : ExpectedLatency;
- if (SU->getDepth() > TopLatency) {
- TopLatency = SU->getDepth();
- DEBUG(dbgs() << " " << Available.getName()
- << " TopLatency SU(" << SU->NodeNum << ") " << TopLatency << "c\n");
- }
- if (SU->getHeight() > BotLatency) {
- BotLatency = SU->getHeight();
- DEBUG(dbgs() << " " << Available.getName()
- << " BotLatency SU(" << SU->NodeNum << ") " << BotLatency << "c\n");
- }
- // If we stall for any reason, bump the cycle.
- if (NextCycle > CurrCycle) {
- bumpCycle(NextCycle);
+ if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+ TryCand, Cand, GenericSchedulerBase::TopPathReduce))
+ return true;
}
else {
- // After updating ZoneCritResIdx and ExpectedLatency, check if we're
- // resource limited. If a stall occured, bumpCycle does this.
- unsigned LFactor = SchedModel->getLatencyFactor();
- IsResourceLimited =
- (int)(getCriticalCount() - (getScheduledLatency() * LFactor))
- > (int)LFactor;
+ if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
+ if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+ TryCand, Cand, GenericSchedulerBase::BotHeightReduce))
+ return true;
+ }
+ if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+ TryCand, Cand, GenericSchedulerBase::BotPathReduce))
+ return true;
}
- DEBUG(dumpScheduledState());
+ return false;
}
-/// Release pending ready nodes in to the available queue. This makes them
-/// visible to heuristics.
-void ConvergingScheduler::SchedBoundary::releasePending() {
- // If the available queue is empty, it is safe to reset MinReadyCycle.
- if (Available.empty())
- MinReadyCycle = UINT_MAX;
+static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand,
+ bool IsTop) {
+ DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ")
+ << GenericSchedulerBase::getReasonStr(Cand.Reason) << '\n');
+}
- // Check to see if any of the pending instructions are ready to issue. If
- // so, add them to the available queue.
- bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0;
- for (unsigned i = 0, e = Pending.size(); i != e; ++i) {
- SUnit *SU = *(Pending.begin()+i);
- unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
+namespace {
+/// GenericScheduler shrinks the unscheduled zone using heuristics to balance
+/// the schedule.
+class GenericScheduler : public GenericSchedulerBase {
+ ScheduleDAGMILive *DAG;
- if (ReadyCycle < MinReadyCycle)
- MinReadyCycle = ReadyCycle;
+ // State of the top and bottom scheduled instruction boundaries.
+ SchedBoundary Top;
+ SchedBoundary Bot;
- if (!IsBuffered && ReadyCycle > CurrCycle)
- continue;
+ MachineSchedPolicy RegionPolicy;
+public:
+ GenericScheduler(const MachineSchedContext *C):
+ GenericSchedulerBase(C), DAG(0), Top(SchedBoundary::TopQID, "TopQ"),
+ Bot(SchedBoundary::BotQID, "BotQ") {}
- if (checkHazard(SU))
- continue;
+ virtual void initPolicy(MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End,
+ unsigned NumRegionInstrs) LLVM_OVERRIDE;
- Available.push(SU);
- Pending.remove(Pending.begin()+i);
- --i; --e;
+ virtual bool shouldTrackPressure() const LLVM_OVERRIDE {
+ return RegionPolicy.ShouldTrackPressure;
}
- DEBUG(if (!Pending.empty()) Pending.dump());
- CheckPending = false;
-}
-/// Remove SU from the ready set for this boundary.
-void ConvergingScheduler::SchedBoundary::removeReady(SUnit *SU) {
- if (Available.isInQueue(SU))
- Available.remove(Available.find(SU));
- else {
- assert(Pending.isInQueue(SU) && "bad ready count");
- Pending.remove(Pending.find(SU));
+ virtual void initialize(ScheduleDAGMI *dag) LLVM_OVERRIDE;
+
+ virtual SUnit *pickNode(bool &IsTopNode) LLVM_OVERRIDE;
+
+ virtual void schedNode(SUnit *SU, bool IsTopNode) LLVM_OVERRIDE;
+
+ virtual void releaseTopNode(SUnit *SU) LLVM_OVERRIDE {
+ Top.releaseTopNode(SU);
}
-}
-/// If this queue only has one ready candidate, return it. As a side effect,
-/// defer any nodes that now hit a hazard, and advance the cycle until at least
-/// one node is ready. If multiple instructions are ready, return NULL.
-SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() {
- if (CheckPending)
- releasePending();
+ virtual void releaseBottomNode(SUnit *SU) LLVM_OVERRIDE {
+ Bot.releaseBottomNode(SU);
+ }
- if (CurrMOps > 0) {
- // Defer any ready instrs that now have a hazard.
- for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) {
- if (checkHazard(*I)) {
- Pending.push(*I);
- I = Available.remove(I);
- continue;
- }
- ++I;
+ virtual void registerRoots() LLVM_OVERRIDE;
+
+protected:
+ void checkAcyclicLatency();
+
+ void tryCandidate(SchedCandidate &Cand,
+ SchedCandidate &TryCand,
+ SchedBoundary &Zone,
+ const RegPressureTracker &RPTracker,
+ RegPressureTracker &TempTracker);
+
+ SUnit *pickNodeBidirectional(bool &IsTopNode);
+
+ void pickNodeFromQueue(SchedBoundary &Zone,
+ const RegPressureTracker &RPTracker,
+ SchedCandidate &Candidate);
+
+ void reschedulePhysRegCopies(SUnit *SU, bool isTop);
+};
+} // namespace
+
+void GenericScheduler::initialize(ScheduleDAGMI *dag) {
+ assert(dag->hasVRegLiveness() &&
+ "(PreRA)GenericScheduler needs vreg liveness");
+ DAG = static_cast<ScheduleDAGMILive*>(dag);
+ SchedModel = DAG->getSchedModel();
+ TRI = DAG->TRI;
+
+ Rem.init(DAG, SchedModel);
+ Top.init(DAG, SchedModel, &Rem);
+ Bot.init(DAG, SchedModel, &Rem);
+
+ // Initialize resource counts.
+
+ // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
+ // are disabled, then these HazardRecs will be disabled.
+ const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
+ const TargetMachine &TM = DAG->MF.getTarget();
+ if (!Top.HazardRec) {
+ Top.HazardRec =
+ TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+ }
+ if (!Bot.HazardRec) {
+ Bot.HazardRec =
+ TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+ }
+}
+
+/// Initialize the per-region scheduling policy.
+void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End,
+ unsigned NumRegionInstrs) {
+ const TargetMachine &TM = Context->MF->getTarget();
+
+ // Avoid setting up the register pressure tracker for small regions to save
+ // compile time. As a rough heuristic, only track pressure when the number of
+ // schedulable instructions exceeds half the integer register file.
+ RegionPolicy.ShouldTrackPressure = true;
+ unsigned LegalIntVT = MVT::i32;
+ for (; LegalIntVT > (unsigned)MVT::i1; --LegalIntVT) {
+ if (TM.getTargetLowering()->isTypeLegal((MVT::SimpleValueType)LegalIntVT)) {
+ unsigned NIntRegs = Context->RegClassInfo->getNumAllocatableRegs(
+ TM.getTargetLowering()->getRegClassFor(
+ (MVT::SimpleValueType)LegalIntVT));
+ RegionPolicy.ShouldTrackPressure = NumRegionInstrs > (NIntRegs / 2);
}
}
- for (unsigned i = 0; Available.empty(); ++i) {
- assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedLatency) &&
- "permanent hazard"); (void)i;
- bumpCycle(CurrCycle + 1);
- releasePending();
- }
- if (Available.size() == 1)
- return *Available.begin();
- return NULL;
-}
-#ifndef NDEBUG
-// This is useful information to dump after bumpNode.
-// Note that the Queue contents are more useful before pickNodeFromQueue.
-void ConvergingScheduler::SchedBoundary::dumpScheduledState() {
- unsigned ResFactor;
- unsigned ResCount;
- if (ZoneCritResIdx) {
- ResFactor = SchedModel->getResourceFactor(ZoneCritResIdx);
- ResCount = getResourceCount(ZoneCritResIdx);
+ // For generic targets, we default to bottom-up, because it's simpler and more
+ // compile-time optimizations have been implemented in that direction.
+ RegionPolicy.OnlyBottomUp = true;
+
+ // Allow the subtarget to override default policy.
+ const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+ ST.overrideSchedPolicy(RegionPolicy, Begin, End, NumRegionInstrs);
+
+ // After subtarget overrides, apply command line options.
+ if (!EnableRegPressure)
+ RegionPolicy.ShouldTrackPressure = false;
+
+ // Check -misched-topdown/bottomup can force or unforce scheduling direction.
+ // e.g. -misched-bottomup=false allows scheduling in both directions.
+ assert((!ForceTopDown || !ForceBottomUp) &&
+ "-misched-topdown incompatible with -misched-bottomup");
+ if (ForceBottomUp.getNumOccurrences() > 0) {
+ RegionPolicy.OnlyBottomUp = ForceBottomUp;
+ if (RegionPolicy.OnlyBottomUp)
+ RegionPolicy.OnlyTopDown = false;
}
- else {
- ResFactor = SchedModel->getMicroOpFactor();
- ResCount = RetiredMOps * SchedModel->getMicroOpFactor();
+ if (ForceTopDown.getNumOccurrences() > 0) {
+ RegionPolicy.OnlyTopDown = ForceTopDown;
+ if (RegionPolicy.OnlyTopDown)
+ RegionPolicy.OnlyBottomUp = false;
}
- unsigned LFactor = SchedModel->getLatencyFactor();
- dbgs() << Available.getName() << " @" << CurrCycle << "c\n"
- << " Retired: " << RetiredMOps;
- dbgs() << "\n Executed: " << getExecutedCount() / LFactor << "c";
- dbgs() << "\n Critical: " << ResCount / LFactor << "c, "
- << ResCount / ResFactor << " " << getResourceName(ZoneCritResIdx)
- << "\n ExpectedLatency: " << ExpectedLatency << "c\n"
- << (IsResourceLimited ? " - Resource" : " - Latency")
- << " limited.\n";
}
-#endif
-void ConvergingScheduler::SchedCandidate::
-initResourceDelta(const ScheduleDAGMI *DAG,
- const TargetSchedModel *SchedModel) {
- if (!Policy.ReduceResIdx && !Policy.DemandResIdx)
+/// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
+/// critical path by more cycles than it takes to drain the instruction buffer.
+/// We estimate an upper bounds on in-flight instructions as:
+///
+/// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height )
+/// InFlightIterations = AcyclicPath / CyclesPerIteration
+/// InFlightResources = InFlightIterations * LoopResources
+///
+/// TODO: Check execution resources in addition to IssueCount.
+void GenericScheduler::checkAcyclicLatency() {
+ if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath)
return;
- const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
- for (TargetSchedModel::ProcResIter
- PI = SchedModel->getWriteProcResBegin(SC),
- PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
- if (PI->ProcResourceIdx == Policy.ReduceResIdx)
- ResDelta.CritResources += PI->Cycles;
- if (PI->ProcResourceIdx == Policy.DemandResIdx)
- ResDelta.DemandedResources += PI->Cycles;
- }
+ // Scaled number of cycles per loop iteration.
+ unsigned IterCount =
+ std::max(Rem.CyclicCritPath * SchedModel->getLatencyFactor(),
+ Rem.RemIssueCount);
+ // Scaled acyclic critical path.
+ unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor();
+ // InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop
+ unsigned InFlightCount =
+ (AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount;
+ unsigned BufferLimit =
+ SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor();
+
+ Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit;
+
+ DEBUG(dbgs() << "IssueCycles="
+ << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c "
+ << "IterCycles=" << IterCount / SchedModel->getLatencyFactor()
+ << "c NumIters=" << (AcyclicCount + IterCount-1) / IterCount
+ << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor()
+ << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n";
+ if (Rem.IsAcyclicLatencyLimited)
+ dbgs() << " ACYCLIC LATENCY LIMIT\n");
}
+void GenericScheduler::registerRoots() {
+ Rem.CriticalPath = DAG->ExitSU.getDepth();
-/// Return true if this heuristic determines order.
-static bool tryLess(int TryVal, int CandVal,
- ConvergingScheduler::SchedCandidate &TryCand,
- ConvergingScheduler::SchedCandidate &Cand,
- ConvergingScheduler::CandReason Reason) {
- if (TryVal < CandVal) {
- TryCand.Reason = Reason;
- return true;
- }
- if (TryVal > CandVal) {
- if (Cand.Reason > Reason)
- Cand.Reason = Reason;
- return true;
+ // Some roots may not feed into ExitSU. Check all of them in case.
+ for (std::vector<SUnit*>::const_iterator
+ I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {
+ if ((*I)->getDepth() > Rem.CriticalPath)
+ Rem.CriticalPath = (*I)->getDepth();
}
- Cand.setRepeat(Reason);
- return false;
-}
+ DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n');
-static bool tryGreater(int TryVal, int CandVal,
- ConvergingScheduler::SchedCandidate &TryCand,
- ConvergingScheduler::SchedCandidate &Cand,
- ConvergingScheduler::CandReason Reason) {
- if (TryVal > CandVal) {
- TryCand.Reason = Reason;
- return true;
- }
- if (TryVal < CandVal) {
- if (Cand.Reason > Reason)
- Cand.Reason = Reason;
- return true;
+ if (EnableCyclicPath) {
+ Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
+ checkAcyclicLatency();
}
- Cand.setRepeat(Reason);
- return false;
}
static bool tryPressure(const PressureChange &TryP,
const PressureChange &CandP,
- ConvergingScheduler::SchedCandidate &TryCand,
- ConvergingScheduler::SchedCandidate &Cand,
- ConvergingScheduler::CandReason Reason) {
+ GenericSchedulerBase::SchedCandidate &TryCand,
+ GenericSchedulerBase::SchedCandidate &Cand,
+ GenericSchedulerBase::CandReason Reason) {
int TryRank = TryP.getPSetOrMax();
int CandRank = CandP.getPSetOrMax();
// If both candidates affect the same set, go with the smallest increase.
return 0;
}
-static bool tryLatency(ConvergingScheduler::SchedCandidate &TryCand,
- ConvergingScheduler::SchedCandidate &Cand,
- ConvergingScheduler::SchedBoundary &Zone) {
- if (Zone.isTop()) {
- if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
- if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
- TryCand, Cand, ConvergingScheduler::TopDepthReduce))
- return true;
- }
- if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
- TryCand, Cand, ConvergingScheduler::TopPathReduce))
- return true;
- }
- else {
- if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
- if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
- TryCand, Cand, ConvergingScheduler::BotHeightReduce))
- return true;
- }
- if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
- TryCand, Cand, ConvergingScheduler::BotPathReduce))
- return true;
- }
- return false;
-}
-
/// Apply a set of heursitics to a new candidate. Heuristics are currently
/// hierarchical. This may be more efficient than a graduated cost model because
/// we don't need to evaluate all aspects of the model for each node in the
/// \param Zone describes the scheduled zone that we are extending.
/// \param RPTracker describes reg pressure within the scheduled zone.
/// \param TempTracker is a scratch pressure tracker to reuse in queries.
-void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
- SchedCandidate &TryCand,
- SchedBoundary &Zone,
- const RegPressureTracker &RPTracker,
- RegPressureTracker &TempTracker) {
-
- // Always initialize TryCand's RPDelta.
- if (Zone.isTop()) {
- TempTracker.getMaxDownwardPressureDelta(
- TryCand.SU->getInstr(),
- TryCand.RPDelta,
- DAG->getRegionCriticalPSets(),
- DAG->getRegPressure().MaxSetPressure);
- }
- else {
- if (VerifyScheduling) {
- TempTracker.getMaxUpwardPressureDelta(
+void GenericScheduler::tryCandidate(SchedCandidate &Cand,
+ SchedCandidate &TryCand,
+ SchedBoundary &Zone,
+ const RegPressureTracker &RPTracker,
+ RegPressureTracker &TempTracker) {
+
+ if (DAG->isTrackingPressure()) {
+ // Always initialize TryCand's RPDelta.
+ if (Zone.isTop()) {
+ TempTracker.getMaxDownwardPressureDelta(
TryCand.SU->getInstr(),
- &DAG->getPressureDiff(TryCand.SU),
TryCand.RPDelta,
DAG->getRegionCriticalPSets(),
DAG->getRegPressure().MaxSetPressure);
}
else {
- RPTracker.getUpwardPressureDelta(
- TryCand.SU->getInstr(),
- DAG->getPressureDiff(TryCand.SU),
- TryCand.RPDelta,
- DAG->getRegionCriticalPSets(),
- DAG->getRegPressure().MaxSetPressure);
+ if (VerifyScheduling) {
+ TempTracker.getMaxUpwardPressureDelta(
+ TryCand.SU->getInstr(),
+ &DAG->getPressureDiff(TryCand.SU),
+ TryCand.RPDelta,
+ DAG->getRegionCriticalPSets(),
+ DAG->getRegPressure().MaxSetPressure);
+ }
+ else {
+ RPTracker.getUpwardPressureDelta(
+ TryCand.SU->getInstr(),
+ DAG->getPressureDiff(TryCand.SU),
+ TryCand.RPDelta,
+ DAG->getRegionCriticalPSets(),
+ DAG->getRegPressure().MaxSetPressure);
+ }
}
}
+ DEBUG(if (TryCand.RPDelta.Excess.isValid())
+ dbgs() << " SU(" << TryCand.SU->NodeNum << ") "
+ << TRI->getRegPressureSetName(TryCand.RPDelta.Excess.getPSet())
+ << ":" << TryCand.RPDelta.Excess.getUnitInc() << "\n");
// Initialize the candidate if needed.
if (!Cand.isValid()) {
// Avoid exceeding the target's limit. If signed PSetID is negative, it is
// invalid; convert it to INT_MAX to give it lowest priority.
- if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
- RegExcess))
+ if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess,
+ Cand.RPDelta.Excess,
+ TryCand, Cand, RegExcess))
+ return;
+
+ // Avoid increasing the max critical pressure in the scheduled region.
+ if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax,
+ Cand.RPDelta.CriticalMax,
+ TryCand, Cand, RegCritical))
return;
// For loops that are acyclic path limited, aggressively schedule for latency.
- if (Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone))
+ // This can result in very long dependence chains scheduled in sequence, so
+ // once every cycle (when CurrMOps == 0), switch to normal heuristics.
+ if (Rem.IsAcyclicLatencyLimited && !Zone.getCurrMOps()
+ && tryLatency(TryCand, Cand, Zone))
return;
- // Avoid increasing the max critical pressure in the scheduled region.
- if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
- TryCand, Cand, RegCritical))
+ // Prioritize instructions that read unbuffered resources by stall cycles.
+ if (tryLess(Zone.getLatencyStallCycles(TryCand.SU),
+ Zone.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
return;
// Keep clustered nodes together to encourage downstream peephole
return;
}
// Avoid increasing the max pressure of the entire region.
- if (tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax,
- TryCand, Cand, RegMax))
+ if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax,
+ Cand.RPDelta.CurrentMax,
+ TryCand, Cand, RegMax))
return;
// Avoid critical resource consumption and balance the schedule.
// Prefer immediate defs/users of the last scheduled instruction. This is a
// local pressure avoidance strategy that also makes the machine code
// readable.
- if (tryGreater(Zone.NextSUs.count(TryCand.SU), Zone.NextSUs.count(Cand.SU),
+ if (tryGreater(Zone.isNextSU(TryCand.SU), Zone.isNextSU(Cand.SU),
TryCand, Cand, NextDefUse))
return;
}
}
-#ifndef NDEBUG
-const char *ConvergingScheduler::getReasonStr(
- ConvergingScheduler::CandReason Reason) {
- switch (Reason) {
- case NoCand: return "NOCAND ";
- case PhysRegCopy: return "PREG-COPY";
- case RegExcess: return "REG-EXCESS";
- case RegCritical: return "REG-CRIT ";
- case Cluster: return "CLUSTER ";
- case Weak: return "WEAK ";
- case RegMax: return "REG-MAX ";
- case ResourceReduce: return "RES-REDUCE";
- case ResourceDemand: return "RES-DEMAND";
- case TopDepthReduce: return "TOP-DEPTH ";
- case TopPathReduce: return "TOP-PATH ";
- case BotHeightReduce:return "BOT-HEIGHT";
- case BotPathReduce: return "BOT-PATH ";
- case NextDefUse: return "DEF-USE ";
- case NodeOrder: return "ORDER ";
- };
- llvm_unreachable("Unknown reason!");
-}
-
-void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand) {
- PressureChange P;
- unsigned ResIdx = 0;
- unsigned Latency = 0;
- switch (Cand.Reason) {
- default:
- break;
- case RegExcess:
- P = Cand.RPDelta.Excess;
- break;
- case RegCritical:
- P = Cand.RPDelta.CriticalMax;
- break;
- case RegMax:
- P = Cand.RPDelta.CurrentMax;
- break;
- case ResourceReduce:
- ResIdx = Cand.Policy.ReduceResIdx;
- break;
- case ResourceDemand:
- ResIdx = Cand.Policy.DemandResIdx;
- break;
- case TopDepthReduce:
- Latency = Cand.SU->getDepth();
- break;
- case TopPathReduce:
- Latency = Cand.SU->getHeight();
- break;
- case BotHeightReduce:
- Latency = Cand.SU->getHeight();
- break;
- case BotPathReduce:
- Latency = Cand.SU->getDepth();
- break;
- }
- dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
- if (P.isValid())
- dbgs() << " " << TRI->getRegPressureSetName(P.getPSet())
- << ":" << P.getUnitInc() << " ";
- else
- dbgs() << " ";
- if (ResIdx)
- dbgs() << " " << SchedModel->getProcResource(ResIdx)->Name << " ";
- else
- dbgs() << " ";
- if (Latency)
- dbgs() << " " << Latency << " cycles ";
- else
- dbgs() << " ";
- dbgs() << '\n';
-}
-#endif
-
-/// Pick the best candidate from the top queue.
+/// Pick the best candidate from the queue.
///
/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
/// DAG building. To adjust for the current scheduling location we need to
/// maintain the number of vreg uses remaining to be top-scheduled.
-void ConvergingScheduler::pickNodeFromQueue(SchedBoundary &Zone,
- const RegPressureTracker &RPTracker,
- SchedCandidate &Cand) {
+void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
+ const RegPressureTracker &RPTracker,
+ SchedCandidate &Cand) {
ReadyQueue &Q = Zone.Available;
DEBUG(Q.dump());
}
}
-static void tracePick(const ConvergingScheduler::SchedCandidate &Cand,
- bool IsTop) {
- DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ")
- << ConvergingScheduler::getReasonStr(Cand.Reason) << '\n');
-}
-
/// Pick the best candidate node from either the top or bottom queue.
-SUnit *ConvergingScheduler::pickNodeBidirectional(bool &IsTopNode) {
+SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
// Schedule as far as possible in the direction of no choice. This is most
// efficient, but also provides the best heuristics for CriticalPSets.
if (SUnit *SU = Bot.pickOnlyChoice()) {
CandPolicy NoPolicy;
SchedCandidate BotCand(NoPolicy);
SchedCandidate TopCand(NoPolicy);
- Bot.setPolicy(BotCand.Policy, Top);
- Top.setPolicy(TopCand.Policy, Bot);
+ // Set the bottom-up policy based on the state of the current bottom zone and
+ // the instructions outside the zone, including the top zone.
+ setPolicy(BotCand.Policy, /*IsPostRA=*/false, Bot, &Top);
+ // Set the top-down policy based on the state of the current top zone and
+ // the instructions outside the zone, including the bottom zone.
+ setPolicy(TopCand.Policy, /*IsPostRA=*/false, Top, &Bot);
// Prefer bottom scheduling when heuristics are silent.
pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
}
/// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
-SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
+SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
if (DAG->top() == DAG->bottom()) {
assert(Top.Available.empty() && Top.Pending.empty() &&
Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
}
SUnit *SU;
do {
- if (ForceTopDown) {
+ if (RegionPolicy.OnlyTopDown) {
SU = Top.pickOnlyChoice();
if (!SU) {
CandPolicy NoPolicy;
SchedCandidate TopCand(NoPolicy);
pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
- assert(TopCand.Reason != NoCand && "failed to find the first candidate");
+ assert(TopCand.Reason != NoCand && "failed to find a candidate");
+ tracePick(TopCand, true);
SU = TopCand.SU;
}
IsTopNode = true;
}
- else if (ForceBottomUp) {
+ else if (RegionPolicy.OnlyBottomUp) {
SU = Bot.pickOnlyChoice();
if (!SU) {
CandPolicy NoPolicy;
SchedCandidate BotCand(NoPolicy);
pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
- assert(BotCand.Reason != NoCand && "failed to find the first candidate");
+ assert(BotCand.Reason != NoCand && "failed to find a candidate");
+ tracePick(BotCand, false);
SU = BotCand.SU;
}
IsTopNode = false;
return SU;
}
-void ConvergingScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
+void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
MachineBasicBlock::iterator InsertPos = SU->getInstr();
if (!isTop)
}
/// Update the scheduler's state after scheduling a node. This is the same node
-/// that was just returned by pickNode(). However, ScheduleDAGMI needs to update
-/// it's state based on the current cycle before MachineSchedStrategy does.
+/// that was just returned by pickNode(). However, ScheduleDAGMILive needs to
+/// update it's state based on the current cycle before MachineSchedStrategy
+/// does.
///
/// FIXME: Eventually, we may bundle physreg copies rather than rescheduling
/// them here. See comments in biasPhysRegCopy.
-void ConvergingScheduler::schedNode(SUnit *SU, bool IsTopNode) {
+void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
if (IsTopNode) {
- SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.CurrCycle);
+ SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
Top.bumpNode(SU);
if (SU->hasPhysRegUses)
reschedulePhysRegCopies(SU, true);
}
else {
- SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.CurrCycle);
+ SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle());
Bot.bumpNode(SU);
if (SU->hasPhysRegDefs)
reschedulePhysRegCopies(SU, false);
/// Create the standard converging machine scheduler. This will be used as the
/// default scheduler if the target does not set a default.
-static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) {
- assert((!ForceTopDown || !ForceBottomUp) &&
- "-misched-topdown incompatible with -misched-bottomup");
- ScheduleDAGMI *DAG = new ScheduleDAGMI(C, new ConvergingScheduler());
+static ScheduleDAGInstrs *createGenericSchedLive(MachineSchedContext *C) {
+ ScheduleDAGMILive *DAG = new ScheduleDAGMILive(C, new GenericScheduler(C));
// Register DAG post-processors.
//
// FIXME: extend the mutation API to allow earlier mutations to instantiate
// data and pass it to later mutations. Have a single mutation that gathers
// the interesting nodes in one pass.
DAG->addMutation(new CopyConstrain(DAG->TII, DAG->TRI));
- if (EnableLoadCluster)
+ if (EnableLoadCluster && DAG->TII->enableClusterLoads())
DAG->addMutation(new LoadClusterMutation(DAG->TII, DAG->TRI));
if (EnableMacroFusion)
DAG->addMutation(new MacroFusion(DAG->TII));
return DAG;
}
+
static MachineSchedRegistry
-ConvergingSchedRegistry("converge", "Standard converging scheduler.",
- createConvergingSched);
+GenericSchedRegistry("converge", "Standard converging scheduler.",
+ createGenericSchedLive);
+
+//===----------------------------------------------------------------------===//
+// PostGenericScheduler - Generic PostRA implementation of MachineSchedStrategy.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// PostGenericScheduler - Interface to the scheduling algorithm used by
+/// ScheduleDAGMI.
+///
+/// Callbacks from ScheduleDAGMI:
+/// initPolicy -> initialize(DAG) -> registerRoots -> pickNode ...
+class PostGenericScheduler : public GenericSchedulerBase {
+ ScheduleDAGMI *DAG;
+ SchedBoundary Top;
+ SmallVector<SUnit*, 8> BotRoots;
+public:
+ PostGenericScheduler(const MachineSchedContext *C):
+ GenericSchedulerBase(C), Top(SchedBoundary::TopQID, "TopQ") {}
+
+ virtual ~PostGenericScheduler() {}
+
+ virtual void initPolicy(MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End,
+ unsigned NumRegionInstrs) LLVM_OVERRIDE {
+ /* no configurable policy */
+ };
+
+ /// PostRA scheduling does not track pressure.
+ virtual bool shouldTrackPressure() const LLVM_OVERRIDE { return false; }
+
+ virtual void initialize(ScheduleDAGMI *Dag) LLVM_OVERRIDE {
+ DAG = Dag;
+ SchedModel = DAG->getSchedModel();
+ TRI = DAG->TRI;
+
+ Rem.init(DAG, SchedModel);
+ Top.init(DAG, SchedModel, &Rem);
+ BotRoots.clear();
+
+ // Initialize the HazardRecognizers. If itineraries don't exist, are empty,
+ // or are disabled, then these HazardRecs will be disabled.
+ const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
+ const TargetMachine &TM = DAG->MF.getTarget();
+ if (!Top.HazardRec) {
+ Top.HazardRec =
+ TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+ }
+ }
+
+ virtual void registerRoots() LLVM_OVERRIDE;
+
+ virtual SUnit *pickNode(bool &IsTopNode) LLVM_OVERRIDE;
+
+ virtual void scheduleTree(unsigned SubtreeID) LLVM_OVERRIDE {
+ llvm_unreachable("PostRA scheduler does not support subtree analysis.");
+ }
+
+ virtual void schedNode(SUnit *SU, bool IsTopNode) LLVM_OVERRIDE;
+
+ virtual void releaseTopNode(SUnit *SU) LLVM_OVERRIDE {
+ Top.releaseTopNode(SU);
+ }
+
+ // Only called for roots.
+ virtual void releaseBottomNode(SUnit *SU) LLVM_OVERRIDE {
+ BotRoots.push_back(SU);
+ }
+
+protected:
+ void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand);
+
+ void pickNodeFromQueue(SchedCandidate &Cand);
+};
+} // namespace
+
+void PostGenericScheduler::registerRoots() {
+ Rem.CriticalPath = DAG->ExitSU.getDepth();
+
+ // Some roots may not feed into ExitSU. Check all of them in case.
+ for (SmallVectorImpl<SUnit*>::const_iterator
+ I = BotRoots.begin(), E = BotRoots.end(); I != E; ++I) {
+ if ((*I)->getDepth() > Rem.CriticalPath)
+ Rem.CriticalPath = (*I)->getDepth();
+ }
+ DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n');
+}
+
+/// Apply a set of heursitics to a new candidate for PostRA scheduling.
+///
+/// \param Cand provides the policy and current best candidate.
+/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
+void PostGenericScheduler::tryCandidate(SchedCandidate &Cand,
+ SchedCandidate &TryCand) {
+
+ // Initialize the candidate if needed.
+ if (!Cand.isValid()) {
+ TryCand.Reason = NodeOrder;
+ return;
+ }
+
+ // Prioritize instructions that read unbuffered resources by stall cycles.
+ if (tryLess(Top.getLatencyStallCycles(TryCand.SU),
+ Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+ return;
+
+ // Avoid critical resource consumption and balance the schedule.
+ if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+ TryCand, Cand, ResourceReduce))
+ return;
+ if (tryGreater(TryCand.ResDelta.DemandedResources,
+ Cand.ResDelta.DemandedResources,
+ TryCand, Cand, ResourceDemand))
+ return;
+
+ // Avoid serializing long latency dependence chains.
+ if (Cand.Policy.ReduceLatency && tryLatency(TryCand, Cand, Top)) {
+ return;
+ }
+
+ // Fall through to original instruction order.
+ if (TryCand.SU->NodeNum < Cand.SU->NodeNum)
+ TryCand.Reason = NodeOrder;
+}
+
+void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {
+ ReadyQueue &Q = Top.Available;
+
+ DEBUG(Q.dump());
+
+ for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+ SchedCandidate TryCand(Cand.Policy);
+ TryCand.SU = *I;
+ TryCand.initResourceDelta(DAG, SchedModel);
+ tryCandidate(Cand, TryCand);
+ if (TryCand.Reason != NoCand) {
+ Cand.setBest(TryCand);
+ DEBUG(traceCandidate(Cand));
+ }
+ }
+}
+
+/// Pick the next node to schedule.
+SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
+ if (DAG->top() == DAG->bottom()) {
+ assert(Top.Available.empty() && Top.Pending.empty() && "ReadyQ garbage");
+ return NULL;
+ }
+ SUnit *SU;
+ do {
+ SU = Top.pickOnlyChoice();
+ if (!SU) {
+ CandPolicy NoPolicy;
+ SchedCandidate TopCand(NoPolicy);
+ // Set the top-down policy based on the state of the current top zone and
+ // the instructions outside the zone, including the bottom zone.
+ setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, NULL);
+ pickNodeFromQueue(TopCand);
+ assert(TopCand.Reason != NoCand && "failed to find a candidate");
+ tracePick(TopCand, true);
+ SU = TopCand.SU;
+ }
+ } while (SU->isScheduled);
+
+ IsTopNode = true;
+ Top.removeReady(SU);
+
+ DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
+ return SU;
+}
+
+/// Called after ScheduleDAGMI has scheduled an instruction and updated
+/// scheduled/remaining flags in the DAG nodes.
+void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
+ SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
+ Top.bumpNode(SU);
+}
+
+/// Create a generic scheduler with no vreg liveness or DAG mutation passes.
+static ScheduleDAGInstrs *createGenericSchedPostRA(MachineSchedContext *C) {
+ return new ScheduleDAGMI(C, new PostGenericScheduler(C), /*IsPostRA=*/true);
+}
//===----------------------------------------------------------------------===//
// ILP Scheduler. Currently for experimental analysis of heuristics.
/// \brief Schedule based on the ILP metric.
class ILPScheduler : public MachineSchedStrategy {
- /// In case all subtrees are eventually connected to a common root through
- /// data dependence (e.g. reduction), place an upper limit on their size.
- ///
- /// FIXME: A subtree limit is generally good, but in the situation commented
- /// above, where multiple similar subtrees feed a common root, we should
- /// only split at a point where the resulting subtrees will be balanced.
- /// (a motivating test case must be found).
- static const unsigned SubtreeLimit = 16;
-
- ScheduleDAGMI *DAG;
+ ScheduleDAGMILive *DAG;
ILPOrder Cmp;
std::vector<SUnit*> ReadyQ;
ILPScheduler(bool MaximizeILP): DAG(0), Cmp(MaximizeILP) {}
virtual void initialize(ScheduleDAGMI *dag) {
- DAG = dag;
+ assert(dag->hasVRegLiveness() && "ILPScheduler needs vreg liveness");
+ DAG = static_cast<ScheduleDAGMILive*>(dag);
DAG->computeDFSResult();
Cmp.DFSResult = DAG->getDFSResult();
Cmp.ScheduledTrees = &DAG->getScheduledTrees();
} // namespace
static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) {
- return new ScheduleDAGMI(C, new ILPScheduler(true));
+ return new ScheduleDAGMILive(C, new ILPScheduler(true));
}
static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) {
- return new ScheduleDAGMI(C, new ILPScheduler(false));
+ return new ScheduleDAGMILive(C, new ILPScheduler(false));
}
static MachineSchedRegistry ILPMaxRegistry(
"ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler);
InstructionShuffler(bool alternate, bool topdown)
: IsAlternating(alternate), IsTopDown(topdown) {}
- virtual void initialize(ScheduleDAGMI *) {
+ virtual void initialize(ScheduleDAGMI*) {
TopQ.clear();
BottomQ.clear();
}
bool TopDown = !ForceBottomUp;
assert((TopDown || !ForceTopDown) &&
"-misched-topdown incompatible with -misched-bottomup");
- return new ScheduleDAGMI(C, new InstructionShuffler(Alternate, TopDown));
+ return new ScheduleDAGMILive(C, new InstructionShuffler(Alternate, TopDown));
}
static MachineSchedRegistry ShufflerRegistry(
"shuffle", "Shuffle machine instructions alternating directions",
#endif // !NDEBUG
//===----------------------------------------------------------------------===//
-// GraphWriter support for ScheduleDAGMI.
+// GraphWriter support for ScheduleDAGMILive.
//===----------------------------------------------------------------------===//
#ifndef NDEBUG
}
static bool isNodeHidden(const SUnit *Node) {
- return (Node->NumPreds > 10 || Node->NumSuccs > 10);
+ return (Node->Preds.size() > 10 || Node->Succs.size() > 10);
}
static bool hasNodeAddressLabel(const SUnit *Node,
static std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *G) {
std::string Str;
raw_string_ostream SS(Str);
- SS << "SU(" << SU->NodeNum << ')';
+ const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G);
+ const SchedDFSResult *DFS = DAG->hasVRegLiveness() ?
+ static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : 0;
+ SS << "SU:" << SU->NodeNum;
+ if (DFS)
+ SS << " I:" << DFS->getNumInstrs(SU);
return SS.str();
}
static std::string getNodeDescription(const SUnit *SU, const ScheduleDAG *G) {
return G->getGraphNodeLabel(SU);
}
- static std::string getNodeAttributes(const SUnit *N,
- const ScheduleDAG *Graph) {
+ static std::string getNodeAttributes(const SUnit *N, const ScheduleDAG *G) {
std::string Str("shape=Mrecord");
- const SchedDFSResult *DFS =
- static_cast<const ScheduleDAGMI*>(Graph)->getDFSResult();
+ const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G);
+ const SchedDFSResult *DFS = DAG->hasVRegLiveness() ?
+ static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : 0;
if (DFS) {
Str += ",style=filled,fillcolor=\"#";
Str += DOT::getColorString(DFS->getSubtreeID(N));