//
/// \file
/// \brief R600 Machine Scheduler interface
-// TODO: Scheduling is optimised for VLIW4 arch, modify it to support TRANS slot
//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "misched"
-
#include "R600MachineScheduler.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "AMDGPUSubtarget.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/Pass.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
-void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
+#define DEBUG_TYPE "misched"
- DAG = dag;
+void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
+ assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
+ DAG = static_cast<ScheduleDAGMILive*>(dag);
+ const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
TII = static_cast<const R600InstrInfo*>(DAG->TII);
TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
+ VLIW5 = !ST.hasCaymanISA();
MRI = &DAG->MRI;
CurInstKind = IDOther;
CurEmitted = 0;
OccupedSlotsMask = 31;
InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
InstKindLimit[IDOther] = 32;
-
- const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
AluInstCount = 0;
FetchInstCount = 0;
}
SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
- SUnit *SU = 0;
+ SUnit *SU = nullptr;
NextInstKind = IDOther;
IsTopNode = false;
// OpenCL Programming Guide :
// The approx. number of WF that allows TEX inst to hide ALU inst is :
// 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
- float ALUFetchRationEstimate =
+ float ALUFetchRationEstimate =
(AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
(FetchInstCount + Available[IDFetch].size());
- unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
- DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
- // We assume the local GPR requirements to be "dominated" by the requirement
- // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
- // after TEX are indeed likely to consume or generate values from/for the
- // TEX clause.
- // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
- // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
- // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
- // (TODO : use RegisterPressure)
- // If we are going too use too many GPR, we flush Fetch instruction to lower
- // register pressure on 128 bits regs.
- unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
- if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
+ if (ALUFetchRationEstimate == 0) {
AllowSwitchFromAlu = true;
- }
-
-
- // We want to scheduled AR defs as soon as possible to make sure they aren't
- // put in a different ALU clause from their uses.
- if (!SU && !UnscheduledARDefs.empty()) {
- SU = UnscheduledARDefs[0];
- UnscheduledARDefs.erase(UnscheduledARDefs.begin());
- NextInstKind = IDAlu;
+ } else {
+ unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
+ DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
+ // We assume the local GPR requirements to be "dominated" by the requirement
+ // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
+ // after TEX are indeed likely to consume or generate values from/for the
+ // TEX clause.
+ // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
+ // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
+ // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
+ // (TODO : use RegisterPressure)
+ // If we are going too use too many GPR, we flush Fetch instruction to lower
+ // register pressure on 128 bits regs.
+ unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
+ if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
+ AllowSwitchFromAlu = true;
+ }
}
if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
NextInstKind = IDOther;
}
- // We want to schedule the AR uses as late as possible to make sure that
- // the AR defs have been released.
- if (!SU && !UnscheduledARUses.empty()) {
- SU = UnscheduledARUses[0];
- UnscheduledARUses.erase(UnscheduledARUses.begin());
- NextInstKind = IDAlu;
- }
-
-
DEBUG(
if (SU) {
dbgs() << " ** Pick node **\n";
int IK = getInstKind(SU);
- // Check for AR register defines
- for (MachineInstr::const_mop_iterator I = SU->getInstr()->operands_begin(),
- E = SU->getInstr()->operands_end();
- I != E; ++I) {
- if (I->isReg() && I->getReg() == AMDGPU::AR_X) {
- if (I->isDef()) {
- UnscheduledARDefs.push_back(SU);
- } else {
- UnscheduledARUses.push_back(SU);
- }
- return;
- }
- }
-
// There is no export clause, we can schedule one as soon as its ready
if (IK == IDOther)
Available[IDOther].push_back(SU);
if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
return AluT_XYZW;
+ // LDS src registers cannot be used in the Trans slot.
+ if (TII->readsLDSSrcReg(MI))
+ return AluT_XYZW;
+
return AluAny;
}
}
}
-SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q) {
+SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
if (Q.empty())
- return NULL;
+ return nullptr;
for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
It != E; ++It) {
SUnit *SU = *It;
InstructionsGroupCandidate.push_back(SU->getInstr());
- if (TII->fitsConstReadLimitations(InstructionsGroupCandidate)) {
+ if (TII->fitsConstReadLimitations(InstructionsGroupCandidate)
+ && (!AnyALU || !TII->isVectorOnly(SU->getInstr()))
+ ) {
InstructionsGroupCandidate.pop_back();
Q.erase((It + 1).base());
return SU;
InstructionsGroupCandidate.pop_back();
}
}
- return NULL;
+ return nullptr;
}
void R600SchedStrategy::LoadAlu() {
DEBUG(dbgs() << "New Slot\n");
assert (OccupedSlotsMask && "Slot wasn't filled");
OccupedSlotsMask = 0;
+// if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
+// OccupedSlotsMask |= 16;
InstructionsGroupCandidate.clear();
LoadAlu();
}
}
}
-SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
+SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) {
static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
- SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]]);
+ SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu);
if (SlotedSU)
return SlotedSU;
- SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny]);
+ SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu);
if (UnslotedSU)
AssignSlot(UnslotedSU->getInstr(), Slot);
return UnslotedSU;
// Bottom up scheduling : predX must comes first
if (!AvailableAlus[AluPredX].empty()) {
OccupedSlotsMask |= 31;
- return PopInst(AvailableAlus[AluPredX]);
+ return PopInst(AvailableAlus[AluPredX], false);
}
// Flush physical reg copies (RA will discard them)
if (!AvailableAlus[AluDiscarded].empty()) {
OccupedSlotsMask |= 31;
- return PopInst(AvailableAlus[AluDiscarded]);
+ return PopInst(AvailableAlus[AluDiscarded], false);
}
// If there is a T_XYZW alu available, use it
if (!AvailableAlus[AluT_XYZW].empty()) {
OccupedSlotsMask |= 15;
- return PopInst(AvailableAlus[AluT_XYZW]);
+ return PopInst(AvailableAlus[AluT_XYZW], false);
}
}
bool TransSlotOccuped = OccupedSlotsMask & 16;
- if (!TransSlotOccuped) {
+ if (!TransSlotOccuped && VLIW5) {
if (!AvailableAlus[AluTrans].empty()) {
OccupedSlotsMask |= 16;
- return PopInst(AvailableAlus[AluTrans]);
+ return PopInst(AvailableAlus[AluTrans], false);
+ }
+ SUnit *SU = AttemptFillSlot(3, true);
+ if (SU) {
+ OccupedSlotsMask |= 16;
+ return SU;
}
}
for (int Chan = 3; Chan > -1; --Chan) {
bool isOccupied = OccupedSlotsMask & (1 << Chan);
if (!isOccupied) {
- SUnit *SU = AttemptFillSlot(Chan);
+ SUnit *SU = AttemptFillSlot(Chan, false);
if (SU) {
OccupedSlotsMask |= (1 << Chan);
InstructionsGroupCandidate.push_back(SU->getInstr());
}
PrepareNextSlot();
}
- return NULL;
+ return nullptr;
}
SUnit* R600SchedStrategy::pickOther(int QID) {
- SUnit *SU = 0;
+ SUnit *SU = nullptr;
std::vector<SUnit *> &AQ = Available[QID];
if (AQ.empty()) {
}
return SU;
}
-