X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FR600%2FR600ControlFlowFinalizer.cpp;h=8e7bc1079b7bfdc1cd7b6a88f478e9a9e1c6d5be;hb=9f85dccfc64b5f0b0c63ddfa0a42d8615aa1fcb3;hp=3d448bf3d3bee1871410a44fd86d3f8b75f42088;hpb=b5632b5b456db647b42239cbd4d8b58c82290c4e;p=oota-llvm.git diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp index 3d448bf3d3b..8e7bc1079b7 100644 --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -12,9 +12,9 @@ /// computing their address on the fly ; it also sets STACK_SIZE info. //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "r600cf" #include "llvm/Support/Debug.h" #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" @@ -26,8 +26,176 @@ using namespace llvm; +#define DEBUG_TYPE "r600cf" + namespace { +struct CFStack { + + enum StackItem { + ENTRY = 0, + SUB_ENTRY = 1, + FIRST_NON_WQM_PUSH = 2, + FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3 + }; + + const AMDGPUSubtarget &ST; + std::vector BranchStack; + std::vector LoopStack; + unsigned MaxStackSize; + unsigned CurrentEntries; + unsigned CurrentSubEntries; + + CFStack(const AMDGPUSubtarget &st, unsigned ShaderType) : ST(st), + // We need to reserve a stack entry for CALL_FS in vertex shaders. + MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0), + CurrentEntries(0), CurrentSubEntries(0) { } + + unsigned getLoopDepth(); + bool branchStackContains(CFStack::StackItem); + bool requiresWorkAroundForInst(unsigned Opcode); + unsigned getSubEntrySize(CFStack::StackItem Item); + void updateMaxStackSize(); + void pushBranch(unsigned Opcode, bool isWQM = false); + void pushLoop(); + void popBranch(); + void popLoop(); +}; + +unsigned CFStack::getLoopDepth() { + return LoopStack.size(); +} + +bool CFStack::branchStackContains(CFStack::StackItem Item) { + for (std::vector::const_iterator I = BranchStack.begin(), + E = BranchStack.end(); I != E; ++I) { + if (*I == Item) + return true; + } + return false; +} + +bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { + if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST.hasCaymanISA() && + getLoopDepth() > 1) + return true; + + if (!ST.hasCFAluBug()) + return false; + + switch(Opcode) { + default: return false; + case AMDGPU::CF_ALU_PUSH_BEFORE: + case AMDGPU::CF_ALU_ELSE_AFTER: + case AMDGPU::CF_ALU_BREAK: + case AMDGPU::CF_ALU_CONTINUE: + if (CurrentSubEntries == 0) + return false; + if (ST.getWavefrontSize() == 64) { + // We are being conservative here. We only require this work-around if + // CurrentSubEntries > 3 && + // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0) + // + // We have to be conservative, because we don't know for certain that + // our stack allocation algorithm for Evergreen/NI is correct. Applying this + // work-around when CurrentSubEntries > 3 allows us to over-allocate stack + // resources without any problems. + return CurrentSubEntries > 3; + } else { + assert(ST.getWavefrontSize() == 32); + // We are being conservative here. We only require the work-around if + // CurrentSubEntries > 7 && + // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0) + // See the comment on the wavefront size == 64 case for why we are + // being conservative. + return CurrentSubEntries > 7; + } + } +} + +unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { + switch(Item) { + default: + return 0; + case CFStack::FIRST_NON_WQM_PUSH: + assert(!ST.hasCaymanISA()); + if (ST.getGeneration() <= AMDGPUSubtarget::R700) { + // +1 For the push operation. + // +2 Extra space required. + return 3; + } else { + // Some documentation says that this is not necessary on Evergreen, + // but experimentation has show that we need to allocate 1 extra + // sub-entry for the first non-WQM push. + // +1 For the push operation. + // +1 Extra space required. + return 2; + } + case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY: + assert(ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN); + // +1 For the push operation. + // +1 Extra space required. + return 2; + case CFStack::SUB_ENTRY: + return 1; + } +} + +void CFStack::updateMaxStackSize() { + unsigned CurrentStackSize = CurrentEntries + + (RoundUpToAlignment(CurrentSubEntries, 4) / 4); + MaxStackSize = std::max(CurrentStackSize, MaxStackSize); +} + +void CFStack::pushBranch(unsigned Opcode, bool isWQM) { + CFStack::StackItem Item = CFStack::ENTRY; + switch(Opcode) { + case AMDGPU::CF_PUSH_EG: + case AMDGPU::CF_ALU_PUSH_BEFORE: + if (!isWQM) { + if (!ST.hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) + Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI + // See comment in + // CFStack::getSubEntrySize() + else if (CurrentEntries > 0 && + ST.getGeneration() > AMDGPUSubtarget::EVERGREEN && + !ST.hasCaymanISA() && + !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY)) + Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY; + else + Item = CFStack::SUB_ENTRY; + } else + Item = CFStack::ENTRY; + break; + } + BranchStack.push_back(Item); + if (Item == CFStack::ENTRY) + CurrentEntries++; + else + CurrentSubEntries += getSubEntrySize(Item); + updateMaxStackSize(); +} + +void CFStack::pushLoop() { + LoopStack.push_back(CFStack::ENTRY); + CurrentEntries++; + updateMaxStackSize(); +} + +void CFStack::popBranch() { + CFStack::StackItem Top = BranchStack.back(); + if (Top == CFStack::ENTRY) + CurrentEntries--; + else + CurrentSubEntries-= getSubEntrySize(Top); + BranchStack.pop_back(); +} + +void CFStack::popLoop() { + CurrentEntries--; + LoopStack.pop_back(); +} + class R600ControlFlowFinalizer : public MachineFunctionPass { private: @@ -65,7 +233,7 @@ private: const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const { unsigned Opcode = 0; - bool isEg = (ST.device()->getGeneration() >= AMDGPUDeviceInfo::HD5XXX); + bool isEg = (ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN); switch (CFI) { case CF_TC: Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; @@ -98,7 +266,7 @@ private: Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600; break; case CF_END: - if (ST.device()->getDeviceFlag() == OCL_DEVICE_CAYMAN) { + if (ST.hasCaymanISA()) { Opcode = AMDGPU::CF_END_CM; break; } @@ -110,7 +278,7 @@ private: } bool isCompatibleWithClause(const MachineInstr *MI, - std::set &DstRegs, std::set &SrcRegs) const { + std::set &DstRegs) const { unsigned DstMI, SrcMI; for (MachineInstr::const_mop_iterator I = MI->operands_begin(), E = MI->operands_end(); I != E; ++I) { @@ -136,9 +304,7 @@ private: &AMDGPU::R600_Reg128RegClass); } } - if ((DstRegs.find(SrcMI) == DstRegs.end()) && - (SrcRegs.find(DstMI) == SrcRegs.end())) { - SrcRegs.insert(SrcMI); + if ((DstRegs.find(SrcMI) == DstRegs.end())) { DstRegs.insert(DstMI); return true; } else @@ -152,7 +318,7 @@ private: std::vector ClauseContent; unsigned AluInstCount = 0; bool IsTex = TII->usesTextureCache(ClauseHead); - std::set DstRegs, SrcRegs; + std::set DstRegs; for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { if (IsTrivialInst(I)) continue; @@ -161,7 +327,7 @@ private: if ((IsTex && !TII->usesTextureCache(I)) || (!IsTex && !TII->usesVertexCache(I))) break; - if (!isCompatibleWithClause(I, DstRegs, SrcRegs)) + if (!isCompatibleWithClause(I, DstRegs)) break; AluInstCount ++; ClauseContent.push_back(I); @@ -174,7 +340,7 @@ private: } void getLiteral(MachineInstr *MI, std::vector &Lits) const { - unsigned LiteralRegs[] = { + static const unsigned LiteralRegs[] = { AMDGPU::ALU_LITERAL_X, AMDGPU::ALU_LITERAL_Y, AMDGPU::ALU_LITERAL_Z, @@ -258,6 +424,7 @@ private: ClauseContent.push_back(MILit); } } + assert(ClauseContent.size() < 128 && "ALU clause is too big"); ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1); return ClauseFile(ClauseHead, ClauseContent); } @@ -278,6 +445,7 @@ private: void EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, unsigned &CfCount) { + Clause.first->getOperand(0).setImm(0); CounterPropagateAddr(Clause.first, CfCount); MachineBasicBlock *BB = Clause.first->getParent(); BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE)) @@ -300,82 +468,76 @@ private: } } - unsigned getHWStackSize(unsigned StackSubEntry, bool hasPush) const { - switch (ST.device()->getGeneration()) { - case AMDGPUDeviceInfo::HD4XXX: - if (hasPush) - StackSubEntry += 2; - break; - case AMDGPUDeviceInfo::HD5XXX: - if (hasPush) - StackSubEntry ++; - case AMDGPUDeviceInfo::HD6XXX: - StackSubEntry += 2; - break; - } - return (StackSubEntry + 3)/4; // Need ceil value of StackSubEntry/4 - } - public: R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID), - TII (0), TRI(0), + TII (nullptr), TRI(nullptr), ST(tm.getSubtarget()) { const AMDGPUSubtarget &ST = tm.getSubtarget(); MaxFetchInst = ST.getTexVTXClauseSize(); } - virtual bool runOnMachineFunction(MachineFunction &MF) { - TII=static_cast(MF.getTarget().getInstrInfo()); - TRI=static_cast(MF.getTarget().getRegisterInfo()); + bool runOnMachineFunction(MachineFunction &MF) override { + TII = static_cast( + MF.getTarget().getSubtargetImpl()->getInstrInfo()); + TRI = static_cast( + MF.getTarget().getSubtargetImpl()->getRegisterInfo()); + R600MachineFunctionInfo *MFI = MF.getInfo(); - unsigned MaxStack = 0; - unsigned CurrentStack = 0; - bool HasPush = false; + CFStack CFStack(ST, MFI->getShaderType()); for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; ++MB) { MachineBasicBlock &MBB = *MB; unsigned CfCount = 0; std::vector > > LoopStack; std::vector IfThenElseStack; - R600MachineFunctionInfo *MFI = MF.getInfo(); - if (MFI->ShaderType == 1) { + if (MFI->getShaderType() == ShaderType::VERTEX) { BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), getHWInstrDesc(CF_CALL_FS)); CfCount++; - MaxStack = 1; } std::vector FetchClauses, AluClauses; + std::vector LastAlu(1); + std::vector ToPopAfter; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) { DEBUG(dbgs() << CfCount << ":"; I->dump();); FetchClauses.push_back(MakeFetchClause(MBB, I)); CfCount++; + LastAlu.back() = nullptr; continue; } MachineBasicBlock::iterator MI = I; + if (MI->getOpcode() != AMDGPU::ENDIF) + LastAlu.back() = nullptr; + if (MI->getOpcode() == AMDGPU::CF_ALU) + LastAlu.back() = MI; I++; + bool RequiresWorkAround = + CFStack.requiresWorkAroundForInst(MI->getOpcode()); switch (MI->getOpcode()) { case AMDGPU::CF_ALU_PUSH_BEFORE: - CurrentStack++; - MaxStack = std::max(MaxStack, CurrentStack); - HasPush = true; + if (RequiresWorkAround) { + DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n"); + BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG)) + .addImm(CfCount + 1) + .addImm(1); + MI->setDesc(TII->get(AMDGPU::CF_ALU)); + CfCount++; + CFStack.pushBranch(AMDGPU::CF_PUSH_EG); + } else + CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE); + case AMDGPU::CF_ALU: I = MI; AluClauses.push_back(MakeALUClause(MBB, I)); - case AMDGPU::EG_ExportBuf: - case AMDGPU::EG_ExportSwz: - case AMDGPU::R600_ExportBuf: - case AMDGPU::R600_ExportSwz: - case AMDGPU::RAT_WRITE_CACHELESS_32_eg: - case AMDGPU::RAT_WRITE_CACHELESS_128_eg: DEBUG(dbgs() << CfCount << ":"; MI->dump();); CfCount++; break; case AMDGPU::WHILELOOP: { - CurrentStack+=4; - MaxStack = std::max(MaxStack, CurrentStack); + CFStack.pushLoop(); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_WHILE_LOOP)) .addImm(1); @@ -388,7 +550,7 @@ public: break; } case AMDGPU::ENDLOOP: { - CurrentStack-=4; + CFStack.popLoop(); std::pair > Pair = LoopStack.back(); LoopStack.pop_back(); @@ -400,6 +562,7 @@ public: break; } case AMDGPU::IF_PREDICATE_SET: { + LastAlu.push_back(nullptr); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_JUMP)) .addImm(0) @@ -417,7 +580,7 @@ public: MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_ELSE)) .addImm(0) - .addImm(1); + .addImm(0); DEBUG(dbgs() << CfCount << ":"; MIb->dump();); IfThenElseStack.push_back(MIb); MI->eraseFromParent(); @@ -425,32 +588,32 @@ public: break; } case AMDGPU::ENDIF: { - CurrentStack--; + CFStack.popBranch(); + if (LastAlu.back()) { + ToPopAfter.push_back(LastAlu.back()); + } else { + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_POP)) + .addImm(CfCount + 1) + .addImm(1); + (void)MIb; + DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + CfCount++; + } + MachineInstr *IfOrElseInst = IfThenElseStack.back(); IfThenElseStack.pop_back(); - CounterPropagateAddr(IfOrElseInst, CfCount + 1); - MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), - getHWInstrDesc(CF_POP)) - .addImm(CfCount + 1) - .addImm(1); - (void)MIb; - DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + CounterPropagateAddr(IfOrElseInst, CfCount); + IfOrElseInst->getOperand(1).setImm(1); + LastAlu.pop_back(); MI->eraseFromParent(); - CfCount++; break; } - case AMDGPU::PREDICATED_BREAK: { - CurrentStack--; - CfCount += 3; - BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_JUMP)) - .addImm(CfCount) - .addImm(1); + case AMDGPU::BREAK: { + CfCount ++; MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_LOOP_BREAK)) .addImm(0); - BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_POP)) - .addImm(CfCount) - .addImm(1); LoopStack.back().second.insert(MIb); MI->eraseFromParent(); break; @@ -478,16 +641,35 @@ public: EmitALUClause(I, AluClauses[i], CfCount); } default: + if (TII->isExport(MI->getOpcode())) { + DEBUG(dbgs() << CfCount << ":"; MI->dump();); + CfCount++; + } break; } } - MFI->StackSize = getHWStackSize(MaxStack, HasPush); + for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) { + MachineInstr *Alu = ToPopAfter[i]; + BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu), + TII->get(AMDGPU::CF_ALU_POP_AFTER)) + .addImm(Alu->getOperand(0).getImm()) + .addImm(Alu->getOperand(1).getImm()) + .addImm(Alu->getOperand(2).getImm()) + .addImm(Alu->getOperand(3).getImm()) + .addImm(Alu->getOperand(4).getImm()) + .addImm(Alu->getOperand(5).getImm()) + .addImm(Alu->getOperand(6).getImm()) + .addImm(Alu->getOperand(7).getImm()) + .addImm(Alu->getOperand(8).getImm()); + Alu->eraseFromParent(); + } + MFI->StackSize = CFStack.MaxStackSize; } return false; } - const char *getPassName() const { + const char *getPassName() const override { return "R600 Control Flow Finalizer Pass"; } };