1 //===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// This pass compute turns all control flow pseudo instructions into native one
12 /// computing their address on the fly ; it also sets STACK_SIZE info.
13 //===----------------------------------------------------------------------===//
15 #define DEBUG_TYPE "r600cf"
16 #include "llvm/Support/Debug.h"
18 #include "R600Defines.h"
19 #include "R600InstrInfo.h"
20 #include "R600MachineFunctionInfo.h"
21 #include "R600RegisterInfo.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/CodeGen/MachineInstrBuilder.h"
24 #include "llvm/CodeGen/MachineRegisterInfo.h"
25 #include "llvm/Support/raw_ostream.h"
36 FIRST_NON_WQM_PUSH = 2,
37 FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
40 const AMDGPUSubtarget &ST;
41 std::vector<StackItem> BranchStack;
42 std::vector<StackItem> LoopStack;
43 unsigned MaxStackSize;
44 unsigned CurrentEntries;
45 unsigned CurrentSubEntries;
47 CFStack(const AMDGPUSubtarget &st, unsigned ShaderType) : ST(st),
48 // We need to reserve a stack entry for CALL_FS in vertex shaders.
49 MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0),
50 CurrentEntries(0), CurrentSubEntries(0) { }
52 unsigned getLoopDepth();
53 bool branchStackContains(CFStack::StackItem);
54 bool requiresWorkAroundForInst(unsigned Opcode);
55 unsigned getSubEntrySize(CFStack::StackItem Item);
56 void updateMaxStackSize();
57 void pushBranch(unsigned Opcode, bool isWQM = false);
63 unsigned CFStack::getLoopDepth() {
64 return LoopStack.size();
67 bool CFStack::branchStackContains(CFStack::StackItem Item) {
68 for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(),
69 E = BranchStack.end(); I != E; ++I) {
76 unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
80 case CFStack::FIRST_NON_WQM_PUSH:
81 assert(!ST.hasCaymanISA());
82 if (ST.getGeneration() <= AMDGPUSubtarget::R700) {
83 // +1 For the push operation.
84 // +2 Extra space required.
87 // Some documentation says that this is not necessary on Evergreen,
88 // but experimentation has show that we need to allocate 1 extra
89 // sub-entry for the first non-WQM push.
90 // +1 For the push operation.
91 // +1 Extra space required.
94 case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
95 assert(ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN);
96 // +1 For the push operation.
97 // +1 Extra space required.
99 case CFStack::SUB_ENTRY:
104 void CFStack::updateMaxStackSize() {
105 unsigned CurrentStackSize = CurrentEntries +
106 (RoundUpToAlignment(CurrentSubEntries, 4) / 4);
107 MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
110 void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
111 CFStack::StackItem Item = CFStack::ENTRY;
113 case AMDGPU::CF_PUSH_EG:
114 case AMDGPU::CF_ALU_PUSH_BEFORE:
116 if (!ST.hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
117 Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI
119 // CFStack::getSubEntrySize()
120 else if (CurrentEntries > 0 &&
121 ST.getGeneration() > AMDGPUSubtarget::EVERGREEN &&
122 !ST.hasCaymanISA() &&
123 !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
124 Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
126 Item = CFStack::SUB_ENTRY;
128 Item = CFStack::ENTRY;
131 BranchStack.push_back(Item);
132 if (Item == CFStack::ENTRY)
135 CurrentSubEntries += getSubEntrySize(Item);
136 updateMaxStackSize();
139 void CFStack::pushLoop() {
140 LoopStack.push_back(CFStack::ENTRY);
142 updateMaxStackSize();
145 void CFStack::popBranch() {
146 CFStack::StackItem Top = BranchStack.back();
147 if (Top == CFStack::ENTRY)
150 CurrentSubEntries-= getSubEntrySize(Top);
151 BranchStack.pop_back();
154 void CFStack::popLoop() {
156 LoopStack.pop_back();
159 class R600ControlFlowFinalizer : public MachineFunctionPass {
162 typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile;
164 enum ControlFlowInstruction {
179 const R600InstrInfo *TII;
180 const R600RegisterInfo *TRI;
181 unsigned MaxFetchInst;
182 const AMDGPUSubtarget &ST;
184 bool IsTrivialInst(MachineInstr *MI) const {
185 switch (MI->getOpcode()) {
194 const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
196 bool isEg = (ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN);
199 Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
202 Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
205 Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
208 Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
211 Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
214 Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
216 case CF_LOOP_CONTINUE:
217 Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
220 Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
223 Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
226 Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
229 if (ST.hasCaymanISA()) {
230 Opcode = AMDGPU::CF_END_CM;
233 Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
236 assert (Opcode && "No opcode selected");
237 return TII->get(Opcode);
240 bool isCompatibleWithClause(const MachineInstr *MI,
241 std::set<unsigned> &DstRegs) const {
242 unsigned DstMI, SrcMI;
243 for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
244 E = MI->operands_end(); I != E; ++I) {
245 const MachineOperand &MO = *I;
249 unsigned Reg = MO.getReg();
250 if (AMDGPU::R600_Reg128RegClass.contains(Reg))
253 DstMI = TRI->getMatchingSuperReg(Reg,
254 TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
255 &AMDGPU::R600_Reg128RegClass);
258 unsigned Reg = MO.getReg();
259 if (AMDGPU::R600_Reg128RegClass.contains(Reg))
262 SrcMI = TRI->getMatchingSuperReg(Reg,
263 TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
264 &AMDGPU::R600_Reg128RegClass);
267 if ((DstRegs.find(SrcMI) == DstRegs.end())) {
268 DstRegs.insert(DstMI);
275 MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
277 MachineBasicBlock::iterator ClauseHead = I;
278 std::vector<MachineInstr *> ClauseContent;
279 unsigned AluInstCount = 0;
280 bool IsTex = TII->usesTextureCache(ClauseHead);
281 std::set<unsigned> DstRegs;
282 for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
283 if (IsTrivialInst(I))
285 if (AluInstCount >= MaxFetchInst)
287 if ((IsTex && !TII->usesTextureCache(I)) ||
288 (!IsTex && !TII->usesVertexCache(I)))
290 if (!isCompatibleWithClause(I, DstRegs))
293 ClauseContent.push_back(I);
295 MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
296 getHWInstrDesc(IsTex?CF_TC:CF_VC))
298 .addImm(AluInstCount - 1); // COUNT
299 return ClauseFile(MIb, ClauseContent);
302 void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const {
303 static const unsigned LiteralRegs[] = {
304 AMDGPU::ALU_LITERAL_X,
305 AMDGPU::ALU_LITERAL_Y,
306 AMDGPU::ALU_LITERAL_Z,
307 AMDGPU::ALU_LITERAL_W
309 const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs =
311 for (unsigned i = 0, e = Srcs.size(); i < e; ++i) {
312 if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X)
314 int64_t Imm = Srcs[i].second;
315 std::vector<int64_t>::iterator It =
316 std::find(Lits.begin(), Lits.end(), Imm);
317 if (It != Lits.end()) {
318 unsigned Index = It - Lits.begin();
319 Srcs[i].first->setReg(LiteralRegs[Index]);
321 assert(Lits.size() < 4 && "Too many literals in Instruction Group");
322 Srcs[i].first->setReg(LiteralRegs[Lits.size()]);
328 MachineBasicBlock::iterator insertLiterals(
329 MachineBasicBlock::iterator InsertPos,
330 const std::vector<unsigned> &Literals) const {
331 MachineBasicBlock *MBB = InsertPos->getParent();
332 for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
333 unsigned LiteralPair0 = Literals[i];
334 unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
335 InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
336 TII->get(AMDGPU::LITERALS))
337 .addImm(LiteralPair0)
338 .addImm(LiteralPair1);
344 MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
346 MachineBasicBlock::iterator ClauseHead = I;
347 std::vector<MachineInstr *> ClauseContent;
349 for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) {
350 if (IsTrivialInst(I)) {
354 if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
356 std::vector<int64_t> Literals;
358 MachineInstr *DeleteMI = I;
359 MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
360 while (++BI != E && BI->isBundledWithPred()) {
361 BI->unbundleFromPred();
362 for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) {
363 MachineOperand &MO = BI->getOperand(i);
364 if (MO.isReg() && MO.isInternalRead())
365 MO.setIsInternalRead(false);
367 getLiteral(BI, Literals);
368 ClauseContent.push_back(BI);
371 DeleteMI->eraseFromParent();
373 getLiteral(I, Literals);
374 ClauseContent.push_back(I);
377 for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
378 unsigned literal0 = Literals[i];
379 unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0;
380 MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(),
381 TII->get(AMDGPU::LITERALS))
384 ClauseContent.push_back(MILit);
387 assert(ClauseContent.size() < 128 && "ALU clause is too big");
388 ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1);
389 return ClauseFile(ClauseHead, ClauseContent);
393 EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
395 CounterPropagateAddr(Clause.first, CfCount);
396 MachineBasicBlock *BB = Clause.first->getParent();
397 BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE))
399 for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
400 BB->splice(InsertPos, BB, Clause.second[i]);
402 CfCount += 2 * Clause.second.size();
406 EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
408 Clause.first->getOperand(0).setImm(0);
409 CounterPropagateAddr(Clause.first, CfCount);
410 MachineBasicBlock *BB = Clause.first->getParent();
411 BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE))
413 for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
414 BB->splice(InsertPos, BB, Clause.second[i]);
416 CfCount += Clause.second.size();
419 void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const {
420 MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm());
422 void CounterPropagateAddr(std::set<MachineInstr *> MIs, unsigned Addr)
424 for (std::set<MachineInstr *>::iterator It = MIs.begin(), E = MIs.end();
426 MachineInstr *MI = *It;
427 CounterPropagateAddr(MI, Addr);
432 R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID),
434 ST(tm.getSubtarget<AMDGPUSubtarget>()) {
435 const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>();
436 MaxFetchInst = ST.getTexVTXClauseSize();
439 virtual bool runOnMachineFunction(MachineFunction &MF) {
440 TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
441 TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo());
442 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
444 CFStack CFStack(ST, MFI->ShaderType);
445 for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
447 MachineBasicBlock &MBB = *MB;
448 unsigned CfCount = 0;
449 std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
450 std::vector<MachineInstr * > IfThenElseStack;
451 if (MFI->ShaderType == 1) {
452 BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
453 getHWInstrDesc(CF_CALL_FS));
456 std::vector<ClauseFile> FetchClauses, AluClauses;
457 std::vector<MachineInstr *> LastAlu(1);
458 std::vector<MachineInstr *> ToPopAfter;
460 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
462 if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) {
463 DEBUG(dbgs() << CfCount << ":"; I->dump(););
464 FetchClauses.push_back(MakeFetchClause(MBB, I));
469 MachineBasicBlock::iterator MI = I;
470 if (MI->getOpcode() != AMDGPU::ENDIF)
472 if (MI->getOpcode() == AMDGPU::CF_ALU)
475 switch (MI->getOpcode()) {
476 case AMDGPU::CF_ALU_PUSH_BEFORE:
477 if (ST.hasCaymanISA() && CFStack.getLoopDepth() > 1) {
478 BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
481 MI->setDesc(TII->get(AMDGPU::CF_ALU));
483 CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
485 CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
489 AluClauses.push_back(MakeALUClause(MBB, I));
490 DEBUG(dbgs() << CfCount << ":"; MI->dump(););
493 case AMDGPU::WHILELOOP: {
495 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
496 getHWInstrDesc(CF_WHILE_LOOP))
498 std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
499 std::set<MachineInstr *>());
500 Pair.second.insert(MIb);
501 LoopStack.push_back(Pair);
502 MI->eraseFromParent();
506 case AMDGPU::ENDLOOP: {
508 std::pair<unsigned, std::set<MachineInstr *> > Pair =
510 LoopStack.pop_back();
511 CounterPropagateAddr(Pair.second, CfCount);
512 BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
513 .addImm(Pair.first + 1);
514 MI->eraseFromParent();
518 case AMDGPU::IF_PREDICATE_SET: {
519 LastAlu.push_back(0);
520 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
521 getHWInstrDesc(CF_JUMP))
524 IfThenElseStack.push_back(MIb);
525 DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
526 MI->eraseFromParent();
531 MachineInstr * JumpInst = IfThenElseStack.back();
532 IfThenElseStack.pop_back();
533 CounterPropagateAddr(JumpInst, CfCount);
534 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
535 getHWInstrDesc(CF_ELSE))
538 DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
539 IfThenElseStack.push_back(MIb);
540 MI->eraseFromParent();
544 case AMDGPU::ENDIF: {
546 if (LastAlu.back()) {
547 ToPopAfter.push_back(LastAlu.back());
549 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
550 getHWInstrDesc(CF_POP))
554 DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
558 MachineInstr *IfOrElseInst = IfThenElseStack.back();
559 IfThenElseStack.pop_back();
560 CounterPropagateAddr(IfOrElseInst, CfCount);
561 IfOrElseInst->getOperand(1).setImm(1);
563 MI->eraseFromParent();
566 case AMDGPU::BREAK: {
568 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
569 getHWInstrDesc(CF_LOOP_BREAK))
571 LoopStack.back().second.insert(MIb);
572 MI->eraseFromParent();
575 case AMDGPU::CONTINUE: {
576 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
577 getHWInstrDesc(CF_LOOP_CONTINUE))
579 LoopStack.back().second.insert(MIb);
580 MI->eraseFromParent();
584 case AMDGPU::RETURN: {
585 BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END));
587 MI->eraseFromParent();
589 BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD));
592 for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
593 EmitFetchClause(I, FetchClauses[i], CfCount);
594 for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
595 EmitALUClause(I, AluClauses[i], CfCount);
598 if (TII->isExport(MI->getOpcode())) {
599 DEBUG(dbgs() << CfCount << ":"; MI->dump(););
605 for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
606 MachineInstr *Alu = ToPopAfter[i];
607 BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
608 TII->get(AMDGPU::CF_ALU_POP_AFTER))
609 .addImm(Alu->getOperand(0).getImm())
610 .addImm(Alu->getOperand(1).getImm())
611 .addImm(Alu->getOperand(2).getImm())
612 .addImm(Alu->getOperand(3).getImm())
613 .addImm(Alu->getOperand(4).getImm())
614 .addImm(Alu->getOperand(5).getImm())
615 .addImm(Alu->getOperand(6).getImm())
616 .addImm(Alu->getOperand(7).getImm())
617 .addImm(Alu->getOperand(8).getImm());
618 Alu->eraseFromParent();
620 MFI->StackSize = CFStack.MaxStackSize;
626 const char *getPassName() const {
627 return "R600 Control Flow Finalizer Pass";
631 char R600ControlFlowFinalizer::ID = 0;
633 } // end anonymous namespace
636 llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
637 return new R600ControlFlowFinalizer(TM);