R600: Refactor stack size calculation
[oota-llvm.git] / lib / Target / R600 / R600ControlFlowFinalizer.cpp
1 //===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// This pass compute turns all control flow pseudo instructions into native one
12 /// computing their address on the fly ; it also sets STACK_SIZE info.
13 //===----------------------------------------------------------------------===//
14
15 #define DEBUG_TYPE "r600cf"
16 #include "llvm/Support/Debug.h"
17 #include "AMDGPU.h"
18 #include "R600Defines.h"
19 #include "R600InstrInfo.h"
20 #include "R600MachineFunctionInfo.h"
21 #include "R600RegisterInfo.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/CodeGen/MachineInstrBuilder.h"
24 #include "llvm/CodeGen/MachineRegisterInfo.h"
25 #include "llvm/Support/raw_ostream.h"
26
27 using namespace llvm;
28
29 namespace {
30
31 struct CFStack {
32
33   enum StackItem {
34     ENTRY = 0,
35     SUB_ENTRY = 1,
36     FIRST_NON_WQM_PUSH = 2,
37     FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
38   };
39
40   const AMDGPUSubtarget &ST;
41   std::vector<StackItem> BranchStack;
42   std::vector<StackItem> LoopStack;
43   unsigned MaxStackSize;
44   unsigned CurrentEntries;
45   unsigned CurrentSubEntries;
46
47   CFStack(const AMDGPUSubtarget &st, unsigned ShaderType) : ST(st),
48       // We need to reserve a stack entry for CALL_FS in vertex shaders.
49       MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0),
50       CurrentEntries(0), CurrentSubEntries(0) { }
51
52   unsigned getLoopDepth();
53   bool branchStackContains(CFStack::StackItem);
54   bool requiresWorkAroundForInst(unsigned Opcode);
55   unsigned getSubEntrySize(CFStack::StackItem Item);
56   void updateMaxStackSize();
57   void pushBranch(unsigned Opcode, bool isWQM = false);
58   void pushLoop();
59   void popBranch();
60   void popLoop();
61 };
62
63 unsigned CFStack::getLoopDepth() {
64   return LoopStack.size();
65 }
66
67 bool CFStack::branchStackContains(CFStack::StackItem Item) {
68   for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(),
69        E = BranchStack.end(); I != E; ++I) {
70     if (*I == Item)
71       return true;
72   }
73   return false;
74 }
75
76 unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
77   switch(Item) {
78   default:
79     return 0;
80   case CFStack::FIRST_NON_WQM_PUSH:
81   assert(!ST.hasCaymanISA());
82   if (ST.getGeneration() <= AMDGPUSubtarget::R700) {
83     // +1 For the push operation.
84     // +2 Extra space required.
85     return 3;
86   } else {
87     // Some documentation says that this is not necessary on Evergreen,
88     // but experimentation has show that we need to allocate 1 extra
89     // sub-entry for the first non-WQM push.
90     // +1 For the push operation.
91     // +1 Extra space required.
92     return 2;
93   }
94   case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
95     assert(ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN);
96     // +1 For the push operation.
97     // +1 Extra space required.
98     return 2;
99   case CFStack::SUB_ENTRY:
100     return 1;
101   }
102 }
103
104 void CFStack::updateMaxStackSize() {
105   unsigned CurrentStackSize = CurrentEntries +
106                               (RoundUpToAlignment(CurrentSubEntries, 4) / 4);
107   MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
108 }
109
110 void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
111   CFStack::StackItem Item = CFStack::ENTRY;
112   switch(Opcode) {
113   case AMDGPU::CF_PUSH_EG:
114   case AMDGPU::CF_ALU_PUSH_BEFORE:
115     if (!isWQM) {
116       if (!ST.hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
117         Item = CFStack::FIRST_NON_WQM_PUSH;  // May not be required on Evergreen/NI
118                                              // See comment in
119                                              // CFStack::getSubEntrySize()
120       else if (CurrentEntries > 0 &&
121                ST.getGeneration() > AMDGPUSubtarget::EVERGREEN &&
122                !ST.hasCaymanISA() &&
123                !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
124         Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
125       else
126         Item = CFStack::SUB_ENTRY;
127     } else
128       Item = CFStack::ENTRY;
129     break;
130   }
131   BranchStack.push_back(Item);
132   if (Item == CFStack::ENTRY)
133     CurrentEntries++;
134   else
135     CurrentSubEntries += getSubEntrySize(Item);
136   updateMaxStackSize();
137 }
138
139 void CFStack::pushLoop() {
140   LoopStack.push_back(CFStack::ENTRY);
141   CurrentEntries++;
142   updateMaxStackSize();
143 }
144
145 void CFStack::popBranch() {
146   CFStack::StackItem Top = BranchStack.back();
147   if (Top == CFStack::ENTRY)
148     CurrentEntries--;
149   else
150     CurrentSubEntries-= getSubEntrySize(Top);
151   BranchStack.pop_back();
152 }
153
154 void CFStack::popLoop() {
155   CurrentEntries--;
156   LoopStack.pop_back();
157 }
158
159 class R600ControlFlowFinalizer : public MachineFunctionPass {
160
161 private:
162   typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile;
163
164   enum ControlFlowInstruction {
165     CF_TC,
166     CF_VC,
167     CF_CALL_FS,
168     CF_WHILE_LOOP,
169     CF_END_LOOP,
170     CF_LOOP_BREAK,
171     CF_LOOP_CONTINUE,
172     CF_JUMP,
173     CF_ELSE,
174     CF_POP,
175     CF_END
176   };
177
178   static char ID;
179   const R600InstrInfo *TII;
180   const R600RegisterInfo *TRI;
181   unsigned MaxFetchInst;
182   const AMDGPUSubtarget &ST;
183
184   bool IsTrivialInst(MachineInstr *MI) const {
185     switch (MI->getOpcode()) {
186     case AMDGPU::KILL:
187     case AMDGPU::RETURN:
188       return true;
189     default:
190       return false;
191     }
192   }
193
194   const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
195     unsigned Opcode = 0;
196     bool isEg = (ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN);
197     switch (CFI) {
198     case CF_TC:
199       Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
200       break;
201     case CF_VC:
202       Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
203       break;
204     case CF_CALL_FS:
205       Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
206       break;
207     case CF_WHILE_LOOP:
208       Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
209       break;
210     case CF_END_LOOP:
211       Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
212       break;
213     case CF_LOOP_BREAK:
214       Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
215       break;
216     case CF_LOOP_CONTINUE:
217       Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
218       break;
219     case CF_JUMP:
220       Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
221       break;
222     case CF_ELSE:
223       Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
224       break;
225     case CF_POP:
226       Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
227       break;
228     case CF_END:
229       if (ST.hasCaymanISA()) {
230         Opcode = AMDGPU::CF_END_CM;
231         break;
232       }
233       Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
234       break;
235     }
236     assert (Opcode && "No opcode selected");
237     return TII->get(Opcode);
238   }
239
240   bool isCompatibleWithClause(const MachineInstr *MI,
241       std::set<unsigned> &DstRegs) const {
242     unsigned DstMI, SrcMI;
243     for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
244         E = MI->operands_end(); I != E; ++I) {
245       const MachineOperand &MO = *I;
246       if (!MO.isReg())
247         continue;
248       if (MO.isDef()) {
249         unsigned Reg = MO.getReg();
250         if (AMDGPU::R600_Reg128RegClass.contains(Reg))
251           DstMI = Reg;
252         else
253           DstMI = TRI->getMatchingSuperReg(Reg,
254               TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
255               &AMDGPU::R600_Reg128RegClass);
256       }
257       if (MO.isUse()) {
258         unsigned Reg = MO.getReg();
259         if (AMDGPU::R600_Reg128RegClass.contains(Reg))
260           SrcMI = Reg;
261         else
262           SrcMI = TRI->getMatchingSuperReg(Reg,
263               TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
264               &AMDGPU::R600_Reg128RegClass);
265       }
266     }
267     if ((DstRegs.find(SrcMI) == DstRegs.end())) {
268       DstRegs.insert(DstMI);
269       return true;
270     } else
271       return false;
272   }
273
274   ClauseFile
275   MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
276       const {
277     MachineBasicBlock::iterator ClauseHead = I;
278     std::vector<MachineInstr *> ClauseContent;
279     unsigned AluInstCount = 0;
280     bool IsTex = TII->usesTextureCache(ClauseHead);
281     std::set<unsigned> DstRegs;
282     for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
283       if (IsTrivialInst(I))
284         continue;
285       if (AluInstCount >= MaxFetchInst)
286         break;
287       if ((IsTex && !TII->usesTextureCache(I)) ||
288           (!IsTex && !TII->usesVertexCache(I)))
289         break;
290       if (!isCompatibleWithClause(I, DstRegs))
291         break;
292       AluInstCount ++;
293       ClauseContent.push_back(I);
294     }
295     MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
296         getHWInstrDesc(IsTex?CF_TC:CF_VC))
297         .addImm(0) // ADDR
298         .addImm(AluInstCount - 1); // COUNT
299     return ClauseFile(MIb, ClauseContent);
300   }
301
302   void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const {
303     static const unsigned LiteralRegs[] = {
304       AMDGPU::ALU_LITERAL_X,
305       AMDGPU::ALU_LITERAL_Y,
306       AMDGPU::ALU_LITERAL_Z,
307       AMDGPU::ALU_LITERAL_W
308     };
309     const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs =
310         TII->getSrcs(MI);
311     for (unsigned i = 0, e = Srcs.size(); i < e; ++i) {
312       if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X)
313         continue;
314       int64_t Imm = Srcs[i].second;
315       std::vector<int64_t>::iterator It =
316           std::find(Lits.begin(), Lits.end(), Imm);
317       if (It != Lits.end()) {
318         unsigned Index = It - Lits.begin();
319         Srcs[i].first->setReg(LiteralRegs[Index]);
320       } else {
321         assert(Lits.size() < 4 && "Too many literals in Instruction Group");
322         Srcs[i].first->setReg(LiteralRegs[Lits.size()]);
323         Lits.push_back(Imm);
324       }
325     }
326   }
327
328   MachineBasicBlock::iterator insertLiterals(
329       MachineBasicBlock::iterator InsertPos,
330       const std::vector<unsigned> &Literals) const {
331     MachineBasicBlock *MBB = InsertPos->getParent();
332     for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
333       unsigned LiteralPair0 = Literals[i];
334       unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
335       InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
336           TII->get(AMDGPU::LITERALS))
337           .addImm(LiteralPair0)
338           .addImm(LiteralPair1);
339     }
340     return InsertPos;
341   }
342
343   ClauseFile
344   MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
345       const {
346     MachineBasicBlock::iterator ClauseHead = I;
347     std::vector<MachineInstr *> ClauseContent;
348     I++;
349     for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) {
350       if (IsTrivialInst(I)) {
351         ++I;
352         continue;
353       }
354       if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
355         break;
356       std::vector<int64_t> Literals;
357       if (I->isBundle()) {
358         MachineInstr *DeleteMI = I;
359         MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
360         while (++BI != E && BI->isBundledWithPred()) {
361           BI->unbundleFromPred();
362           for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) {
363             MachineOperand &MO = BI->getOperand(i);
364             if (MO.isReg() && MO.isInternalRead())
365               MO.setIsInternalRead(false);
366           }
367           getLiteral(BI, Literals);
368           ClauseContent.push_back(BI);
369         }
370         I = BI;
371         DeleteMI->eraseFromParent();
372       } else {
373         getLiteral(I, Literals);
374         ClauseContent.push_back(I);
375         I++;
376       }
377       for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
378         unsigned literal0 = Literals[i];
379         unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0;
380         MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(),
381             TII->get(AMDGPU::LITERALS))
382             .addImm(literal0)
383             .addImm(literal2);
384         ClauseContent.push_back(MILit);
385       }
386     }
387     assert(ClauseContent.size() < 128 && "ALU clause is too big");
388     ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1);
389     return ClauseFile(ClauseHead, ClauseContent);
390   }
391
392   void
393   EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
394       unsigned &CfCount) {
395     CounterPropagateAddr(Clause.first, CfCount);
396     MachineBasicBlock *BB = Clause.first->getParent();
397     BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE))
398         .addImm(CfCount);
399     for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
400       BB->splice(InsertPos, BB, Clause.second[i]);
401     }
402     CfCount += 2 * Clause.second.size();
403   }
404
405   void
406   EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
407       unsigned &CfCount) {
408     Clause.first->getOperand(0).setImm(0);
409     CounterPropagateAddr(Clause.first, CfCount);
410     MachineBasicBlock *BB = Clause.first->getParent();
411     BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE))
412         .addImm(CfCount);
413     for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
414       BB->splice(InsertPos, BB, Clause.second[i]);
415     }
416     CfCount += Clause.second.size();
417   }
418
419   void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const {
420     MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm());
421   }
422   void CounterPropagateAddr(std::set<MachineInstr *> MIs, unsigned Addr)
423       const {
424     for (std::set<MachineInstr *>::iterator It = MIs.begin(), E = MIs.end();
425         It != E; ++It) {
426       MachineInstr *MI = *It;
427       CounterPropagateAddr(MI, Addr);
428     }
429   }
430
431 public:
432   R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID),
433     TII (0), TRI(0),
434     ST(tm.getSubtarget<AMDGPUSubtarget>()) {
435       const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>();
436       MaxFetchInst = ST.getTexVTXClauseSize();
437   }
438
439   virtual bool runOnMachineFunction(MachineFunction &MF) {
440     TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
441     TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo());
442     R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
443
444     CFStack CFStack(ST, MFI->ShaderType);
445     for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
446         ++MB) {
447       MachineBasicBlock &MBB = *MB;
448       unsigned CfCount = 0;
449       std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
450       std::vector<MachineInstr * > IfThenElseStack;
451       if (MFI->ShaderType == 1) {
452         BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
453             getHWInstrDesc(CF_CALL_FS));
454         CfCount++;
455       }
456       std::vector<ClauseFile> FetchClauses, AluClauses;
457       std::vector<MachineInstr *> LastAlu(1);
458       std::vector<MachineInstr *> ToPopAfter;
459       
460       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
461           I != E;) {
462         if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) {
463           DEBUG(dbgs() << CfCount << ":"; I->dump(););
464           FetchClauses.push_back(MakeFetchClause(MBB, I));
465           CfCount++;
466           continue;
467         }
468
469         MachineBasicBlock::iterator MI = I;
470         if (MI->getOpcode() != AMDGPU::ENDIF)
471           LastAlu.back() = 0;
472         if (MI->getOpcode() == AMDGPU::CF_ALU)
473           LastAlu.back() = MI;
474         I++;
475         switch (MI->getOpcode()) {
476         case AMDGPU::CF_ALU_PUSH_BEFORE:
477           if (ST.hasCaymanISA() && CFStack.getLoopDepth() > 1) {
478             BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
479                 .addImm(CfCount + 1)
480                 .addImm(1);
481             MI->setDesc(TII->get(AMDGPU::CF_ALU));
482             CfCount++;
483             CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
484           } else
485             CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
486
487         case AMDGPU::CF_ALU:
488           I = MI;
489           AluClauses.push_back(MakeALUClause(MBB, I));
490           DEBUG(dbgs() << CfCount << ":"; MI->dump(););
491           CfCount++;
492           break;
493         case AMDGPU::WHILELOOP: {
494           CFStack.pushLoop();
495           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
496               getHWInstrDesc(CF_WHILE_LOOP))
497               .addImm(1);
498           std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
499               std::set<MachineInstr *>());
500           Pair.second.insert(MIb);
501           LoopStack.push_back(Pair);
502           MI->eraseFromParent();
503           CfCount++;
504           break;
505         }
506         case AMDGPU::ENDLOOP: {
507           CFStack.popLoop();
508           std::pair<unsigned, std::set<MachineInstr *> > Pair =
509               LoopStack.back();
510           LoopStack.pop_back();
511           CounterPropagateAddr(Pair.second, CfCount);
512           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
513               .addImm(Pair.first + 1);
514           MI->eraseFromParent();
515           CfCount++;
516           break;
517         }
518         case AMDGPU::IF_PREDICATE_SET: {
519           LastAlu.push_back(0);
520           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
521               getHWInstrDesc(CF_JUMP))
522               .addImm(0)
523               .addImm(0);
524           IfThenElseStack.push_back(MIb);
525           DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
526           MI->eraseFromParent();
527           CfCount++;
528           break;
529         }
530         case AMDGPU::ELSE: {
531           MachineInstr * JumpInst = IfThenElseStack.back();
532           IfThenElseStack.pop_back();
533           CounterPropagateAddr(JumpInst, CfCount);
534           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
535               getHWInstrDesc(CF_ELSE))
536               .addImm(0)
537               .addImm(0);
538           DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
539           IfThenElseStack.push_back(MIb);
540           MI->eraseFromParent();
541           CfCount++;
542           break;
543         }
544         case AMDGPU::ENDIF: {
545           CFStack.popBranch();
546           if (LastAlu.back()) {
547             ToPopAfter.push_back(LastAlu.back());
548           } else {
549             MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
550                 getHWInstrDesc(CF_POP))
551                 .addImm(CfCount + 1)
552                 .addImm(1);
553             (void)MIb;
554             DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
555             CfCount++;
556           }
557           
558           MachineInstr *IfOrElseInst = IfThenElseStack.back();
559           IfThenElseStack.pop_back();
560           CounterPropagateAddr(IfOrElseInst, CfCount);
561           IfOrElseInst->getOperand(1).setImm(1);
562           LastAlu.pop_back();
563           MI->eraseFromParent();
564           break;
565         }
566         case AMDGPU::BREAK: {
567           CfCount ++;
568           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
569               getHWInstrDesc(CF_LOOP_BREAK))
570               .addImm(0);
571           LoopStack.back().second.insert(MIb);
572           MI->eraseFromParent();
573           break;
574         }
575         case AMDGPU::CONTINUE: {
576           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
577               getHWInstrDesc(CF_LOOP_CONTINUE))
578               .addImm(0);
579           LoopStack.back().second.insert(MIb);
580           MI->eraseFromParent();
581           CfCount++;
582           break;
583         }
584         case AMDGPU::RETURN: {
585           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END));
586           CfCount++;
587           MI->eraseFromParent();
588           if (CfCount % 2) {
589             BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD));
590             CfCount++;
591           }
592           for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
593             EmitFetchClause(I, FetchClauses[i], CfCount);
594           for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
595             EmitALUClause(I, AluClauses[i], CfCount);
596         }
597         default:
598           if (TII->isExport(MI->getOpcode())) {
599             DEBUG(dbgs() << CfCount << ":"; MI->dump(););
600             CfCount++;
601           }
602           break;
603         }
604       }
605       for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
606         MachineInstr *Alu = ToPopAfter[i];
607         BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
608             TII->get(AMDGPU::CF_ALU_POP_AFTER))
609             .addImm(Alu->getOperand(0).getImm())
610             .addImm(Alu->getOperand(1).getImm())
611             .addImm(Alu->getOperand(2).getImm())
612             .addImm(Alu->getOperand(3).getImm())
613             .addImm(Alu->getOperand(4).getImm())
614             .addImm(Alu->getOperand(5).getImm())
615             .addImm(Alu->getOperand(6).getImm())
616             .addImm(Alu->getOperand(7).getImm())
617             .addImm(Alu->getOperand(8).getImm());
618         Alu->eraseFromParent();
619       }
620       MFI->StackSize = CFStack.MaxStackSize;
621     }
622
623     return false;
624   }
625
626   const char *getPassName() const {
627     return "R600 Control Flow Finalizer Pass";
628   }
629 };
630
631 char R600ControlFlowFinalizer::ID = 0;
632
633 } // end anonymous namespace
634
635
636 llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
637   return new R600ControlFlowFinalizer(TM);
638 }