Setting the default value (fixes CRT assertions about uninitialized variable use...
[oota-llvm.git] / lib / Target / R600 / R600ControlFlowFinalizer.cpp
1 //===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// This pass compute turns all control flow pseudo instructions into native one
12 /// computing their address on the fly ; it also sets STACK_SIZE info.
13 //===----------------------------------------------------------------------===//
14
15 #define DEBUG_TYPE "r600cf"
16 #include "llvm/Support/Debug.h"
17 #include "llvm/Support/raw_ostream.h"
18
19 #include "AMDGPU.h"
20 #include "R600Defines.h"
21 #include "R600InstrInfo.h"
22 #include "R600MachineFunctionInfo.h"
23 #include "R600RegisterInfo.h"
24 #include "llvm/CodeGen/MachineFunctionPass.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineRegisterInfo.h"
27
28 namespace llvm {
29
30 class R600ControlFlowFinalizer : public MachineFunctionPass {
31
32 private:
33   typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile;
34
35   enum ControlFlowInstruction {
36     CF_TC,
37     CF_VC,
38     CF_CALL_FS,
39     CF_WHILE_LOOP,
40     CF_END_LOOP,
41     CF_LOOP_BREAK,
42     CF_LOOP_CONTINUE,
43     CF_JUMP,
44     CF_ELSE,
45     CF_POP,
46     CF_END
47   };
48
49   static char ID;
50   const R600InstrInfo *TII;
51   const R600RegisterInfo &TRI;
52   unsigned MaxFetchInst;
53   const AMDGPUSubtarget &ST;
54
55   bool IsTrivialInst(MachineInstr *MI) const {
56     switch (MI->getOpcode()) {
57     case AMDGPU::KILL:
58     case AMDGPU::RETURN:
59       return true;
60     default:
61       return false;
62     }
63   }
64
65   const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
66     unsigned Opcode = 0;
67     bool isEg = (ST.device()->getGeneration() >= AMDGPUDeviceInfo::HD5XXX);
68     switch (CFI) {
69     case CF_TC:
70       Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
71       break;
72     case CF_VC:
73       Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
74       break;
75     case CF_CALL_FS:
76       Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
77       break;
78     case CF_WHILE_LOOP:
79       Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
80       break;
81     case CF_END_LOOP:
82       Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
83       break;
84     case CF_LOOP_BREAK:
85       Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
86       break;
87     case CF_LOOP_CONTINUE:
88       Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
89       break;
90     case CF_JUMP:
91       Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
92       break;
93     case CF_ELSE:
94       Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
95       break;
96     case CF_POP:
97       Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
98       break;
99     case CF_END:
100       if (ST.device()->getDeviceFlag() == OCL_DEVICE_CAYMAN) {
101         Opcode = AMDGPU::CF_END_CM;
102         break;
103       }
104       Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
105       break;
106     }
107     assert (Opcode && "No opcode selected");
108     return TII->get(Opcode);
109   }
110
111   bool isCompatibleWithClause(const MachineInstr *MI,
112   std::set<unsigned> &DstRegs, std::set<unsigned> &SrcRegs) const {
113     unsigned DstMI, SrcMI;
114     for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
115         E = MI->operands_end(); I != E; ++I) {
116       const MachineOperand &MO = *I;
117       if (!MO.isReg())
118         continue;
119       if (MO.isDef())
120         DstMI = MO.getReg();
121       if (MO.isUse()) {
122         unsigned Reg = MO.getReg();
123         if (AMDGPU::R600_Reg128RegClass.contains(Reg))
124           SrcMI = Reg;
125         else
126           SrcMI = TRI.getMatchingSuperReg(Reg,
127               TRI.getSubRegFromChannel(TRI.getHWRegChan(Reg)),
128               &AMDGPU::R600_Reg128RegClass);
129       }
130     }
131     if ((DstRegs.find(SrcMI) == DstRegs.end()) &&
132         (SrcRegs.find(DstMI) == SrcRegs.end())) {
133       SrcRegs.insert(SrcMI);
134       DstRegs.insert(DstMI);
135       return true;
136     } else
137       return false;
138   }
139
140   ClauseFile
141   MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
142       const {
143     MachineBasicBlock::iterator ClauseHead = I;
144     std::vector<MachineInstr *> ClauseContent;
145     unsigned AluInstCount = 0;
146     bool IsTex = TII->usesTextureCache(ClauseHead);
147     std::set<unsigned> DstRegs, SrcRegs;
148     for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
149       if (IsTrivialInst(I))
150         continue;
151       if (AluInstCount >= MaxFetchInst)
152         break;
153       if ((IsTex && !TII->usesTextureCache(I)) ||
154           (!IsTex && !TII->usesVertexCache(I)))
155         break;
156       if (!isCompatibleWithClause(I, DstRegs, SrcRegs))
157         break;
158       AluInstCount ++;
159       ClauseContent.push_back(I);
160     }
161     MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
162         getHWInstrDesc(IsTex?CF_TC:CF_VC))
163         .addImm(0) // ADDR
164         .addImm(AluInstCount - 1); // COUNT
165     return ClauseFile(MIb, ClauseContent);
166   }
167
168   void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const {
169     unsigned LiteralRegs[] = {
170       AMDGPU::ALU_LITERAL_X,
171       AMDGPU::ALU_LITERAL_Y,
172       AMDGPU::ALU_LITERAL_Z,
173       AMDGPU::ALU_LITERAL_W
174     };
175     const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs =
176         TII->getSrcs(MI);
177     for (unsigned i = 0, e = Srcs.size(); i < e; ++i) {
178       if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X)
179         continue;
180       int64_t Imm = Srcs[i].second;
181       std::vector<int64_t>::iterator It =
182           std::find(Lits.begin(), Lits.end(), Imm);
183       if (It != Lits.end()) {
184         unsigned Index = It - Lits.begin();
185         Srcs[i].first->setReg(LiteralRegs[Index]);
186       } else {
187         assert(Lits.size() < 4 && "Too many literals in Instruction Group");
188         Srcs[i].first->setReg(LiteralRegs[Lits.size()]);
189         Lits.push_back(Imm);
190       }
191     }
192   }
193
194   MachineBasicBlock::iterator insertLiterals(
195       MachineBasicBlock::iterator InsertPos,
196       const std::vector<unsigned> &Literals) const {
197     MachineBasicBlock *MBB = InsertPos->getParent();
198     for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
199       unsigned LiteralPair0 = Literals[i];
200       unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
201       InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
202           TII->get(AMDGPU::LITERALS))
203           .addImm(LiteralPair0)
204           .addImm(LiteralPair1);
205     }
206     return InsertPos;
207   }
208
209   ClauseFile
210   MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
211       const {
212     MachineBasicBlock::iterator ClauseHead = I;
213     std::vector<MachineInstr *> ClauseContent;
214     I++;
215     for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) {
216       if (IsTrivialInst(I)) {
217         ++I;
218         continue;
219       }
220       if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
221         break;
222       std::vector<int64_t> Literals;
223       if (I->isBundle()) {
224         MachineInstr *DeleteMI = I;
225         MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
226         while (++BI != E && BI->isBundledWithPred()) {
227           BI->unbundleFromPred();
228           for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) {
229             MachineOperand &MO = BI->getOperand(i);
230             if (MO.isReg() && MO.isInternalRead())
231               MO.setIsInternalRead(false);
232           }
233           getLiteral(BI, Literals);
234           ClauseContent.push_back(BI);
235         }
236         I = BI;
237         DeleteMI->eraseFromParent();
238       } else {
239         getLiteral(I, Literals);
240         ClauseContent.push_back(I);
241         I++;
242       }
243       for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
244         unsigned literal0 = Literals[i];
245         unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0;
246         MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(),
247             TII->get(AMDGPU::LITERALS))
248             .addImm(literal0)
249             .addImm(literal2);
250         ClauseContent.push_back(MILit);
251       }
252     }
253     ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1);
254     return ClauseFile(ClauseHead, ClauseContent);
255   }
256
257   void
258   EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
259       unsigned &CfCount) {
260     CounterPropagateAddr(Clause.first, CfCount);
261     MachineBasicBlock *BB = Clause.first->getParent();
262     BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE))
263         .addImm(CfCount);
264     for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
265       BB->splice(InsertPos, BB, Clause.second[i]);
266     }
267     CfCount += 2 * Clause.second.size();
268   }
269
270   void
271   EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
272       unsigned &CfCount) {
273     CounterPropagateAddr(Clause.first, CfCount);
274     MachineBasicBlock *BB = Clause.first->getParent();
275     BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE))
276         .addImm(CfCount);
277     for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
278       BB->splice(InsertPos, BB, Clause.second[i]);
279     }
280     CfCount += Clause.second.size();
281   }
282
283   void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const {
284     MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm());
285   }
286   void CounterPropagateAddr(std::set<MachineInstr *> MIs, unsigned Addr)
287       const {
288     for (std::set<MachineInstr *>::iterator It = MIs.begin(), E = MIs.end();
289         It != E; ++It) {
290       MachineInstr *MI = *It;
291       CounterPropagateAddr(MI, Addr);
292     }
293   }
294
295   unsigned getHWStackSize(unsigned StackSubEntry, bool hasPush) const {
296     switch (ST.device()->getGeneration()) {
297     case AMDGPUDeviceInfo::HD4XXX:
298       if (hasPush)
299         StackSubEntry += 2;
300       break;
301     case AMDGPUDeviceInfo::HD5XXX:
302       if (hasPush)
303         StackSubEntry ++;
304     case AMDGPUDeviceInfo::HD6XXX:
305       StackSubEntry += 2;
306       break;
307     }
308     return (StackSubEntry + 3)/4; // Need ceil value of StackSubEntry/4
309   }
310
311 public:
312   R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID),
313     TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())),
314     TRI(TII->getRegisterInfo()),
315     ST(tm.getSubtarget<AMDGPUSubtarget>()) {
316       const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>();
317       MaxFetchInst = ST.getTexVTXClauseSize();
318   }
319
320   virtual bool runOnMachineFunction(MachineFunction &MF) {
321     unsigned MaxStack = 0;
322     unsigned CurrentStack = 0;
323     bool HasPush = false;
324     for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
325         ++MB) {
326       MachineBasicBlock &MBB = *MB;
327       unsigned CfCount = 0;
328       std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
329       std::vector<MachineInstr * > IfThenElseStack;
330       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
331       if (MFI->ShaderType == 1) {
332         BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
333             getHWInstrDesc(CF_CALL_FS));
334         CfCount++;
335       }
336       std::vector<ClauseFile> FetchClauses, AluClauses;
337       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
338           I != E;) {
339         if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) {
340           DEBUG(dbgs() << CfCount << ":"; I->dump(););
341           FetchClauses.push_back(MakeFetchClause(MBB, I));
342           CfCount++;
343           continue;
344         }
345
346         MachineBasicBlock::iterator MI = I;
347         I++;
348         switch (MI->getOpcode()) {
349         case AMDGPU::CF_ALU_PUSH_BEFORE:
350           CurrentStack++;
351           MaxStack = std::max(MaxStack, CurrentStack);
352           HasPush = true;
353         case AMDGPU::CF_ALU:
354           I = MI;
355           AluClauses.push_back(MakeALUClause(MBB, I));
356         case AMDGPU::EG_ExportBuf:
357         case AMDGPU::EG_ExportSwz:
358         case AMDGPU::R600_ExportBuf:
359         case AMDGPU::R600_ExportSwz:
360         case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
361         case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
362           DEBUG(dbgs() << CfCount << ":"; MI->dump(););
363           CfCount++;
364           break;
365         case AMDGPU::WHILELOOP: {
366           CurrentStack+=4;
367           MaxStack = std::max(MaxStack, CurrentStack);
368           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
369               getHWInstrDesc(CF_WHILE_LOOP))
370               .addImm(1);
371           std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
372               std::set<MachineInstr *>());
373           Pair.second.insert(MIb);
374           LoopStack.push_back(Pair);
375           MI->eraseFromParent();
376           CfCount++;
377           break;
378         }
379         case AMDGPU::ENDLOOP: {
380           CurrentStack-=4;
381           std::pair<unsigned, std::set<MachineInstr *> > Pair =
382               LoopStack.back();
383           LoopStack.pop_back();
384           CounterPropagateAddr(Pair.second, CfCount);
385           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
386               .addImm(Pair.first + 1);
387           MI->eraseFromParent();
388           CfCount++;
389           break;
390         }
391         case AMDGPU::IF_PREDICATE_SET: {
392           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
393               getHWInstrDesc(CF_JUMP))
394               .addImm(0)
395               .addImm(0);
396           IfThenElseStack.push_back(MIb);
397           DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
398           MI->eraseFromParent();
399           CfCount++;
400           break;
401         }
402         case AMDGPU::ELSE: {
403           MachineInstr * JumpInst = IfThenElseStack.back();
404           IfThenElseStack.pop_back();
405           CounterPropagateAddr(JumpInst, CfCount);
406           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
407               getHWInstrDesc(CF_ELSE))
408               .addImm(0)
409               .addImm(1);
410           DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
411           IfThenElseStack.push_back(MIb);
412           MI->eraseFromParent();
413           CfCount++;
414           break;
415         }
416         case AMDGPU::ENDIF: {
417           CurrentStack--;
418           MachineInstr *IfOrElseInst = IfThenElseStack.back();
419           IfThenElseStack.pop_back();
420           CounterPropagateAddr(IfOrElseInst, CfCount + 1);
421           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
422               getHWInstrDesc(CF_POP))
423               .addImm(CfCount + 1)
424               .addImm(1);
425           (void)MIb;
426           DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
427           MI->eraseFromParent();
428           CfCount++;
429           break;
430         }
431         case AMDGPU::PREDICATED_BREAK: {
432           CurrentStack--;
433           CfCount += 3;
434           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_JUMP))
435               .addImm(CfCount)
436               .addImm(1);
437           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
438               getHWInstrDesc(CF_LOOP_BREAK))
439               .addImm(0);
440           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_POP))
441               .addImm(CfCount)
442               .addImm(1);
443           LoopStack.back().second.insert(MIb);
444           MI->eraseFromParent();
445           break;
446         }
447         case AMDGPU::CONTINUE: {
448           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
449               getHWInstrDesc(CF_LOOP_CONTINUE))
450               .addImm(0);
451           LoopStack.back().second.insert(MIb);
452           MI->eraseFromParent();
453           CfCount++;
454           break;
455         }
456         case AMDGPU::RETURN: {
457           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END));
458           CfCount++;
459           MI->eraseFromParent();
460           if (CfCount % 2) {
461             BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD));
462             CfCount++;
463           }
464           for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
465             EmitFetchClause(I, FetchClauses[i], CfCount);
466           for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
467             EmitALUClause(I, AluClauses[i], CfCount);
468         }
469         default:
470           break;
471         }
472       }
473       MFI->StackSize = getHWStackSize(MaxStack, HasPush);
474     }
475
476     return false;
477   }
478
479   const char *getPassName() const {
480     return "R600 Control Flow Finalizer Pass";
481   }
482 };
483
484 char R600ControlFlowFinalizer::ID = 0;
485
486 }
487
488
489 llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
490   return new R600ControlFlowFinalizer(TM);
491 }