R600: Take inner dependency into tex/vtx clauses
[oota-llvm.git] / lib / Target / R600 / R600ControlFlowFinalizer.cpp
1 //===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// This pass compute turns all control flow pseudo instructions into native one
12 /// computing their address on the fly ; it also sets STACK_SIZE info.
13 //===----------------------------------------------------------------------===//
14
15 #define DEBUG_TYPE "r600cf"
16 #include "llvm/Support/Debug.h"
17 #include "llvm/Support/raw_ostream.h"
18
19 #include "AMDGPU.h"
20 #include "R600Defines.h"
21 #include "R600InstrInfo.h"
22 #include "R600MachineFunctionInfo.h"
23 #include "R600RegisterInfo.h"
24 #include "llvm/CodeGen/MachineFunctionPass.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineRegisterInfo.h"
27
28 namespace llvm {
29
30 class R600ControlFlowFinalizer : public MachineFunctionPass {
31
32 private:
33   typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile;
34
35   enum ControlFlowInstruction {
36     CF_TC,
37     CF_VC,
38     CF_CALL_FS,
39     CF_WHILE_LOOP,
40     CF_END_LOOP,
41     CF_LOOP_BREAK,
42     CF_LOOP_CONTINUE,
43     CF_JUMP,
44     CF_ELSE,
45     CF_POP,
46     CF_END
47   };
48
49   static char ID;
50   const R600InstrInfo *TII;
51   const R600RegisterInfo &TRI;
52   unsigned MaxFetchInst;
53   const AMDGPUSubtarget &ST;
54
55   bool IsTrivialInst(MachineInstr *MI) const {
56     switch (MI->getOpcode()) {
57     case AMDGPU::KILL:
58     case AMDGPU::RETURN:
59       return true;
60     default:
61       return false;
62     }
63   }
64
65   const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
66     unsigned Opcode = 0;
67     bool isEg = (ST.device()->getGeneration() >= AMDGPUDeviceInfo::HD5XXX);
68     switch (CFI) {
69     case CF_TC:
70       Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
71       break;
72     case CF_VC:
73       Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
74       break;
75     case CF_CALL_FS:
76       Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
77       break;
78     case CF_WHILE_LOOP:
79       Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
80       break;
81     case CF_END_LOOP:
82       Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
83       break;
84     case CF_LOOP_BREAK:
85       Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
86       break;
87     case CF_LOOP_CONTINUE:
88       Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
89       break;
90     case CF_JUMP:
91       Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
92       break;
93     case CF_ELSE:
94       Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
95       break;
96     case CF_POP:
97       Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
98       break;
99     case CF_END:
100       if (ST.device()->getDeviceFlag() == OCL_DEVICE_CAYMAN) {
101         Opcode = AMDGPU::CF_END_CM;
102         break;
103       }
104       Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
105       break;
106     }
107     assert (Opcode && "No opcode selected");
108     return TII->get(Opcode);
109   }
110
111   bool isCompatibleWithClause(const MachineInstr *MI,
112   std::set<unsigned> &DstRegs, std::set<unsigned> &SrcRegs) const {
113     unsigned DstMI, SrcMI;
114     for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
115         E = MI->operands_end(); I != E; ++I) {
116       const MachineOperand &MO = *I;
117       if (!MO.isReg())
118         continue;
119       if (MO.isDef())
120         DstMI = MO.getReg();
121       if (MO.isUse()) {
122         unsigned Reg = MO.getReg();
123         if (AMDGPU::R600_Reg128RegClass.contains(Reg))
124           SrcMI = Reg;
125         else
126           SrcMI = TRI.getMatchingSuperReg(Reg,
127               TRI.getSubRegFromChannel(TRI.getHWRegChan(Reg)),
128               &AMDGPU::R600_Reg128RegClass);
129       }
130     }
131     if ((DstRegs.find(SrcMI) == DstRegs.end()) &&
132         (SrcRegs.find(DstMI) == SrcRegs.end())) {
133       SrcRegs.insert(SrcMI);
134       DstRegs.insert(DstMI);
135       return true;
136     } else
137       return false;
138   }
139
140   ClauseFile
141   MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
142       const {
143     MachineBasicBlock::iterator ClauseHead = I;
144     std::vector<MachineInstr *> ClauseContent;
145     unsigned AluInstCount = 0;
146     bool IsTex = TII->usesTextureCache(ClauseHead);
147     std::set<unsigned> DstRegs, SrcRegs;
148     for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
149       if (IsTrivialInst(I))
150         continue;
151       if (AluInstCount > MaxFetchInst)
152         break;
153       if ((IsTex && !TII->usesTextureCache(I)) ||
154           (!IsTex && !TII->usesVertexCache(I)))
155         break;
156       if (!isCompatibleWithClause(I, DstRegs, SrcRegs))
157         break;
158       AluInstCount ++;
159       ClauseContent.push_back(I);
160     }
161     MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
162         getHWInstrDesc(IsTex?CF_TC:CF_VC))
163         .addImm(0) // ADDR
164         .addImm(AluInstCount - 1); // COUNT
165     return ClauseFile(MIb, ClauseContent);
166   }
167
168   void
169   EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
170       unsigned &CfCount) {
171     CounterPropagateAddr(Clause.first, CfCount);
172     MachineBasicBlock *BB = Clause.first->getParent();
173     BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE))
174         .addImm(CfCount);
175     for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
176       BB->splice(InsertPos, BB, Clause.second[i]);
177     }
178     CfCount += 2 * Clause.second.size();
179   }
180
181   void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const {
182     MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm());
183   }
184   void CounterPropagateAddr(std::set<MachineInstr *> MIs, unsigned Addr)
185       const {
186     for (std::set<MachineInstr *>::iterator It = MIs.begin(), E = MIs.end();
187         It != E; ++It) {
188       MachineInstr *MI = *It;
189       CounterPropagateAddr(MI, Addr);
190     }
191   }
192
193   unsigned getHWStackSize(unsigned StackSubEntry, bool hasPush) const {
194     switch (ST.device()->getGeneration()) {
195     case AMDGPUDeviceInfo::HD4XXX:
196       if (hasPush)
197         StackSubEntry += 2;
198       break;
199     case AMDGPUDeviceInfo::HD5XXX:
200       if (hasPush)
201         StackSubEntry ++;
202     case AMDGPUDeviceInfo::HD6XXX:
203       StackSubEntry += 2;
204       break;
205     }
206     return (StackSubEntry + 3)/4; // Need ceil value of StackSubEntry/4
207   }
208
209 public:
210   R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID),
211     TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())),
212     TRI(TII->getRegisterInfo()),
213     ST(tm.getSubtarget<AMDGPUSubtarget>()) {
214       const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>();
215       if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD4XXX)
216         MaxFetchInst = 8;
217       else
218         MaxFetchInst = 16;
219   }
220
221   virtual bool runOnMachineFunction(MachineFunction &MF) {
222     unsigned MaxStack = 0;
223     unsigned CurrentStack = 0;
224     bool hasPush;
225     for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
226         ++MB) {
227       MachineBasicBlock &MBB = *MB;
228       unsigned CfCount = 0;
229       std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
230       std::vector<MachineInstr * > IfThenElseStack;
231       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
232       if (MFI->ShaderType == 1) {
233         BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
234             getHWInstrDesc(CF_CALL_FS));
235         CfCount++;
236       }
237       std::vector<ClauseFile> FetchClauses;
238       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
239           I != E;) {
240         if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) {
241           DEBUG(dbgs() << CfCount << ":"; I->dump(););
242           FetchClauses.push_back(MakeFetchClause(MBB, I));
243           CfCount++;
244           continue;
245         }
246
247         MachineBasicBlock::iterator MI = I;
248         I++;
249         switch (MI->getOpcode()) {
250         case AMDGPU::CF_ALU_PUSH_BEFORE:
251           CurrentStack++;
252           MaxStack = std::max(MaxStack, CurrentStack);
253           hasPush = true;
254         case AMDGPU::CF_ALU:
255         case AMDGPU::EG_ExportBuf:
256         case AMDGPU::EG_ExportSwz:
257         case AMDGPU::R600_ExportBuf:
258         case AMDGPU::R600_ExportSwz:
259         case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
260         case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
261           DEBUG(dbgs() << CfCount << ":"; MI->dump(););
262           CfCount++;
263           break;
264         case AMDGPU::WHILELOOP: {
265           CurrentStack+=4;
266           MaxStack = std::max(MaxStack, CurrentStack);
267           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
268               getHWInstrDesc(CF_WHILE_LOOP))
269               .addImm(1);
270           std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
271               std::set<MachineInstr *>());
272           Pair.second.insert(MIb);
273           LoopStack.push_back(Pair);
274           MI->eraseFromParent();
275           CfCount++;
276           break;
277         }
278         case AMDGPU::ENDLOOP: {
279           CurrentStack-=4;
280           std::pair<unsigned, std::set<MachineInstr *> > Pair =
281               LoopStack.back();
282           LoopStack.pop_back();
283           CounterPropagateAddr(Pair.second, CfCount);
284           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
285               .addImm(Pair.first + 1);
286           MI->eraseFromParent();
287           CfCount++;
288           break;
289         }
290         case AMDGPU::IF_PREDICATE_SET: {
291           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
292               getHWInstrDesc(CF_JUMP))
293               .addImm(0)
294               .addImm(0);
295           IfThenElseStack.push_back(MIb);
296           DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
297           MI->eraseFromParent();
298           CfCount++;
299           break;
300         }
301         case AMDGPU::ELSE: {
302           MachineInstr * JumpInst = IfThenElseStack.back();
303           IfThenElseStack.pop_back();
304           CounterPropagateAddr(JumpInst, CfCount);
305           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
306               getHWInstrDesc(CF_ELSE))
307               .addImm(0)
308               .addImm(1);
309           DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
310           IfThenElseStack.push_back(MIb);
311           MI->eraseFromParent();
312           CfCount++;
313           break;
314         }
315         case AMDGPU::ENDIF: {
316           CurrentStack--;
317           MachineInstr *IfOrElseInst = IfThenElseStack.back();
318           IfThenElseStack.pop_back();
319           CounterPropagateAddr(IfOrElseInst, CfCount + 1);
320           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
321               getHWInstrDesc(CF_POP))
322               .addImm(CfCount + 1)
323               .addImm(1);
324           (void)MIb;
325           DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
326           MI->eraseFromParent();
327           CfCount++;
328           break;
329         }
330         case AMDGPU::PREDICATED_BREAK: {
331           CurrentStack--;
332           CfCount += 3;
333           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_JUMP))
334               .addImm(CfCount)
335               .addImm(1);
336           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
337               getHWInstrDesc(CF_LOOP_BREAK))
338               .addImm(0);
339           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_POP))
340               .addImm(CfCount)
341               .addImm(1);
342           LoopStack.back().second.insert(MIb);
343           MI->eraseFromParent();
344           break;
345         }
346         case AMDGPU::CONTINUE: {
347           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
348               getHWInstrDesc(CF_LOOP_CONTINUE))
349               .addImm(0);
350           LoopStack.back().second.insert(MIb);
351           MI->eraseFromParent();
352           CfCount++;
353           break;
354         }
355         case AMDGPU::RETURN: {
356           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END));
357           CfCount++;
358           MI->eraseFromParent();
359           if (CfCount % 2) {
360             BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD));
361             CfCount++;
362           }
363           for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
364             EmitFetchClause(I, FetchClauses[i], CfCount);
365         }
366         default:
367           break;
368         }
369       }
370       MFI->StackSize = getHWStackSize(MaxStack, hasPush);
371     }
372
373     return false;
374   }
375
376   const char *getPassName() const {
377     return "R600 Control Flow Finalizer Pass";
378   }
379 };
380
381 char R600ControlFlowFinalizer::ID = 0;
382
383 }
384
385
386 llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
387   return new R600ControlFlowFinalizer(TM);
388 }