X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FR600%2FSILowerControlFlow.cpp;h=aaf91ea828168d1c7a433993ab27c87c98e5fdb6;hb=8d6f6a27c706852f54373b90752a66bacdb2172d;hp=1b0dbcc45fb3deaf0c0f6be95695da1d05b3dadd;hpb=9262a64b307621a046ef5728d90bef4921b46108;p=oota-llvm.git diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp index 1b0dbcc45fb..aaf91ea8281 100644 --- a/lib/Target/R600/SILowerControlFlow.cpp +++ b/lib/Target/R600/SILowerControlFlow.cpp @@ -49,12 +49,15 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" using namespace llvm; @@ -66,7 +69,8 @@ private: static const unsigned SkipThreshold = 12; static char ID; - const TargetInstrInfo *TII; + const SIRegisterInfo *TRI; + const SIInstrInfo *TII; bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); @@ -84,13 +88,17 @@ private: void Kill(MachineInstr &MI); void Branch(MachineInstr &MI); + void LoadM0(MachineInstr &MI, MachineInstr *MovRel); + void IndirectSrc(MachineInstr &MI); + void IndirectDst(MachineInstr &MI); + public: SILowerControlFlowPass(TargetMachine &tm) : - MachineFunctionPass(ID), TII(tm.getInstrInfo()) { } + MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { } - virtual bool runOnMachineFunction(MachineFunction &MF); + bool runOnMachineFunction(MachineFunction &MF) override; - const char *getPassName() const { + const char *getPassName() const override { return "SI Lower control flow instructions"; } @@ -140,7 +148,9 @@ void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); - if (!shouldSkip(&MBB, &MBB.getParent()->back())) + if (MBB.getParent()->getInfo()->getShaderType() != + ShaderType::PIXEL || + !shouldSkip(&MBB, &MBB.getParent()->back())) return; MachineBasicBlock::iterator Insert = &MI; @@ -158,10 +168,10 @@ void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { .addImm(0) .addImm(1) .addImm(1) - .addReg(AMDGPU::SREG_LIT_0) - .addReg(AMDGPU::SREG_LIT_0) - .addReg(AMDGPU::SREG_LIT_0) - .addReg(AMDGPU::SREG_LIT_0); + .addReg(AMDGPU::VGPR0) + .addReg(AMDGPU::VGPR0) + .addReg(AMDGPU::VGPR0) + .addReg(AMDGPU::VGPR0); // ... and terminate wavefront BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); @@ -191,7 +201,8 @@ void SILowerControlFlowPass::Else(MachineInstr &MI) { unsigned Dst = MI.getOperand(0).getReg(); unsigned Src = MI.getOperand(1).getReg(); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) + BuildMI(MBB, MBB.getFirstNonPHI(), DL, + TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) .addReg(Src); // Saved EXEC BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) @@ -277,45 +288,173 @@ void SILowerControlFlowPass::EndCf(MachineInstr &MI) { } void SILowerControlFlowPass::Branch(MachineInstr &MI) { - MachineBasicBlock *Next = MI.getParent()->getNextNode(); - MachineBasicBlock *Target = MI.getOperand(0).getMBB(); - if (Target == Next) + if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode()) MI.eraseFromParent(); - else - assert(0); + + // If these aren't equal, this is probably an infinite loop. } void SILowerControlFlowPass::Kill(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + const MachineOperand &Op = MI.getOperand(0); + +#ifndef NDEBUG + const SIMachineFunctionInfo *MFI + = MBB.getParent()->getInfo(); + // Kill is only allowed in pixel / geometry shaders. + assert(MFI->getShaderType() == ShaderType::PIXEL || + MFI->getShaderType() == ShaderType::GEOMETRY); +#endif + + // Clear this thread from the exec mask if the operand is negative + if ((Op.isImm() || Op.isFPImm())) { + // Constant operand: Set exec mask to 0 or do nothing + if (Op.isImm() ? (Op.getImm() & 0x80000000) : + Op.getFPImm()->isNegative()) { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addImm(0); + } + } else { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC) + .addImm(0) + .addOperand(Op); + } + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); + MachineBasicBlock::iterator I = MI; + + unsigned Save = MI.getOperand(1).getReg(); + unsigned Idx = MI.getOperand(3).getReg(); + + if (AMDGPU::SReg_32RegClass.contains(Idx)) { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(Idx); + MBB.insert(I, MovRel); + } else { + + assert(AMDGPU::SReg_64RegClass.contains(Save)); + assert(AMDGPU::VReg_32RegClass.contains(Idx)); + + // Save the EXEC mask + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) + .addReg(AMDGPU::EXEC); - // Kill is only allowed in pixel shaders - assert(MBB.getParent()->getInfo()->ShaderType == - ShaderType::PIXEL); + // Read the next variant into VCC (lower 32 bits) <- also loop target + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + AMDGPU::VCC_LO) + .addReg(Idx); - // Clear this pixel from the exec mask if the operand is negative - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC) - .addReg(AMDGPU::SREG_LIT_0) - .addOperand(MI.getOperand(0)); + // Move index from VCC into M0 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(AMDGPU::VCC_LO); + // Compare the just read M0 value to all possible Idx values + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC) + .addReg(AMDGPU::M0) + .addReg(Idx); + + // Update EXEC, save the original EXEC value to VCC + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) + .addReg(AMDGPU::VCC); + + // Do the actual move + MBB.insert(I, MovRel); + + // Update EXEC, switch all done bits to 0 and all todo bits to 1 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(AMDGPU::VCC); + + // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addImm(-7) + .addReg(AMDGPU::EXEC); + + // Restore EXEC + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addReg(Save); + + } MI.eraseFromParent(); } +void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Vec = MI.getOperand(2).getReg(); + unsigned Off = MI.getOperand(4).getImm(); + unsigned SubReg = TRI->getSubReg(Vec, AMDGPU::sub0); + if (!SubReg) + SubReg = Vec; + + MachineInstr *MovRel = + BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) + .addReg(SubReg + Off) + .addReg(AMDGPU::M0, RegState::Implicit) + .addReg(Vec, RegState::Implicit); + + LoadM0(MI, MovRel); +} + +void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Off = MI.getOperand(4).getImm(); + unsigned Val = MI.getOperand(5).getReg(); + unsigned SubReg = TRI->getSubReg(Dst, AMDGPU::sub0); + if (!SubReg) + SubReg = Dst; + + MachineInstr *MovRel = + BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32)) + .addReg(SubReg + Off, RegState::Define) + .addReg(Val) + .addReg(AMDGPU::M0, RegState::Implicit) + .addReg(Dst, RegState::Implicit); + + LoadM0(MI, MovRel); +} + bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { + TII = static_cast(MF.getSubtarget().getInstrInfo()); + TRI = + static_cast(MF.getSubtarget().getRegisterInfo()); + SIMachineFunctionInfo *MFI = MF.getInfo(); bool HaveKill = false; + bool NeedWQM = false; + bool NeedFlat = false; unsigned Depth = 0; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; - for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); - I != MBB.end(); I = Next) { + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); - Next = llvm::next(I); MachineInstr &MI = *I; + if (TII->isDS(MI.getOpcode())) + NeedWQM = true; + + // Flat uses m0 in case it needs to access LDS. + if (TII->isFLAT(MI.getOpcode())) + NeedFlat = true; + switch (MI.getOpcode()) { default: break; case AMDGPU::SI_IF: @@ -363,9 +502,72 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::S_BRANCH: Branch(MI); break; + + case AMDGPU::SI_INDIRECT_SRC: + IndirectSrc(MI); + break; + + case AMDGPU::SI_INDIRECT_DST_V1: + case AMDGPU::SI_INDIRECT_DST_V2: + case AMDGPU::SI_INDIRECT_DST_V4: + case AMDGPU::SI_INDIRECT_DST_V8: + case AMDGPU::SI_INDIRECT_DST_V16: + IndirectDst(MI); + break; + + case AMDGPU::V_INTERP_P1_F32: + case AMDGPU::V_INTERP_P2_F32: + case AMDGPU::V_INTERP_MOV_F32: + NeedWQM = true; + break; } } } + if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) { + MachineBasicBlock &MBB = MF.front(); + BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), + AMDGPU::EXEC).addReg(AMDGPU::EXEC); + } + + // FIXME: This seems inappropriate to do here. + if (NeedFlat && MFI->IsKernel) { + // Insert the prologue initializing the SGPRs pointing to the scratch space + // for flat accesses. + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + + // TODO: What to use with function calls? + + // FIXME: This is reporting stack size that is used in a scratch buffer + // rather than registers as well. + uint64_t StackSizeBytes = FrameInfo->getStackSize(); + + int IndirectBegin + = static_cast(TII)->getIndirectIndexBegin(MF); + // Convert register index to 256-byte unit. + uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256); + + assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff && + "Stack limits should be smaller than 16-bits"); + + // Initialize the flat scratch register pair. + // TODO: Can we use one s_mov_b64 here? + + // Offset is in units of 256-bytes. + MachineBasicBlock &MBB = MF.front(); + DebugLoc NoDL; + MachineBasicBlock::iterator Start = MBB.getFirstNonPHI(); + const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32); + + assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes)); + + BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO) + .addImm(StackOffset); + + // Documentation says size is "per-thread scratch size in bytes" + BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI) + .addImm(StackSizeBytes); + } + return true; }