From 328080423746398da1c44e679df6f9010374296a Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Mon, 21 Jul 2014 15:45:01 +0000 Subject: [PATCH] R600/SI: Use scratch memory for large private arrays git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213551 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPUAsmPrinter.cpp | 24 ++++- lib/Target/R600/AMDGPUAsmPrinter.h | 2 + lib/Target/R600/AMDGPUISelDAGToDAG.cpp | 107 +++++++++++++++++++++- lib/Target/R600/AMDGPUISelLowering.h | 15 +-- lib/Target/R600/AMDGPUInstructions.td | 26 ++++++ lib/Target/R600/AMDGPURegisterInfo.h | 2 +- lib/Target/R600/AMDGPUTargetMachine.cpp | 2 +- lib/Target/R600/SIDefines.h | 4 + lib/Target/R600/SIISelLowering.cpp | 94 +++++++++++++++++-- lib/Target/R600/SIISelLowering.h | 1 + lib/Target/R600/SIInstrInfo.cpp | 21 ++++- lib/Target/R600/SIInstrInfo.h | 5 +- lib/Target/R600/SIInstrInfo.td | 57 ++++++++---- lib/Target/R600/SIInstructions.td | 86 ++++++++++++----- lib/Target/R600/SIMachineFunctionInfo.cpp | 3 +- lib/Target/R600/SIMachineFunctionInfo.h | 1 + lib/Target/R600/SIRegisterInfo.cpp | 48 +++++++++- lib/Target/R600/SIRegisterInfo.h | 19 ++++ test/CodeGen/R600/array-ptr-calc-i32.ll | 9 +- test/CodeGen/R600/gv-const-addrspace.ll | 19 ++++ test/CodeGen/R600/indirect-private-64.ll | 40 ++++---- test/CodeGen/R600/private-memory.ll | 16 ++-- test/CodeGen/R600/work-item-intrinsics.ll | 10 +- 23 files changed, 507 insertions(+), 104 deletions(-) diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index 257f72e5ce6..73faaa18358 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -25,6 +25,7 @@ #include "SIDefines.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" @@ -141,6 +142,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { false); OutStreamer.emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode), false); + OutStreamer.emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize), + false); } else { R600MachineFunctionInfo *MFI = MF.getInfo(); OutStreamer.emitRawComment( @@ -332,6 +335,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // Do not clamp NAN to 0. ProgInfo.DX10Clamp = 0; + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF); + ProgInfo.CodeLen = CodeSize; } @@ -361,6 +367,15 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, unsigned LDSBlocks = RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; + // Scratch is allocated in 256 dword blocks. + unsigned ScratchAlignShift = 10; + // We need to program the hardware with the amount of scratch memory that + // is used by the entire wave. KernelInfo.ScratchSize is the amount of + // scratch memory used per thread. + unsigned ScratchBlocks = + RoundUpToAlignment(KernelInfo.ScratchSize * STM.getWavefrontSize(), + 1 << ScratchAlignShift) >> ScratchAlignShift; + if (MFI->getShaderType() == ShaderType::COMPUTE) { OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); @@ -377,7 +392,14 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer.EmitIntValue(ComputePGMRSrc1, 4); OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); - OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4); + const uint32_t ComputePGMRSrc2 = + S_00B84C_LDS_SIZE(LDSBlocks) | + S_00B02C_SCRATCH_EN(ScratchBlocks > 0); + + OutStreamer.EmitIntValue(ComputePGMRSrc2, 4); + + OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); + OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4); } else { OutStreamer.EmitIntValue(RsrcReg, 4); OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) | diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h index fc2d58915e7..19907cfd013 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.h +++ b/lib/Target/R600/AMDGPUAsmPrinter.h @@ -32,6 +32,7 @@ private: DX10Clamp(0), DebugMode(0), IEEEMode(0), + ScratchSize(0), CodeLen(0) {} // Fields set in PGM_RSRC1 pm4 packet. @@ -43,6 +44,7 @@ private: uint32_t DX10Clamp; uint32_t DebugMode; uint32_t IEEEMode; + uint32_t ScratchSize; // Bonus information for debugging. uint64_t CodeLen; diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index b4d79e5754e..cc17b7ec618 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -16,9 +16,13 @@ #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" #include "R600InstrInfo.h" +#include "SIDefines.h" #include "SIISelLowering.h" +#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/IR/Function.h" @@ -85,7 +89,13 @@ private: bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); bool SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, SDValue &Offset, - SDValue &ImmOffset) const; + SDValue &ImmOffset) const; + bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr, + SDValue &SOffset, SDValue &ImmOffset) const; + bool SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, + SDValue &SOffset, SDValue &Offset, SDValue &Offen, + SDValue &Idxen, SDValue &GLC, SDValue &SLC, + SDValue &TFE) const; SDNode *SelectADD_SUB_I64(SDNode *N); SDNode *SelectDIV_SCALE(SDNode *N); @@ -730,6 +740,10 @@ static SDValue wrapAddr64Rsrc(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) { Ptr), 0); } +static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) { + return isUInt<12>(Imm->getZExtValue()); +} + bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, SDValue &Offset, SDValue &ImmOffset) const { @@ -740,7 +754,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, SDValue N1 = Addr.getOperand(1); ConstantSDNode *C1 = cast(N1); - if (isUInt<12>(C1->getZExtValue())) { + if (isLegalMUBUFImmOffset(C1)) { if (N0.getOpcode() == ISD::ADD) { // (add (add N2, N3), C1) @@ -776,6 +790,95 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, return true; } +/// \brief Return a resource descriptor with the 'Add TID' bit enabled +/// The TID (Thread ID) is multipled by the stride value (bits [61:48] +/// of the resource descriptor) to create an offset, which is added to the +/// resource ponter. +static SDValue buildScratchRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) { + + uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE | + 0xffffffff; + + SDValue PtrLo = DAG->getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); + SDValue PtrHi = DAG->getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); + SDValue DataLo = DAG->getTargetConstant( + Rsrc & APInt::getAllOnesValue(32).getZExtValue(), MVT::i32); + SDValue DataHi = DAG->getTargetConstant(Rsrc >> 32, MVT::i32); + + const SDValue Ops[] = { PtrLo, PtrHi, DataLo, DataHi }; + return SDValue(DAG->getMachineNode(AMDGPU::SI_BUFFER_RSRC, DL, + MVT::v4i32, Ops), 0); +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, + SDValue &VAddr, SDValue &SOffset, + SDValue &ImmOffset) const { + + SDLoc DL(Addr); + MachineFunction &MF = CurDAG->getMachineFunction(); + const SIRegisterInfo *TRI = static_cast(MF.getTarget().getRegisterInfo()); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + + unsigned ScratchPtrReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); + unsigned ScratchOffsetReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + + Rsrc = buildScratchRSRC(CurDAG, DL, CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, MRI.getLiveInVirtReg(ScratchPtrReg), MVT::i64)); + SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, + MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32); + + // (add n0, c1) + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast(N1); + + if (isLegalMUBUFImmOffset(C1)) { + VAddr = Addr.getOperand(0); + ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); + return true; + } + } + + // (add FI, n0) + if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && + isa(Addr.getOperand(0))) { + VAddr = Addr.getOperand(1); + ImmOffset = Addr.getOperand(0); + return true; + } + + // (FI) + if (isa(Addr)) { + VAddr = SDValue(CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, + CurDAG->getConstant(0, MVT::i32)), 0); + ImmOffset = Addr; + return true; + } + + // (node) + VAddr = Addr; + ImmOffset = CurDAG->getTargetConstant(0, MVT::i16); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc, + SDValue &VAddr, SDValue &SOffset, + SDValue &Offset, SDValue &Offen, + SDValue &Idxen, SDValue &GLC, + SDValue &SLC, SDValue &TFE) const { + + GLC = CurDAG->getTargetConstant(0, MVT::i1); + SLC = CurDAG->getTargetConstant(0, MVT::i1); + TFE = CurDAG->getTargetConstant(0, MVT::i1); + + Idxen = CurDAG->getTargetConstant(0, MVT::i1); + Offen = CurDAG->getTargetConstant(1, MVT::i1); + + return SelectMUBUFScratch(Addr, SRsrc, VAddr, SOffset, Offset); +} + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast(getTargetLowering()); diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index 0865645e077..624d4e0c196 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -71,13 +71,6 @@ protected: static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT); - /// \brief Helper function that adds Reg to the LiveIn list of the DAG's - /// MachineFunction. - /// - /// \returns a RegisterSDNode representing Reg. - virtual SDValue CreateLiveInRegister(SelectionDAG &DAG, - const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const; virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const; /// \brief Split a vector load into multiple scalar loads. @@ -160,6 +153,14 @@ public: SDValue Op, const SelectionDAG &DAG, unsigned Depth = 0) const override; + + /// \brief Helper function that adds Reg to the LiveIn list of the DAG's + /// MachineFunction. + /// + /// \returns a RegisterSDNode representing Reg. + virtual SDValue CreateLiveInRegister(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const; }; namespace AMDGPUISD { diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index b4b351c7f96..cd3560378e5 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -41,6 +41,8 @@ def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; def InstFlag : OperandWithDefaultOps ; def ADDRIndirect : ComplexPattern; +let OperandType = "OPERAND_IMMEDIATE" in { + def u32imm : Operand { let PrintMethod = "printU32ImmOperand"; } @@ -53,6 +55,8 @@ def u8imm : Operand { let PrintMethod = "printU8ImmOperand"; } +} // End OperandType = "OPERAND_IMMEDIATE" + //===--------------------------------------------------------------------===// // Custom Operands //===--------------------------------------------------------------------===// @@ -136,6 +140,28 @@ def COND_NULL : PatLeaf < // Load/Store Pattern Fragments //===----------------------------------------------------------------------===// +class PrivateMemOp : PatFrag (N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; +}]>; + +class PrivateLoad : PrivateMemOp < + (ops node:$ptr), (op node:$ptr) +>; + +class PrivateStore : PrivateMemOp < + (ops node:$value, node:$ptr), (op node:$value, node:$ptr) +>; + +def extloadi8_private : PrivateLoad ; +def sextloadi8_private : PrivateLoad ; +def extloadi16_private : PrivateLoad ; +def sextloadi16_private : PrivateLoad ; +def load_private : PrivateLoad ; + +def truncstorei8_private : PrivateStore ; +def truncstorei16_private : PrivateStore ; +def store_private : PrivateStore ; + def global_store : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ return isGlobalStore(dyn_cast(N)); diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h index 4731595d4f7..46aa7a17dfc 100644 --- a/lib/Target/R600/AMDGPURegisterInfo.h +++ b/lib/Target/R600/AMDGPURegisterInfo.h @@ -51,7 +51,7 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { unsigned getSubRegFromChannel(unsigned Channel) const; const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override; - void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, + virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override; unsigned getFrameRegister(const MachineFunction &MF) const override; diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index 6a78b177e96..23beb2576ac 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -52,7 +52,7 @@ static std::string computeDataLayout(const AMDGPUSubtarget &ST) { std::string Ret = "e-p:32:32"; if (ST.is64bit()) { - // 32-bit private, local, and region pointers. 64-bit global and constant. + // 32-bit local, and region pointers. 64-bit private, global, and constant. Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64"; } diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h index 4d31a118593..b7e7a2d000b 100644 --- a/lib/Target/R600/SIDefines.h +++ b/lib/Target/R600/SIDefines.h @@ -32,6 +32,7 @@ enum { #define S_00B028_VGPRS(x) (((x) & 0x3F) << 0) #define S_00B028_SGPRS(x) (((x) & 0x0F) << 6) #define R_00B84C_COMPUTE_PGM_RSRC2 0x00B84C +#define S_00B02C_SCRATCH_EN(x) (((x) & 0x1) << 0) #define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15) #define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC @@ -85,4 +86,7 @@ enum { #define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4) #define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6) +#define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860 +#define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12) + #endif // SIDEFINES_H_ diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index b2a8f1a9cb4..540b3c74d33 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -391,10 +391,15 @@ SDValue SITargetLowering::LowerFormalArguments( } // The pointer to the list of arguments is stored in SGPR0, SGPR1 + // The pointer to the scratch buffer is stored in SGPR2, SGPR3 if (Info->getShaderType() == ShaderType::COMPUTE) { + Info->NumUserSGPRs = 4; CCInfo.AllocateReg(AMDGPU::SGPR0); CCInfo.AllocateReg(AMDGPU::SGPR1); + CCInfo.AllocateReg(AMDGPU::SGPR2); + CCInfo.AllocateReg(AMDGPU::SGPR3); MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass); + MF.addLiveIn(AMDGPU::SGPR2_SGPR3, &AMDGPU::SReg_64RegClass); } if (Info->getShaderType() == ShaderType::COMPUTE) { @@ -509,6 +514,36 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MI->eraseFromParent(); break; } + case AMDGPU::SI_BUFFER_RSRC: { + unsigned SuperReg = MI->getOperand(0).getReg(); + unsigned Args[4]; + for (unsigned i = 0, e = 4; i < e; ++i) { + MachineOperand &Arg = MI->getOperand(i + 1); + + if (Arg.isReg()) { + Args[i] = Arg.getReg(); + continue; + } + + assert(Arg.isImm()); + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), Reg) + .addImm(Arg.getImm()); + Args[i] = Reg; + } + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), + SuperReg) + .addReg(Args[0]) + .addImm(AMDGPU::sub0) + .addReg(Args[1]) + .addImm(AMDGPU::sub1) + .addReg(Args[2]) + .addImm(AMDGPU::sub2) + .addReg(Args[3]) + .addImm(AMDGPU::sub3); + MI->eraseFromParent(); + break; + } case AMDGPU::V_SUB_F64: { unsigned DestReg = MI->getOperand(0).getReg(); BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg) @@ -620,6 +655,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { SIMachineFunctionInfo *MFI = MF.getInfo(); switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::LOAD: { LoadSDNode *Load = dyn_cast(Op); @@ -658,8 +694,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { cast(Op.getOperand(0))->getZExtValue(); EVT VT = Op.getValueType(); SDLoc DL(Op); - //XXX: Hardcoded we only use two to store the pointer to the parameters. - unsigned NumUserSGPRs = 2; switch (IntrinsicID) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); case Intrinsic::r600_read_ngroups_x: @@ -682,13 +716,13 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false); case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT); + AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0), VT); case Intrinsic::r600_read_tgid_y: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT); + AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1), VT); case Intrinsic::r600_read_tgid_z: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT); + AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2), VT); case Intrinsic::r600_read_tidig_x: return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, AMDGPU::VGPR0, VT); @@ -782,6 +816,21 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) { return nullptr; } +SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { + + MachineFunction &MF = DAG.getMachineFunction(); + const SIInstrInfo *TII = + static_cast(getTargetMachine().getInstrInfo()); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + FrameIndexSDNode *FINode = cast(Op); + unsigned FrameIndex = FINode->getIndex(); + + CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI.getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET), MVT::i32); + + return DAG.getTargetFrameIndex(FrameIndex, MVT::i32); +} + /// This transforms the control flow intrinsics to get the branch destination as /// last parameter, also switches branch target with BR if the need arise SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, @@ -891,6 +940,11 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); LoadSDNode *Load = cast(Op); + // Vector private memory loads have already been split, and + // all the rest of private memory loads are legal. + if (Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + return SDValue(); + } SDValue Lowered = AMDGPUTargetLowering::LowerLOAD(Op, DAG); if (Lowered.getNode()) return Lowered; @@ -1081,6 +1135,12 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { VT.getVectorElementType() == MVT::i32) return SDValue(); + if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + if (VT.isVector() && VT.getVectorNumElements() > 4) + return SplitVectorStore(Op, DAG); + return SDValue(); + } + SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); if (Ret.getNode()) return Ret; @@ -1495,9 +1555,19 @@ void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, // This is a conservative aproach. It is possible that we can't determine the // correct register class and copy too often, but better safe than sorry. - SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); - SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), - Operand.getValueType(), Operand, RC); + + SDNode *Node; + // We can't use COPY_TO_REGCLASS with FrameIndex arguments. + if (isa(Operand)) { + unsigned Opcode = Operand.getValueType() == MVT::i32 ? + AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + Node = DAG.getMachineNode(Opcode, SDLoc(), Operand.getValueType(), + Operand); + } else { + SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); + Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), + Operand.getValueType(), Operand, RC); + } Operand = SDValue(Node, 0); } @@ -1591,6 +1661,14 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); } continue; + } else { + // If it's not a VSrc or SSrc operand check if we have a GlobalAddress. + // These will be lowered to immediates, so we will need to insert a MOV. + if (isa(Ops[i])) { + SDNode *Node = DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(), + Operand.getValueType(), Operand); + Ops[i] = SDValue(Node, 0); + } } if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) { diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h index 6c2c00989ff..d106d4abb18 100644 --- a/lib/Target/R600/SIISelLowering.h +++ b/lib/Target/R600/SIISelLowering.h @@ -27,6 +27,7 @@ class SITargetLowering : public AMDGPUTargetLowering { SelectionDAG &DAG) const; SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; + SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index 15c9a5e1cdc..59f10b6ded1 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -561,6 +561,21 @@ static bool compareMachineOp(const MachineOperand &Op0, } } +bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, + const MachineOperand &MO) const { + const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; + + assert(MO.isImm() || MO.isFPImm()); + + if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) + return true; + + if (OpInfo.RegClass < 0) + return false; + + return RI.regClassCanUseImmediate(OpInfo.RegClass); +} + bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, StringRef &ErrInfo) const { uint16_t Opcode = MI->getOpcode(); @@ -589,7 +604,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, } break; case MCOI::OPERAND_IMMEDIATE: - if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm()) { + // Check if this operand is an immediate. + // FrameIndex operands will be replaced by immediates, so they are + // allowed. + if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm() && + !MI->getOperand(i).isFI()) { ErrInfo = "Expected immediate, but got non-immediate"; return false; } diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index cae6ffff5c4..e8b6b6d69f8 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -106,6 +106,9 @@ public: bool isInlineConstant(const MachineOperand &MO) const; bool isLiteralConstant(const MachineOperand &MO) const; + bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, + const MachineOperand &MO) const; + bool verifyInstruction(const MachineInstr *MI, StringRef &ErrInfo) const override; @@ -181,7 +184,7 @@ namespace AMDGPU { int getMCOpcode(uint16_t Opcode, unsigned Gen); const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; - + const uint64_t RSRC_TID_ENABLE = 1LL << 55; } // End namespace AMDGPU diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index c94cce7bc8b..52049b00f54 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -163,7 +163,9 @@ def sopp_brtarget : Operand { // Complex patterns //===----------------------------------------------------------------------===// +def MUBUFAddr32 : ComplexPattern; def MUBUFAddr64 : ComplexPattern; +def MUBUFScratch : ComplexPattern; //===----------------------------------------------------------------------===// // SI assembler operands @@ -605,12 +607,12 @@ multiclass MUBUF_Load_Helper op, string asm, RegisterClass regClass, asm#" $vdata, $srsrc + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>; } - let offen = 1, idxen = 0, offset = 0 in { + let offen = 1, idxen = 0 in { def _OFFEN : MUBUF ; + asm#" $vdata, $srsrc + $vaddr + $soffset + $offset, glc=$glc, slc=$slc, tfe=$tfe", []>; } let offen = 0, idxen = 1 in { @@ -640,25 +642,40 @@ multiclass MUBUF_Load_Helper op, string asm, RegisterClass regClass, } } -class MUBUF_Store_Helper op, string name, RegisterClass vdataClass, - ValueType store_vt, SDPatternOperator st> : - MUBUF { +multiclass MUBUF_Store_Helper op, string name, RegisterClass vdataClass, + ValueType store_vt, SDPatternOperator st> { - let mayLoad = 0; - let mayStore = 1; + def "" : MUBUF < + op, (outs), + (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_32:$vaddr, SSrc_32:$soffset, + u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$slc, + i1imm:$tfe), + name#" $vdata, $srsrc, $vaddr, $soffset, $offset $offen $idxen $glc $slc $tfe", + [] + > { + let addr64 = 0; + } - // Encoding - let offen = 0; - let idxen = 0; - let glc = 0; - let addr64 = 1; - let lds = 0; - let slc = 0; - let tfe = 0; - let soffset = 128; // ZERO + def _ADDR64 : MUBUF < + op, (outs), + (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset), + name#" $vdata, $srsrc + $vaddr + $offset", + [(st store_vt:$vdata, + (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, u16imm:$offset))]> { + + let mayLoad = 0; + let mayStore = 1; + + // Encoding + let offen = 0; + let idxen = 0; + let glc = 0; + let addr64 = 1; + let lds = 0; + let slc = 0; + let tfe = 0; + let soffset = 128; // ZERO + } } class MTBUF_Load_Helper op, string asm, RegisterClass regClass> : MTBUF < diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 291b5376849..aecd847a2ba 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -872,23 +872,23 @@ defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper < 0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128, v4i32, global_load >; -def BUFFER_STORE_BYTE : MUBUF_Store_Helper < +defm BUFFER_STORE_BYTE : MUBUF_Store_Helper < 0x00000018, "BUFFER_STORE_BYTE", VReg_32, i32, truncstorei8_global >; -def BUFFER_STORE_SHORT : MUBUF_Store_Helper < +defm BUFFER_STORE_SHORT : MUBUF_Store_Helper < 0x0000001a, "BUFFER_STORE_SHORT", VReg_32, i32, truncstorei16_global >; -def BUFFER_STORE_DWORD : MUBUF_Store_Helper < +defm BUFFER_STORE_DWORD : MUBUF_Store_Helper < 0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32, global_store >; -def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < +defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < 0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, v2i32, global_store >; -def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < +defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < 0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128, v4i32, global_store >; //def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>; @@ -1667,6 +1667,12 @@ def SI_ADDR64_RSRC : InstSI < "", [] >; +def SI_BUFFER_RSRC : InstSI < + (outs SReg_128:$srsrc), + (ins SReg_32:$ptr_lo, SReg_32:$ptr_hi, SSrc_32:$data_lo, SSrc_32:$data_hi), + "", [] +>; + def V_SUB_F64 : InstSI < (outs VReg_64:$dst), (ins VReg_64:$src0, VReg_64:$src1), @@ -2410,7 +2416,7 @@ def : Ext32Pat ; // Offset in an 32Bit VGPR def : Pat < (SIload_constant v4i32:$sbase, i32:$voff), - (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0) + (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0, 0) >; // The multiplication scales from [0,1] to the unsigned integer range @@ -2599,22 +2605,30 @@ multiclass MUBUFLoad_Pattern ; + } -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; + +class MUBUFScratchLoadPat : Pat < + (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr, + i32:$soffset, u16imm:$offset))), + (Instr $srsrc, $vaddr, $soffset, $offset, 0, 0, 0) +>; + +def : MUBUFScratchLoadPat ; +def : MUBUFScratchLoadPat ; +def : MUBUFScratchLoadPat ; +def : MUBUFScratchLoadPat ; +def : MUBUFScratchLoadPat ; +def : MUBUFScratchLoadPat ; +def : MUBUFScratchLoadPat ; // BUFFER_LOAD_DWORD*, addr64=0 multiclass MUBUF_Load_Dword ; @@ -2660,6 +2674,34 @@ defm : MUBUF_Load_Dword ; +class MUBUFScratchStorePat : Pat < + (st vt:$value, (MUBUFAddr32 v4i32:$srsrc, i32:$vaddr, i32:$soffset, + u16imm:$offset, i1imm:$offen, i1imm:$idxen, + i1imm:$glc, i1imm:$slc, i1imm:$tfe)), + (Instr $value, $srsrc, $vaddr, $soffset, $offset, $offen, $idxen, + $glc, $slc, $tfe) +>; + +def : MUBUFScratchStorePat ; +def : MUBUFScratchStorePat ; +def : MUBUFScratchStorePat ; +def : MUBUFScratchStorePat ; +def : MUBUFScratchStorePat ; + +/* +class MUBUFStore_Pattern : Pat < + (st vt:$value, (MUBUFScratch v4i32:$srsrc, i64:$vaddr, u16imm:$offset)), + (Instr $value, $srsrc, $vaddr, $offset) +>; + +def : MUBUFStore_Pattern ; +def : MUBUFStore_Pattern ; +def : MUBUFStore_Pattern ; +def : MUBUFStore_Pattern ; +def : MUBUFStore_Pattern ; + +*/ + //===----------------------------------------------------------------------===// // MTBUF Patterns //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp index e2df950fd27..c53a7e10d54 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.cpp +++ b/lib/Target/R600/SIMachineFunctionInfo.cpp @@ -27,7 +27,8 @@ void SIMachineFunctionInfo::anchor() {} SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), PSInputAddr(0), - SpillTracker() { } + SpillTracker(), + NumUserSGPRs(0) { } static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) { unsigned VGPR = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h index 96e619bde8d..9684d285cec 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.h +++ b/lib/Target/R600/SIMachineFunctionInfo.h @@ -59,6 +59,7 @@ public: SIMachineFunctionInfo(const MachineFunction &MF); unsigned PSInputAddr; struct RegSpillTracker SpillTracker; + unsigned NumUserSGPRs; }; } // End namespace llvm diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp index d0b677a41f2..009f24fd63c 100644 --- a/lib/Target/R600/SIRegisterInfo.cpp +++ b/lib/Target/R600/SIRegisterInfo.cpp @@ -16,6 +16,10 @@ #include "SIRegisterInfo.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/RegisterScavenging.h" using namespace llvm; @@ -27,8 +31,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::EXEC); Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); - const SIInstrInfo *TII = static_cast(ST.getInstrInfo()); - TII->reserveIndirectRegisters(Reserved, MF); return Reserved; } @@ -37,6 +39,30 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, return RC->getNumRegs(); } +bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { + return Fn.getFrameInfo()->hasStackObjects(); +} + +void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, + int SPAdj, unsigned FIOperandNum, + RegScavenger *RS) const { + MachineFunction *MF = MI->getParent()->getParent(); + MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + const SIInstrInfo *TII = static_cast(ST.getInstrInfo()); + MachineOperand &FIOp = MI->getOperand(FIOperandNum); + int Index = MI->getOperand(FIOperandNum).getIndex(); + int64_t Offset = FrameInfo->getObjectOffset(Index); + + FIOp.ChangeToImmediate(Offset); + if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) { + unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VReg_32RegClass, MI, SPAdj); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) + .addImm(Offset); + FIOp.ChangeToRegister(TmpReg, false); + } +} + const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass( MVT VT) const { switch(VT.SimpleTy) { @@ -141,3 +167,21 @@ bool SIRegisterInfo::regClassCanUseImmediate( const TargetRegisterClass *RC) const { return regClassCanUseImmediate(RC->getID()); } + +unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, + enum PreloadedValue Value) const { + + const SIMachineFunctionInfo *MFI = MF.getInfo(); + switch (Value) { + case SIRegisterInfo::TGID_X: + return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0); + case SIRegisterInfo::TGID_Y: + return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1); + case SIRegisterInfo::TGID_Z: + return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2); + case SIRegisterInfo::SCRATCH_WAVE_OFFSET: + return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4); + case SIRegisterInfo::SCRATCH_PTR: + return AMDGPU::SGPR2_SGPR3; + } +} diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h index c9305fbad40..5d0235c0f42 100644 --- a/lib/Target/R600/SIRegisterInfo.h +++ b/lib/Target/R600/SIRegisterInfo.h @@ -29,6 +29,12 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; + bool requiresRegisterScavenging(const MachineFunction &Fn) const override; + + void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS) const override; + /// \brief get the register class of the specified type to use in the /// CFGStructurizer const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override; @@ -68,6 +74,19 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { /// \returns True if operands defined with this register class can accept /// inline immediates. bool regClassCanUseImmediate(const TargetRegisterClass *RC) const; + + enum PreloadedValue { + TGID_X, + TGID_Y, + TGID_Z, + SCRATCH_WAVE_OFFSET, + SCRATCH_PTR + }; + + /// \brief Returns the physical register that \p Value is stored in. + unsigned getPreloadedValue(const MachineFunction &MF, + enum PreloadedValue Value) const; + }; } // End namespace llvm diff --git a/test/CodeGen/R600/array-ptr-calc-i32.ll b/test/CodeGen/R600/array-ptr-calc-i32.ll index 2ddc61f3972..a2b69782351 100644 --- a/test/CodeGen/R600/array-ptr-calc-i32.ll +++ b/test/CodeGen/R600/array-ptr-calc-i32.ll @@ -11,15 +11,18 @@ declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate ; SI-LABEL: @test_private_array_ptr_calc: -; SI: V_ADD_I32_e32 [[PTRREG:v[0-9]+]] - -; SI-ALLOCA: V_MOVRELD_B32_e32 {{v[0-9]+}}, [[PTRREG]] +; FIXME: We end up with zero argument for ADD, because +; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index +; with the appropriate offset. We should fold this into the store. +; SI-ALLOCA: V_ADD_I32_e32 [[PTRREG:v[0-9]+]], 0, v{{[0-9]+}} +; SI-ALLOCA: BUFFER_STORE_DWORD {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], [[PTRREG]] ; ; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this ; alloca to a vector. It currently fails because it does not know how ; to interpret: ; getelementptr [4 x i32]* %alloca, i32 1, i32 %b +; SI-PROMOTE: V_ADD_I32_e32 [[PTRREG:v[0-9]+]] ; SI-PROMOTE: DS_WRITE_B32 {{v[0-9]+}}, [[PTRREG]] define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) { %alloca = alloca [4 x i32], i32 4, align 16 diff --git a/test/CodeGen/R600/gv-const-addrspace.ll b/test/CodeGen/R600/gv-const-addrspace.ll index 074d9087ee6..e0ac317f998 100644 --- a/test/CodeGen/R600/gv-const-addrspace.ll +++ b/test/CodeGen/R600/gv-const-addrspace.ll @@ -76,3 +76,22 @@ define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) { store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4 ret void } + +define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) { +entry: + %0 = icmp eq i32 0, %a + br i1 %0, label %if, label %else + +if: + %1 = getelementptr inbounds [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index + %2 = load float addrspace(2)* %1 + store float %2, float addrspace(1)* %out + br label %endif + +else: + store float 1.0, float addrspace(1)* %out + br label %endif + +endif: + ret void +} diff --git a/test/CodeGen/R600/indirect-private-64.ll b/test/CodeGen/R600/indirect-private-64.ll index 2f628458387..00331e6696d 100644 --- a/test/CodeGen/R600/indirect-private-64.ll +++ b/test/CodeGen/R600/indirect-private-64.ll @@ -6,10 +6,10 @@ declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind ; SI-LABEL: @private_access_f64_alloca: -; SI-ALLOCA: V_MOVRELD_B32_e32 -; SI-ALLOCA: V_MOVRELD_B32_e32 -; SI-ALLOCA: V_MOVRELS_B32_e32 -; SI-ALLOCA: V_MOVRELS_B32_e32 +; SI-ALLOCA: BUFFER_STORE_DWORDX2 +; FIXME: We should be able to use BUFFER_LOAD_DWORDX2 +; SI-ALLOCA: BUFFER_LOAD_DWORD +; SI-ALLOCA: BUFFER_LOAD_DWORD ; SI-PROMOTE: DS_WRITE_B64 ; SI-PROMOTE: DS_READ_B64 @@ -26,10 +26,12 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double ; SI-LABEL: @private_access_v2f64_alloca: -; SI-ALLOCA: V_MOVRELD_B32_e32 -; SI-ALLOCA: V_MOVRELD_B32_e32 -; SI-ALLOCA: V_MOVRELS_B32_e32 -; SI-ALLOCA: V_MOVRELS_B32_e32 +; SI-ALLOCA: BUFFER_STORE_DWORDX4 +; FIXME: We should be able to use BUFFER_LOAD_DWORDX4 +; SI-ALLOCA: BUFFER_LOAD_DWORD +; SI-ALLOCA: BUFFER_LOAD_DWORD +; SI-ALLOCA: BUFFER_LOAD_DWORD +; SI-ALLOCA: BUFFER_LOAD_DWORD ; SI-PROMOTE: DS_WRITE_B32 ; SI-PROMOTE: DS_WRITE_B32 @@ -52,10 +54,10 @@ define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out ; SI-LABEL: @private_access_i64_alloca: -; SI-ALLOCA: V_MOVRELD_B32_e32 -; SI-ALLOCA: V_MOVRELD_B32_e32 -; SI-ALLOCA: V_MOVRELS_B32_e32 -; SI-ALLOCA: V_MOVRELS_B32_e32 +; SI-ALLOCA: BUFFER_STORE_DWORDX2 +; FIXME: We should be able to use BUFFER_LOAD_DWORDX2 +; SI-ALLOCA: BUFFER_LOAD_DWORD +; SI-ALLOCA: BUFFER_LOAD_DWORD ; SI-PROMOTE: DS_WRITE_B64 ; SI-PROMOTE: DS_READ_B64 @@ -72,14 +74,12 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs ; SI-LABEL: @private_access_v2i64_alloca: -; SI-ALLOCA: V_MOVRELD_B32_e32 -; SI-ALLOCA: V_MOVRELD_B32_e32 -; SI-ALLOCA: V_MOVRELD_B32_e32 -; SI-ALLOCA: V_MOVRELD_B32_e32 -; SI-ALLOCA: V_MOVRELS_B32_e32 -; SI-ALLOCA: V_MOVRELS_B32_e32 -; SI-ALLOCA: V_MOVRELS_B32_e32 -; SI-ALLOCA: V_MOVRELS_B32_e32 +; SI-ALLOCA: BUFFER_STORE_DWORDX4 +; FIXME: We should be able to use BUFFER_LOAD_DWORDX4 +; SI-ALLOCA: BUFFER_LOAD_DWORD +; SI-ALLOCA: BUFFER_LOAD_DWORD +; SI-ALLOCA: BUFFER_LOAD_DWORD +; SI-ALLOCA: BUFFER_LOAD_DWORD ; SI-PROMOTE: DS_WRITE_B32 ; SI-PROMOTE: DS_WRITE_B32 diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll index 1f34b115c70..3ce8c2cb03d 100644 --- a/test/CodeGen/R600/private-memory.ll +++ b/test/CodeGen/R600/private-memory.ll @@ -16,12 +16,8 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; SI-PROMOTE: DS_READ_B32 ; SI-PROMOTE: DS_READ_B32 -; SI-ALLOCA: V_READFIRSTLANE_B32 vcc_lo -; SI-ALLOCA: V_MOVRELD -; SI-ALLOCA: S_CBRANCH -; SI-ALLOCA: V_READFIRSTLANE_B32 vcc_lo -; SI-ALLOCA: V_MOVRELD -; SI-ALLOCA: S_CBRANCH +; SI-ALLOCA: BUFFER_STORE_DWORD v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}} +; SI-ALLOCA: BUFFER_STORE_DWORD v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}} define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { entry: %stack = alloca [5 x i32], align 4 @@ -120,7 +116,9 @@ for.end: ; R600: MOVA_INT -; SI-PROMOTE: V_MOVRELS_B32_e32 +; SI-PROMOTE: BUFFER_STORE_SHORT v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}} +; SI-PROMOTE: BUFFER_STORE_SHORT v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}} +; SI_PROMOTE: BUFFER_LOAD_SSHORT v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] + v{{[0-9]+}}, s{{[0-9]+}} define void @short_array(i32 addrspace(1)* %out, i32 %index) { entry: %0 = alloca [2 x i16] @@ -139,8 +137,8 @@ entry: ; R600: MOVA_INT -; SI: V_OR_B32_e32 v{{[0-9]}}, 0x100 -; SI: V_MOVRELS_B32_e32 +; SI-DAG: BUFFER_STORE_BYTE v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}, 0x0 +; SI-DAG: BUFFER_STORE_BYTE v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}, 0x1 define void @char_array(i32 addrspace(1)* %out, i32 %index) { entry: %0 = alloca [2 x i8] diff --git a/test/CodeGen/R600/work-item-intrinsics.ll b/test/CodeGen/R600/work-item-intrinsics.ll index 90079b005bb..01236590742 100644 --- a/test/CodeGen/R600/work-item-intrinsics.ll +++ b/test/CodeGen/R600/work-item-intrinsics.ll @@ -127,12 +127,12 @@ entry: ret void } -; The tgid values are stored in ss offset by the number of user ss. -; Currently we always use exactly 2 user ss for the pointer to the +; The tgid values are stored in sgprs offset by the number of user sgprs. +; Currently we always use exactly 2 user sgprs for the pointer to the ; kernel arguments, but this may change in the future. ; SI-CHECK: @tgid_x -; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s2 +; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s4 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]] define void @tgid_x (i32 addrspace(1)* %out) { entry: @@ -142,7 +142,7 @@ entry: } ; SI-CHECK: @tgid_y -; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s3 +; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s5 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]] define void @tgid_y (i32 addrspace(1)* %out) { entry: @@ -152,7 +152,7 @@ entry: } ; SI-CHECK: @tgid_z -; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s4 +; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s6 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]] define void @tgid_z (i32 addrspace(1)* %out) { entry: -- 2.34.1