From e5fcc0dee4b41658986047f346201ad98757e7d5 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Mon, 3 Jun 2013 17:40:18 +0000 Subject: [PATCH] R600/SI: Add support for work item and work group intrinsics git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@183138 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPUISelLowering.h | 5 +- lib/Target/R600/SIISelLowering.cpp | 94 ++++++++-- lib/Target/R600/SIISelLowering.h | 4 + test/CodeGen/R600/work-item-intrinsics.ll | 211 ++++++++++++++++++++++ 4 files changed, 299 insertions(+), 15 deletions(-) create mode 100644 test/CodeGen/R600/work-item-intrinsics.ll diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index b6547ea62f6..69a0ac95b29 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -33,8 +33,9 @@ protected: /// MachineFunction. /// /// \returns a RegisterSDNode representing Reg. - SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const; + virtual SDValue CreateLiveInRegister(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const; bool isHWTrueValue(SDValue Op) const; bool isHWFalseValue(SDValue Op) const; diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 65d5479adcb..5dca0ca00c8 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -76,6 +76,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); @@ -83,6 +85,23 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setSchedulingPreference(Sched::RegPressure); } +SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, + SDLoc DL, SDValue Chain, + unsigned Offset) const { + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), + AMDGPUAS::CONSTANT_ADDRESS); + EVT ArgVT = MVT::getIntegerVT(VT.getSizeInBits()); + SDValue BasePtr = DAG.getCopyFromReg(Chain, DL, + MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64); + SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, + DAG.getConstant(Offset, MVT::i64)); + return DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, Chain, Ptr, + MachinePointerInfo(UndefValue::get(PtrTy)), + VT, false, false, ArgVT.getSizeInBits() >> 3); + +} + SDValue SITargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, @@ -153,12 +172,11 @@ SDValue SITargetLowering::LowerFormalArguments( CCInfo.AllocateReg(AMDGPU::VGPR1); } - unsigned ArgReg = 0; // The pointer to the list of arguments is stored in SGPR0, SGPR1 if (Info->ShaderType == ShaderType::COMPUTE) { CCInfo.AllocateReg(AMDGPU::SGPR0); CCInfo.AllocateReg(AMDGPU::SGPR1); - ArgReg = MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass); + MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass); } AnalyzeFormalArguments(CCInfo, Splits); @@ -175,17 +193,10 @@ SDValue SITargetLowering::LowerFormalArguments( EVT VT = VA.getLocVT(); if (VA.isMemLoc()) { - assert(ArgReg); - PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), - AMDGPUAS::CONSTANT_ADDRESS); - EVT ArgVT = MVT::getIntegerVT(VT.getSizeInBits()); - SDValue BasePtr = DAG.getCopyFromReg(DAG.getRoot(), DL, - ArgReg, MVT::i64); - SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, - DAG.getConstant(VA.getLocMemOffset(), MVT::i64)); - SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(), Ptr, - MachinePointerInfo(UndefValue::get(PtrTy)), - VA.getValVT(), false, false, ArgVT.getSizeInBits() >> 3); + // The first 36 bytes of the input buffer contains information about + // thread group and global sizes. + SDValue Arg = LowerParameter(DAG, VT, DL, DAG.getRoot(), + 36 + VA.getLocMemOffset()); InVals.push_back(Arg); continue; } @@ -293,6 +304,54 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntrinsicID = + cast(Op.getOperand(0))->getZExtValue(); + EVT VT = Op.getValueType(); + SDLoc DL(Op); + //XXX: Hardcoded we only use two to store the pointer to the parameters. + unsigned NumUserSGPRs = 2; + switch (IntrinsicID) { + default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case Intrinsic::r600_read_ngroups_x: + return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 0); + case Intrinsic::r600_read_ngroups_y: + return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 4); + case Intrinsic::r600_read_ngroups_z: + return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 8); + case Intrinsic::r600_read_global_size_x: + return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 12); + case Intrinsic::r600_read_global_size_y: + return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 16); + case Intrinsic::r600_read_global_size_z: + return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 20); + case Intrinsic::r600_read_local_size_x: + return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 24); + case Intrinsic::r600_read_local_size_y: + return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 28); + case Intrinsic::r600_read_local_size_z: + return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 32); + case Intrinsic::r600_read_tgid_x: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT); + case Intrinsic::r600_read_tgid_y: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT); + case Intrinsic::r600_read_tgid_z: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT); + case Intrinsic::r600_read_tidig_x: + return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, + AMDGPU::VGPR0, VT); + case Intrinsic::r600_read_tidig_y: + return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, + AMDGPU::VGPR1, VT); + case Intrinsic::r600_read_tidig_z: + return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, + AMDGPU::VGPR2, VT); + + } + } } return SDValue(); } @@ -933,3 +992,12 @@ MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N, } } } + +SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const { + SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); + + return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), + cast(VReg)->getReg(), VT); +} diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h index 5288a40f5b6..9b263b9c2c5 100644 --- a/lib/Target/R600/SIISelLowering.h +++ b/lib/Target/R600/SIISelLowering.h @@ -24,6 +24,8 @@ class SITargetLowering : public AMDGPUTargetLowering { const SIInstrInfo * TII; const TargetRegisterInfo * TRI; + SDValue LowerParameter(SelectionDAG &DAG, EVT VT, SDLoc DL, + SDValue Chain, unsigned Offset) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; @@ -59,6 +61,8 @@ public: SDNode *Node) const; int32_t analyzeImmediate(const SDNode *N) const; + SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const; }; } // End namespace llvm diff --git a/test/CodeGen/R600/work-item-intrinsics.ll b/test/CodeGen/R600/work-item-intrinsics.ll new file mode 100644 index 00000000000..46e3e54b9a2 --- /dev/null +++ b/test/CodeGen/R600/work-item-intrinsics.ll @@ -0,0 +1,211 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s +; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI-CHECK %s + +; R600-CHECK: @ngroups_x +; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]] +; R600-CHECK: VTX_READ_32 [[VAL]], [[VAL]], 0 +; SI-CHECK: @ngroups_x +; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 0 +; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]] +; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]] +define void @ngroups_x (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.ngroups.x() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; R600-CHECK: @ngroups_y +; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]] +; R600-CHECK: VTX_READ_32 [[VAL]], [[VAL]], 4 +; SI-CHECK: @ngroups_y +; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 1 +; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]] +; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]] +define void @ngroups_y (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.ngroups.y() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; R600-CHECK: @ngroups_z +; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]] +; R600-CHECK: VTX_READ_32 [[VAL]], [[VAL]], 8 +; SI-CHECK: @ngroups_z +; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 2 +; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]] +; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]] +define void @ngroups_z (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.ngroups.z() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; R600-CHECK: @global_size_x +; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]] +; R600-CHECK: VTX_READ_32 [[VAL]], [[VAL]], 12 +; SI-CHECK: @global_size_x +; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 3 +; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]] +; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]] +define void @global_size_x (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.global.size.x() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; R600-CHECK: @global_size_y +; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]] +; R600-CHECK: VTX_READ_32 [[VAL]], [[VAL]], 16 +; SI-CHECK: @global_size_y +; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 4 +; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]] +; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]] +define void @global_size_y (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.global.size.y() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; R600-CHECK: @global_size_z +; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]] +; R600-CHECK: VTX_READ_32 [[VAL]], [[VAL]], 20 +; SI-CHECK: @global_size_z +; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 5 +; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]] +; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]] +define void @global_size_z (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.global.size.z() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; R600-CHECK: @local_size_x +; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]] +; R600-CHECK: VTX_READ_32 [[VAL]], [[VAL]], 24 +; SI-CHECK: @local_size_x +; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 6 +; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]] +; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]] +define void @local_size_x (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.x() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; R600-CHECK: @local_size_y +; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]] +; R600-CHECK: VTX_READ_32 [[VAL]], [[VAL]], 28 +; SI-CHECK: @local_size_y +; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 7 +; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]] +; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]] +define void @local_size_y (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.y() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; R600-CHECK: @local_size_z +; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]] +; R600-CHECK: VTX_READ_32 [[VAL]], [[VAL]], 32 +; SI-CHECK: @local_size_z +; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 8 +; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]] +; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]] +define void @local_size_z (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.z() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; The tgid values are stored in SGPRs offset by the number of user SGPRs. +; Currently we always use exactly 2 user SGPRs for the pointer to the +; kernel arguments, but this may change in the future. + +; SI-CHECK: @tgid_x +; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], SGPR2 +; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]] +define void @tgid_x (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tgid.x() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; SI-CHECK: @tgid_y +; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], SGPR3 +; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]] +define void @tgid_y (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tgid.y() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; SI-CHECK: @tgid_z +; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], SGPR4 +; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]] +define void @tgid_z (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tgid.z() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; SI-CHECK: @tidig_x +; SI-CHECK: BUFFER_STORE_DWORD VGPR0 +define void @tidig_x (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tidig.x() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; SI-CHECK: @tidig_y +; SI-CHECK: BUFFER_STORE_DWORD VGPR1 +define void @tidig_y (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tidig.y() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; SI-CHECK: @tidig_z +; SI-CHECK: BUFFER_STORE_DWORD VGPR2 +define void @tidig_z (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tidig.z() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.r600.read.ngroups.x() #0 +declare i32 @llvm.r600.read.ngroups.y() #0 +declare i32 @llvm.r600.read.ngroups.z() #0 + +declare i32 @llvm.r600.read.global.size.x() #0 +declare i32 @llvm.r600.read.global.size.y() #0 +declare i32 @llvm.r600.read.global.size.z() #0 + +declare i32 @llvm.r600.read.local.size.x() #0 +declare i32 @llvm.r600.read.local.size.y() #0 +declare i32 @llvm.r600.read.local.size.z() #0 + +declare i32 @llvm.r600.read.tgid.x() #0 +declare i32 @llvm.r600.read.tgid.y() #0 +declare i32 @llvm.r600.read.tgid.z() #0 + +declare i32 @llvm.r600.read.tidig.x() #0 +declare i32 @llvm.r600.read.tidig.y() #0 +declare i32 @llvm.r600.read.tidig.z() #0 + +attributes #0 = { readnone } -- 2.34.1