From e3d4cbc7d25061441adafa47450a31571c87bf85 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 28 Jun 2013 15:47:08 +0000 Subject: [PATCH] R600: Add local memory support via LDS Reviewed-by: Vincent Lejeune git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@185162 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPUAsmPrinter.cpp | 6 ++ lib/Target/R600/AMDGPUISelLowering.cpp | 23 +++++++ lib/Target/R600/AMDGPUISelLowering.h | 3 + lib/Target/R600/AMDGPUInstructions.td | 9 +++ lib/Target/R600/AMDGPUMachineFunction.cpp | 1 + lib/Target/R600/AMDGPUMachineFunction.h | 2 + lib/Target/R600/AMDILISelDAGToDAG.cpp | 15 +++-- lib/Target/R600/R600Defines.h | 6 +- lib/Target/R600/R600ISelLowering.cpp | 18 ++++- lib/Target/R600/R600InstrFormats.td | 43 ++++++++++-- lib/Target/R600/R600InstrInfo.cpp | 46 ++++++++++--- lib/Target/R600/R600InstrInfo.h | 11 +++ lib/Target/R600/R600Instructions.td | 75 +++++++++++++++++++++ lib/Target/R600/R600MachineScheduler.cpp | 12 +++- lib/Target/R600/R600Packetizer.cpp | 3 + lib/Target/R600/R600RegisterInfo.td | 3 +- lib/Target/R600/R600Schedule.td | 2 + test/CodeGen/R600/local-memory.ll | 82 +++++++++++++++++++++++ 18 files changed, 336 insertions(+), 24 deletions(-) create mode 100644 test/CodeGen/R600/local-memory.ll diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index f720c7ecb63..996d2a61221 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -29,6 +29,7 @@ #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/Support/ELF.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetLoweringObjectFile.h" @@ -130,6 +131,11 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) { S_STACK_SIZE(MFI->StackSize), 4); OutStreamer.EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); OutStreamer.EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); + + if (MFI->ShaderType == ShaderType::COMPUTE) { + OutStreamer.EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); + OutStreamer.EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4); + } } void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) { diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 6d7359053dc..4019a1f02ef 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/IR/DataLayout.h" using namespace llvm; @@ -71,6 +72,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::UDIVREM, MVT::i32, Custom); setOperationAction(ISD::UREM, MVT::i32, Expand); + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + int types[] = { (int)MVT::v2i32, (int)MVT::v4i32 @@ -138,6 +141,26 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) return Op; } +SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, + SDValue Op, + SelectionDAG &DAG) const { + + const DataLayout *TD = getTargetMachine().getDataLayout(); + GlobalAddressSDNode *G = cast(Op); + // XXX: What does the value of G->getOffset() mean? + assert(G->getOffset() == 0 && + "Do not know what to do with an non-zero offset"); + + unsigned Offset = MFI->LDSSize; + const GlobalValue *GV = G->getGlobal(); + uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); + + // XXX: Account for alignment? + MFI->LDSSize += Size; + + return DAG.getConstant(Offset, MVT::i32); +} + SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntrinsicID = cast(Op.getOperand(0))->getZExtValue(); diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index 69a0ac95b29..d739a013242 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -20,6 +20,7 @@ namespace llvm { +class AMDGPUMachineFunction; class MachineRegisterInfo; class AMDGPUTargetLowering : public TargetLowering { @@ -36,6 +37,8 @@ protected: virtual SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const; + SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, + SelectionDAG &DAG) const; bool isHWTrueValue(SDValue Op) const; bool isHWFalseValue(SDValue Op) const; diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index 29df37499a2..234bb994f37 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -94,6 +94,15 @@ def zextloadi8_constant : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr), [{ return isGlobalLoad(dyn_cast(N)); }]>; +def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isLocalLoad(dyn_cast(N)); +}]>; + +def local_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast(N)); +}]>; + class Constants { int TWO_PI = 0x40c90fdb; int PI = 0x40490fdb; diff --git a/lib/Target/R600/AMDGPUMachineFunction.cpp b/lib/Target/R600/AMDGPUMachineFunction.cpp index 04610254029..9a1e3447e7e 100644 --- a/lib/Target/R600/AMDGPUMachineFunction.cpp +++ b/lib/Target/R600/AMDGPUMachineFunction.cpp @@ -10,6 +10,7 @@ const char *AMDGPUMachineFunction::ShaderTypeAttribute = "ShaderType"; AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo() { ShaderType = ShaderType::COMPUTE; + LDSSize = 0; AttributeSet Set = MF.getFunction()->getAttributes(); Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, ShaderTypeAttribute); diff --git a/lib/Target/R600/AMDGPUMachineFunction.h b/lib/Target/R600/AMDGPUMachineFunction.h index 21c8c51dae4..5d5df1249b4 100644 --- a/lib/Target/R600/AMDGPUMachineFunction.h +++ b/lib/Target/R600/AMDGPUMachineFunction.h @@ -23,6 +23,8 @@ private: public: AMDGPUMachineFunction(const MachineFunction &MF); unsigned ShaderType; + /// Number of bytes in the LDS that are being used. + unsigned LDSSize; }; } diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp index 9f077b9db58..e79ab3c0db9 100644 --- a/lib/Target/R600/AMDILISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp @@ -282,11 +282,16 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(), AMDGPU::OpName::literal); - assert(ImmIdx != -1); + if (ImmIdx == -1) { + continue; + } - // subtract one from ImmIdx, because the DST operand is usually index - // 0 for MachineInstrs, but we have no DST in the Ops vector. - ImmIdx--; + if (TII->getOperandIdx(Use->getMachineOpcode(), + AMDGPU::OpName::dst) != -1) { + // subtract one from ImmIdx, because the DST operand is usually index + // 0 for MachineInstrs, but we have no DST in the Ops vector. + ImmIdx--; + } // Check that we aren't already using an immediate. // XXX: It's possible for an instruction to have more than one @@ -336,7 +341,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { } if (Result && Result->isMachineOpcode() && !(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR) - && TII->isALUInstr(Result->getMachineOpcode())) { + && TII->hasInstrModifiers(Result->getMachineOpcode())) { // Fold FNEG/FABS/CONST_ADDRESS // TODO: Isel can generate multiple MachineInst, we need to recursively // parse Result diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h index 6bcf8aefa74..90fc29ce14d 100644 --- a/lib/Target/R600/R600Defines.h +++ b/lib/Target/R600/R600Defines.h @@ -42,7 +42,9 @@ namespace R600_InstFlag { OP2 = (1 << 11), VTX_INST = (1 << 12), TEX_INST = (1 << 13), - ALU_INST = (1 << 14) + ALU_INST = (1 << 14), + LDS_1A = (1 << 15), + LDS_1A1D = (1 << 16) }; } @@ -162,4 +164,6 @@ namespace OpName { #define R_028878_SQ_PGM_RESOURCES_GS 0x028878 #define R_0288D4_SQ_PGM_RESOURCES_LS 0x0288d4 +#define R_0288E8_SQ_LDS_ALLOC 0x0288E8 + #endif // R600DEFINES_H_ diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index b898af13b7a..ce2aa9208f8 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -138,6 +138,19 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( break; } + case AMDGPU::LDS_READ_RET: { + MachineInstrBuilder NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), + TII->get(MI->getOpcode()), + AMDGPU::OQAP); + for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) { + NewMI.addOperand(MI->getOperand(i)); + } + TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV, + MI->getOperand(0).getReg(), + AMDGPU::OQAP); + break; + } + case AMDGPU::MOV_IMM_F32: TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), MI->getOperand(1).getFPImm()->getValueAPF() @@ -456,6 +469,8 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( //===----------------------------------------------------------------------===// SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + R600MachineFunctionInfo *MFI = MF.getInfo(); switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); @@ -463,14 +478,13 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::LOAD: return LowerLOAD(Op, DAG); case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); + case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); case ISD::INTRINSIC_VOID: { SDValue Chain = Op.getOperand(0); unsigned IntrinsicID = cast(Op.getOperand(1))->getZExtValue(); switch (IntrinsicID) { case AMDGPUIntrinsic::AMDGPU_store_output: { - MachineFunction &MF = DAG.getMachineFunction(); - R600MachineFunctionInfo *MFI = MF.getInfo(); int64_t RegIndex = cast(Op.getOperand(3))->getZExtValue(); unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); MFI->LiveOuts.push_back(Reg); diff --git a/lib/Target/R600/R600InstrFormats.td b/lib/Target/R600/R600InstrFormats.td index 2c98fb90db5..2d72404702f 100644 --- a/lib/Target/R600/R600InstrFormats.td +++ b/lib/Target/R600/R600InstrFormats.td @@ -23,6 +23,8 @@ class InstR600 pattern, bits<2> FlagOperandIdx = 0; bit Op1 = 0; bit Op2 = 0; + bit LDS_1A = 0; + bit LDS_1A1D = 0; bit HasNativeOperands = 0; bit VTXInst = 0; bit TEXInst = 0; @@ -49,21 +51,21 @@ class InstR600 pattern, let TSFlags{12} = VTXInst; let TSFlags{13} = TEXInst; let TSFlags{14} = ALUInst; + let TSFlags{15} = LDS_1A; + let TSFlags{16} = LDS_1A1D; } //===----------------------------------------------------------------------===// // ALU instructions //===----------------------------------------------------------------------===// -class R600ALU_Word0 { +class R600_ALU_LDS_Word0 { field bits<32> Word0; bits<11> src0; - bits<1> src0_neg; bits<1> src0_rel; bits<11> src1; bits<1> src1_rel; - bits<1> src1_neg; bits<3> index_mode = 0; bits<2> pred_sel; bits<1> last; @@ -76,16 +78,23 @@ class R600ALU_Word0 { let Word0{8-0} = src0_sel; let Word0{9} = src0_rel; let Word0{11-10} = src0_chan; - let Word0{12} = src0_neg; let Word0{21-13} = src1_sel; let Word0{22} = src1_rel; let Word0{24-23} = src1_chan; - let Word0{25} = src1_neg; let Word0{28-26} = index_mode; let Word0{30-29} = pred_sel; let Word0{31} = last; } +class R600ALU_Word0 : R600_ALU_LDS_Word0 { + + bits<1> src0_neg; + bits<1> src1_neg; + + let Word0{12} = src0_neg; + let Word0{25} = src1_neg; +} + class R600ALU_Word1 { field bits<32> Word1; @@ -138,6 +147,30 @@ class R600ALU_Word1_OP3 alu_inst> : R600ALU_Word1{ let Word1{17-13} = alu_inst; } +class R600LDS_Word1 { + field bits<32> Word1; + + bits<11> src2; + bits<9> src2_sel = src2{8-0}; + bits<2> src2_chan = src2{10-9}; + bits<1> src2_rel; + // offset specifies the stride offset to the second set of data to be read + // from. This is a dword offset. + bits<5> alu_inst = 17; // OP3_INST_LDS_IDX_OP + bits<3> bank_swizzle; + bits<6> lds_op; + bits<2> dst_chan = 0; + + let Word1{8-0} = src2_sel; + let Word1{9} = src2_rel; + let Word1{11-10} = src2_chan; + let Word1{17-13} = alu_inst; + let Word1{20-18} = bank_swizzle; + let Word1{26-21} = lds_op; + let Word1{30-29} = dst_chan; +} + + /* XXX: R600 subtarget uses a slightly different encoding than the other subtargets. We currently handle this in R600MCCodeEmitter, but we may diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index 3b1a24015a8..f05390ec48c 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -136,6 +136,21 @@ bool R600InstrInfo::isALUInstr(unsigned Opcode) const { return (TargetFlags & R600_InstFlag::ALU_INST); } +bool R600InstrInfo::hasInstrModifiers(unsigned Opcode) const { + unsigned TargetFlags = get(Opcode).TSFlags; + + return ((TargetFlags & R600_InstFlag::OP1) | + (TargetFlags & R600_InstFlag::OP2) | + (TargetFlags & R600_InstFlag::OP3)); +} + +bool R600InstrInfo::isLDSInstr(unsigned Opcode) const { + unsigned TargetFlags = get(Opcode).TSFlags; + + return ((TargetFlags & R600_InstFlag::LDS_1A) | + (TargetFlags & R600_InstFlag::LDS_1A1D)); +} + bool R600InstrInfo::isTransOnly(unsigned Opcode) const { return (get(Opcode).TSFlags & R600_InstFlag::TRANS_ONLY); } @@ -245,6 +260,9 @@ R600InstrInfo::ExtractSrcs(MachineInstr *MI, unsigned Reg = Srcs[i].first->getReg(); unsigned Index = RI.getEncodingValue(Reg) & 0xff; unsigned Chan = RI.getHWRegChan(Reg); + if (Reg == AMDGPU::OQAP) { + Result.push_back(std::pair(Index, 0)); + } if (Index > 127) { Result.push_back(DummyPair); continue; @@ -287,10 +305,11 @@ Swizzle(std::vector > Src, return Src; } -static bool -isLegal(const std::vector > > &IGSrcs, - const std::vector &Swz, - unsigned CheckedSize) { +bool +R600InstrInfo::isLegal( + const std::vector > > &IGSrcs, + const std::vector &Swz, + unsigned CheckedSize) const { int Vector[4][3]; memset(Vector, -1, sizeof(Vector)); for (unsigned i = 0; i < CheckedSize; i++) { @@ -300,6 +319,16 @@ isLegal(const std::vector > > &IGSrcs, const std::pair &Src = Srcs[j]; if (Src.first < 0) continue; + if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) { + if (Swz[i] != R600InstrInfo::ALU_VEC_012 && + Swz[i] != R600InstrInfo::ALU_VEC_021) { + // The value from output queue A (denoted by register OQAP) can + // only be fetched during the first cycle. + return false; + } + // OQAP does not count towards the normal read port restrictions + continue; + } if (Vector[Src.second][j] < 0) Vector[Src.second][j] = Src.first; if (Vector[Src.second][j] != Src.first) @@ -309,10 +338,11 @@ isLegal(const std::vector > > &IGSrcs, return true; } -static bool recursiveFitsFPLimitation( -const std::vector > > &IGSrcs, -std::vector &SwzCandidate, -unsigned Depth = 0) { +bool +R600InstrInfo::recursiveFitsFPLimitation( + const std::vector > > &IGSrcs, + std::vector &SwzCandidate, + unsigned Depth) const { if (!isLegal(IGSrcs, SwzCandidate, Depth)) return false; if (IGSrcs.size() == Depth) diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h index 3c2e50be662..a3752889369 100644 --- a/lib/Target/R600/R600InstrInfo.h +++ b/lib/Target/R600/R600InstrInfo.h @@ -63,6 +63,8 @@ namespace llvm { /// \returns true if this \p Opcode represents an ALU instruction. bool isALUInstr(unsigned Opcode) const; + bool hasInstrModifiers(unsigned Opcode) const; + bool isLDSInstr(unsigned Opcode) const; bool isTransOnly(unsigned Opcode) const; bool isTransOnly(const MachineInstr *MI) const; @@ -82,6 +84,15 @@ namespace llvm { SmallVector, 3> getSrcs(MachineInstr *MI) const; + bool isLegal( + const std::vector > > &IGSrcs, + const std::vector &Swz, + unsigned CheckedSize) const; + bool recursiveFitsFPLimitation( + const std::vector > > &IGSrcs, + std::vector &SwzCandidate, + unsigned Depth = 0) const; + /// Given the order VEC_012 < VEC_021 < VEC_120 < VEC_102 < VEC_201 < VEC_210 /// returns true and the first (in lexical order) BankSwizzle affectation /// starting from the one already provided in the Instruction Group MIs that diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index f42501a2fd0..fd585f8b7e9 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1529,6 +1529,81 @@ def GROUP_BARRIER : InstR600 < let ALUInst = 1; } +//===----------------------------------------------------------------------===// +// LDS Instructions +//===----------------------------------------------------------------------===// +class R600_LDS op, dag outs, dag ins, string asm, + list pattern = []> : + + InstR600 , + R600_ALU_LDS_Word0, + R600LDS_Word1 { + + bits<6> offset = 0; + let lds_op = op; + + let Word1{27} = offset{0}; + let Word1{12} = offset{1}; + let Word1{28} = offset{2}; + let Word1{31} = offset{3}; + let Word0{12} = offset{4}; + let Word0{25} = offset{5}; + + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + + let ALUInst = 1; + let HasNativeOperands = 1; + let UseNamedOperandTable = 1; +} + +class R600_LDS_1A lds_op, string name, list pattern> : R600_LDS < + lds_op, + (outs R600_Reg32:$dst), + (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel, + LAST:$last, R600_Pred:$pred_sel, + BANK_SWIZZLE:$bank_swizzle), + " "#name#" $last OQAP, $src0$src0_rel $pred_sel", + pattern + > { + + let src1 = 0; + let src1_rel = 0; + let src2 = 0; + let src2_rel = 0; + + let Defs = [OQAP]; + let usesCustomInserter = 1; + let LDS_1A = 1; + let DisableEncoding = "$dst"; +} + +class R600_LDS_1A1D lds_op, string name, list pattern> : + R600_LDS < + lds_op, + (outs), + (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel, + R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel, + LAST:$last, R600_Pred:$pred_sel, + BANK_SWIZZLE:$bank_swizzle), + " "#name#" $last $src0$src0_rel, $src1$src1_rel, $pred_sel", + pattern + > { + + let src2 = 0; + let src2_rel = 0; + let LDS_1A1D = 1; +} + +def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET", + [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))] +>; + +def LDS_WRITE : R600_LDS_1A1D <0xD, "LDS_WRITE", + [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)] +>; + // TRUNC is used for the FLT_TO_INT instructions to work around a // perceived problem where the rounding modes are applied differently // depending on the instruction and the slot they are in. diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index acc1b4d6ee3..7e28f9dde47 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -278,6 +278,10 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { return AluT_XYZW; } + if (TII->isLDSInstr(MI->getOpcode())) { + return AluT_X; + } + // Is the result already assigned to a channel ? unsigned DestSubReg = MI->getOperand(0).getSubReg(); switch (DestSubReg) { @@ -371,14 +375,18 @@ void R600SchedStrategy::PrepareNextSlot() { } void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) { - unsigned DestReg = MI->getOperand(0).getReg(); + int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); + if (DstIndex == -1) { + return; + } + unsigned DestReg = MI->getOperand(DstIndex).getReg(); // PressureRegister crashes if an operand is def and used in the same inst // and we try to constraint its regclass for (MachineInstr::mop_iterator It = MI->operands_begin(), E = MI->operands_end(); It != E; ++It) { MachineOperand &MO = *It; if (MO.isReg() && !MO.isDef() && - MO.getReg() == MI->getOperand(0).getReg()) + MO.getReg() == DestReg) return; } // Constrains the regclass of DestReg to assign it to Slot diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp index 4c72d229675..6fc15deb5c9 100644 --- a/lib/Target/R600/R600Packetizer.cpp +++ b/lib/Target/R600/R600Packetizer.cpp @@ -92,6 +92,9 @@ private: Result[Dst] = AMDGPU::PV_X; continue; } + if (Dst == AMDGPU::OQAP) { + continue; + } unsigned PVReg = 0; switch (TRI.getHWRegChan(Dst)) { case 0: diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td index a8b9b70322c..60a93e3f6c2 100644 --- a/lib/Target/R600/R600RegisterInfo.td +++ b/lib/Target/R600/R600RegisterInfo.td @@ -101,6 +101,7 @@ def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>; def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>; def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>; def AR_X : R600Reg<"AR.x", 0>; +def OQAP : R600Reg<"OQAP", 221>; def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32, (add (sequence "ArrayBase%u", 448, 480))>; @@ -170,7 +171,7 @@ def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add R600_ArrayBase, R600_Addr, ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF, - ALU_CONST, ALU_PARAM + ALU_CONST, ALU_PARAM, OQAP )>; def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add diff --git a/lib/Target/R600/R600Schedule.td b/lib/Target/R600/R600Schedule.td index 207233d7e76..df62bf85c0a 100644 --- a/lib/Target/R600/R600Schedule.td +++ b/lib/Target/R600/R600Schedule.td @@ -23,6 +23,7 @@ def TRANS : FuncUnit; def AnyALU : InstrItinClass; def VecALU : InstrItinClass; def TransALU : InstrItinClass; +def XALU : InstrItinClass; def R600_VLIW5_Itin : ProcessorItineraries < [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL], @@ -31,6 +32,7 @@ def R600_VLIW5_Itin : ProcessorItineraries < InstrItinData]>, InstrItinData]>, InstrItinData]>, + InstrItinData]>, InstrItinData]> ] >; diff --git a/test/CodeGen/R600/local-memory.ll b/test/CodeGen/R600/local-memory.ll new file mode 100644 index 00000000000..0ff38483087 --- /dev/null +++ b/test/CodeGen/R600/local-memory.ll @@ -0,0 +1,82 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + + +@local_memory.local_mem = internal addrspace(3) unnamed_addr global [16 x i32] zeroinitializer, align 4 + +; CHECK: @local_memory + +; Check that the LDS size emitted correctly +; CHECK: .long 166120 +; CHECK-NEXT: .long 16 + +; CHECK: LDS_WRITE + +; GROUP_BARRIER must be the last instruction in a clause +; CHECK: GROUP_BARRIER +; CHECK-NEXT: ALU clause + +; CHECK: LDS_READ_RET + +define void @local_memory(i32 addrspace(1)* %out) { +entry: + %y.i = call i32 @llvm.r600.read.tidig.x() #0 + %arrayidx = getelementptr inbounds [16 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i + store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4 + %add = add nsw i32 %y.i, 1 + %cmp = icmp eq i32 %add, 16 + %.add = select i1 %cmp, i32 0, i32 %add + call void @llvm.AMDGPU.barrier.local() + %arrayidx1 = getelementptr inbounds [16 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add + %0 = load i32 addrspace(3)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %out, i32 %y.i + store i32 %0, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +@local_memory_two_objects.local_mem0 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4 +@local_memory_two_objects.local_mem1 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4 + +; CHECK: @local_memory_two_objects + +; Check that the LDS size emitted correctly +; CHECK: .long 166120 +; CHECK-NEXT: .long 8 + +; Make sure the lds writes are using different addresses. +; CHECK: LDS_WRITE {{[*]*}} {{PV|T}}[[ADDRW:[0-9]*\.[XYZW]]] +; CHECK-NOT: LDS_WRITE {{[*]*}} T[[ADDRW]] + +; GROUP_BARRIER must be the last instruction in a clause +; CHECK: GROUP_BARRIER +; CHECK-NEXT: ALU clause + +; Make sure the lds reads are using different addresses. +; CHECK: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]] +; CHECK-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]] + +define void @local_memory_two_objects(i32 addrspace(1)* %out) { +entry: + %x.i = call i32 @llvm.r600.read.tidig.x() #0 + %arrayidx = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i + store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4 + %mul = shl nsw i32 %x.i, 1 + %arrayidx1 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i + store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4 + %sub = sub nsw i32 3, %x.i + call void @llvm.AMDGPU.barrier.local() + %arrayidx2 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub + %0 = load i32 addrspace(3)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds i32 addrspace(1)* %out, i32 %x.i + store i32 %0, i32 addrspace(1)* %arrayidx3, align 4 + %arrayidx4 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub + %1 = load i32 addrspace(3)* %arrayidx4, align 4 + %add = add nsw i32 %x.i, 4 + %arrayidx5 = getelementptr inbounds i32 addrspace(1)* %out, i32 %add + store i32 %1, i32 addrspace(1)* %arrayidx5, align 4 + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #0 +declare void @llvm.AMDGPU.barrier.local() + +attributes #0 = { readnone } -- 2.34.1