X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTarget%2FR600%2FSIISelLowering.cpp;h=9844151cab59b5b9b902d556ddf18b481a8d9de9;hp=40f082723afeaae5fd0de0e063dff00fe39da438;hb=c848b1bbcf88ab5d8318d990612fb1fda206ea3d;hpb=68db37b952be497c94c7aa98cf26f3baadb5afd3 diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 40f082723af..9844151cab5 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -14,6 +14,7 @@ #include "SIISelLowering.h" #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "AMDILIntrinsicInfo.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" @@ -24,34 +25,25 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/IR/Function.h" -const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; - using namespace llvm; SITargetLowering::SITargetLowering(TargetMachine &TM) : AMDGPUTargetLowering(TM) { - addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass); - addRegisterClass(MVT::v2i1, &AMDGPU::VReg_64RegClass); - addRegisterClass(MVT::v4i1, &AMDGPU::VReg_128RegClass); - addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); - addRegisterClass(MVT::i32, &AMDGPU::VSrc_32RegClass); + addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::f32, &AMDGPU::VSrc_32RegClass); - addRegisterClass(MVT::v1i32, &AMDGPU::VSrc_32RegClass); - addRegisterClass(MVT::f64, &AMDGPU::VSrc_64RegClass); addRegisterClass(MVT::v2i32, &AMDGPU::VSrc_64RegClass); addRegisterClass(MVT::v2f32, &AMDGPU::VSrc_64RegClass); - addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass); - addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); - addRegisterClass(MVT::i128, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::VSrc_128RegClass); + addRegisterClass(MVT::v4f32, &AMDGPU::VSrc_128RegClass); addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass); addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); @@ -61,15 +53,55 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : computeRegisterProperties(); + // Condition Codes + setCondCodeAction(ISD::SETONE, MVT::f32, Expand); + setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); + setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); + setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); + setCondCodeAction(ISD::SETULE, MVT::f32, Expand); + setCondCodeAction(ISD::SETULT, MVT::f32, Expand); + + setCondCodeAction(ISD::SETONE, MVT::f64, Expand); + setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand); + setCondCodeAction(ISD::SETUGE, MVT::f64, Expand); + setCondCodeAction(ISD::SETUGT, MVT::f64, Expand); + setCondCodeAction(ISD::SETULE, MVT::f64, Expand); + setCondCodeAction(ISD::SETULT, MVT::f64, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); - setOperationAction(ISD::ADD, MVT::i64, Legal); setOperationAction(ISD::ADD, MVT::i32, Legal); - - setOperationAction(ISD::BITCAST, MVT::i128, Legal); + setOperationAction(ISD::ADDC, MVT::i32, Legal); + setOperationAction(ISD::ADDE, MVT::i32, Legal); + + // We need to custom lower vector stores from local memory + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setOperationAction(ISD::LOAD, MVT::v8i32, Custom); + setOperationAction(ISD::LOAD, MVT::v16i32, Custom); + + setOperationAction(ISD::STORE, MVT::v8i32, Custom); + setOperationAction(ISD::STORE, MVT::v16i32, Custom); + + // We need to custom lower loads/stores from private memory + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::i64, Custom); + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setOperationAction(ISD::LOAD, MVT::v8i32, Custom); + + setOperationAction(ISD::STORE, MVT::i1, Custom); + setOperationAction(ISD::STORE, MVT::i32, Custom); + setOperationAction(ISD::STORE, MVT::i64, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v4i32, Custom); + + setOperationAction(ISD::SELECT, MVT::i64, Custom); + setOperationAction(ISD::SELECT, MVT::f64, Promote); + AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); @@ -79,23 +111,106 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::SETCC, MVT::v2i1, Expand); setOperationAction(ISD::SETCC, MVT::v4i1, Expand); + setOperationAction(ISD::ANY_EXTEND, MVT::i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); - setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand); + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom); + setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand); + + setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote); + setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); + + setTruncStoreAction(MVT::i32, MVT::i8, Custom); + setTruncStoreAction(MVT::i32, MVT::i16, Custom); setTruncStoreAction(MVT::f64, MVT::f32, Expand); + setTruncStoreAction(MVT::i64, MVT::i32, Expand); + setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); + setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); + + setOperationAction(ISD::LOAD, MVT::i1, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); + setOperationAction(ISD::FrameIndex, MVT::i32, Custom); + + // We only support LOAD/STORE and vector manipulation ops for vectors + // with > 4 elements. + MVT VecTypes[] = { + MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32 + }; + + const size_t NumVecTypes = array_lengthof(VecTypes); + for (unsigned Type = 0; Type < NumVecTypes; ++Type) { + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { + switch(Op) { + case ISD::LOAD: + case ISD::STORE: + case ISD::BUILD_VECTOR: + case ISD::BITCAST: + case ISD::EXTRACT_VECTOR_ELT: + case ISD::INSERT_VECTOR_ELT: + case ISD::CONCAT_VECTORS: + case ISD::INSERT_SUBVECTOR: + case ISD::EXTRACT_SUBVECTOR: + break; + default: + setOperationAction(Op, VecTypes[Type], Expand); + break; + } + } + } - setTargetDAGCombine(ISD::SELECT_CC); + for (int I = MVT::v1f64; I <= MVT::v8f64; ++I) { + MVT::SimpleValueType VT = static_cast(I); + setOperationAction(ISD::FTRUNC, VT, Expand); + setOperationAction(ISD::FCEIL, VT, Expand); + setOperationAction(ISD::FFLOOR, VT, Expand); + } + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { + setOperationAction(ISD::FTRUNC, MVT::f64, Legal); + setOperationAction(ISD::FCEIL, MVT::f64, Legal); + setOperationAction(ISD::FFLOOR, MVT::f64, Legal); + setOperationAction(ISD::FRINT, MVT::f64, Legal); + } + + setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); setSchedulingPreference(Sched::RegPressure); @@ -106,27 +221,69 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : //===----------------------------------------------------------------------===// bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, + unsigned AddrSpace, bool *IsFast) const { + if (IsFast) + *IsFast = false; + // XXX: This depends on the address space and also we may want to revist // the alignment values we specify in the DataLayout. + + // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, + // which isn't a simple VT. + if (!VT.isSimple() || VT == MVT::Other) + return false; + + // XXX - CI changes say "Support for unaligned memory accesses" but I don't + // see what for specifically. The wording everywhere else seems to be the + // same. + + // 3.6.4 - Operations using pairs of VGPRs (for example: double-floats) have + // no alignment restrictions. + if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { + // Using any pair of GPRs should be the same as any other pair. + if (IsFast) + *IsFast = true; + return VT.bitsGE(MVT::i64); + } + + // XXX - The only mention I see of this in the ISA manual is for LDS direct + // reads the "byte address and must be dword aligned". Is it also true for the + // normal loads and stores? + if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) + return false; + + // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the + // byte-address are ignored, thus forcing Dword alignment. + if (IsFast) + *IsFast = true; return VT.bitsGT(MVT::i32); } +bool SITargetLowering::shouldSplitVectorType(EVT VT) const { + return VT.getScalarType().bitsLE(MVT::i16); +} -SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, +bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const { + const SIInstrInfo *TII = + static_cast(getTargetMachine().getInstrInfo()); + return TII->isInlineConstant(Imm); +} + +SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL, SDValue Chain, - unsigned Offset) const { + unsigned Offset, bool Signed) const { MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), AMDGPUAS::CONSTANT_ADDRESS); - EVT ArgVT = MVT::getIntegerVT(VT.getSizeInBits()); SDValue BasePtr = DAG.getCopyFromReg(Chain, DL, MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64); SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, DAG.getConstant(Offset, MVT::i64)); - return DAG.getLoad(VT, DL, Chain, Ptr, - MachinePointerInfo(UndefValue::get(PtrTy)), - false, false, false, ArgVT.getSizeInBits() >> 3); + return DAG.getExtLoad(Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL, VT, Chain, Ptr, + MachinePointerInfo(UndefValue::get(PtrTy)), MemVT, + false, false, MemVT.getSizeInBits() >> 3); } @@ -153,7 +310,8 @@ SDValue SITargetLowering::LowerFormalArguments( const ISD::InputArg &Arg = Ins[i]; // First check if it's a PS input addr - if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg()) { + if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg() && + !Arg.Flags.isByVal()) { assert((PSInputNum <= 15) && "Too many PS inputs!"); @@ -184,7 +342,7 @@ SDValue SITargetLowering::LowerFormalArguments( NewArg.PartOffset += NewArg.VT.getStoreSize(); } - } else { + } else if (Info->ShaderType != ShaderType::COMPUTE) { Splits.push_back(Arg); } } @@ -207,6 +365,11 @@ SDValue SITargetLowering::LowerFormalArguments( MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass); } + if (Info->ShaderType == ShaderType::COMPUTE) { + getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, + Splits); + } + AnalyzeFormalArguments(CCInfo, Splits); for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { @@ -221,10 +384,13 @@ SDValue SITargetLowering::LowerFormalArguments( EVT VT = VA.getLocVT(); if (VA.isMemLoc()) { + VT = Ins[i].VT; + EVT MemVT = Splits[i].VT; // The first 36 bytes of the input buffer contains information about // thread group and global sizes. - SDValue Arg = LowerParameter(DAG, VT, DL, DAG.getRoot(), - 36 + VA.getLocMemOffset()); + SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(), + 36 + VA.getLocMemOffset(), + Ins[i].Flags.isSExt()); InVals.push_back(Arg); continue; } @@ -289,16 +455,16 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( static_cast(getTargetMachine().getInstrInfo()); MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); unsigned SuperReg = MI->getOperand(0).getReg(); - unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); + unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); + unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo) .addOperand(MI->getOperand(1)); BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo) .addImm(0); BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi) - .addImm(RSRC_DATA_FORMAT >> 32); + .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32); BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi) .addReg(SubRegHiLo) .addImm(AMDGPU::sub0) @@ -327,6 +493,19 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MI->eraseFromParent(); break; } + case AMDGPU::SI_RegisterStorePseudo: { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const SIInstrInfo *TII = + static_cast(getTargetMachine().getInstrInfo()); + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + MachineInstrBuilder MIB = + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), + Reg); + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) + MIB.addOperand(MI->getOperand(i)); + + MI->eraseFromParent(); + } } return BB; } @@ -370,8 +549,28 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); + case ISD::LOAD: { + LoadSDNode *Load = dyn_cast(Op); + if (Op.getValueType().isVector() && + (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS || + (Load->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && + Op.getValueType().getVectorNumElements() > 4))) { + SDValue MergedValues[2] = { + SplitVectorLoad(Op, DAG), + Load->getChain() + }; + return DAG.getMergeValues(MergedValues, 2, SDLoc(Op)); + } else { + return LowerLOAD(Op, DAG); + } + } + + case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG); + case ISD::STORE: return LowerSTORE(Op, DAG); + case ISD::ANY_EXTEND: // Fall-through case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); case ISD::INTRINSIC_WO_CHAIN: { @@ -384,23 +583,23 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (IntrinsicID) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); case Intrinsic::r600_read_ngroups_x: - return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 0); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0, false); case Intrinsic::r600_read_ngroups_y: - return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 4); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4, false); case Intrinsic::r600_read_ngroups_z: - return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 8); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8, false); case Intrinsic::r600_read_global_size_x: - return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 12); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12, false); case Intrinsic::r600_read_global_size_y: - return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 16); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16, false); case Intrinsic::r600_read_global_size_z: - return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 20); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20, false); case Intrinsic::r600_read_local_size_x: - return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 24); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24, false); case Intrinsic::r600_read_local_size_y: - return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 28); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28, false); case Intrinsic::r600_read_local_size_z: - return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 32); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false); case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT); @@ -421,13 +620,14 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { AMDGPU::VGPR2, VT); case AMDGPUIntrinsic::SI_load_const: { SDValue Ops [] = { - ResourceDescriptorToi128(Op.getOperand(1), DAG), + Op.getOperand(1), Op.getOperand(2) }; - MachineMemOperand *MMO = new MachineMemOperand(MachinePointerInfo(), - MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, - VT.getSizeInBits() / 8, 4); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, + VT.getSizeInBits() / 8, 4); return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, Op->getVTList(), Ops, 2, VT, MMO); } @@ -441,11 +641,48 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); case AMDGPUIntrinsic::SI_vs_load_input: return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, - ResourceDescriptorToi128(Op.getOperand(1), DAG), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } } + + case ISD::INTRINSIC_VOID: + SDValue Chain = Op.getOperand(0); + unsigned IntrinsicID = cast(Op.getOperand(1))->getZExtValue(); + + switch (IntrinsicID) { + case AMDGPUIntrinsic::SI_tbuffer_store: { + SDLoc DL(Op); + SDValue Ops [] = { + Chain, + Op.getOperand(2), + Op.getOperand(3), + Op.getOperand(4), + Op.getOperand(5), + Op.getOperand(6), + Op.getOperand(7), + Op.getOperand(8), + Op.getOperand(9), + Op.getOperand(10), + Op.getOperand(11), + Op.getOperand(12), + Op.getOperand(13), + Op.getOperand(14) + }; + EVT VT = Op.getOperand(3).getValueType(); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOStore, + VT.getSizeInBits() / 8, 4); + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, + Op->getVTList(), Ops, + sizeof(Ops)/sizeof(Ops[0]), VT, MMO); + } + default: + break; + } } return SDValue(); } @@ -463,7 +700,7 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) { if (I->getOpcode() == Opcode) return *I; } - return 0; + return nullptr; } /// This transforms the control flow intrinsics to get the branch destination as @@ -475,7 +712,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SDNode *Intr = BRCOND.getOperand(1).getNode(); SDValue Target = BRCOND.getOperand(2); - SDNode *BR = 0; + SDNode *BR = nullptr; if (Intr->getOpcode() == ISD::SETCC) { // As long as we negate the condition everything is fine @@ -508,7 +745,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, // build the new intrinsic call SDNode *Result = DAG.getNode( Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, - DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode(); + DAG.getVTList(Res), Ops.data(), Ops.size()).getNode(); if (BR) { // Give the branch instruction our target @@ -544,18 +781,47 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, return Chain; } -SDValue SITargetLowering::ResourceDescriptorToi128(SDValue Op, - SelectionDAG &DAG) const { +SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + LoadSDNode *Load = cast(Op); + SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG); + SDValue MergedValues[2]; + MergedValues[1] = Load->getChain(); + if (Ret.getNode()) { + MergedValues[0] = Ret; + return DAG.getMergeValues(MergedValues, 2, DL); + } - if (Op.getValueType() == MVT::i128) { - return Op; + if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + return SDValue(); } - assert(Op.getOpcode() == ISD::UNDEF); + EVT MemVT = Load->getMemoryVT(); + + assert(!MemVT.isVector() && "Private loads should be scalarized"); + assert(!MemVT.isFloatingPoint() && "FP loads should be promoted to int"); + + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), + DAG.getConstant(2, MVT::i32)); + Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, + Load->getChain(), Ptr, + DAG.getTargetConstant(0, MVT::i32), + Op.getOperand(2)); + if (MemVT.getSizeInBits() == 64) { + SDValue IncPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, + DAG.getConstant(1, MVT::i32)); + + SDValue LoadUpper = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, + Load->getChain(), IncPtr, + DAG.getTargetConstant(0, MVT::i32), + Op.getOperand(2)); + + Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ret, LoadUpper); + } + + MergedValues[0] = Ret; + return DAG.getMergeValues(MergedValues, 2, DL); - return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), MVT::i128, - DAG.getConstant(0, MVT::i64), - DAG.getConstant(0, MVT::i64)); } SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, @@ -563,10 +829,37 @@ SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, SelectionDAG &DAG) const { return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2), - ResourceDescriptorToi128(Op.getOperand(3), DAG), + Op.getOperand(3), Op.getOperand(4)); } +SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { + if (Op.getValueType() != MVT::i64) + return SDValue(); + + SDLoc DL(Op); + SDValue Cond = Op.getOperand(0); + + SDValue Zero = DAG.getConstant(0, MVT::i32); + SDValue One = DAG.getConstant(1, MVT::i32); + + SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); + SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); + + SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); + SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); + + SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); + + SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); + SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); + + SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); + + SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi); + return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); +} + SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); @@ -601,6 +894,87 @@ SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op, return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi); } +SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + StoreSDNode *Store = cast(Op); + EVT VT = Store->getMemoryVT(); + + SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); + if (Ret.getNode()) + return Ret; + + if (VT.isVector() && VT.getVectorNumElements() >= 8) + return SplitVectorStore(Op, DAG); + + if (VT == MVT::i1) + return DAG.getTruncStore(Store->getChain(), DL, + DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), + Store->getBasePtr(), MVT::i1, Store->getMemOperand()); + + if (Store->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) + return SDValue(); + + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Store->getBasePtr(), + DAG.getConstant(2, MVT::i32)); + SDValue Chain = Store->getChain(); + SmallVector Values; + + if (Store->isTruncatingStore()) { + unsigned Mask = 0; + if (Store->getMemoryVT() == MVT::i8) { + Mask = 0xff; + } else if (Store->getMemoryVT() == MVT::i16) { + Mask = 0xffff; + } + SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, + Chain, Store->getBasePtr(), + DAG.getConstant(0, MVT::i32)); + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getBasePtr(), + DAG.getConstant(0x3, MVT::i32)); + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, MVT::i32)); + SDValue MaskedValue = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getValue(), + DAG.getConstant(Mask, MVT::i32)); + SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, + MaskedValue, ShiftAmt); + SDValue RotrAmt = DAG.getNode(ISD::SUB, DL, MVT::i32, + DAG.getConstant(32, MVT::i32), ShiftAmt); + SDValue DstMask = DAG.getNode(ISD::ROTR, DL, MVT::i32, + DAG.getConstant(Mask, MVT::i32), + RotrAmt); + Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); + Dst = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); + + Values.push_back(Dst); + } else if (VT == MVT::i64) { + for (unsigned i = 0; i < 2; ++i) { + Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, + Store->getValue(), DAG.getConstant(i, MVT::i32))); + } + } else if (VT == MVT::i128) { + for (unsigned i = 0; i < 2; ++i) { + for (unsigned j = 0; j < 2; ++j) { + Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, + DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, + Store->getValue(), DAG.getConstant(i, MVT::i32)), + DAG.getConstant(j, MVT::i32))); + } + } + } else { + Values.push_back(Store->getValue()); + } + + for (unsigned i = 0; i < Values.size(); ++i) { + SDValue PartPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, + Ptr, DAG.getConstant(i, MVT::i32)); + Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Values[i], PartPtr, + DAG.getTargetConstant(0, MVT::i32)); + } + return Chain; +} + + SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); @@ -610,8 +984,12 @@ SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op, return SDValue(); } - return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), - DAG.getConstant(0, MVT::i32)); + SDValue Src = Op.getOperand(0); + if (Src.getValueType() != MVT::i32) + Src = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src); + + SDValue Zero = DAG.getConstant(0, MVT::i32); + return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Src, Zero); } //===----------------------------------------------------------------------===// @@ -625,9 +1003,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, EVT VT = N->getValueType(0); switch (N->getOpcode()) { - default: break; + default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); case ISD::SELECT_CC: { - N->dump(); ConstantSDNode *True, *False; // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc) if ((True = dyn_cast(N->getOperand(2))) @@ -645,7 +1022,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, SDValue Arg0 = N->getOperand(0); SDValue Arg1 = N->getOperand(1); SDValue CC = N->getOperand(2); - ConstantSDNode * C = NULL; + ConstantSDNode * C = nullptr; ISD::CondCode CCOp = dyn_cast(CC)->get(); // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne) @@ -692,9 +1069,11 @@ int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { return -1; } Imm.I = Node->getSExtValue(); - } else if (const ConstantFPSDNode *Node = dyn_cast(N)) + } else if (const ConstantFPSDNode *Node = dyn_cast(N)) { + if (N->getValueType(0) != MVT::f32) + return -1; Imm.F = Node->getValueAPF().convertToFloat(); - else + } else return -1; // It isn't an immediate if ((Imm.I >= -16 && Imm.I <= 64) || @@ -714,7 +1093,7 @@ bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, MachineSDNode *Mov = dyn_cast(Operand); const SIInstrInfo *TII = static_cast(getTargetMachine().getInstrInfo()); - if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode())) + if (!Mov || !TII->isMov(Mov->getMachineOpcode())) return false; const SDValue &Op = Mov->getOperand(0); @@ -761,7 +1140,7 @@ const TargetRegisterClass *SITargetLowering::getRegClassForNode( } return TRI.getPhysRegClass(Reg); } - default: return NULL; + default: return nullptr; } } const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode()); @@ -821,7 +1200,7 @@ void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, else return; - // Nothing todo if they fit naturaly + // Nothing to do if they fit naturally if (fitsRegClass(DAG, Operand, RegClass)) return; @@ -831,8 +1210,8 @@ void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, return; } - // This is a conservative aproach, it is possible that we can't determine - // the correct register class and copy too often, but better save than sorry. + // This is a conservative aproach. It is possible that we can't determine the + // correct register class and copy too often, but better safe than sorry. SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), Operand.getValueType(), Operand, RC); @@ -865,14 +1244,14 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, // Commuted opcode if available int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1; - const MCInstrDesc *DescRev = OpcodeRev == -1 ? 0 : &TII->get(OpcodeRev); + const MCInstrDesc *DescRev = OpcodeRev == -1 ? nullptr : &TII->get(OpcodeRev); assert(!DescRev || DescRev->getNumDefs() == NumDefs); assert(!DescRev || DescRev->getNumOperands() == NumOps); // e64 version if available, -1 otherwise int OpcodeE64 = AMDGPU::getVOPe64(Opcode); - const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? 0 : &TII->get(OpcodeE64); + const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? nullptr : &TII->get(OpcodeE64); assert(!DescE64 || DescE64->getNumDefs() == NumDefs); assert(!DescE64 || DescE64->getNumOperands() == (NumOps + 4)); @@ -942,12 +1321,10 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, fitsRegClass(DAG, Ops[1], OtherRegClass))) { // Swap commutable operands - SDValue Tmp = Ops[1]; - Ops[1] = Ops[0]; - Ops[0] = Tmp; + std::swap(Ops[0], Ops[1]); Desc = DescRev; - DescRev = 0; + DescRev = nullptr; continue; } } @@ -968,7 +1345,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, Immediate = -1; Promote2e64 = true; Desc = DescE64; - DescE64 = 0; + DescE64 = nullptr; } } } @@ -1009,7 +1386,9 @@ static unsigned SubIdx2Lane(unsigned Idx) { void SITargetLowering::adjustWritemask(MachineSDNode *&Node, SelectionDAG &DAG) const { SDNode *Users[4] = { }; - unsigned Writemask = 0, Lane = 0; + unsigned Lane = 0; + unsigned OldDmask = Node->getConstantOperandVal(0); + unsigned NewDmask = 0; // Try to figure out the used register components for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); @@ -1020,29 +1399,42 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) return; + // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. + // Note that subregs are packed, i.e. Lane==0 is the first bit set + // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit + // set, etc. Lane = SubIdx2Lane(I->getConstantOperandVal(1)); + // Set which texture component corresponds to the lane. + unsigned Comp; + for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { + assert(Dmask); + Comp = countTrailingZeros(Dmask); + Dmask &= ~(1 << Comp); + } + // Abort if we have more than one user per component if (Users[Lane]) return; Users[Lane] = *I; - Writemask |= 1 << Lane; + NewDmask |= 1 << Comp; } - // Abort if all components are used - if (Writemask == 0xf) + // Abort if there's no change + if (NewDmask == OldDmask) return; // Adjust the writemask in the node std::vector Ops; - Ops.push_back(DAG.getTargetConstant(Writemask, MVT::i32)); + Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32)); for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) Ops.push_back(Node->getOperand(i)); Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size()); // If we only got one lane, replace it with a copy - if (Writemask == (1U << Lane)) { + // (if NewDmask has only one bit set...) + if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32); SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), Users[Lane]->getValueType(0), @@ -1106,6 +1498,8 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, case 3: RC = &AMDGPU::VReg_96RegClass; break; } + unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); + MI->setDesc(TII->get(NewOpcode)); MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); MRI.setRegClass(VReg, RC); }