setTruncStoreAction(MVT::i64, MVT::i32, Expand);
setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
setOperationAction(ISD::LOAD, MVT::i1, Custom);
return false;
}
-bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM,
- Type *Ty, unsigned AS) const {
+bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
+ // Flat instructions do not have offsets, and only have the register
+ // address.
+ return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
+}
+
+bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
+ // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
+ // additionally can do r + r + i with addr64. 32-bit has more addressing
+ // mode options. Depending on the resource constant, it can also do
+ // (i64 r0) + (i32 r1) * (i14 i).
+ //
+ // Private arrays end up using a scratch buffer most of the time, so also
+ // assume those use MUBUF instructions. Scratch loads / stores are currently
+ // implemented as mubuf instructions with offen bit set, so slightly
+ // different than the normal addr64.
+ if (!isUInt<12>(AM.BaseOffs))
+ return false;
+
+ // FIXME: Since we can split immediate into soffset and immediate offset,
+ // would it make sense to allow any immediate?
+
+ switch (AM.Scale) {
+ case 0: // r + i or just i, depending on HasBaseReg.
+ return true;
+ case 1:
+ return true; // We have r + r or r + i.
+ case 2:
+ if (AM.HasBaseReg) {
+ // Reject 2 * r + r.
+ return false;
+ }
+
+ // Allow 2 * r as r + r
+ // Or 2 * r + i is allowed as r + r + i.
+ return true;
+ default: // Don't allow n * r
+ return false;
+ }
+}
+
+bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
+ const AddrMode &AM, Type *Ty,
+ unsigned AS) const {
// No global is ever allowed as a base.
if (AM.BaseGV)
return false;
switch (AS) {
- case AMDGPUAS::GLOBAL_ADDRESS:
- case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
- case AMDGPUAS::PRIVATE_ADDRESS:
- case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: {
- // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
- // additionally can do r + r + i with addr64. 32-bit has more addressing
- // mode options. Depending on the resource constant, it can also do
- // (i64 r0) + (i32 r1) * (i14 i).
- //
- // SMRD instructions have an 8-bit, dword offset.
- //
- // Assume nonunifom access, since the address space isn't enough to know
- // what instruction we will use, and since we don't know if this is a load
- // or store and scalar stores are only available on VI.
- //
- // We also know if we are doing an extload, we can't do a scalar load.
- //
- // Private arrays end up using a scratch buffer most of the time, so also
- // assume those use MUBUF instructions. Scratch loads / stores are currently
- // implemented as mubuf instructions with offen bit set, so slightly
- // different than the normal addr64.
- if (!isUInt<12>(AM.BaseOffs))
- return false;
+ case AMDGPUAS::GLOBAL_ADDRESS: {
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // Assume the we will use FLAT for all global memory accesses
+ // on VI.
+ // FIXME: This assumption is currently wrong. On VI we still use
+ // MUBUF instructions for the r + i addressing mode. As currently
+ // implemented, the MUBUF instructions only work on buffer < 4GB.
+ // It may be possible to support > 4GB buffers with MUBUF instructions,
+ // by setting the stride value in the resource descriptor which would
+ // increase the size limit to (stride * 4GB). However, this is risky,
+ // because it has never been validated.
+ return isLegalFlatAddressingMode(AM);
+ }
- // FIXME: Since we can split immediate into soffset and immediate offset,
- // would it make sense to allow any immediate?
+ return isLegalMUBUFAddressingMode(AM);
+ }
+ case AMDGPUAS::CONSTANT_ADDRESS: {
+ // If the offset isn't a multiple of 4, it probably isn't going to be
+ // correctly aligned.
+ if (AM.BaseOffs % 4 != 0)
+ return isLegalMUBUFAddressingMode(AM);
+
+ // There are no SMRD extloads, so if we have to do a small type access we
+ // will use a MUBUF load.
+ // FIXME?: We also need to do this if unaligned, but we don't know the
+ // alignment here.
+ if (DL.getTypeStoreSize(Ty) < 4)
+ return isLegalMUBUFAddressingMode(AM);
+
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ // SMRD instructions have an 8-bit, dword offset on SI.
+ if (!isUInt<8>(AM.BaseOffs / 4))
+ return false;
+ } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
+ // On CI+, this can also be a 32-bit literal constant offset. If it fits
+ // in 8-bits, it can use a smaller encoding.
+ if (!isUInt<32>(AM.BaseOffs / 4))
+ return false;
+ } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // On VI, these use the SMEM format and the offset is 20-bit in bytes.
+ if (!isUInt<20>(AM.BaseOffs))
+ return false;
+ } else
+ llvm_unreachable("unhandled generation");
- switch (AM.Scale) {
- case 0: // r + i or just i, depending on HasBaseReg.
+ if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
return true;
- case 1:
- return true; // We have r + r or r + i.
- case 2:
- if (AM.HasBaseReg) {
- // Reject 2 * r + r.
- return false;
- }
- // Allow 2 * r as r + r
- // Or 2 * r + i is allowed as r + r + i.
+ if (AM.Scale == 1 && AM.HasBaseReg)
return true;
- default: // Don't allow n * r
- return false;
- }
+
+ return false;
}
+
+ case AMDGPUAS::PRIVATE_ADDRESS:
+ case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
+ return isLegalMUBUFAddressingMode(AM);
+
case AMDGPUAS::LOCAL_ADDRESS:
case AMDGPUAS::REGION_ADDRESS: {
// Basic, single offset DS instructions allow a 16-bit unsigned immediate
return false;
}
- case AMDGPUAS::FLAT_ADDRESS: {
- // Flat instructions do not have offsets, and only have the register
- // address.
- return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
- }
+ case AMDGPUAS::FLAT_ADDRESS:
+ return isLegalFlatAddressingMode(AM);
+
default:
llvm_unreachable("unhandled address space");
}
// ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
// aligned, 8 byte access in a single operation using ds_read2/write2_b32
// with adjacent offsets.
- return Align % 4 == 0;
+ bool AlignedBy4 = (Align % 4 == 0);
+ if (IsFast)
+ *IsFast = AlignedBy4;
+ return AlignedBy4;
}
// Smaller than dword value must be aligned.
return TII->isInlineConstant(Imm);
}
-static EVT toIntegerVT(EVT VT) {
- if (VT.isVector())
- return VT.changeVectorElementTypeToInteger();
- return MVT::getIntegerVT(VT.getSizeInBits());
-}
-
SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
SDLoc SL, SDValue Chain,
unsigned Offset, bool Signed) const {
- const DataLayout *DL = getDataLayout();
+ const DataLayout &DL = DAG.getDataLayout();
MachineFunction &MF = DAG.getMachineFunction();
const SIRegisterInfo *TRI =
static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
Type *Ty = VT.getTypeForEVT(*DAG.getContext());
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
- MVT PtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS);
+ MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
DAG.getConstant(Offset, SL, PtrVT));
- SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS));
+ SDValue PtrOffset = DAG.getUNDEF(PtrVT);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
- unsigned Align = DL->getABITypeAlignment(Ty);
-
- if (VT != MemVT && VT.isFloatingPoint()) {
- // Do an integer load and convert.
- // FIXME: This is mostly because load legalization after type legalization
- // doesn't handle FP extloads.
- assert(VT.getScalarType() == MVT::f32 &&
- MemVT.getScalarType() == MVT::f16);
-
- EVT IVT = toIntegerVT(VT);
- EVT MemIVT = toIntegerVT(MemVT);
- SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD,
- IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT,
- false, // isVolatile
- true, // isNonTemporal
- true, // isInvariant
- Align); // Alignment
- return DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load);
- }
+ unsigned Align = DL.getABITypeAlignment(Ty);
ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+ if (MemVT.isFloatingPoint())
+ ExtTy = ISD::EXTLOAD;
+
return DAG.getLoad(ISD::UNINDEXED, ExtTy,
VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
false, // isVolatile
FunctionType *FType = MF.getFunction()->getFunctionType();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- assert(CallConv == CallingConv::C);
+ // FIXME: We currently assume all calling conventions are kernels.
SmallVector<ISD::InputArg, 16> Splits;
BitVector Skipped(Ins.size());
assert((PSInputNum <= 15) && "Too many PS inputs!");
if (!Arg.Used) {
- // We can savely skip PS inputs
+ // We can safely skip PS inputs
Skipped.set(i);
++PSInputNum;
continue;
// We REALLY want the ORIGINAL number of vertex elements here, e.g. a
// three or five element vertex only needs three or five registers,
- // NOT four or eigth.
+ // NOT four or eight.
Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
unsigned NumElements = ParamType->getVectorNumElements();
}
// The pointer to the list of arguments is stored in SGPR0, SGPR1
- // The pointer to the scratch buffer is stored in SGPR2, SGPR3
+ // The pointer to the scratch buffer is stored in SGPR2, SGPR3
if (Info->getShaderType() == ShaderType::COMPUTE) {
if (Subtarget->isAmdHsaOS())
Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers.
AnalyzeFormalArguments(CCInfo, Splits);
+ SmallVector<SDValue, 16> Chains;
+
for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
const ISD::InputArg &Arg = Ins[i];
VA.getLocMemOffset();
// The first 36 bytes of the input buffer contains information about
// thread group and global sizes.
- SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(),
+ SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain,
Offset, Ins[i].Flags.isSExt());
+ Chains.push_back(Arg.getValue(1));
- const PointerType *ParamTy =
+ auto *ParamTy =
dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
&AMDGPU::SReg_64RegClass);
Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass);
- InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
+ SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+ InVals.push_back(Copy);
continue;
}
for (unsigned j = 1; j != NumElements; ++j) {
Reg = ArgLocs[ArgIdx++].getLocReg();
Reg = MF.addLiveIn(Reg, RC);
- Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
+
+ SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+ Regs.push_back(Copy);
}
// Fill up the missing vector elements
}
if (Info->getShaderType() != ShaderType::COMPUTE) {
- unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>(
+ unsigned ScratchIdx = CCInfo.getFirstUnallocated(makeArrayRef(
AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()));
Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
}
- return Chain;
+
+ if (Chains.empty())
+ return Chain;
+
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
}
MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
return true;
}
-EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const {
+EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
+ EVT VT) const {
if (!VT.isVector()) {
return MVT::i1;
}
return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
}
-MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const {
+MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT) const {
return MVT::i32;
}
SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
unsigned FrameIndex = FINode->getIndex();
- return DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
+ // A FrameIndex node represents a 32-bit offset into scratch memory. If
+ // the high bit of a frame index offset were to be set, this would mean
+ // that it represented an offset of ~2GB * 64 = ~128GB from the start of the
+ // scratch buffer, with 64 being the number of threads per wave.
+ //
+ // If we know the machine uses less than 128GB of scratch, then we can
+ // amrk the high bit of the FrameIndex node as known zero,
+ // which is important, because it means in most situations we can
+ // prove that values derived from FrameIndex nodes are non-negative.
+ // This enables us to take advantage of more addressing modes when
+ // accessing scratch buffers, since for scratch reads/writes, the register
+ // offset must always be positive.
+
+ SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
+ if (Subtarget->enableHugeScratchBuffer())
+ return TFI;
+
+ return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
+ DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31)));
}
/// This transforms the control flow intrinsics to get the branch destination as
SDLoc DL(GSD);
const GlobalValue *GV = GSD->getGlobal();
- MVT PtrVT = getPointerTy(GSD->getAddressSpace());
+ MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace());
SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT);
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
+ auto MFI = MF.getInfo<SIMachineFunctionInfo>();
const SIRegisterInfo *TRI =
static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
SDLoc DL(Op);
unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ // TODO: Should this propagate fast-math-flags?
+
switch (IntrinsicID) {
case Intrinsic::r600_read_ngroups_x:
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
case Intrinsic::AMDGPU_read_workdim:
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- MF.getInfo<SIMachineFunctionInfo>()->ABIArgOffset,
- false);
+ getImplicitParameterOffset(MFI, GRID_DIM), false);
case Intrinsic::r600_read_tgid_x:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
if (Unsafe) {
// Turn into multiply by the reciprocal.
// x / y -> x * (1.0 / y)
+ SDNodeFlags Flags;
+ Flags.setUnsafeAlgebra(true);
SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
- return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip);
+ return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags);
}
return SDValue();
const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
- EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
+ EVT SetCCVT =
+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+ // TODO: Should this propagate fast-math-flags?
+
r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
SDLoc DL(Op);
EVT VT = Op.getValueType();
SDValue Arg = Op.getOperand(0);
+ // TODO: Should this propagate fast-math-flags?
SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
DAG.getNode(ISD::FMUL, DL, VT, Arg,
DAG.getConstantFP(0.5/M_PI, DL,
unsigned AS = Load->getAddressSpace();
unsigned Align = Load->getAlignment();
Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext());
- unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
+ unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
// Don't try to replace the load if we have to expand it due to alignment
// problems. Otherwise we will end up scalarizing the load, and trying to
}
}
+static bool isFrameIndexOp(SDValue Op) {
+ if (Op.getOpcode() == ISD::AssertZext)
+ Op = Op.getOperand(0);
+
+ return isa<FrameIndexSDNode>(Op);
+}
+
/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
/// with frame index operands.
/// LLVM assumes that inputs are to these instructions are registers.
SmallVector<SDValue, 8> Ops;
for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
- if (!isa<FrameIndexSDNode>(Node->getOperand(i))) {
+ if (!isFrameIndexOp(Node->getOperand(i))) {
Ops.push_back(Node->getOperand(i));
continue;
}
MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
TII->legalizeOperands(MI);
- if (TII->isMIMG(MI->getOpcode())) {
+ if (TII->isMIMG(*MI)) {
unsigned VReg = MI->getOperand(0).getReg();
unsigned Writemask = MI->getOperand(1).getImm();
unsigned BitsSet = 0;
SDLoc DL,
SDValue Ptr) const {
const SIInstrInfo *TII =
- static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-#if 1
- // XXX - Workaround for moveToVALU not handling different register class
- // inserts for REG_SEQUENCE.
-
- // Build the half of the subregister with the constants.
- const SDValue Ops0[] = {
- DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
- buildSMovImm32(DAG, DL, 0),
- DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
- buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
- DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
- };
-
- SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
- MVT::v2i32, Ops0), 0);
-
- // Combine the constants and the pointer.
- const SDValue Ops1[] = {
- DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
- Ptr,
- DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
- SubRegHi,
- DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
- };
+ static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+
+ // Build the half of the subregister with the constants before building the
+ // full 128-bit register. If we are building multiple resource descriptors,
+ // this will allow CSEing of the 2-component register.
+ const SDValue Ops0[] = {
+ DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
+ buildSMovImm32(DAG, DL, 0),
+ DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+ buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
+ DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
+ };
- return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
-#else
- const SDValue Ops[] = {
- DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32),
- Ptr,
- DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32),
- buildSMovImm32(DAG, DL, 0),
- DAG.getTargetConstant(AMDGPU::sub2, MVT::i32),
- buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32),
- DAG.getTargetConstant(AMDGPU::sub3, MVT::i32)
- };
+ SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+ MVT::v2i32, Ops0), 0);
- return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
+ // Combine the constants and the pointer.
+ const SDValue Ops1[] = {
+ DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
+ Ptr,
+ DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
+ SubRegHi,
+ DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
+ };
-#endif
+ return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
}
/// \brief Return a resource descriptor with the 'Add TID' bit enabled
-/// The TID (Thread ID) is multipled by the stride value (bits [61:48]
-/// of the resource descriptor) to create an offset, which is added to the
-/// resource ponter.
+/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
+/// of the resource descriptor) to create an offset, which is added to
+/// the resource pointer.
MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
SDLoc DL,
SDValue Ptr,
SDValue Ptr) const {
const SIInstrInfo *TII =
static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
- uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE |
- 0xffffffff; // Size
- return buildRSRC(DAG, DL, Ptr, 0, Rsrc);
+ return buildRSRC(DAG, DL, Ptr, 0, TII->getScratchRsrcWords23());
}
SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,