#include "SIISelLowering.h"
#include "AMDGPU.h"
+#include "AMDGPUDiagnosticInfoUnsupported.h"
#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
+ addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
+ addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
+
addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
for (MVT VT : MVT::fp_valuetypes())
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
+
setTruncStoreAction(MVT::i64, MVT::i32, Expand);
setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
+
+ setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
+
+ setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
+ setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
+
setOperationAction(ISD::LOAD, MVT::i1, Custom);
+ setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
+
+ setOperationAction(ISD::STORE, MVT::v2i64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
+
+ setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
+
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
setOperationAction(ISD::SELECT, MVT::i1, Promote);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
+
+
+ setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
+
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
- for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) {
+ for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch(Op) {
case ISD::LOAD:
case ISD::INSERT_VECTOR_ELT:
case ISD::INSERT_SUBVECTOR:
case ISD::EXTRACT_SUBVECTOR:
+ case ISD::SCALAR_TO_VECTOR:
break;
case ISD::CONCAT_VECTORS:
setOperationAction(Op, VT, Custom);
}
}
+ // Most operations are naturally 32-bit vector operations. We only support
+ // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
+ for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
+
+ setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
+ }
+
if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
return false;
}
+bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
+ // Flat instructions do not have offsets, and only have the register
+ // address.
+ return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
+}
+
+bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
+ // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
+ // additionally can do r + r + i with addr64. 32-bit has more addressing
+ // mode options. Depending on the resource constant, it can also do
+ // (i64 r0) + (i32 r1) * (i14 i).
+ //
+ // Private arrays end up using a scratch buffer most of the time, so also
+ // assume those use MUBUF instructions. Scratch loads / stores are currently
+ // implemented as mubuf instructions with offen bit set, so slightly
+ // different than the normal addr64.
+ if (!isUInt<12>(AM.BaseOffs))
+ return false;
+
+ // FIXME: Since we can split immediate into soffset and immediate offset,
+ // would it make sense to allow any immediate?
+
+ switch (AM.Scale) {
+ case 0: // r + i or just i, depending on HasBaseReg.
+ return true;
+ case 1:
+ return true; // We have r + r or r + i.
+ case 2:
+ if (AM.HasBaseReg) {
+ // Reject 2 * r + r.
+ return false;
+ }
+
+ // Allow 2 * r as r + r
+ // Or 2 * r + i is allowed as r + r + i.
+ return true;
+ default: // Don't allow n * r
+ return false;
+ }
+}
+
bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS) const {
return false;
switch (AS) {
- case AMDGPUAS::GLOBAL_ADDRESS:
- case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
- case AMDGPUAS::PRIVATE_ADDRESS:
- case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: {
- // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
- // additionally can do r + r + i with addr64. 32-bit has more addressing
- // mode options. Depending on the resource constant, it can also do
- // (i64 r0) + (i32 r1) * (i14 i).
- //
- // SMRD instructions have an 8-bit, dword offset.
- //
- // Assume nonunifom access, since the address space isn't enough to know
- // what instruction we will use, and since we don't know if this is a load
- // or store and scalar stores are only available on VI.
- //
- // We also know if we are doing an extload, we can't do a scalar load.
- //
- // Private arrays end up using a scratch buffer most of the time, so also
- // assume those use MUBUF instructions. Scratch loads / stores are currently
- // implemented as mubuf instructions with offen bit set, so slightly
- // different than the normal addr64.
- if (!isUInt<12>(AM.BaseOffs))
- return false;
+ case AMDGPUAS::GLOBAL_ADDRESS: {
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // Assume the we will use FLAT for all global memory accesses
+ // on VI.
+ // FIXME: This assumption is currently wrong. On VI we still use
+ // MUBUF instructions for the r + i addressing mode. As currently
+ // implemented, the MUBUF instructions only work on buffer < 4GB.
+ // It may be possible to support > 4GB buffers with MUBUF instructions,
+ // by setting the stride value in the resource descriptor which would
+ // increase the size limit to (stride * 4GB). However, this is risky,
+ // because it has never been validated.
+ return isLegalFlatAddressingMode(AM);
+ }
- // FIXME: Since we can split immediate into soffset and immediate offset,
- // would it make sense to allow any immediate?
+ return isLegalMUBUFAddressingMode(AM);
+ }
+ case AMDGPUAS::CONSTANT_ADDRESS: {
+ // If the offset isn't a multiple of 4, it probably isn't going to be
+ // correctly aligned.
+ if (AM.BaseOffs % 4 != 0)
+ return isLegalMUBUFAddressingMode(AM);
+
+ // There are no SMRD extloads, so if we have to do a small type access we
+ // will use a MUBUF load.
+ // FIXME?: We also need to do this if unaligned, but we don't know the
+ // alignment here.
+ if (DL.getTypeStoreSize(Ty) < 4)
+ return isLegalMUBUFAddressingMode(AM);
+
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ // SMRD instructions have an 8-bit, dword offset on SI.
+ if (!isUInt<8>(AM.BaseOffs / 4))
+ return false;
+ } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
+ // On CI+, this can also be a 32-bit literal constant offset. If it fits
+ // in 8-bits, it can use a smaller encoding.
+ if (!isUInt<32>(AM.BaseOffs / 4))
+ return false;
+ } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // On VI, these use the SMEM format and the offset is 20-bit in bytes.
+ if (!isUInt<20>(AM.BaseOffs))
+ return false;
+ } else
+ llvm_unreachable("unhandled generation");
- switch (AM.Scale) {
- case 0: // r + i or just i, depending on HasBaseReg.
+ if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
return true;
- case 1:
- return true; // We have r + r or r + i.
- case 2:
- if (AM.HasBaseReg) {
- // Reject 2 * r + r.
- return false;
- }
- // Allow 2 * r as r + r
- // Or 2 * r + i is allowed as r + r + i.
+ if (AM.Scale == 1 && AM.HasBaseReg)
return true;
- default: // Don't allow n * r
- return false;
- }
+
+ return false;
}
+
+ case AMDGPUAS::PRIVATE_ADDRESS:
+ case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
+ return isLegalMUBUFAddressingMode(AM);
+
case AMDGPUAS::LOCAL_ADDRESS:
case AMDGPUAS::REGION_ADDRESS: {
// Basic, single offset DS instructions allow a 16-bit unsigned immediate
return false;
}
- case AMDGPUAS::FLAT_ADDRESS: {
- // Flat instructions do not have offsets, and only have the register
- // address.
- return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
- }
+ case AMDGPUAS::FLAT_ADDRESS:
+ return isLegalFlatAddressingMode(AM);
+
default:
llvm_unreachable("unhandled address space");
}
// ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
// aligned, 8 byte access in a single operation using ds_read2/write2_b32
// with adjacent offsets.
- return Align % 4 == 0;
+ bool AlignedBy4 = (Align % 4 == 0);
+ if (IsFast)
+ *IsFast = AlignedBy4;
+ return AlignedBy4;
}
// Smaller than dword value must be aligned.
return TII->isInlineConstant(Imm);
}
-static EVT toIntegerVT(EVT VT) {
- if (VT.isVector())
- return VT.changeVectorElementTypeToInteger();
- return MVT::getIntegerVT(VT.getSizeInBits());
-}
-
SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
SDLoc SL, SDValue Chain,
unsigned Offset, bool Signed) const {
MachineFunction &MF = DAG.getMachineFunction();
const SIRegisterInfo *TRI =
static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
- unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
+ unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
Type *Ty = VT.getTypeForEVT(*DAG.getContext());
unsigned Align = DL.getABITypeAlignment(Ty);
- if (VT != MemVT && VT.isFloatingPoint()) {
- // Do an integer load and convert.
- // FIXME: This is mostly because load legalization after type legalization
- // doesn't handle FP extloads.
- assert(VT.getScalarType() == MVT::f32 &&
- MemVT.getScalarType() == MVT::f16);
-
- EVT IVT = toIntegerVT(VT);
- EVT MemIVT = toIntegerVT(MemVT);
- SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD,
- IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT,
- false, // isVolatile
- true, // isNonTemporal
- true, // isInvariant
- Align); // Alignment
- SDValue Ops[] = {
- DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load),
- Load.getValue(1)
- };
-
- return DAG.getMergeValues(Ops, SL);
- }
-
ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+ if (MemVT.isFloatingPoint())
+ ExtTy = ISD::EXTLOAD;
+
return DAG.getLoad(ISD::UNINDEXED, ExtTy,
VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
false, // isVolatile
MachineFunction &MF = DAG.getMachineFunction();
FunctionType *FType = MF.getFunction()->getFunctionType();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+
+ if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) {
+ const Function *Fn = MF.getFunction();
+ DiagnosticInfoUnsupported NoGraphicsHSA(*Fn, "non-compute shaders with HSA");
+ DAG.getContext()->diagnose(NoGraphicsHSA);
+ return SDValue();
+ }
- assert(CallConv == CallingConv::C);
+ // FIXME: We currently assume all calling conventions are kernels.
SmallVector<ISD::InputArg, 16> Splits;
BitVector Skipped(Ins.size());
assert((PSInputNum <= 15) && "Too many PS inputs!");
if (!Arg.Used) {
- // We can savely skip PS inputs
+ // We can safely skip PS inputs
Skipped.set(i);
++PSInputNum;
continue;
// We REALLY want the ORIGINAL number of vertex elements here, e.g. a
// three or five element vertex only needs three or five registers,
- // NOT four or eigth.
+ // NOT four or eight.
Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
unsigned NumElements = ParamType->getVectorNumElements();
CCInfo.AllocateReg(AMDGPU::VGPR1);
}
- // The pointer to the list of arguments is stored in SGPR0, SGPR1
- // The pointer to the scratch buffer is stored in SGPR2, SGPR3
- if (Info->getShaderType() == ShaderType::COMPUTE) {
- if (Subtarget->isAmdHsaOS())
- Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers.
- else
- Info->NumUserSGPRs = 4;
-
- unsigned InputPtrReg =
- TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
- unsigned InputPtrRegLo =
- TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0);
- unsigned InputPtrRegHi =
- TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1);
-
- unsigned ScratchPtrReg =
- TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
- unsigned ScratchPtrRegLo =
- TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0);
- unsigned ScratchPtrRegHi =
- TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1);
-
- CCInfo.AllocateReg(InputPtrRegLo);
- CCInfo.AllocateReg(InputPtrRegHi);
- CCInfo.AllocateReg(ScratchPtrRegLo);
- CCInfo.AllocateReg(ScratchPtrRegHi);
- MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
- MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass);
- }
-
if (Info->getShaderType() == ShaderType::COMPUTE) {
getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
Splits);
}
+ // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
+ if (Info->hasPrivateSegmentBuffer()) {
+ unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
+ MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
+ CCInfo.AllocateReg(PrivateSegmentBufferReg);
+ }
+
+ if (Info->hasDispatchPtr()) {
+ unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
+ MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass);
+ CCInfo.AllocateReg(DispatchPtrReg);
+ }
+
+ if (Info->hasKernargSegmentPtr()) {
+ unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
+ MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
+ CCInfo.AllocateReg(InputPtrReg);
+ }
+
AnalyzeFormalArguments(CCInfo, Splits);
SmallVector<SDValue, 16> Chains;
Offset, Ins[i].Flags.isSExt());
Chains.push_back(Arg.getValue(1));
- const PointerType *ParamTy =
+ auto *ParamTy =
dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
InVals.push_back(Val);
}
- if (Info->getShaderType() != ShaderType::COMPUTE) {
- unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>(
- AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()));
- Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
+ // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
+ // these from the dispatch pointer.
+
+ // Start adding system SGPRs.
+ if (Info->hasWorkGroupIDX()) {
+ unsigned Reg = Info->addWorkGroupIDX();
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ } else
+ llvm_unreachable("work group id x is always enabled");
+
+ if (Info->hasWorkGroupIDY()) {
+ unsigned Reg = Info->addWorkGroupIDY();
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info->hasWorkGroupIDZ()) {
+ unsigned Reg = Info->addWorkGroupIDZ();
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info->hasWorkGroupInfo()) {
+ unsigned Reg = Info->addWorkGroupInfo();
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info->hasPrivateSegmentWaveByteOffset()) {
+ // Scratch wave offset passed in system SGPR.
+ unsigned PrivateSegmentWaveByteOffsetReg
+ = Info->addPrivateSegmentWaveByteOffset();
+
+ MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
+ CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
+ }
+
+ // Now that we've figured out where the scratch register inputs are, see if
+ // should reserve the arguments and use them directly.
+
+ bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
+
+ if (ST.isAmdHsaOS()) {
+ // TODO: Assume we will spill without optimizations.
+ if (HasStackObjects) {
+ // If we have stack objects, we unquestionably need the private buffer
+ // resource. For the HSA ABI, this will be the first 4 user SGPR
+ // inputs. We can reserve those and use them directly.
+
+ unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+ Info->setScratchRSrcReg(PrivateSegmentBufferReg);
+
+ unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+ } else {
+ unsigned ReservedBufferReg
+ = TRI->reservedPrivateSegmentBufferReg(MF);
+ unsigned ReservedOffsetReg
+ = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+
+ // We tentatively reserve the last registers (skipping the last two
+ // which may contain VCC). After register allocation, we'll replace
+ // these with the ones immediately after those which were really
+ // allocated. In the prologue copies will be inserted from the argument
+ // to these reserved registers.
+ Info->setScratchRSrcReg(ReservedBufferReg);
+ Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+ }
+ } else {
+ unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF);
+
+ // Without HSA, relocations are used for the scratch pointer and the
+ // buffer resource setup is always inserted in the prologue. Scratch wave
+ // offset is still in an input SGPR.
+ Info->setScratchRSrcReg(ReservedBufferReg);
+
+ if (HasStackObjects) {
+ unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+ } else {
+ unsigned ReservedOffsetReg
+ = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+ Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+ }
+ }
+
+ if (Info->hasWorkItemIDX()) {
+ unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
+ MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ } else
+ llvm_unreachable("workitem id x should always be enabled");
+
+ if (Info->hasWorkItemIDY()) {
+ unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
+ MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info->hasWorkItemIDZ()) {
+ unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
+ MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
}
if (Chains.empty())
SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
unsigned FrameIndex = FINode->getIndex();
- return DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
+ // A FrameIndex node represents a 32-bit offset into scratch memory. If
+ // the high bit of a frame index offset were to be set, this would mean
+ // that it represented an offset of ~2GB * 64 = ~128GB from the start of the
+ // scratch buffer, with 64 being the number of threads per wave.
+ //
+ // If we know the machine uses less than 128GB of scratch, then we can
+ // amrk the high bit of the FrameIndex node as known zero,
+ // which is important, because it means in most situations we can
+ // prove that values derived from FrameIndex nodes are non-negative.
+ // This enables us to take advantage of more addressing modes when
+ // accessing scratch buffers, since for scratch reads/writes, the register
+ // offset must always be positive.
+
+ SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
+ if (Subtarget->enableHugeScratchBuffer())
+ return TFI;
+
+ return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
+ DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31)));
}
/// This transforms the control flow intrinsics to get the branch destination as
// a glue result.
}
+SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
+ SDValue Op,
+ MVT VT,
+ unsigned Offset) const {
+ SDLoc SL(Op);
+ SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL,
+ DAG.getEntryNode(), Offset, false);
+ // The local size values will have the hi 16-bits as zero.
+ return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
+ DAG.getValueType(VT));
+}
+
SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
SDLoc DL(Op);
unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ // TODO: Should this propagate fast-math-flags?
+
switch (IntrinsicID) {
+ case Intrinsic::amdgcn_dispatch_ptr:
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
+ TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT);
+
case Intrinsic::r600_read_ngroups_x:
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
SI::KernelInputOffsets::NGROUPS_X, false);
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
case Intrinsic::r600_read_local_size_x:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::LOCAL_SIZE_X, false);
+ return lowerImplicitZextParam(DAG, Op, MVT::i16,
+ SI::KernelInputOffsets::LOCAL_SIZE_X);
case Intrinsic::r600_read_local_size_y:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::LOCAL_SIZE_Y, false);
+ return lowerImplicitZextParam(DAG, Op, MVT::i16,
+ SI::KernelInputOffsets::LOCAL_SIZE_Y);
case Intrinsic::r600_read_local_size_z:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::LOCAL_SIZE_Z, false);
-
+ return lowerImplicitZextParam(DAG, Op, MVT::i16,
+ SI::KernelInputOffsets::LOCAL_SIZE_Z);
case Intrinsic::AMDGPU_read_workdim:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- getImplicitParameterOffset(MFI, GRID_DIM), false);
-
+ // Really only 2 bits.
+ return lowerImplicitZextParam(DAG, Op, MVT::i8,
+ getImplicitParameterOffset(MFI, GRID_DIM));
case Intrinsic::r600_read_tgid_x:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT);
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
case Intrinsic::r600_read_tgid_y:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT);
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
case Intrinsic::r600_read_tgid_z:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT);
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
case Intrinsic::r600_read_tidig_x:
return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT);
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
case Intrinsic::r600_read_tidig_y:
return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT);
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
case Intrinsic::r600_read_tidig_z:
return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT);
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
case AMDGPUIntrinsic::SI_load_const: {
SDValue Ops[] = {
Op.getOperand(1),
DAG.getConstant(2, DL, MVT::i32), // P0
Op.getOperand(1), Op.getOperand(2), Glue);
}
+ case AMDGPUIntrinsic::SI_packf16:
+ if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef())
+ return DAG.getUNDEF(MVT::i32);
+ return Op;
case AMDGPUIntrinsic::SI_fs_interp: {
SDValue IJ = Op.getOperand(4);
SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
"Custom lowering for non-i32 vectors hasn't been implemented.");
unsigned NumElements = Op.getValueType().getVectorNumElements();
assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
+
switch (Load->getAddressSpace()) {
default: break;
case AMDGPUAS::GLOBAL_ADDRESS:
case AMDGPUAS::PRIVATE_ADDRESS:
+ if (NumElements >= 8)
+ return SplitVectorLoad(Op, DAG);
+
// v4 loads are supported for private and global memory.
if (NumElements <= 4)
break;
// fall-through
case AMDGPUAS::LOCAL_ADDRESS:
- return ScalarizeVectorLoad(Op, DAG);
+ // If properly aligned, if we split we might be able to use ds_read_b64.
+ return SplitVectorLoad(Op, DAG);
}
}
if (Unsafe) {
// Turn into multiply by the reciprocal.
// x / y -> x * (1.0 / y)
+ SDNodeFlags Flags;
+ Flags.setUnsafeAlgebra(true);
SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
- return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip);
+ return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags);
}
return SDValue();
SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+ // TODO: Should this propagate fast-math-flags?
+
r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
return Ret;
if (VT.isVector() && VT.getVectorNumElements() >= 8)
- return ScalarizeVectorStore(Op, DAG);
+ return SplitVectorStore(Op, DAG);
if (VT == MVT::i1)
return DAG.getTruncStore(Store->getChain(), DL,
SDLoc DL(Op);
EVT VT = Op.getValueType();
SDValue Arg = Op.getOperand(0);
+ // TODO: Should this propagate fast-math-flags?
SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
DAG.getNode(ISD::FMUL, DL, VT, Arg,
DAG.getConstantFP(0.5/M_PI, DL,
}
}
+static bool isFrameIndexOp(SDValue Op) {
+ if (Op.getOpcode() == ISD::AssertZext)
+ Op = Op.getOperand(0);
+
+ return isa<FrameIndexSDNode>(Op);
+}
+
/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
/// with frame index operands.
/// LLVM assumes that inputs are to these instructions are registers.
SmallVector<SDValue, 8> Ops;
for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
- if (!isa<FrameIndexSDNode>(Node->getOperand(i))) {
+ if (!isFrameIndexOp(Node->getOperand(i))) {
Ops.push_back(Node->getOperand(i));
continue;
}
static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
- TII->legalizeOperands(MI);
- if (TII->isMIMG(MI->getOpcode())) {
+ if (TII->isVOP3(MI->getOpcode())) {
+ // Make sure constant bus requirements are respected.
+ TII->legalizeOperandsVOP3(MRI, MI);
+ return;
+ }
+
+ if (TII->isMIMG(*MI)) {
unsigned VReg = MI->getOperand(0).getReg();
unsigned Writemask = MI->getOperand(1).getImm();
unsigned BitsSet = 0;
SDLoc DL,
SDValue Ptr) const {
const SIInstrInfo *TII =
- static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-#if 1
- // XXX - Workaround for moveToVALU not handling different register class
- // inserts for REG_SEQUENCE.
-
- // Build the half of the subregister with the constants.
- const SDValue Ops0[] = {
- DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
- buildSMovImm32(DAG, DL, 0),
- DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
- buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
- DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
- };
-
- SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
- MVT::v2i32, Ops0), 0);
-
- // Combine the constants and the pointer.
- const SDValue Ops1[] = {
- DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
- Ptr,
- DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
- SubRegHi,
- DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
- };
+ static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+
+ // Build the half of the subregister with the constants before building the
+ // full 128-bit register. If we are building multiple resource descriptors,
+ // this will allow CSEing of the 2-component register.
+ const SDValue Ops0[] = {
+ DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
+ buildSMovImm32(DAG, DL, 0),
+ DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+ buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
+ DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
+ };
- return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
-#else
- const SDValue Ops[] = {
- DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32),
- Ptr,
- DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32),
- buildSMovImm32(DAG, DL, 0),
- DAG.getTargetConstant(AMDGPU::sub2, MVT::i32),
- buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32),
- DAG.getTargetConstant(AMDGPU::sub3, MVT::i32)
- };
+ SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+ MVT::v2i32, Ops0), 0);
- return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
+ // Combine the constants and the pointer.
+ const SDValue Ops1[] = {
+ DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
+ Ptr,
+ DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
+ SubRegHi,
+ DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
+ };
-#endif
+ return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
}
/// \brief Return a resource descriptor with the 'Add TID' bit enabled
-/// The TID (Thread ID) is multipled by the stride value (bits [61:48]
-/// of the resource descriptor) to create an offset, which is added to the
-/// resource ponter.
+/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
+/// of the resource descriptor) to create an offset, which is added to
+/// the resource pointer.
MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
SDLoc DL,
SDValue Ptr,
return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
}
-MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG,
- SDLoc DL,
- SDValue Ptr) const {
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
- uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE |
- 0xffffffff; // Size
-
- return buildRSRC(DAG, DL, Ptr, 0, Rsrc);
-}
-
SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
const TargetRegisterClass *RC,
unsigned Reg, EVT VT) const {