setTruncStoreAction(MVT::i64, MVT::i32, Expand);
setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
setOperationAction(ISD::LOAD, MVT::i1, Custom);
return false;
}
+bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
+ // Flat instructions do not have offsets, and only have the register
+ // address.
+ return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
+}
+
bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS) const {
switch (AS) {
case AMDGPUAS::GLOBAL_ADDRESS:
- case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // Assume the we will use FLAT for all global memory accesses
+ // on VI.
+ // FIXME: This assumption is currently wrong. On VI we still use
+ // MUBUF instructions for the r + i addressing mode. As currently
+ // implemented, the MUBUF instructions only work on buffer < 4GB.
+ // It may be possible to support > 4GB buffers with MUBUF instructions,
+ // by setting the stride value in the resource descriptor which would
+ // increase the size limit to (stride * 4GB). However, this is risky,
+ // because it has never been validated.
+ return isLegalFlatAddressingMode(AM);
+ }
+ // fall-through
case AMDGPUAS::PRIVATE_ADDRESS:
+ case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: {
// MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
// additionally can do r + r + i with addr64. 32-bit has more addressing
return false;
}
- case AMDGPUAS::FLAT_ADDRESS: {
- // Flat instructions do not have offsets, and only have the register
- // address.
- return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
- }
+ case AMDGPUAS::FLAT_ADDRESS:
+ return isLegalFlatAddressingMode(AM);
+
default:
llvm_unreachable("unhandled address space");
}
true, // isNonTemporal
true, // isInvariant
Align); // Alignment
- return DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load);
+ SDValue Ops[] = {
+ DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load),
+ Load.getValue(1)
+ };
+
+ return DAG.getMergeValues(Ops, SL);
}
ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
AnalyzeFormalArguments(CCInfo, Splits);
+ SmallVector<SDValue, 16> Chains;
+
for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
const ISD::InputArg &Arg = Ins[i];
VA.getLocMemOffset();
// The first 36 bytes of the input buffer contains information about
// thread group and global sizes.
- SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(),
+ SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain,
Offset, Ins[i].Flags.isSExt());
+ Chains.push_back(Arg.getValue(1));
- const PointerType *ParamTy =
+ auto *ParamTy =
dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
&AMDGPU::SReg_64RegClass);
Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass);
- InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
+ SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+ InVals.push_back(Copy);
continue;
}
for (unsigned j = 1; j != NumElements; ++j) {
Reg = ArgLocs[ArgIdx++].getLocReg();
Reg = MF.addLiveIn(Reg, RC);
- Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
+
+ SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+ Regs.push_back(Copy);
}
// Fill up the missing vector elements
AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()));
Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
}
- return Chain;
+
+ if (Chains.empty())
+ return Chain;
+
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
}
MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
unsigned FrameIndex = FINode->getIndex();
- return DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
+ // A FrameIndex node represents a 32-bit offset into scratch memory. If
+ // the high bit of a frame index offset were to be set, this would mean
+ // that it represented an offset of ~2GB * 64 = ~128GB from the start of the
+ // scratch buffer, with 64 being the number of threads per wave.
+ //
+ // If we know the machine uses less than 128GB of scratch, then we can
+ // amrk the high bit of the FrameIndex node as known zero,
+ // which is important, because it means in most situations we can
+ // prove that values derived from FrameIndex nodes are non-negative.
+ // This enables us to take advantage of more addressing modes when
+ // accessing scratch buffers, since for scratch reads/writes, the register
+ // offset must always be positive.
+
+ SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
+ if (Subtarget->enableHugeScratchBuffer())
+ return TFI;
+
+ return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
+ DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31)));
}
/// This transforms the control flow intrinsics to get the branch destination as
}
}
+static bool isFrameIndexOp(SDValue Op) {
+ if (Op.getOpcode() == ISD::AssertZext)
+ Op = Op.getOperand(0);
+
+ return isa<FrameIndexSDNode>(Op);
+}
+
/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
/// with frame index operands.
/// LLVM assumes that inputs are to these instructions are registers.
SmallVector<SDValue, 8> Ops;
for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
- if (!isa<FrameIndexSDNode>(Node->getOperand(i))) {
+ if (!isFrameIndexOp(Node->getOperand(i))) {
Ops.push_back(Node->getOperand(i));
continue;
}