setOperationAction(ISD::LOAD, MVT::i32, Custom);
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+
+ // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
+ // spaces, so it is custom lowered to handle those where it isn't.
setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
+ setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
+
setOperationAction(ISD::STORE, MVT::i8, Custom);
setOperationAction(ISD::STORE, MVT::i32, Custom);
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
switch (MI->getOpcode()) {
default:
- if (TII->isLDSInstr(MI->getOpcode()) &&
- TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst) != -1) {
+ // Replace LDS_*_RET instruction that don't have any uses with the
+ // equivalent LDS_*_NORET instruction.
+ if (TII->isLDSRetInstr(MI->getOpcode())) {
int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
assert(DstIdx != -1);
MachineInstrBuilder NewMI;
- if (!MRI.use_empty(MI->getOperand(DstIdx).getReg())) {
- NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()),
- AMDGPU::OQAP);
- TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV,
- MI->getOperand(0).getReg(),
- AMDGPU::OQAP);
- } else {
- NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
- TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
- }
+ if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()))
+ return BB;
+
+ NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
+ TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
NewMI.addOperand(MI->getOperand(i));
}
case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
- unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
+ unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
.addOperand(MI->getOperand(0))
// Instruction is left unmodified if its not the last one of its type
bool isLastInstructionOfItsType = true;
unsigned InstExportType = MI->getOperand(1).getImm();
- for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
+ for (MachineBasicBlock::iterator NextExportInst = std::next(I),
EndBlock = BB->end(); NextExportInst != EndBlock;
- NextExportInst = llvm::next(NextExportInst)) {
+ NextExportInst = std::next(NextExportInst)) {
if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
unsigned CurrentInstExportType = NextExportInst->getOperand(1)
}
}
}
- bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
+ bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
if (!EOP && !isLastInstructionOfItsType)
return BB;
unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
case ISD::STORE: return LowerSTORE(Op, DAG);
case ISD::LOAD: return LowerLOAD(Op, DAG);
- case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
case ISD::INTRINSIC_VOID: {
SDValue Chain = Op.getOperand(0);
TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
DL, MVT::f32, SDValue(interp, 0));
}
-
MachineFunction &MF = DAG.getMachineFunction();
MachineRegisterInfo &MRI = MF.getRegInfo();
unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
RegisterJNode, RegisterINode);
return SDValue(interp, slot % 2);
}
+ case AMDGPUIntrinsic::R600_interp_xy:
+ case AMDGPUIntrinsic::R600_interp_zw: {
+ int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ MachineSDNode *interp;
+ SDValue RegisterINode = Op.getOperand(2);
+ SDValue RegisterJNode = Op.getOperand(3);
+
+ if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
+ interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
+ MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
+ RegisterJNode, RegisterINode);
+ else
+ interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
+ MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
+ RegisterJNode, RegisterINode);
+ return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
+ SDValue(interp, 0), SDValue(interp, 1));
+ }
case AMDGPUIntrinsic::R600_tex:
case AMDGPUIntrinsic::R600_texc:
case AMDGPUIntrinsic::R600_txl:
false, false, false, 0);
}
-SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
-
- MachineFunction &MF = DAG.getMachineFunction();
- const AMDGPUFrameLowering *TFL =
- static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
-
- FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
- assert(FIN);
-
- unsigned FrameIndex = FIN->getIndex();
- unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
- return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
-}
-
bool R600TargetLowering::isZero(SDValue Op) const {
if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
return Cst->isNullValue();
HWFalse = DAG.getConstant(0, CompareVT);
}
else {
- assert(!"Unhandled value type in LowerSELECT_CC");
+ llvm_unreachable("Unhandled value type in LowerSELECT_CC");
}
// Lower this unsupported SELECT_CC into a combination of two supported
DAG.getCondCode(ISD::SETNE));
}
-/// LLVM generates byte-addresed pointers. For indirect addressing, we need to
+/// LLVM generates byte-addressed pointers. For indirect addressing, we need to
/// convert these pointers to a register index. Each register holds
/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
Ptr, DAG.getConstant(2, MVT::i32)));
if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
- assert(!"Truncated and indexed stores not supported yet");
+ llvm_unreachable("Truncated and indexed stores not supported yet");
} else {
Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
}
return SDValue();
}
+ SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
+ if (Ret.getNode()) {
+ return Ret;
+ }
// Lowering for indirect addressing
const MachineFunction &MF = DAG.getMachineFunction();
SDValue Ptr = Op.getOperand(1);
SDValue LoweredLoad;
+ SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
+ if (Ret.getNode()) {
+ SDValue Ops[2];
+ Ops[0] = Ret;
+ Ops[1] = Chain;
+ return DAG.getMergeValues(Ops, 2, DL);
+ }
+
+
if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
SDValue MergedValues[2] = {
SplitVectorLoad(Op, DAG),
}
int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
- if (ConstantBlock > -1) {
+ if (ConstantBlock > -1 &&
+ ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
+ (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
SDValue Result;
- if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
- dyn_cast<Constant>(LoadNode->getSrcValue()) ||
- dyn_cast<ConstantSDNode>(Ptr)) {
+ if (isa<ConstantExpr>(LoadNode->getSrcValue()) ||
+ isa<Constant>(LoadNode->getSrcValue()) ||
+ isa<ConstantSDNode>(Ptr)) {
SDValue Slots[4];
for (unsigned i = 0; i < 4; i++) {
// We want Const position encoded with the following formula :
}
Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
} else {
- // non constant ptr cant be folded, keeps it as a v4f32 load
+ // non-constant ptr can't be folded, keeps it as a v4f32 load
Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
DAG.getConstant(LoadNode->getAddressSpace() -
return DAG.getMergeValues(MergedValues, 2, DL);
}
- // For most operations returning SDValue() will result int he node being
- // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so
- // we need to manually expand loads that may be legal in some address spaces
- // and illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported
- // for compute shaders, since the data is sign extended when it is uploaded
- // to the buffer. Howerver SEXT loads from other addresspaces are not
- // supported, so we need to expand them here.
+ // For most operations returning SDValue() will result in the node being
+ // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
+ // need to manually expand loads that may be legal in some address spaces and
+ // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
+ // compute shaders, since the data is sign extended when it is uploaded to the
+ // buffer. However SEXT loads from other address spaces are not supported, so
+ // we need to expand them here.
if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
EVT MemVT = LoadNode->getMemoryVT();
assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
getTargetMachine(), ArgLocs, *DAG.getContext());
+ MachineFunction &MF = DAG.getMachineFunction();
+ unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType;
- AnalyzeFormalArguments(CCInfo, Ins);
+ SmallVector<ISD::InputArg, 8> LocalIns;
+
+ getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
+ LocalIns);
+
+ AnalyzeFormalArguments(CCInfo, LocalIns);
for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
CCValAssign &VA = ArgLocs[i];
- EVT VT = VA.getLocVT();
+ EVT VT = Ins[i].VT;
+ EVT MemVT = LocalIns[i].VT;
+
+ if (ShaderType != ShaderType::COMPUTE) {
+ unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
+ SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+ InVals.push_back(Register);
+ continue;
+ }
PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
AMDGPUAS::CONSTANT_BUFFER_0);
// The first 36 bytes of the input buffer contains information about
// thread group and global sizes.
- SDValue Arg = DAG.getLoad(VT, DL, Chain,
- DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
- MachinePointerInfo(UndefValue::get(PtrTy)), false,
- false, false, 4); // 4 is the prefered alignment for
- // the CONSTANT memory space.
+ SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain,
+ DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
+ MachinePointerInfo(UndefValue::get(PtrTy)),
+ MemVT, false, false, 4);
+ // 4 is the preferred alignment for
+ // the CONSTANT memory space.
InVals.push_back(Arg);
}
return Chain;
};
for (unsigned i = 0; i < 4; i++) {
+ if (NewBldVec[i].getOpcode() == ISD::UNDEF)
+ // We mask write here to teach later passes that the ith element of this
+ // vector is undef. Thus we can use it to reduce 128 bits reg usage,
+ // break false dependencies and additionnaly make assembly easier to read.
+ RemapSwizzle[i] = 7; // SEL_MASK_WRITE
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
if (C->isZero()) {
RemapSwizzle[i] = 4; // SEL_0
VectorEntry.getOperand(3)
};
bool isUnmovable[4] = { false, false, false, false };
- for (unsigned i = 0; i < 4; i++)
+ for (unsigned i = 0; i < 4; i++) {
RemapSwizzle[i] = i;
+ if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
+ ->getZExtValue();
+ if (i == Idx)
+ isUnmovable[Idx] = true;
+ }
+ }
for (unsigned i = 0; i < 4; i++) {
if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
->getZExtValue();
- if (!isUnmovable[Idx]) {
- // Swap i and Idx
- std::swap(NewBldVec[Idx], NewBldVec[i]);
- std::swap(RemapSwizzle[RemapSwizzle[Idx]], RemapSwizzle[RemapSwizzle[i]]);
- }
- isUnmovable[Idx] = true;
+ if (isUnmovable[Idx])
+ continue;
+ // Swap i and Idx
+ std::swap(NewBldVec[Idx], NewBldVec[i]);
+ std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
+ break;
}
}
break;
}
- // insert_vector_elt (build_vector elt0, …, eltN), NewEltIdx, idx
- // => build_vector elt0, …, NewEltIdx, …, eltN
+ // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
+ // => build_vector elt0, ... , NewEltIdx, ... , eltN
case ISD::INSERT_VECTOR_ELT: {
SDValue InVec = N->getOperand(0);
SDValue InVal = N->getOperand(1);
std::vector<SDValue> Ops;
for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end();
I != E; ++I)
- Ops.push_back(*I);
+ Ops.push_back(*I);
if (Opcode == AMDGPU::DOT_4) {
int OperandIdx[] = {
TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
- };
+ };
int NegIdx[] = {
TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
std::vector<SDValue> Ops;
unsigned NumOp = Src.getNumOperands();
for(unsigned i = 0; i < NumOp; ++i)
- Ops.push_back(Src.getOperand(i));
+ Ops.push_back(Src.getOperand(i));
Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
Node->getVTList(), Ops);