X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FR600%2FSIISelLowering.cpp;h=d5d2b68caf0a76784c5533778ab2e03dcc292458;hb=0f3982734058d9039d986e9d1dd3a879ff3512f0;hp=316567cef4656345fd2d04dd67305c9a27144cef;hpb=4e518fd941b119834b5764708fbabf41adc45040;p=oota-llvm.git diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 316567cef46..d5d2b68caf0 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -32,23 +32,17 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : AMDGPUTargetLowering(TM) { addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass); - addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); + addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass); - addRegisterClass(MVT::v2i1, &AMDGPU::VReg_64RegClass); - addRegisterClass(MVT::v4i1, &AMDGPU::VReg_128RegClass); - - addRegisterClass(MVT::v16i8, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); - addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass); - addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass); - - addRegisterClass(MVT::v1i32, &AMDGPU::VReg_32RegClass); + addRegisterClass(MVT::i32, &AMDGPU::VSrc_32RegClass); + addRegisterClass(MVT::f32, &AMDGPU::VSrc_32RegClass); - addRegisterClass(MVT::v2i32, &AMDGPU::VReg_64RegClass); - addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); - addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::f64, &AMDGPU::VSrc_64RegClass); + addRegisterClass(MVT::v2i32, &AMDGPU::VSrc_64RegClass); + addRegisterClass(MVT::v2f32, &AMDGPU::VSrc_64RegClass); addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass); addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); @@ -62,6 +56,21 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : computeRegisterProperties(); + // Condition Codes + setCondCodeAction(ISD::SETONE, MVT::f32, Expand); + setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); + setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); + setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); + setCondCodeAction(ISD::SETULE, MVT::f32, Expand); + setCondCodeAction(ISD::SETULT, MVT::f32, Expand); + + setCondCodeAction(ISD::SETONE, MVT::f64, Expand); + setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand); + setCondCodeAction(ISD::SETUGE, MVT::f64, Expand); + setCondCodeAction(ISD::SETUGT, MVT::f64, Expand); + setCondCodeAction(ISD::SETULE, MVT::f64, Expand); + setCondCodeAction(ISD::SETULT, MVT::f64, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); @@ -69,6 +78,32 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::ADD, MVT::i64, Legal); setOperationAction(ISD::ADD, MVT::i32, Legal); + setOperationAction(ISD::ADDC, MVT::i32, Legal); + setOperationAction(ISD::ADDE, MVT::i32, Legal); + + setOperationAction(ISD::BITCAST, MVT::i128, Legal); + + // We need to custom lower vector stores from local memory + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setOperationAction(ISD::LOAD, MVT::v8i32, Custom); + setOperationAction(ISD::LOAD, MVT::v16i32, Custom); + + setOperationAction(ISD::STORE, MVT::v8i32, Custom); + setOperationAction(ISD::STORE, MVT::v16i32, Custom); + + // We need to custom lower loads/stores from private memory + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::i64, Custom); + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + + setOperationAction(ISD::STORE, MVT::i32, Custom); + setOperationAction(ISD::STORE, MVT::i64, Custom); + setOperationAction(ISD::STORE, MVT::i128, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v4i32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); @@ -78,13 +113,31 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::SETCC, MVT::v2i1, Expand); setOperationAction(ISD::SETCC, MVT::v4i1, Expand); + setOperationAction(ISD::ANY_EXTEND, MVT::i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); + + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + setTruncStoreAction(MVT::i64, MVT::i32, Expand); + setTruncStoreAction(MVT::i128, MVT::i64, Expand); + setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); + setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); - setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::FrameIndex, MVT::i64, Custom); setTargetDAGCombine(ISD::SELECT_CC); @@ -101,24 +154,28 @@ bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *IsFast) const { // XXX: This depends on the address space and also we may want to revist // the alignment values we specify in the DataLayout. + if (!VT.isSimple() || VT == MVT::Other) + return false; return VT.bitsGT(MVT::i32); } +bool SITargetLowering::shouldSplitVectorElementType(EVT VT) const { + return VT.bitsLE(MVT::i16); +} -SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, +SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL, SDValue Chain, unsigned Offset) const { MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), AMDGPUAS::CONSTANT_ADDRESS); - EVT ArgVT = MVT::getIntegerVT(VT.getSizeInBits()); SDValue BasePtr = DAG.getCopyFromReg(Chain, DL, MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64); SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, DAG.getConstant(Offset, MVT::i64)); - return DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, Chain, Ptr, - MachinePointerInfo(UndefValue::get(PtrTy)), - VT, false, false, ArgVT.getSizeInBits() >> 3); + return DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain, Ptr, + MachinePointerInfo(UndefValue::get(PtrTy)), MemVT, + false, false, MemVT.getSizeInBits() >> 3); } @@ -145,7 +202,8 @@ SDValue SITargetLowering::LowerFormalArguments( const ISD::InputArg &Arg = Ins[i]; // First check if it's a PS input addr - if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg()) { + if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg() && + !Arg.Flags.isByVal()) { assert((PSInputNum <= 15) && "Too many PS inputs!"); @@ -176,7 +234,7 @@ SDValue SITargetLowering::LowerFormalArguments( NewArg.PartOffset += NewArg.VT.getStoreSize(); } - } else { + } else if (Info->ShaderType != ShaderType::COMPUTE) { Splits.push_back(Arg); } } @@ -199,6 +257,11 @@ SDValue SITargetLowering::LowerFormalArguments( MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass); } + if (Info->ShaderType == ShaderType::COMPUTE) { + getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, + Splits); + } + AnalyzeFormalArguments(CCInfo, Splits); for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { @@ -213,9 +276,11 @@ SDValue SITargetLowering::LowerFormalArguments( EVT VT = VA.getLocVT(); if (VA.isMemLoc()) { + VT = Ins[i].VT; + EVT MemVT = Splits[i].VT; // The first 36 bytes of the input buffer contains information about // thread group and global sizes. - SDValue Arg = LowerParameter(DAG, VT, DL, DAG.getRoot(), + SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(), 36 + VA.getLocMemOffset()); InVals.push_back(Arg); continue; @@ -319,6 +384,19 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MI->eraseFromParent(); break; } + case AMDGPU::SI_RegisterStorePseudo: { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const SIInstrInfo *TII = + static_cast(getTargetMachine().getInstrInfo()); + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + MachineInstrBuilder MIB = + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), + Reg); + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) + MIB.addOperand(MI->getOperand(i)); + + MI->eraseFromParent(); + } } return BB; } @@ -334,6 +412,24 @@ MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { return MVT::i32; } +bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + VT = VT.getScalarType(); + + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f32: + return false; /* There is V_MAD_F32 for f32 */ + case MVT::f64: + return true; + default: + break; + } + + return false; +} + //===----------------------------------------------------------------------===// // Custom DAG Lowering Operations //===----------------------------------------------------------------------===// @@ -343,9 +439,28 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { SIMachineFunctionInfo *MFI = MF.getInfo(); switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case ISD::ADD: return LowerADD(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); + case ISD::LOAD: { + LoadSDNode *Load = dyn_cast(Op); + if ((Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && + Op.getValueType().isVector()) { + SDValue MergedValues[2] = { + SplitVectorLoad(Op, DAG), + Load->getChain() + }; + return DAG.getMergeValues(MergedValues, 2, SDLoc(Op)); + } else { + return LowerLOAD(Op, DAG); + } + } + case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG); + case ISD::STORE: return LowerSTORE(Op, DAG); + case ISD::ANY_EXTEND: // Fall-through + case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); case ISD::INTRINSIC_WO_CHAIN: { unsigned IntrinsicID = @@ -357,23 +472,23 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (IntrinsicID) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); case Intrinsic::r600_read_ngroups_x: - return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 0); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0); case Intrinsic::r600_read_ngroups_y: - return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 4); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4); case Intrinsic::r600_read_ngroups_z: - return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 8); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8); case Intrinsic::r600_read_global_size_x: - return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 12); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12); case Intrinsic::r600_read_global_size_y: - return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 16); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16); case Intrinsic::r600_read_global_size_z: - return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 20); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20); case Intrinsic::r600_read_local_size_x: - return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 24); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24); case Intrinsic::r600_read_local_size_y: - return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 28); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28); case Intrinsic::r600_read_local_size_z: - return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 32); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32); case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT); @@ -392,13 +507,102 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case Intrinsic::r600_read_tidig_z: return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, AMDGPU::VGPR2, VT); - + case AMDGPUIntrinsic::SI_load_const: { + SDValue Ops [] = { + ResourceDescriptorToi128(Op.getOperand(1), DAG), + Op.getOperand(2) + }; + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, + VT.getSizeInBits() / 8, 4); + return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, + Op->getVTList(), Ops, 2, VT, MMO); + } + case AMDGPUIntrinsic::SI_sample: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); + case AMDGPUIntrinsic::SI_sampleb: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); + case AMDGPUIntrinsic::SI_sampled: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); + case AMDGPUIntrinsic::SI_samplel: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); + case AMDGPUIntrinsic::SI_vs_load_input: + return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, + ResourceDescriptorToi128(Op.getOperand(1), DAG), + Op.getOperand(2), + Op.getOperand(3)); } } + + case ISD::INTRINSIC_VOID: + SDValue Chain = Op.getOperand(0); + unsigned IntrinsicID = cast(Op.getOperand(1))->getZExtValue(); + + switch (IntrinsicID) { + case AMDGPUIntrinsic::SI_tbuffer_store: { + SDLoc DL(Op); + SDValue Ops [] = { + Chain, + ResourceDescriptorToi128(Op.getOperand(2), DAG), + Op.getOperand(3), + Op.getOperand(4), + Op.getOperand(5), + Op.getOperand(6), + Op.getOperand(7), + Op.getOperand(8), + Op.getOperand(9), + Op.getOperand(10), + Op.getOperand(11), + Op.getOperand(12), + Op.getOperand(13), + Op.getOperand(14) + }; + EVT VT = Op.getOperand(3).getValueType(); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOStore, + VT.getSizeInBits() / 8, 4); + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, + Op->getVTList(), Ops, + sizeof(Ops)/sizeof(Ops[0]), VT, MMO); + } + default: + break; + } } return SDValue(); } +SDValue SITargetLowering::LowerADD(SDValue Op, + SelectionDAG &DAG) const { + if (Op.getValueType() != MVT::i64) + return SDValue(); + + SDLoc DL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + SDValue Zero = DAG.getConstant(0, MVT::i32); + SDValue One = DAG.getConstant(1, MVT::i32); + + SDValue Lo0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, Zero); + SDValue Hi0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, One); + + SDValue Lo1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, Zero); + SDValue Hi1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, One); + + SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Glue); + + SDValue AddLo = DAG.getNode(ISD::ADDC, DL, VTList, Lo0, Lo1); + SDValue Carry = AddLo.getValue(1); + SDValue AddHi = DAG.getNode(ISD::ADDE, DL, VTList, Hi0, Hi1, Carry); + + return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddLo, AddHi.getValue(0)); +} + /// \brief Helper function for LowerBRCOND static SDNode *findUser(SDValue Value, unsigned Opcode) { @@ -493,6 +697,53 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, return Chain; } +SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + LoadSDNode *Load = cast(Op); + + if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) + return SDValue(); + + SDValue TruncPtr = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, + Load->getBasePtr(), DAG.getConstant(0, MVT::i32)); + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr, + DAG.getConstant(2, MVT::i32)); + + SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), + Load->getChain(), Ptr, + DAG.getTargetConstant(0, MVT::i32), + Op.getOperand(2)); + SDValue MergedValues[2] = { + Ret, + Load->getChain() + }; + return DAG.getMergeValues(MergedValues, 2, DL); + +} + +SDValue SITargetLowering::ResourceDescriptorToi128(SDValue Op, + SelectionDAG &DAG) const { + + if (Op.getValueType() == MVT::i128) { + return Op; + } + + assert(Op.getOpcode() == ISD::UNDEF); + + return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), MVT::i128, + DAG.getConstant(0, MVT::i64), + DAG.getConstant(0, MVT::i64)); +} + +SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, + const SDValue &Op, + SelectionDAG &DAG) const { + return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), + Op.getOperand(2), + ResourceDescriptorToi128(Op.getOperand(3), DAG), + Op.getOperand(4)); +} + SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); @@ -527,6 +778,69 @@ SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op, return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi); } +SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + StoreSDNode *Store = cast(Op); + EVT VT = Store->getMemoryVT(); + + SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); + if (Ret.getNode()) + return Ret; + + if (VT.isVector() && VT.getVectorNumElements() >= 8) + return SplitVectorStore(Op, DAG); + + if (Store->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) + return SDValue(); + + SDValue TruncPtr = DAG.getZExtOrTrunc(Store->getBasePtr(), DL, MVT::i32); + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr, + DAG.getConstant(2, MVT::i32)); + SDValue Chain = Store->getChain(); + SmallVector Values; + + if (VT == MVT::i64) { + for (unsigned i = 0; i < 2; ++i) { + Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, + Store->getValue(), DAG.getConstant(i, MVT::i32))); + } + } else if (VT == MVT::i128) { + for (unsigned i = 0; i < 2; ++i) { + for (unsigned j = 0; j < 2; ++j) { + Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, + DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, + Store->getValue(), DAG.getConstant(i, MVT::i32)), + DAG.getConstant(j, MVT::i32))); + } + } + } else { + Values.push_back(Store->getValue()); + } + + for (unsigned i = 0; i < Values.size(); ++i) { + SDValue PartPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, + Ptr, DAG.getConstant(i, MVT::i32)); + Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Values[i], PartPtr, + DAG.getTargetConstant(0, MVT::i32)); + } + return Chain; +} + + +SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDLoc DL(Op); + + if (VT != MVT::i64) { + return SDValue(); + } + + return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), + DAG.getConstant(0, MVT::i32)); +} + //===----------------------------------------------------------------------===// // Custom DAG optimizations //===----------------------------------------------------------------------===// @@ -540,7 +854,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, switch (N->getOpcode()) { default: break; case ISD::SELECT_CC: { - N->dump(); ConstantSDNode *True, *False; // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc) if ((True = dyn_cast(N->getOperand(2))) @@ -658,43 +971,67 @@ bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, return false; } +const TargetRegisterClass *SITargetLowering::getRegClassForNode( + SelectionDAG &DAG, const SDValue &Op) const { + const SIInstrInfo *TII = + static_cast(getTargetMachine().getInstrInfo()); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + + if (!Op->isMachineOpcode()) { + switch(Op->getOpcode()) { + case ISD::CopyFromReg: { + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + unsigned Reg = cast(Op->getOperand(1))->getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + return MRI.getRegClass(Reg); + } + return TRI.getPhysRegClass(Reg); + } + default: return NULL; + } + } + const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode()); + int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass; + if (OpClassID != -1) { + return TRI.getRegClass(OpClassID); + } + switch(Op.getMachineOpcode()) { + case AMDGPU::COPY_TO_REGCLASS: + // Operand 1 is the register class id for COPY_TO_REGCLASS instructions. + OpClassID = cast(Op->getOperand(1))->getZExtValue(); + + // If the COPY_TO_REGCLASS instruction is copying to a VSrc register + // class, then the register class for the value could be either a + // VReg or and SReg. In order to get a more accurate + if (OpClassID == AMDGPU::VSrc_32RegClassID || + OpClassID == AMDGPU::VSrc_64RegClassID) { + return getRegClassForNode(DAG, Op.getOperand(0)); + } + return TRI.getRegClass(OpClassID); + case AMDGPU::EXTRACT_SUBREG: { + int SubIdx = cast(Op.getOperand(1))->getZExtValue(); + const TargetRegisterClass *SuperClass = + getRegClassForNode(DAG, Op.getOperand(0)); + return TRI.getSubClassWithSubReg(SuperClass, SubIdx); + } + case AMDGPU::REG_SEQUENCE: + // Operand 0 is the register class id for REG_SEQUENCE instructions. + return TRI.getRegClass( + cast(Op.getOperand(0))->getZExtValue()); + default: + return getRegClassFor(Op.getSimpleValueType()); + } +} + /// \brief Does "Op" fit into register class "RegClass" ? bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op, unsigned RegClass) const { - - MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); - SDNode *Node = Op.getNode(); - - const TargetRegisterClass *OpClass; const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); - if (MachineSDNode *MN = dyn_cast(Node)) { - const SIInstrInfo *TII = - static_cast(getTargetMachine().getInstrInfo()); - const MCInstrDesc &Desc = TII->get(MN->getMachineOpcode()); - int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass; - if (OpClassID == -1) { - switch (MN->getMachineOpcode()) { - case AMDGPU::REG_SEQUENCE: - // Operand 0 is the register class id for REG_SEQUENCE instructions. - OpClass = TRI->getRegClass( - cast(MN->getOperand(0))->getZExtValue()); - break; - default: - OpClass = getRegClassFor(Op.getSimpleValueType()); - break; - } - } else { - OpClass = TRI->getRegClass(OpClassID); - } - - } else if (Node->getOpcode() == ISD::CopyFromReg) { - RegisterSDNode *Reg = cast(Node->getOperand(1).getNode()); - OpClass = MRI.getRegClass(Reg->getReg()); - - } else + const TargetRegisterClass *RC = getRegClassForNode(DAG, Op); + if (!RC) { return false; - - return TRI->getRegClass(RegClass)->hasSubClassEq(OpClass); + } + return TRI->getRegClass(RegClass)->hasSubClassEq(RC); } /// \brief Make sure that we don't exeed the number of allowed scalars @@ -720,8 +1057,8 @@ void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, return; } - // This is a conservative aproach, it is possible that we can't determine - // the correct register class and copy too often, but better save than sorry. + // This is a conservative aproach. It is possible that we can't determine the + // correct register class and copy too often, but better safe than sorry. SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), Operand.getValueType(), Operand, RC); @@ -898,7 +1235,9 @@ static unsigned SubIdx2Lane(unsigned Idx) { void SITargetLowering::adjustWritemask(MachineSDNode *&Node, SelectionDAG &DAG) const { SDNode *Users[4] = { }; - unsigned Writemask = 0, Lane = 0; + unsigned Lane = 0; + unsigned OldDmask = Node->getConstantOperandVal(0); + unsigned NewDmask = 0; // Try to figure out the used register components for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); @@ -909,29 +1248,42 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) return; + // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. + // Note that subregs are packed, i.e. Lane==0 is the first bit set + // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit + // set, etc. Lane = SubIdx2Lane(I->getConstantOperandVal(1)); + // Set which texture component corresponds to the lane. + unsigned Comp; + for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { + assert(Dmask); + Comp = countTrailingZeros(Dmask); + Dmask &= ~(1 << Comp); + } + // Abort if we have more than one user per component if (Users[Lane]) return; Users[Lane] = *I; - Writemask |= 1 << Lane; + NewDmask |= 1 << Comp; } - // Abort if all components are used - if (Writemask == 0xf) + // Abort if there's no change + if (NewDmask == OldDmask) return; // Adjust the writemask in the node std::vector Ops; - Ops.push_back(DAG.getTargetConstant(Writemask, MVT::i32)); + Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32)); for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) Ops.push_back(Node->getOperand(i)); Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size()); // If we only got one lane, replace it with a copy - if (Writemask == (1U << Lane)) { + // (if NewDmask has only one bit set...) + if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32); SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), Users[Lane]->getValueType(0), @@ -962,9 +1314,11 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, /// \brief Fold the instructions after slecting them SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { + const SIInstrInfo *TII = + static_cast(getTargetMachine().getInstrInfo()); Node = AdjustRegClass(Node, DAG); - if (AMDGPU::isMIMG(Node->getMachineOpcode()) != -1) + if (TII->isMIMG(Node->getMachineOpcode())) adjustWritemask(Node, DAG); return foldOperands(Node, DAG); @@ -974,7 +1328,9 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, /// bits set in the writemask void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const { - if (AMDGPU::isMIMG(MI->getOpcode()) == -1) + const SIInstrInfo *TII = + static_cast(getTargetMachine().getInstrInfo()); + if (!TII->isMIMG(MI->getOpcode())) return; unsigned VReg = MI->getOperand(0).getReg(); @@ -991,6 +1347,8 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, case 3: RC = &AMDGPU::VReg_96RegClass; break; } + unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); + MI->setDesc(TII->get(NewOpcode)); MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); MRI.setRegClass(VReg, RC); } @@ -1003,20 +1361,6 @@ MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N, switch (N->getMachineOpcode()) { default: return N; - case AMDGPU::REG_SEQUENCE: { - // MVT::i128 only use SGPRs, so i128 REG_SEQUENCEs don't need to be - // rewritten. - if (N->getValueType(0) == MVT::i128) { - return N; - } - const SDValue Ops[] = { - DAG.getTargetConstant(AMDGPU::VReg_64RegClassID, MVT::i32), - N->getOperand(1) , N->getOperand(2), - N->getOperand(3), N->getOperand(4) - }; - return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::i64, Ops); - } - case AMDGPU::S_LOAD_DWORD_IMM: NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64; // Fall-through