X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTarget%2FR600%2FSIISelLowering.cpp;h=f4911169d2a9a0d46c8305dcfdb6d58b52610132;hp=e68804850785a5f046056b294ea8646380ea7d90;hb=1378871b1af83f2daa5ffa139c1dbc8132c46dbf;hpb=bd24b33e5782997dfa26b0debf934dd364756982 diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index e6880485078..f4911169d2a 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -14,8 +14,8 @@ #include "SIISelLowering.h" #include "AMDGPU.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" -#include "AMDILIntrinsicInfo.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" @@ -24,26 +24,27 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/IR/Function.h" +#include "llvm/ADT/SmallString.h" using namespace llvm; SITargetLowering::SITargetLowering(TargetMachine &TM) : AMDGPUTargetLowering(TM) { addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); - addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass); + addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); - addRegisterClass(MVT::f32, &AMDGPU::VSrc_32RegClass); + addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass); - addRegisterClass(MVT::f64, &AMDGPU::VSrc_64RegClass); - addRegisterClass(MVT::v2i32, &AMDGPU::VSrc_64RegClass); - addRegisterClass(MVT::v2f32, &AMDGPU::VSrc_64RegClass); + addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); + addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); - addRegisterClass(MVT::v4i32, &AMDGPU::VSrc_128RegClass); - addRegisterClass(MVT::v4f32, &AMDGPU::VSrc_128RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass); addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); @@ -76,6 +77,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::ADD, MVT::i32, Legal); setOperationAction(ISD::ADDC, MVT::i32, Legal); setOperationAction(ISD::ADDE, MVT::i32, Legal); + setOperationAction(ISD::SUBC, MVT::i32, Legal); + setOperationAction(ISD::SUBE, MVT::i32, Legal); // We need to custom lower vector stores from local memory setOperationAction(ISD::LOAD, MVT::v2i32, Custom); @@ -88,33 +91,29 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : // We need to custom lower loads/stores from private memory setOperationAction(ISD::LOAD, MVT::i32, Custom); - setOperationAction(ISD::LOAD, MVT::i64, Custom); setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::LOAD, MVT::v8i32, Custom); setOperationAction(ISD::STORE, MVT::i1, Custom); setOperationAction(ISD::STORE, MVT::i32, Custom); - setOperationAction(ISD::STORE, MVT::i64, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); + setOperationAction(ISD::SELECT, MVT::f32, Promote); + AddPromotedToType(ISD::SELECT, MVT::f32, MVT::i32); setOperationAction(ISD::SELECT, MVT::i64, Custom); setOperationAction(ISD::SELECT, MVT::f64, Promote); AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); - setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); - setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); - - setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); setOperationAction(ISD::SETCC, MVT::v2i1, Expand); setOperationAction(ISD::SETCC, MVT::v4i1, Expand); - setOperationAction(ISD::ANY_EXTEND, MVT::i64, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); @@ -137,6 +136,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); @@ -179,8 +179,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32 }; - const size_t NumVecTypes = array_lengthof(VecTypes); - for (unsigned Type = 0; Type < NumVecTypes; ++Type) { + for (MVT VT : VecTypes) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch(Op) { case ISD::LOAD: @@ -194,7 +193,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : case ISD::EXTRACT_SUBVECTOR: break; default: - setOperationAction(Op, VecTypes[Type], Expand); + setOperationAction(Op, VT, Expand); break; } } @@ -214,9 +213,16 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::FRINT, MVT::f64, Legal); } + // FIXME: These should be removed and handled the same was as f32 fneg. Source + // modifiers also work for the double instructions. + setOperationAction(ISD::FNEG, MVT::f64, Expand); + setOperationAction(ISD::FABS, MVT::f64, Expand); + setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); + setTargetDAGCombine(ISD::UINT_TO_FP); + setSchedulingPreference(Sched::RegPressure); } @@ -444,35 +450,6 @@ SDValue SITargetLowering::LowerFormalArguments( return Chain; } -/// Usually ISel will insert a copy between terminator insturction that output -/// a value and the S_BRANCH* at the end of the block. This causes -/// MachineBasicBlock::getFirstTerminator() to return the incorrect value, -/// so we want to make sure there are no copies between terminators at the -/// end of blocks. -static void LowerTerminatorWithOutput(unsigned Opcode, MachineBasicBlock *BB, - MachineInstr *MI, - const TargetInstrInfo *TII, - MachineRegisterInfo &MRI) { - unsigned DstReg = MI->getOperand(0).getReg(); - // Usually ISel will insert a copy between the SI_IF_NON_TERM instruction - // and the S_BRANCH* terminator. We want to replace SI_IF_NO_TERM with - // SI_IF and we can't have any instructions between S_BRANCH* and SI_IF, - // since they are both terminators - assert(MRI.hasOneUse(DstReg)); - MachineOperand &Use = *MRI.use_begin(DstReg); - MachineInstr *UseMI = Use.getParent(); - assert(UseMI->getOpcode() == AMDGPU::COPY); - - MRI.replaceRegWith(UseMI->getOperand(0).getReg(), DstReg); - UseMI->eraseFromParent(); - BuildMI(*BB, BB->getFirstTerminator(), MI->getDebugLoc(), - TII->get(Opcode)) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addOperand(MI->getOperand(2)); - MI->eraseFromParent(); -} - MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { @@ -510,25 +487,20 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MI->eraseFromParent(); break; } - case AMDGPU::SI_IF_NON_TERM: - LowerTerminatorWithOutput(AMDGPU::SI_IF, BB, MI, TII, MRI); - break; - case AMDGPU::SI_ELSE_NON_TERM: - LowerTerminatorWithOutput(AMDGPU::SI_ELSE, BB, MI, TII, MRI); - break; - case AMDGPU::V_SUB_F64: - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), - MI->getOperand(0).getReg()) - .addReg(MI->getOperand(1).getReg()) - .addReg(MI->getOperand(2).getReg()) - .addImm(0) /* src2 */ - .addImm(0) /* ABS */ - .addImm(0) /* CLAMP */ - .addImm(0) /* OMOD */ - .addImm(2); /* NEG */ + case AMDGPU::V_SUB_F64: { + unsigned DestReg = MI->getOperand(0).getReg(); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg) + .addImm(0) // SRC0 modifiers + .addReg(MI->getOperand(1).getReg()) + .addImm(1) // SRC1 modifiers + .addReg(MI->getOperand(2).getReg()) + .addImm(0) // SRC2 modifiers + .addImm(0) // src2 + .addImm(0) // CLAMP + .addImm(0); // OMOD MI->eraseFromParent(); break; - + } case AMDGPU::SI_RegisterStorePseudo: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); @@ -539,6 +511,50 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MIB.addOperand(MI->getOperand(i)); MI->eraseFromParent(); + break; + } + case AMDGPU::FABS_SI: { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const SIInstrInfo *TII = + static_cast(getTargetMachine().getInstrInfo()); + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), + Reg) + .addImm(0x7fffffff); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_AND_B32_e32), + MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addReg(Reg); + MI->eraseFromParent(); + break; + } + case AMDGPU::FNEG_SI: { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const SIInstrInfo *TII = + static_cast(getTargetMachine().getInstrInfo()); + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), + Reg) + .addImm(0x80000000); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_XOR_B32_e32), + MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addReg(Reg); + MI->eraseFromParent(); + break; + } + case AMDGPU::FCLAMP_SI: { + const SIInstrInfo *TII = + static_cast(getTargetMachine().getInstrInfo()); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F32_e64), + MI->getOperand(0).getReg()) + .addImm(0) // SRC0 modifiers + .addOperand(MI->getOperand(1)) + .addImm(0) // SRC1 modifiers + .addImm(0) // SRC1 + .addImm(1) // CLAMP + .addImm(0); // OMOD + MI->eraseFromParent(); } } return BB; @@ -585,6 +601,14 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::LOAD: { LoadSDNode *Load = dyn_cast(Op); + EVT VT = Op.getValueType(); + + // These loads are legal. + if (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + VT.isVector() && VT.getVectorNumElements() == 2 && + VT.getVectorElementType() == MVT::i32) + return SDValue(); + if (Op.getValueType().isVector() && (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS || @@ -601,11 +625,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { } case ISD::SELECT: return LowerSELECT(Op, DAG); - case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); - case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); - case ISD::ANY_EXTEND: // Fall-through - case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); case ISD::INTRINSIC_WO_CHAIN: { unsigned IntrinsicID = @@ -893,45 +913,17 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); } -SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - SDValue True = Op.getOperand(2); - SDValue False = Op.getOperand(3); - SDValue CC = Op.getOperand(4); - EVT VT = Op.getValueType(); - SDLoc DL(Op); - - // Possible Min/Max pattern - SDValue MinMax = LowerMinMax(Op, DAG); - if (MinMax.getNode()) { - return MinMax; - } - - SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC); - return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False); -} - -SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op, - SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - SDLoc DL(Op); - - if (VT != MVT::i64) { - return SDValue(); - } - - SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i32, Op.getOperand(0), - DAG.getConstant(31, MVT::i32)); - - return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi); -} - SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); StoreSDNode *Store = cast(Op); EVT VT = Store->getMemoryVT(); + // These stores are legal. + if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + VT.isVector() && VT.getVectorNumElements() == 2 && + VT.getVectorElementType() == MVT::i32) + return SDValue(); + SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); if (Ret.getNode()) return Ret; @@ -1007,27 +999,99 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return Chain; } +//===----------------------------------------------------------------------===// +// Custom DAG optimizations +//===----------------------------------------------------------------------===// -SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op, - SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - SDLoc DL(Op); +SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, + DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + EVT ScalarVT = VT.getScalarType(); + if (ScalarVT != MVT::f32) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + SDValue Src = N->getOperand(0); + EVT SrcVT = Src.getValueType(); + + // TODO: We could try to match extracting the higher bytes, which would be + // easier if i8 vectors weren't promoted to i32 vectors, particularly after + // types are legalized. v4i8 -> v4f32 is probably the only case to worry + // about in practice. + if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { + if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { + SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); + DCI.AddToWorklist(Cvt.getNode()); + return Cvt; + } + } - if (VT != MVT::i64) { + // We are primarily trying to catch operations on illegal vector types + // before they are expanded. + // For scalars, we can use the more flexible method of checking masked bits + // after legalization. + if (!DCI.isBeforeLegalize() || + !SrcVT.isVector() || + SrcVT.getVectorElementType() != MVT::i8) { return SDValue(); } - SDValue Src = Op.getOperand(0); - if (Src.getValueType() != MVT::i32) - Src = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src); + assert(DCI.isBeforeLegalize() && "Unexpected legal type"); - SDValue Zero = DAG.getConstant(0, MVT::i32); - return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Src, Zero); -} + // Weird sized vectors are a pain to handle, but we know 3 is really the same + // size as 4. + unsigned NElts = SrcVT.getVectorNumElements(); + if (!SrcVT.isSimple() && NElts != 3) + return SDValue(); -//===----------------------------------------------------------------------===// -// Custom DAG optimizations -//===----------------------------------------------------------------------===// + // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to + // prevent a mess from expanding to v4i32 and repacking. + if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { + EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); + EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); + EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); + + LoadSDNode *Load = cast(Src); + SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, + Load->getChain(), + Load->getBasePtr(), + LoadVT, + Load->getMemOperand()); + + // Make sure successors of the original load stay after it by updating + // them to use the new Chain. + DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); + + SmallVector Elts; + if (RegVT.isVector()) + DAG.ExtractVectorElements(NewLoad, Elts); + else + Elts.push_back(NewLoad); + + SmallVector Ops; + + unsigned EltIdx = 0; + for (SDValue Elt : Elts) { + unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); + for (unsigned I = 0; I < ComponentsInElt; ++I) { + unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; + SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); + DCI.AddToWorklist(Cvt.getNode()); + Ops.push_back(Cvt); + } + + ++EltIdx; + } + + assert(Ops.size() == NElts); + + return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops); + } + + return SDValue(); +} SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -1070,8 +1134,34 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, } break; } + + case AMDGPUISD::CVT_F32_UBYTE0: + case AMDGPUISD::CVT_F32_UBYTE1: + case AMDGPUISD::CVT_F32_UBYTE2: + case AMDGPUISD::CVT_F32_UBYTE3: { + unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; + + SDValue Src = N->getOperand(0); + APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); + + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLO.ShrinkDemandedConstant(Src, Demanded) || + TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + } + + break; } - return SDValue(); + + case ISD::UINT_TO_FP: { + return performUCharToFloatCombine(N, DCI); + } + } + + return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } /// \brief Test if RegClass is one of the VSrc classes @@ -1285,14 +1375,14 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, // e64 version if available, -1 otherwise int OpcodeE64 = AMDGPU::getVOPe64(Opcode); const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? nullptr : &TII->get(OpcodeE64); + int InputModifiers[3] = {0}; assert(!DescE64 || DescE64->getNumDefs() == NumDefs); - assert(!DescE64 || DescE64->getNumOperands() == (NumOps + 4)); int32_t Immediate = Desc->getSize() == 4 ? 0 : -1; bool HaveVSrc = false, HaveSSrc = false; - // First figure out what we alread have in this instruction + // First figure out what we already have in this instruction. for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; i != e && Op < NumOps; ++i, ++Op) { @@ -1311,7 +1401,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, } } - // If we neither have VSrc nor SSrc it makes no sense to continue + // If we neither have VSrc nor SSrc, it makes no sense to continue. if (!HaveVSrc && !HaveSSrc) return Node; @@ -1327,17 +1417,17 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, const SDValue &Operand = Node->getOperand(i); Ops.push_back(Operand); - // Already folded immediate ? + // Already folded immediate? if (isa(Operand.getNode()) || isa(Operand.getNode())) continue; - // Is this a VSrc or SSrc operand ? + // Is this a VSrc or SSrc operand? unsigned RegClass = Desc->OpInfo[Op].RegClass; if (isVSrc(RegClass) || isSSrc(RegClass)) { // Try to fold the immediates if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) { - // Folding didn't worked, make sure we don't hit the SReg limit + // Folding didn't work, make sure we don't hit the SReg limit. ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); } continue; @@ -1362,8 +1452,10 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, } } - if (DescE64 && !Immediate) { + if (Immediate) + continue; + if (DescE64) { // Test if it makes sense to switch to e64 encoding unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass; if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass)) @@ -1381,11 +1473,43 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, DescE64 = nullptr; } } + + if (!DescE64 && !Promote2e64) + continue; + if (!Operand.isMachineOpcode()) + continue; + if (Operand.getMachineOpcode() == AMDGPU::FNEG_SI) { + Ops.pop_back(); + Ops.push_back(Operand.getOperand(0)); + InputModifiers[i] = 1; + Promote2e64 = true; + if (!DescE64) + continue; + Desc = DescE64; + DescE64 = nullptr; + } + else if (Operand.getMachineOpcode() == AMDGPU::FABS_SI) { + Ops.pop_back(); + Ops.push_back(Operand.getOperand(0)); + InputModifiers[i] = 2; + Promote2e64 = true; + if (!DescE64) + continue; + Desc = DescE64; + DescE64 = nullptr; + } } if (Promote2e64) { + std::vector OldOps(Ops); + Ops.clear(); + for (unsigned i = 0; i < OldOps.size(); ++i) { + // src_modifier + Ops.push_back(DAG.getTargetConstant(InputModifiers[i], MVT::i32)); + Ops.push_back(OldOps[i]); + } // Add the modifier flags while promoting - for (unsigned i = 0; i < 4; ++i) + for (unsigned i = 0; i < 2; ++i) Ops.push_back(DAG.getTargetConstant(0, MVT::i32)); } @@ -1495,7 +1619,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, } } -/// \brief Fold the instructions after slecting them +/// \brief Fold the instructions after selecting them. SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { const SIInstrInfo *TII =