X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86ISelLowering.cpp;h=390c4dae9b62f811c1234b6fa6c219ce6183c376;hp=0927c2f4fa50540bc0c3278e5db54f4f4dc0a0d4;hb=f24a5b58cd7ecc4fada221308073b9f13672d6c0;hpb=a5063429b2666581df88584093250f5af5a1f8a3 diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 0927c2f4fa5..390c4dae9b6 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -18,6 +18,7 @@ #include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" +#include "X86ShuffleDecodeConstantPool.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "llvm/ADT/SmallBitVector.h" @@ -264,7 +265,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Without SSE, i64->f64 goes through memory. setOperationAction(ISD::BITCAST , MVT::i64 , Expand); } - } + } else if (!Subtarget->is64Bit()) + setOperationAction(ISD::BITCAST , MVT::i64 , Custom); // Scalar integer divide and remainder are lowered to use operations that // produce two results, to match the available instructions. This exposes @@ -1333,6 +1335,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::SETCC, MVT::i1, Custom); + setOperationAction(ISD::SETCCE, MVT::i1, Custom); setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); setOperationAction(ISD::XOR, MVT::i1, Legal); setOperationAction(ISD::OR, MVT::i1, Legal); @@ -2309,6 +2312,18 @@ X86TargetLowering::LowerReturn(SDValue Chain, DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); } + const X86RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *I = + TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); + if (I) { + for (; *I; ++I) { + if (X86::GR64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i64)); + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + } + } + RetOps[0] = Chain; // Update chain. // Add the flag if we have it. @@ -3906,6 +3921,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::SHUFP: + case X86ISD::INSERTPS: case X86ISD::PALIGNR: case X86ISD::MOVLHPS: case X86ISD::MOVLHPD: @@ -4156,6 +4172,35 @@ static bool hasFPCMov(unsigned X86CC) { } } + +bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const { + + const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic); + if (!IntrData) + return false; + + switch (IntrData->Type) { + case LOADA: + case LOADU: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(I.getType()); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = (IntrData->Type == LOADA ? Info.memVT.getSizeInBits()/8 : 1); + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + return true; + } + default: + break; + } + + return false; +} + /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. @@ -4556,6 +4601,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8); + Result = DAG.getBitcast(CastVT, Result); Vec256 = DAG.getBitcast(CastVT, Vec256); Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask); return DAG.getBitcast(ResultVT, Vec256); @@ -4741,8 +4787,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, /// uses one source. Note that this will set IsUnary for shuffles which use a /// single input multiple times, and in those cases it will /// adjust the mask to only have indices within that single input. -/// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero. -static bool getTargetShuffleMask(SDNode *N, MVT VT, +static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, SmallVectorImpl &Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); SDValue ImmN; @@ -4759,6 +4804,11 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, DecodeSHUFPMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; + case X86ISD::INSERTPS: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeINSERTPSMask(cast(ImmN)->getZExtValue(), Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + break; case X86ISD::UNPCKH: DecodeUNPCKHMask(VT, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); @@ -4851,8 +4901,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, if (auto *C = dyn_cast(MaskCP->getConstVal())) { DecodePSHUFBMask(C, Mask); - if (Mask.empty()) - return false; break; } @@ -4870,11 +4918,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, case X86ISD::VPERM2X128: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2X128Mask(VT, cast(ImmN)->getZExtValue(), Mask); - if (Mask.empty()) return false; - // Mask only contains negative index if an element is zero. - if (std::any_of(Mask.begin(), Mask.end(), - [](int M){ return M == SM_SentinelZero; })) - return false; + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVSLDUP: DecodeMOVSLDUPMask(VT, Mask); @@ -4948,8 +4992,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, if (auto *C = dyn_cast(MaskCP->getConstVal())) { DecodeVPERMVMask(C, VT, Mask); - if (Mask.empty()) - return false; break; } return false; @@ -5000,8 +5042,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, if (auto *C = dyn_cast(MaskCP->getConstVal())) { DecodeVPERMV3Mask(C, VT, Mask); - if (Mask.empty()) - return false; break; } return false; @@ -5009,6 +5049,16 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, default: llvm_unreachable("unknown target shuffle node"); } + // Empty mask indicates the decode failed. + if (Mask.empty()) + return false; + + // Check if we're getting a shuffle mask with zero'd elements. + if (!AllowSentinelZero) + if (std::any_of(Mask.begin(), Mask.end(), + [](int M){ return M == SM_SentinelZero; })) + return false; + // If we have a fake unary shuffle, the shuffle mask is spread across two // inputs that are actually the same node. Re-map the mask to always point // into the first input. @@ -5047,19 +5097,19 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { MVT ShufVT = V.getSimpleValueType(); - unsigned NumElems = ShufVT.getVectorNumElements(); + int NumElems = (int)ShufVT.getVectorNumElements(); SmallVector ShuffleMask; bool IsUnary; - if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) + if (!getTargetShuffleMask(N, ShufVT, false, ShuffleMask, IsUnary)) return SDValue(); int Elt = ShuffleMask[Index]; - if (Elt < 0) + if (Elt == SM_SentinelUndef) return DAG.getUNDEF(ShufVT.getVectorElementType()); - SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) - : N->getOperand(1); + assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range"); + SDValue NewV = (Elt < NumElems) ? N->getOperand(0) : N->getOperand(1); return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } @@ -8166,6 +8216,13 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG)) return TruncBroadcast; + MVT BroadcastVT = VT; + + // Peek through any bitcast (only useful for loads). + SDValue BC = V; + while (BC.getOpcode() == ISD::BITCAST) + BC = BC.getOperand(0); + // Also check the simpler case, where we can directly reuse the scalar. if (V.getOpcode() == ISD::BUILD_VECTOR || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { @@ -8175,13 +8232,17 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, // Only AVX2 has register broadcasts. if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) return SDValue(); - } else if (MayFoldLoad(V) && !cast(V)->isVolatile()) { + } else if (MayFoldLoad(BC) && !cast(BC)->isVolatile()) { + // 32-bit targets need to load i64 as a f64 and then bitcast the result. + if (!Subtarget->is64Bit() && VT.getScalarType() == MVT::i64) + BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); + // If we are broadcasting a load that is only used by the shuffle // then we can reduce the vector load to the broadcasted scalar load. - LoadSDNode *Ld = cast(V); + LoadSDNode *Ld = cast(BC); SDValue BaseAddr = Ld->getOperand(1); EVT AddrVT = BaseAddr.getValueType(); - EVT SVT = VT.getScalarType(); + EVT SVT = BroadcastVT.getScalarType(); unsigned Offset = BroadcastIdx * SVT.getStoreSize(); SDValue NewAddr = DAG.getNode( ISD::ADD, DL, AddrVT, BaseAddr, @@ -8195,7 +8256,8 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, return SDValue(); } - return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V); + V = DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, V); + return DAG.getBitcast(VT, V); } // Check for whether we can use INSERTPS to perform the shuffle. We only use @@ -12475,8 +12537,12 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // location. SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL); SDValue Args[] = { Chain, Offset }; Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); + Chain = + DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true), + DAG.getIntPtrConstant(0, DL, true), SDValue(), DL); // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); @@ -12649,13 +12715,21 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, return Op; } + SDValue ValueToStore = Op.getOperand(0); + if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && + !Subtarget->is64Bit()) + // Bitcasting to f64 here allows us to do a single 64-bit store from + // an SSE register, avoiding the store forwarding penalty that would come + // with two 32-bit stores. + ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); + unsigned Size = SrcVT.getSizeInBits()/8; MachineFunction &MF = DAG.getMachineFunction(); auto PtrVT = getPointerTy(MF.getDataLayout()); int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); SDValue Chain = DAG.getStore( - DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, + DAG.getEntryNode(), dl, ValueToStore, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false, false, 0); return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); @@ -13028,7 +13102,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, } assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), + SDValue ValueToStore = Op.getOperand(0); + if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget->is64Bit()) + // Bitcasting to f64 here allows us to do a single 64-bit store from + // an SSE register, avoiding the store forwarding penalty that would come + // with two 32-bit stores. + ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot, MachinePointerInfo(), false, false, 0); // For i64 source, we need to add the appropriate power of 2 if the input @@ -14896,8 +14976,11 @@ SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const { assert(Carry.getOpcode() != ISD::CARRY_FALSE); SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry); - return DAG.getNode(X86ISD::SETCC, DL, Op.getValueType(), - DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1)); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1)); + if (Op.getSimpleValueType() == MVT::i1) + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); + return SetCC; } // isX86LogicalCmp - Return true if opcode is a X86 logical comparison. @@ -17372,6 +17455,18 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, if (!IntrData) { if (IntNo == llvm::Intrinsic::x86_seh_ehregnode) return MarkEHRegistrationNode(Op, DAG); + if (IntNo == llvm::Intrinsic::x86_flags_read_u32 || + IntNo == llvm::Intrinsic::x86_flags_read_u64 || + IntNo == llvm::Intrinsic::x86_flags_write_u32 || + IntNo == llvm::Intrinsic::x86_flags_write_u64) { + // We need a frame pointer because this will get lowered to a PUSH/POP + // sequence. + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setHasCopyImplyingStackAdjustment(true); + // Don't do anything here, we will expand these intrinsics out later + // during ExpandISelPseudos in EmitInstrWithCustomInserter. + return SDValue(); + } return SDValue(); } @@ -17476,7 +17571,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Results, dl); } case COMPRESS_TO_MEM: { - SDLoc dl(Op); SDValue Mask = Op.getOperand(4); SDValue DataToCompress = Op.getOperand(3); SDValue Addr = Op.getOperand(2); @@ -17502,7 +17596,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, case TRUNCATE_TO_MEM_VI32: return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32); case EXPAND_FROM_MEM: { - SDLoc dl(Op); SDValue Mask = Op.getOperand(4); SDValue PassThru = Op.getOperand(3); SDValue Addr = Op.getOperand(2); @@ -17522,6 +17615,25 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, Mask, PassThru, Subtarget, DAG), Chain}; return DAG.getMergeValues(Results, dl); } + case LOADU: + case LOADA: { + SDValue Mask = Op.getOperand(4); + SDValue PassThru = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + MVT VT = Op.getSimpleValueType(); + + MemIntrinsicSDNode *MemIntr = dyn_cast(Op); + assert(MemIntr && "Expected MemIntrinsicSDNode!"); + + if (isAllOnesConstant(Mask)) // return just a load + return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand()); + + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT, + MemIntr->getMemOperand(), ISD::NON_EXTLOAD); + } } } @@ -19501,24 +19613,37 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, MVT SrcVT = Op.getOperand(0).getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); - if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) { + if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || + SrcVT == MVT::i64) { assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); if (DstVT != MVT::f64) // This conversion needs to be expanded. return SDValue(); - SDValue InVec = Op->getOperand(0); - SDLoc dl(Op); - unsigned NumElts = SrcVT.getVectorNumElements(); - MVT SVT = SrcVT.getVectorElementType(); - - // Widen the vector in input in the case of MVT::v2i32. - // Example: from MVT::v2i32 to MVT::v4i32. + SDValue Op0 = Op->getOperand(0); SmallVector Elts; - for (unsigned i = 0, e = NumElts; i != e; ++i) - Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec, - DAG.getIntPtrConstant(i, dl))); - + SDLoc dl(Op); + unsigned NumElts; + MVT SVT; + if (SrcVT.isVector()) { + NumElts = SrcVT.getVectorNumElements(); + SVT = SrcVT.getVectorElementType(); + + // Widen the vector in input in the case of MVT::v2i32. + // Example: from MVT::v2i32 to MVT::v4i32. + for (unsigned i = 0, e = NumElts; i != e; ++i) + Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0, + DAG.getIntPtrConstant(i, dl))); + } else { + assert(SrcVT == MVT::i64 && !Subtarget->is64Bit() && + "Unexpected source type in LowerBITCAST"); + Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0, + DAG.getIntPtrConstant(0, dl))); + Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0, + DAG.getIntPtrConstant(1, dl))); + NumElts = 2; + SVT = MVT::i32; + } // Explicitly mark the extra elements as Undef. Elts.append(NumElts, DAG.getUNDEF(SVT)); @@ -20674,6 +20799,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VSHLI: return "X86ISD::VSHLI"; case X86ISD::VSRLI: return "X86ISD::VSRLI"; case X86ISD::VSRAI: return "X86ISD::VSRAI"; + case X86ISD::VROTLI: return "X86ISD::VROTLI"; + case X86ISD::VROTRI: return "X86ISD::VROTRI"; case X86ISD::CMPP: return "X86ISD::CMPP"; case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; @@ -21144,6 +21271,47 @@ static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, return BB; } +static MachineBasicBlock *EmitWRPKRU(MachineInstr *MI, MachineBasicBlock *BB, + const X86Subtarget *Subtarget) { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + + // insert input VAL into EAX + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) + .addReg(MI->getOperand(0).getReg()); + // insert zero to ECX + BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX) + .addReg(X86::ECX) + .addReg(X86::ECX); + // insert zero to EDX + BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::EDX) + .addReg(X86::EDX) + .addReg(X86::EDX); + // insert WRPKRU instruction + BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr)); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; +} + +static MachineBasicBlock *EmitRDPKRU(MachineInstr *MI, MachineBasicBlock *BB, + const X86Subtarget *Subtarget) { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + + // insert zero to ECX + BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX) + .addReg(X86::ECX) + .addReg(X86::ECX); + // insert RDPKRU instruction + BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr)); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) + .addReg(X86::EAX); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; +} + static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, const X86Subtarget *Subtarget) { DebugLoc dl = MI->getDebugLoc(); @@ -21716,7 +21884,8 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, if (LastCMOV == MI && NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() && - NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) { + NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg() && + NextMIIt->getOperand(1).isKill()) { CascadedCMOV = &*NextMIIt; } @@ -22495,6 +22664,36 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::CMOV_V64I1: return EmitLoweredSelect(MI, BB); + case X86::RDFLAGS32: + case X86::RDFLAGS64: { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + unsigned PushF = + MI->getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64; + unsigned Pop = + MI->getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r; + BuildMI(*BB, MI, DL, TII->get(PushF)); + BuildMI(*BB, MI, DL, TII->get(Pop), MI->getOperand(0).getReg()); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; + } + + case X86::WRFLAGS32: + case X86::WRFLAGS64: { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + unsigned Push = + MI->getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r; + unsigned PopF = + MI->getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64; + BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI->getOperand(0).getReg()); + BuildMI(*BB, MI, DL, TII->get(PopF)); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; + } + case X86::RELEASE_FADD32mr: case X86::RELEASE_FADD64mr: return EmitLoweredAtomicFP(MI, BB); @@ -22611,7 +22810,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // Thread synchronization. case X86::MONITOR: return EmitMonitor(MI, BB, Subtarget); - + // PKU feature + case X86::WRPKRU: + return EmitWRPKRU(MI, BB, Subtarget); + case X86::RDPKRU: + return EmitRDPKRU(MI, BB, Subtarget); // xbegin case X86::XBEGIN: return EmitXBegin(MI, BB, Subtarget->getInstrInfo()); @@ -23098,7 +23301,7 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, return false; SmallVector OpMask; bool IsUnary; - bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary); + bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, true, OpMask, IsUnary); // We only can combine unary shuffles which we can decode the mask for. if (!HaveMask || !IsUnary) return false; @@ -23195,7 +23398,7 @@ static SmallVector getPSHUFShuffleMask(SDValue N) { MVT VT = N.getSimpleValueType(); SmallVector Mask; bool IsUnary; - bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary); + bool HaveMask = getTargetShuffleMask(N.getNode(), VT, false, Mask, IsUnary); (void)HaveMask; assert(HaveMask); @@ -23480,6 +23683,31 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, } return SDValue(); } + case X86ISD::BLENDI: { + SDValue V0 = N->getOperand(0); + SDValue V1 = N->getOperand(1); + assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() && + "Unexpected input vector types"); + + // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector + // operands and changing the mask to 1. This saves us a bunch of + // pattern-matching possibilities related to scalar math ops in SSE/AVX. + // x86InstrInfo knows how to commute this back after instruction selection + // if it would help register allocation. + + // TODO: If optimizing for size or a processor that doesn't suffer from + // partial register update stalls, this should be transformed into a MOVSD + // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. + + if (VT == MVT::v2f64) + if (auto *Mask = dyn_cast(N->getOperand(2))) + if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) { + SDValue NewMask = DAG.getConstant(1, DL, MVT::i8); + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); + } + + return SDValue(); + } default: return SDValue(); } @@ -23573,9 +23801,13 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, /// the operands which explicitly discard the lanes which are unused by this /// operation to try to flow through the rest of the combiner the fact that /// they're unused. -static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { +static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDLoc DL(N); EVT VT = N->getValueType(0); + if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && + (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) + return SDValue(); // We only handle target-independent shuffles. // FIXME: It would be easy and harmless to use the target shuffle mask @@ -23617,12 +23849,6 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}))) return SDValue(); - // Only specific types are legal at this point, assert so we notice if and - // when these change. - assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 || - VT == MVT::v4f64) && - "Unknown vector type encountered!"); - return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS); } @@ -23642,8 +23868,8 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, // If we have legalized the vector types, look for blends of FADD and FSUB // nodes that we can fuse into an ADDSUB node. - if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3()) - if (SDValue AddSub = combineShuffleToAddSub(N, DAG)) + if (TLI.isTypeLegal(VT)) + if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG)) return AddSub; // Combine 256-bit vector shuffles. This is only profitable when in AVX mode @@ -23745,6 +23971,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, SDValue InVec = N->getOperand(0); SDValue EltNo = N->getOperand(1); + EVT EltVT = N->getValueType(0); if (!isa(EltNo)) return SDValue(); @@ -23773,14 +24000,22 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, SmallVector ShuffleMask; bool UnaryShuffle; - if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), + if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true, ShuffleMask, UnaryShuffle)) return SDValue(); // Select the input vector, guarding against out of range extract vector. unsigned NumElems = CurrentVT.getVectorNumElements(); int Elt = cast(EltNo)->getZExtValue(); - int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; + int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt]; + + if (Idx == SM_SentinelZero) + return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT) + : DAG.getConstantFP(+0.0, SDLoc(N), EltVT); + if (Idx == SM_SentinelUndef) + return DAG.getUNDEF(EltVT); + + assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range"); SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) : InVec.getOperand(1); @@ -23805,7 +24040,6 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) return SDValue(); - EVT EltVT = N->getValueType(0); // If there's a bitcast before the shuffle, check if the load type and // alignment is valid. unsigned Align = LN0->getAlignment(); @@ -27124,6 +27358,32 @@ static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG, return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags); } +/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) -> +/// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y) +/// This exposes the {s/z}ext to the sdivrem lowering, so that it directly +/// extends from AH (which we otherwise need to do contortions to access). +static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + auto OpcodeN = N->getOpcode(); + auto OpcodeN0 = N0.getOpcode(); + if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) || + (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM))) + return SDValue(); + + EVT VT = N->getValueType(0); + EVT InVT = N0.getValueType(); + if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32) + return SDValue(); + + SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); + auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG + : X86ISD::UDIVREM8_ZEXT_HREG; + SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0), + N0.getOperand(1)); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); + return R.getValue(1); +} + static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { @@ -27134,18 +27394,8 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, EVT InSVT = InVT.getScalarType(); SDLoc DL(N); - // (i8,i32 sext (sdivrem (i8 x, i8 y)) -> - // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y) - // This exposes the sext to the sdivrem lowering, so that it directly extends - // from AH (which we otherwise need to do contortions to access). - if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 && - InVT == MVT::i8 && VT == MVT::i32) { - SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); - SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, DL, NodeTys, - N0.getOperand(0), N0.getOperand(1)); - DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); - return R.getValue(1); - } + if (SDValue DivRem8 = getDivRem8(N, DAG)) + return DivRem8; if (!DCI.isBeforeLegalizeOps()) { if (InVT == MVT::i1) { @@ -27304,19 +27554,8 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; - // (i8,i32 zext (udivrem (i8 x, i8 y)) -> - // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y) - // This exposes the zext to the udivrem lowering, so that it directly extends - // from AH (which we otherwise need to do contortions to access). - if (N0.getOpcode() == ISD::UDIVREM && - N0.getResNo() == 1 && N0.getValueType() == MVT::i8 && - (VT == MVT::i32 || VT == MVT::i64)) { - SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); - SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys, - N0.getOperand(0), N0.getOperand(1)); - DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); - return R.getValue(1); - } + if (SDValue DivRem8 = getDivRem8(N, DAG)) + return DivRem8; return SDValue(); } @@ -27382,32 +27621,6 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) { - SDValue V0 = N->getOperand(0); - SDValue V1 = N->getOperand(1); - SDLoc DL(N); - EVT VT = N->getValueType(0); - - // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector - // operands and changing the mask to 1. This saves us a bunch of - // pattern-matching possibilities related to scalar math ops in SSE/AVX. - // x86InstrInfo knows how to commute this back after instruction selection - // if it would help register allocation. - - // TODO: If optimizing for size or a processor that doesn't suffer from - // partial register update stalls, this should be transformed into a MOVSD - // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. - - if (VT == MVT::v2f64) - if (auto *Mask = dyn_cast(N->getOperand(2))) - if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) { - SDValue NewMask = DAG.getConstant(1, DL, MVT::i8); - return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); - } - - return SDValue(); -} - static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); // Gather and Scatter instructions use k-registers for masks. The type of @@ -27851,6 +28064,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::PALIGNR: + case X86ISD::BLENDI: case X86ISD::UNPCKH: case X86ISD::UNPCKL: case X86ISD::MOVHLPS: @@ -27865,7 +28079,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); - case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG); case ISD::MGATHER: case ISD::MSCATTER: return PerformGatherScatterCombine(N, DAG); } @@ -27902,6 +28115,18 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { } } +/// This function checks if any of the users of EFLAGS copies the EFLAGS. We +/// know that the code that lowers COPY of EFLAGS has to use the stack, and if +/// we don't adjust the stack we clobber the first frame index. +/// See X86InstrInfo::copyPhysReg. +bool X86TargetLowering::hasCopyImplyingStackAdjustment( + MachineFunction *MF) const { + const MachineRegisterInfo &MRI = MF->getRegInfo(); + + return any_of(MRI.reg_instructions(X86::EFLAGS), + [](const MachineInstr &RI) { return RI.isCopy(); }); +} + /// IsDesirableToPromoteOp - This method query the target whether it is /// beneficial for dag combiner to promote the specified node. If true, it /// should return the desired promotion type by reference. @@ -28667,3 +28892,52 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { Attribute::MinSize); return OptSize && !VT.isVector(); } + +void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { + if (!Subtarget->is64Bit()) + return; + + // Update IsSplitCSR in X86MachineFunctionInfo. + X86MachineFunctionInfo *AFI = + Entry->getParent()->getInfo(); + AFI->setIsSplitCSR(true); +} + +void X86TargetLowering::insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl &Exits) const { + const X86RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); + if (!IStart) + return; + + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + MachineBasicBlock::iterator MBBI = Entry->begin(); + for (const MCPhysReg *I = IStart; *I; ++I) { + const TargetRegisterClass *RC = nullptr; + if (X86::GR64RegClass.contains(*I)) + RC = &X86::GR64RegClass; + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + + unsigned NewVR = MRI->createVirtualRegister(RC); + // Create copy from CSR to a virtual register. + // FIXME: this currently does not emit CFI pseudo-instructions, it works + // fine for CXX_FAST_TLS since the C++-style TLS access functions should be + // nounwind. If we want to generalize this later, we may need to emit + // CFI pseudo-instructions. + assert(Entry->getParent()->getFunction()->hasFnAttribute( + Attribute::NoUnwind) && + "Function should be nounwind in insertCopiesSplitCSR!"); + Entry->addLiveIn(*I); + BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) + .addReg(*I); + + // Insert the copy-back instructions right before the terminator. + for (auto *Exit : Exits) + BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), + TII->get(TargetOpcode::COPY), *I) + .addReg(NewVR); + } +}