X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86ISelLowering.cpp;h=390c4dae9b62f811c1234b6fa6c219ce6183c376;hp=26e69507617ca5bd658ed140b2ce6b2688f4b5f5;hb=f24a5b58cd7ecc4fada221308073b9f13672d6c0;hpb=c3219df36a571450c7d4cb4103d14ae23cb07907 diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 26e69507617..390c4dae9b6 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -18,6 +18,7 @@ #include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" +#include "X86ShuffleDecodeConstantPool.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "llvm/ADT/SmallBitVector.h" @@ -25,7 +26,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -264,7 +265,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Without SSE, i64->f64 goes through memory. setOperationAction(ISD::BITCAST , MVT::i64 , Expand); } - } + } else if (!Subtarget->is64Bit()) + setOperationAction(ISD::BITCAST , MVT::i64 , Custom); // Scalar integer divide and remainder are lowered to use operations that // produce two results, to match the available instructions. This exposes @@ -296,6 +298,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BR_CC , MVT::f32, Expand); setOperationAction(ISD::BR_CC , MVT::f64, Expand); setOperationAction(ISD::BR_CC , MVT::f80, Expand); + setOperationAction(ISD::BR_CC , MVT::f128, Expand); setOperationAction(ISD::BR_CC , MVT::i8, Expand); setOperationAction(ISD::BR_CC , MVT::i16, Expand); setOperationAction(ISD::BR_CC , MVT::i32, Expand); @@ -303,6 +306,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT_CC , MVT::f32, Expand); setOperationAction(ISD::SELECT_CC , MVT::f64, Expand); setOperationAction(ISD::SELECT_CC , MVT::f80, Expand); + setOperationAction(ISD::SELECT_CC , MVT::f128, Expand); setOperationAction(ISD::SELECT_CC , MVT::i8, Expand); setOperationAction(ISD::SELECT_CC , MVT::i16, Expand); setOperationAction(ISD::SELECT_CC , MVT::i32, Expand); @@ -415,12 +419,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT , MVT::f32 , Custom); setOperationAction(ISD::SELECT , MVT::f64 , Custom); setOperationAction(ISD::SELECT , MVT::f80 , Custom); + setOperationAction(ISD::SELECT , MVT::f128 , Custom); setOperationAction(ISD::SETCC , MVT::i8 , Custom); setOperationAction(ISD::SETCC , MVT::i16 , Custom); setOperationAction(ISD::SETCC , MVT::i32 , Custom); setOperationAction(ISD::SETCC , MVT::f32 , Custom); setOperationAction(ISD::SETCC , MVT::f64 , Custom); setOperationAction(ISD::SETCC , MVT::f80 , Custom); + setOperationAction(ISD::SETCC , MVT::f128 , Custom); setOperationAction(ISD::SETCCE , MVT::i8 , Custom); setOperationAction(ISD::SETCCE , MVT::i16 , Custom); setOperationAction(ISD::SETCCE , MVT::i32 , Custom); @@ -619,8 +625,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); - // Long double always uses X87. + // Long double always uses X87, except f128 in MMX. if (!Subtarget->useSoftFloat()) { + if (Subtarget->is64Bit() && Subtarget->hasMMX()) { + addRegisterClass(MVT::f128, &X86::FR128RegClass); + ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat); + setOperationAction(ISD::FABS , MVT::f128, Custom); + setOperationAction(ISD::FNEG , MVT::f128, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); + } + addRegisterClass(MVT::f80, &X86::RFP80RegClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); @@ -1148,7 +1162,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); - if (Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()) { + if (Subtarget->hasAnyFMA()) { setOperationAction(ISD::FMA, MVT::v8f32, Legal); setOperationAction(ISD::FMA, MVT::v4f64, Legal); setOperationAction(ISD::FMA, MVT::v4f32, Legal); @@ -1321,6 +1335,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::SETCC, MVT::i1, Custom); + setOperationAction(ISD::SETCCE, MVT::i1, Custom); setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); setOperationAction(ISD::XOR, MVT::i1, Legal); setOperationAction(ISD::OR, MVT::i1, Legal); @@ -1340,6 +1355,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FDIV, MVT::v16f32, Legal); setOperationAction(ISD::FSQRT, MVT::v16f32, Legal); setOperationAction(ISD::FNEG, MVT::v16f32, Custom); + setOperationAction(ISD::FABS, MVT::v16f32, Custom); setOperationAction(ISD::FADD, MVT::v8f64, Legal); setOperationAction(ISD::FSUB, MVT::v8f64, Legal); @@ -1347,6 +1363,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FDIV, MVT::v8f64, Legal); setOperationAction(ISD::FSQRT, MVT::v8f64, Legal); setOperationAction(ISD::FNEG, MVT::v8f64, Custom); + setOperationAction(ISD::FABS, MVT::v8f64, Custom); setOperationAction(ISD::FMA, MVT::v8f64, Legal); setOperationAction(ISD::FMA, MVT::v16f32, Legal); @@ -1384,6 +1401,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); + } else { + setOperationAction(ISD::MLOAD, MVT::v8i32, Custom); + setOperationAction(ISD::MLOAD, MVT::v8f32, Custom); + setOperationAction(ISD::MSTORE, MVT::v8i32, Custom); + setOperationAction(ISD::MSTORE, MVT::v8f32, Custom); } setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); @@ -1459,6 +1481,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); @@ -1505,17 +1528,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget->hasCDI()) { setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i64, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i32, Legal); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i32, Expand); setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); setOperationAction(ISD::CTLZ, MVT::v16i16, Custom); setOperationAction(ISD::CTLZ, MVT::v32i8, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i16, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i8, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i16, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i8, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i16, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i8, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i16, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i8, Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom); @@ -1525,10 +1548,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTLZ, MVT::v8i32, Legal); setOperationAction(ISD::CTLZ, MVT::v2i64, Legal); setOperationAction(ISD::CTLZ, MVT::v4i32, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Legal); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); @@ -1539,10 +1562,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTLZ, MVT::v8i32, Custom); setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand); } } // Subtarget->hasCDI() @@ -1559,7 +1582,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::OR, VT, Legal); setOperationAction(ISD::XOR, VT, Legal); } - if (EltSize >= 32 && VT.getSizeInBits() <= 512) { + if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) { setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } @@ -1585,6 +1608,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); + setOperationAction(ISD::MGATHER, VT, Legal); + setOperationAction(ISD::MSCATTER, VT, Custom); } } for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) { @@ -1660,13 +1685,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget->hasCDI()) { setOperationAction(ISD::CTLZ, MVT::v32i16, Custom); setOperationAction(ISD::CTLZ, MVT::v64i8, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i16, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i16, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8, Expand); } for (auto VT : { MVT::v64i8, MVT::v32i16 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + + setOperationAction(ISD::AND, VT, Promote); + AddPromotedToType (ISD::AND, VT, MVT::v8i64); + setOperationAction(ISD::OR, VT, Promote); + AddPromotedToType (ISD::OR, VT, MVT::v8i64); + setOperationAction(ISD::XOR, VT, Promote); + AddPromotedToType (ISD::XOR, VT, MVT::v8i64); } } @@ -1710,8 +1745,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - if (!Subtarget->is64Bit()) + if (!Subtarget->is64Bit()) { setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); + } // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. @@ -1775,6 +1812,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FNEG); setTargetDAGCombine(ISD::FMA); + setTargetDAGCombine(ISD::FMINNUM); + setTargetDAGCombine(ISD::FMAXNUM); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::MLOAD); @@ -1791,6 +1830,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); + setTargetDAGCombine(ISD::MSCATTER); + setTargetDAGCombine(ISD::MGATHER); computeRegisterProperties(Subtarget->getRegisterInfo()); @@ -2161,6 +2202,9 @@ X86TargetLowering::LowerReturn(SDValue Chain, MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); + if (CallConv == CallingConv::X86_INTR && !Outs.empty()) + report_fatal_error("X86 interrupts may not return any value"); + SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); @@ -2268,13 +2312,28 @@ X86TargetLowering::LowerReturn(SDValue Chain, DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); } + const X86RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *I = + TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); + if (I) { + for (; *I; ++I) { + if (X86::GR64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i64)); + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + } + } + RetOps[0] = Chain; // Update chain. // Add the flag if we have it. if (Flag.getNode()) RetOps.push_back(Flag); - return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps); + X86ISD::NodeType opcode = X86ISD::RET_FLAG; + if (CallConv == CallingConv::X86_INTR) + opcode = X86ISD::IRET; + return DAG.getNode(opcode, dl, MVT::Other, RetOps); } bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { @@ -2353,7 +2412,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, EVT CopyVT = VA.getLocVT(); // If this is x86-64, and we disabled SSE, we can't return FP values - if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && + if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } @@ -2404,28 +2463,28 @@ enum StructReturnType { StackStructReturn }; static StructReturnType -callIsStructReturn(const SmallVectorImpl &Outs) { +callIsStructReturn(const SmallVectorImpl &Outs, bool IsMCU) { if (Outs.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Outs[0].Flags; if (!Flags.isSRet()) return NotStructReturn; - if (Flags.isInReg()) + if (Flags.isInReg() || IsMCU) return RegStructReturn; return StackStructReturn; } /// Determines whether a function uses struct return semantics. static StructReturnType -argsAreStructReturn(const SmallVectorImpl &Ins) { +argsAreStructReturn(const SmallVectorImpl &Ins, bool IsMCU) { if (Ins.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Ins[0].Flags; if (!Flags.isSRet()) return NotStructReturn; - if (Flags.isInReg()) + if (Flags.isInReg() || IsMCU) return RegStructReturn; return StackStructReturn; } @@ -2514,6 +2573,19 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, else ValVT = VA.getValVT(); + // Calculate SP offset of interrupt parameter, re-arrange the slot normally + // taken by a return address. + int Offset = 0; + if (CallConv == CallingConv::X86_INTR) { + const X86Subtarget& Subtarget = + static_cast(DAG.getSubtarget()); + // X86 interrupts may take one or two arguments. + // On the stack there will be no return address as in regular call. + // Offset of last argument need to be set to -4/-8 bytes. + // Where offset of the first argument out of two, should be set to 0 bytes. + Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1); + } + // FIXME: For now, all byval parameter objects are marked mutable. This can be // changed with more analysis. // In case of tail call optimization mark all arguments mutable. Since they @@ -2522,10 +2594,19 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, unsigned Bytes = Flags.getByValSize(); if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); + // Adjust SP offset of interrupt parameter. + if (CallConv == CallingConv::X86_INTR) { + MFI->setObjectOffset(FI, Offset); + } return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); } else { int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, VA.getLocMemOffset(), isImmutable); + // Adjust SP offset of interrupt parameter. + if (CallConv == CallingConv::X86_INTR) { + MFI->setObjectOffset(FI, Offset); + } + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); SDValue Val = DAG.getLoad( ValVT, dl, Chain, FIN, @@ -2605,6 +2686,14 @@ SDValue X86TargetLowering::LowerFormalArguments( assert(!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); + if (CallConv == CallingConv::X86_INTR) { + bool isLegal = Ins.size() == 1 || + (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) || + (!Is64Bit && Ins[1].VT == MVT::i32))); + if (!isLegal) + report_fatal_error("X86 interrupts may take one or two arguments"); + } + // Assign locations to all of the incoming arguments. SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); @@ -2637,6 +2726,8 @@ SDValue X86TargetLowering::LowerFormalArguments( RC = &X86::FR32RegClass; else if (RegVT == MVT::f64) RC = &X86::FR64RegClass; + else if (RegVT == MVT::f128) + RC = &X86::FR128RegClass; else if (RegVT.is512BitVector()) RC = &X86::VR512RegClass; else if (RegVT.is256BitVector()) @@ -2862,12 +2953,15 @@ SDValue X86TargetLowering::LowerFormalArguments( if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, MF.getTarget().Options.GuaranteedTailCallOpt)) { FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. + } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { + // X86 interrupts must pop the error code if present + FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4); } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. if (!Is64Bit && !canGuaranteeTCO(CallConv) && !Subtarget->getTargetTriple().isOSMSVCRT() && - argsAreStructReturn(Ins) == StackStructReturn) + argsAreStructReturn(Ins, Subtarget->isTargetMCU()) == StackStructReturn) FuncInfo->setBytesToPopOnReturn(4); } @@ -2987,11 +3081,14 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); - StructReturnType SR = callIsStructReturn(Outs); + StructReturnType SR = callIsStructReturn(Outs, Subtarget->isTargetMCU()); bool IsSibcall = false; X86MachineFunctionInfo *X86Info = MF.getInfo(); auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); + if (CallConv == CallingConv::X86_INTR) + report_fatal_error("X86 interrupts may not be called directly"); + if (Attr.getValueAsString() == "true") isTailCall = false; @@ -3824,6 +3921,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::SHUFP: + case X86ISD::INSERTPS: case X86ISD::PALIGNR: case X86ISD::MOVLHPS: case X86ISD::MOVLHPD: @@ -4074,6 +4172,35 @@ static bool hasFPCMov(unsigned X86CC) { } } + +bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const { + + const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic); + if (!IntrData) + return false; + + switch (IntrData->Type) { + case LOADA: + case LOADU: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(I.getType()); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = (IntrData->Type == LOADA ? Info.memVT.getSizeInBits()/8 : 1); + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + return true; + } + default: + break; + } + + return false; +} + /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. @@ -4264,19 +4391,9 @@ unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 256); } -/// Returns true if V is a constant integer zero. -static bool isZero(SDValue V) { - ConstantSDNode *C = dyn_cast(V); - return C && C->isNullValue(); -} - /// Returns true if Elt is a constant zero or a floating point constant +0.0. bool X86::isZeroNode(SDValue Elt) { - if (isZero(Elt)) - return true; - if (ConstantFPSDNode *CFP = dyn_cast(Elt)) - return CFP->getValueAPF().isPosZero(); - return false; + return isNullConstant(Elt) || isNullFPConstant(Elt); } // Build a vector of constants @@ -4314,7 +4431,7 @@ static SDValue getConstVector(ArrayRef Values, MVT VT, } /// Returns a vector of specified type with all zero elements. -static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, +static SDValue getZeroVector(MVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); @@ -4484,6 +4601,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8); + Result = DAG.getBitcast(CastVT, Result); Vec256 = DAG.getBitcast(CastVT, Vec256); Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask); return DAG.getBitcast(ResultVT, Vec256); @@ -4669,8 +4787,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, /// uses one source. Note that this will set IsUnary for shuffles which use a /// single input multiple times, and in those cases it will /// adjust the mask to only have indices within that single input. -/// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero. -static bool getTargetShuffleMask(SDNode *N, MVT VT, +static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, SmallVectorImpl &Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); SDValue ImmN; @@ -4687,6 +4804,11 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, DecodeSHUFPMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; + case X86ISD::INSERTPS: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeINSERTPSMask(cast(ImmN)->getZExtValue(), Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + break; case X86ISD::UNPCKH: DecodeUNPCKHMask(VT, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); @@ -4779,8 +4901,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, if (auto *C = dyn_cast(MaskCP->getConstVal())) { DecodePSHUFBMask(C, Mask); - if (Mask.empty()) - return false; break; } @@ -4798,11 +4918,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, case X86ISD::VPERM2X128: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2X128Mask(VT, cast(ImmN)->getZExtValue(), Mask); - if (Mask.empty()) return false; - // Mask only contains negative index if an element is zero. - if (std::any_of(Mask.begin(), Mask.end(), - [](int M){ return M == SM_SentinelZero; })) - return false; + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVSLDUP: DecodeMOVSLDUPMask(VT, Mask); @@ -4851,8 +4967,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, if (MaskNode->getOpcode() == X86ISD::VBROADCAST) { unsigned NumEltsInMask = MaskNode->getNumOperands(); MaskNode = MaskNode->getOperand(0); - auto *CN = dyn_cast(MaskNode); - if (CN) { + if (auto *CN = dyn_cast(MaskNode)) { APInt MaskEltValue = CN->getAPIntValue(); for (unsigned i = 0; i < NumEltsInMask; ++i) RawMask.push_back(MaskEltValue.getLoBits(MaskLoBits).getZExtValue()); @@ -4875,11 +4990,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) return false; - auto *C = dyn_cast(MaskCP->getConstVal()); - if (C) { + if (auto *C = dyn_cast(MaskCP->getConstVal())) { DecodeVPERMVMask(C, VT, Mask); - if (Mask.empty()) - return false; break; } return false; @@ -4928,11 +5040,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) return false; - auto *C = dyn_cast(MaskCP->getConstVal()); - if (C) { + if (auto *C = dyn_cast(MaskCP->getConstVal())) { DecodeVPERMV3Mask(C, VT, Mask); - if (Mask.empty()) - return false; break; } return false; @@ -4940,6 +5049,16 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, default: llvm_unreachable("unknown target shuffle node"); } + // Empty mask indicates the decode failed. + if (Mask.empty()) + return false; + + // Check if we're getting a shuffle mask with zero'd elements. + if (!AllowSentinelZero) + if (std::any_of(Mask.begin(), Mask.end(), + [](int M){ return M == SM_SentinelZero; })) + return false; + // If we have a fake unary shuffle, the shuffle mask is spread across two // inputs that are actually the same node. Re-map the mask to always point // into the first input. @@ -4978,19 +5097,19 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { MVT ShufVT = V.getSimpleValueType(); - unsigned NumElems = ShufVT.getVectorNumElements(); + int NumElems = (int)ShufVT.getVectorNumElements(); SmallVector ShuffleMask; bool IsUnary; - if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) + if (!getTargetShuffleMask(N, ShufVT, false, ShuffleMask, IsUnary)) return SDValue(); int Elt = ShuffleMask[Index]; - if (Elt < 0) + if (Elt == SM_SentinelUndef) return DAG.getUNDEF(ShufVT.getVectorElementType()); - SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) - : N->getOperand(1); + assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range"); + SDValue NewV = (Elt < NumElems) ? N->getOperand(0) : N->getOperand(1); return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } @@ -6441,7 +6560,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SmallVector V(NumElems); if (NumElems == 4 && NumZero > 0) { for (unsigned i = 0; i < 4; ++i) { - bool isZero = !(NonZeros & (1 << i)); + bool isZero = !(NonZeros & (1ULL << i)); if (isZero) V[i] = getZeroVector(VT, Subtarget, DAG, dl); else @@ -8033,6 +8152,7 @@ static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0, /// For convenience, this code also bundles all of the subtarget feature set /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. +/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them? static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, ArrayRef Mask, const X86Subtarget *Subtarget, @@ -8096,6 +8216,13 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG)) return TruncBroadcast; + MVT BroadcastVT = VT; + + // Peek through any bitcast (only useful for loads). + SDValue BC = V; + while (BC.getOpcode() == ISD::BITCAST) + BC = BC.getOperand(0); + // Also check the simpler case, where we can directly reuse the scalar. if (V.getOpcode() == ISD::BUILD_VECTOR || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { @@ -8105,13 +8232,32 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, // Only AVX2 has register broadcasts. if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) return SDValue(); + } else if (MayFoldLoad(BC) && !cast(BC)->isVolatile()) { + // 32-bit targets need to load i64 as a f64 and then bitcast the result. + if (!Subtarget->is64Bit() && VT.getScalarType() == MVT::i64) + BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); + + // If we are broadcasting a load that is only used by the shuffle + // then we can reduce the vector load to the broadcasted scalar load. + LoadSDNode *Ld = cast(BC); + SDValue BaseAddr = Ld->getOperand(1); + EVT AddrVT = BaseAddr.getValueType(); + EVT SVT = BroadcastVT.getScalarType(); + unsigned Offset = BroadcastIdx * SVT.getStoreSize(); + SDValue NewAddr = DAG.getNode( + ISD::ADD, DL, AddrVT, BaseAddr, + DAG.getConstant(Offset, DL, AddrVT)); + V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, + DAG.getMachineFunction().getMachineMemOperand( + Ld->getMemOperand(), Offset, SVT.getStoreSize())); } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) { // We can't broadcast from a vector register without AVX2, and we can only // broadcast from the zero-element of a vector register. return SDValue(); } - return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V); + V = DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, V); + return DAG.getBitcast(VT, V); } // Check for whether we can use INSERTPS to perform the shuffle. We only use @@ -10273,6 +10419,108 @@ static SDValue lowerVectorShuffleByMerging128BitLanes( return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask); } +/// Lower shuffles where an entire half of a 256-bit vector is UNDEF. +/// This allows for fast cases such as subvector extraction/insertion +/// or shuffling smaller vector types which can lower more efficiently. +static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(VT.getSizeInBits() == 256 && "Expected 256-bit vector"); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfNumElts = NumElts / 2; + MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); + + bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts); + bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts); + if (!UndefLower && !UndefUpper) + return SDValue(); + + // Upper half is undef and lower half is whole upper subvector. + // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> + if (UndefUpper && + isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(HalfNumElts, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(0, DL)); + } + + // Lower half is undef and upper half is whole lower subvector. + // e.g. vector_shuffle or + if (UndefLower && + isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(0, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(HalfNumElts, DL)); + } + + // AVX2 supports efficient immediate 64-bit element cross-lane shuffles. + if (UndefLower && Subtarget->hasAVX2() && + (VT == MVT::v4f64 || VT == MVT::v4i64)) + return SDValue(); + + // If the shuffle only uses the lower halves of the input operands, + // then extract them and perform the 'half' shuffle at half width. + // e.g. vector_shuffle or + int HalfIdx1 = -1, HalfIdx2 = -1; + SmallVector HalfMask; + unsigned Offset = UndefLower ? HalfNumElts : 0; + for (unsigned i = 0; i != HalfNumElts; ++i) { + int M = Mask[i + Offset]; + if (M < 0) { + HalfMask.push_back(M); + continue; + } + + // Determine which of the 4 half vectors this element is from. + // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2. + int HalfIdx = M / HalfNumElts; + + // Only shuffle using the lower halves of the inputs. + // TODO: Investigate usefulness of shuffling with upper halves. + if (HalfIdx != 0 && HalfIdx != 2) + return SDValue(); + + // Determine the element index into its half vector source. + int HalfElt = M % HalfNumElts; + + // We can shuffle with up to 2 half vectors, set the new 'half' + // shuffle mask accordingly. + if (-1 == HalfIdx1 || HalfIdx1 == HalfIdx) { + HalfMask.push_back(HalfElt); + HalfIdx1 = HalfIdx; + continue; + } + if (-1 == HalfIdx2 || HalfIdx2 == HalfIdx) { + HalfMask.push_back(HalfElt + HalfNumElts); + HalfIdx2 = HalfIdx; + continue; + } + + // Too many half vectors referenced. + return SDValue(); + } + assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"); + + auto GetHalfVector = [&](int HalfIdx) { + if (HalfIdx < 0) + return DAG.getUNDEF(HalfVT); + SDValue V = (HalfIdx < 2 ? V1 : V2); + HalfIdx = (HalfIdx % 2) * HalfNumElts; + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V, + DAG.getIntPtrConstant(HalfIdx, DL)); + }; + + SDValue Half1 = GetHalfVector(HalfIdx1); + SDValue Half2 = GetHalfVector(HalfIdx2); + SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, + DAG.getIntPtrConstant(Offset, DL)); +} + /// \brief Test whether the specified input (0 or 1) is in-place blended by the /// given mask. /// @@ -10549,9 +10797,7 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (Subtarget->hasAVX2()) return DAG.getNode( X86ISD::VPERMV, DL, MVT::v8f32, - DAG.getBitcast(MVT::v8f32, DAG.getNode(ISD::BUILD_VECTOR, DL, - MVT::v8i32, VPermMask)), - V1); + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1); // Otherwise, fall back. return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, @@ -10844,6 +11090,11 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, VT, V1, V2, Mask, Subtarget, DAG)) return Insertion; + // Handle special cases where the lower or upper half is UNDEF. + if (SDValue V = + lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) + return V; + // There is a really nice hard cut-over between AVX1 and AVX2 that means we // can check for those subtargets here and avoid much of the subtarget // querying in the per-vector-type lowering routines. With AVX1 we have @@ -11310,9 +11561,9 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, int Lane1Cond = -1, Lane2Cond = -1; if (isa(EltCond)) - Lane1Cond = !isZero(EltCond); + Lane1Cond = !isNullConstant(EltCond); if (isa(SndLaneEltCond)) - Lane2Cond = !isZero(SndLaneEltCond); + Lane2Cond = !isNullConstant(SndLaneEltCond); unsigned LaneMask = 0; if (Lane1Cond == Lane2Cond || Lane2Cond < 0) @@ -11353,7 +11604,8 @@ static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) { SDValue CondElt = CondBV->getOperand(i); Mask.push_back( - isa(CondElt) ? i + (isZero(CondElt) ? Size : 0) : -1); + isa(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0) + : -1); } return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask); } @@ -11418,9 +11670,8 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { } if (VT.getSizeInBits() == 16) { - unsigned Idx = cast(Op.getOperand(1))->getZExtValue(); // If Idx is 0, it's cheaper to do a move instead of a pextrw. - if (Idx == 0) + if (isNullConstant(Op.getOperand(1))) return DAG.getNode( ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, @@ -11443,8 +11694,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { return SDValue(); SDNode *User = *Op.getNode()->use_begin(); if ((User->getOpcode() != ISD::STORE || - (isa(Op.getOperand(1)) && - cast(Op.getOperand(1))->isNullValue())) && + isNullConstant(Op.getOperand(1))) && (User->getOpcode() != ISD::BITCAST || User->getValueType(0) != MVT::i32)) return SDValue(); @@ -11561,8 +11811,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // TODO: handle v16i8. if (VT.getSizeInBits() == 16) { SDValue Vec = Op.getOperand(0); - unsigned Idx = cast(Op.getOperand(1))->getZExtValue(); - if (Idx == 0) + if (isNullConstant(Op.getOperand(1))) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), @@ -11594,8 +11843,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught // to match extract_elt for f64. - unsigned Idx = cast(Op.getOperand(1))->getZExtValue(); - if (Idx == 0) + if (isNullConstant(Op.getOperand(1))) return Op; // UNPCKHPD the element to the lowest double word, then movsd. @@ -12229,6 +12477,12 @@ SDValue X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GA = cast(Op); + + // Cygwin uses emutls. + // FIXME: It may be EmulatedTLS-generic also for X86-Android. + if (Subtarget->isTargetWindowsCygwin()) + return LowerToTLSEmulatedModel(GA, DAG); + const GlobalValue *GV = GA->getGlobal(); auto PtrVT = getPointerTy(DAG.getDataLayout()); @@ -12283,8 +12537,12 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // location. SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL); SDValue Args[] = { Chain, Offset }; Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); + Chain = + DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true), + DAG.getIntPtrConstant(0, DL, true), SDValue(), DL); // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); @@ -12457,13 +12715,21 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, return Op; } + SDValue ValueToStore = Op.getOperand(0); + if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && + !Subtarget->is64Bit()) + // Bitcasting to f64 here allows us to do a single 64-bit store from + // an SSE register, avoiding the store forwarding penalty that would come + // with two 32-bit stores. + ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); + unsigned Size = SrcVT.getSizeInBits()/8; MachineFunction &MF = DAG.getMachineFunction(); auto PtrVT = getPointerTy(MF.getDataLayout()); int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); SDValue Chain = DAG.getStore( - DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, + DAG.getEntryNode(), dl, ValueToStore, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false, false, 0); return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); @@ -12836,7 +13102,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, } assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), + SDValue ValueToStore = Op.getOperand(0); + if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget->is64Bit()) + // Bitcasting to f64 here allows us to do a single 64-bit store from + // an SSE register, avoiding the store forwarding penalty that would come + // with two 32-bit stores. + ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot, MachinePointerInfo(), false, false, 0); // For i64 source, we need to add the appropriate power of 2 if the input @@ -13174,6 +13446,62 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, return SDValue(); } +static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + SDValue In = Op.getOperand(0); + MVT InVT = In.getSimpleValueType(); + + assert(VT.getVectorElementType() == MVT::i1 && "Unexected vector type."); + + // Shift LSB to MSB and use VPMOVB2M - SKX. + unsigned ShiftInx = InVT.getScalarSizeInBits() - 1; + if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 && + Subtarget->hasBWI()) || // legal, will go to VPMOVB2M, VPMOVW2M + ((InVT.is256BitVector() || InVT.is128BitVector()) && + InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI() && + Subtarget->hasVLX())) { // legal, will go to VPMOVB2M, VPMOVW2M + // Shift packed bytes not supported natively, bitcast to dword + MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT, + DAG.getBitcast(ExtVT, In), + DAG.getConstant(ShiftInx, DL, ExtVT)); + ShiftNode = DAG.getBitcast(InVT, ShiftNode); + return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); + } + if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 && + Subtarget->hasDQI()) || // legal, will go to VPMOVD2M, VPMOVQ2M + ((InVT.is256BitVector() || InVT.is128BitVector()) && + InVT.getScalarSizeInBits() >= 32 && Subtarget->hasDQI() && + Subtarget->hasVLX())) { // legal, will go to VPMOVD2M, VPMOVQ2M + + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, + DAG.getConstant(ShiftInx, DL, InVT)); + return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); + } + + // Shift LSB to MSB, extend if necessary and use TESTM. + unsigned NumElts = InVT.getVectorNumElements(); + if (InVT.getSizeInBits() < 512 && + (InVT.getScalarType() == MVT::i8 || InVT.getScalarType() == MVT::i16 || + !Subtarget->hasVLX())) { + assert((NumElts == 8 || NumElts == 16) && "Unexected vector type."); + + // TESTD/Q should be used (if BW supported we use CVT2MASK above), + // so vector should be extended to packed dword/qword. + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); + In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); + InVT = ExtVT; + ShiftInx = InVT.getScalarSizeInBits() - 1; + } + + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, + DAG.getConstant(ShiftInx, DL, InVT)); + return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode); +} + SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); @@ -13191,39 +13519,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); - // move vector to mask - truncate solution for SKX - if (VT.getVectorElementType() == MVT::i1) { - if (InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 && - Subtarget->hasBWI()) - return Op; // legal, will go to VPMOVB2M, VPMOVW2M - if ((InVT.is256BitVector() || InVT.is128BitVector()) - && InVT.getScalarSizeInBits() <= 16 && - Subtarget->hasBWI() && Subtarget->hasVLX()) - return Op; // legal, will go to VPMOVB2M, VPMOVW2M - if (InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 && - Subtarget->hasDQI()) - return Op; // legal, will go to VPMOVD2M, VPMOVQ2M - if ((InVT.is256BitVector() || InVT.is128BitVector()) - && InVT.getScalarSizeInBits() >= 32 && - Subtarget->hasDQI() && Subtarget->hasVLX()) - return Op; // legal, will go to VPMOVB2M, VPMOVQ2M - } - - if (VT.getVectorElementType() == MVT::i1) { - assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); - unsigned NumElts = InVT.getVectorNumElements(); - assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type"); - if (InVT.getSizeInBits() < 512) { - MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64; - In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); - InVT = ExtVT; - } - - SDValue OneV = - DAG.getConstant(APInt::getSignBit(InVT.getScalarSizeInBits()), DL, InVT); - SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In); - return DAG.getNode(X86ISD::TESTM, DL, VT, And, And); - } + if (VT.getVectorElementType() == MVT::i1) + return LowerTruncateVecI1(Op, DAG, Subtarget); // vpmovqb/w/d, vpmovdb/w, vpmovwb if (Subtarget->hasAVX512()) { @@ -13400,6 +13697,8 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); + bool IsF128 = (VT == MVT::f128); + // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to // decide if we should generate a 16-byte constant mask when we only need 4 or // 8 bytes for the scalar case. @@ -13412,6 +13711,11 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { LogicVT = VT; EltVT = VT.getVectorElementType(); NumElts = VT.getVectorNumElements(); + } else if (IsF128) { + // SSE instructions are used for optimized f128 logical operations. + LogicVT = MVT::f128; + EltVT = VT; + NumElts = 1; } else { // There are no scalar bitwise logical SSE/AVX instructions, so we // generate a 16-byte vector constant and logic op even for the scalar case. @@ -13443,7 +13747,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; - if (VT.isVector()) + if (VT.isVector() || IsF128) return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); // For the scalar case extend to a 128-bit vector, perform the logic op, @@ -13462,6 +13766,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT SrcVT = Op1.getSimpleValueType(); + bool IsF128 = (VT == MVT::f128); // If second operand is smaller, extend it first. if (SrcVT.bitsLT(VT)) { @@ -13476,13 +13781,16 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { // At this point the operands and the result should have the same // type, and that won't be f80 since that is not custom lowered. + assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) && + "Unexpected type in LowerFCOPYSIGN"); const fltSemantics &Sem = - VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle; + VT == MVT::f64 ? APFloat::IEEEdouble : + (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle); const unsigned SizeInBits = VT.getSizeInBits(); SmallVector CV( - VT == MVT::f64 ? 2 : 4, + VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4), ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0)))); // First, clear all bits but the sign bit from the second operand (sign). @@ -13495,12 +13803,13 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { // Perform all logic operations as 16-byte vectors because there are no // scalar FP logic instructions in SSE. This allows load folding of the // constants into the logic instructions. - MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; + MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32); SDValue Mask1 = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, false, false, 16); - Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1); + if (!IsF128) + Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1); // Next, clear the sign bit from the first operand (magnitude). @@ -13509,8 +13818,9 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { APFloat APF = Op0CN->getValueAPF(); // If the magnitude is a positive zero, the sign bit alone is enough. if (APF.isPosZero()) - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit, - DAG.getIntPtrConstant(0, dl)); + return IsF128 ? SignBit : + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit, + DAG.getIntPtrConstant(0, dl)); APF.clearSign(); CV[0] = ConstantFP::get(*Context, APF); } else { @@ -13526,13 +13836,15 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { false, false, false, 16); // If the magnitude operand wasn't a constant, we need to AND out the sign. if (!isa(Op0)) { - Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0); + if (!IsF128) + Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0); Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val); } // OR the magnitude value with the sign bit. Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val, - DAG.getIntPtrConstant(0, dl)); + return IsF128 ? Val : + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val, + DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { @@ -13759,14 +14071,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, if (ConstantSDNode *C = dyn_cast(ArithOp.getNode()->getOperand(1))) { // An add of one will be selected as an INC. - if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) { + if (C->isOne() && !Subtarget->slowIncDec()) { Opcode = X86ISD::INC; NumOperands = 1; break; } // An add of negative one (subtract of one) will be selected as a DEC. - if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) { + if (C->isAllOnesValue() && !Subtarget->slowIncDec()) { Opcode = X86ISD::DEC; NumOperands = 1; break; @@ -13895,13 +14207,11 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, /// equivalent. SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, SDLoc dl, SelectionDAG &DAG) const { - if (ConstantSDNode *C = dyn_cast(Op1)) { - if (C->getAPIntValue() == 0) - return EmitTest(Op0, X86CC, dl, DAG); + if (isNullConstant(Op1)) + return EmitTest(Op0, X86CC, dl, DAG); - assert(Op0.getValueType() != MVT::i1 && - "Unexpected comparison operation for MVT::i1 operands"); - } + assert(!(isa(Op1) && Op0.getValueType() == MVT::i1) && + "Unexpected comparison operation for MVT::i1 operands"); if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { @@ -13947,6 +14257,9 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, DAG.getConstant(8, dl, MVT::i8)); SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); + + // Some 64-bit targets lack SAHF support, but they do support FCOMI. + assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?"); return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } @@ -14024,11 +14337,6 @@ unsigned X86TargetLowering::combineRepeatedFPDivisors() const { return 2; } -static bool isAllOnes(SDValue V) { - ConstantSDNode *C = dyn_cast(V); - return C && C->isAllOnesValue(); -} - /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node /// if it's possible. SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, @@ -14044,8 +14352,7 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, if (Op1.getOpcode() == ISD::SHL) std::swap(Op0, Op1); if (Op0.getOpcode() == ISD::SHL) { - if (ConstantSDNode *And00C = dyn_cast(Op0.getOperand(0))) - if (And00C->getZExtValue() == 1) { + if (isOneConstant(Op0.getOperand(0))) { // If we looked past a truncate, check that it's only truncating away // known zeros. unsigned BitWidth = Op0.getValueSizeInBits(); @@ -14604,8 +14911,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Lower ((X >>u N) & 1) != 0 to BT(X, N). // Lower ((X >>s N) & 1) != 0 to BT(X, N). if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && - Op1.getOpcode() == ISD::Constant && - cast(Op1)->isNullValue() && + isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) { if (VT == MVT::i1) @@ -14616,17 +14922,14 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of // these. - if (Op1.getOpcode() == ISD::Constant && - (cast(Op1)->getZExtValue() == 1 || - cast(Op1)->isNullValue()) && + if ((isOneConstant(Op1) || isNullConstant(Op1)) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // If the input is a setcc, then reuse the input setcc or use a new one with // the inverted condition. if (Op0.getOpcode() == X86ISD::SETCC) { X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); - bool Invert = (CC == ISD::SETNE) ^ - cast(Op1)->isNullValue(); + bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1); if (!Invert) return Op0; @@ -14639,8 +14942,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return SetCC; } } - if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) && - (cast(Op1)->getZExtValue() == 1) && + if ((Op0.getValueType() == MVT::i1) && isOneConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true); @@ -14674,8 +14976,11 @@ SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const { assert(Carry.getOpcode() != ISD::CARRY_FALSE); SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry); - return DAG.getNode(X86ISD::SETCC, DL, Op.getValueType(), - DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1)); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1)); + if (Op.getSimpleValueType() == MVT::i1) + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); + return SetCC; } // isX86LogicalCmp - Return true if opcode is a X86 logical comparison. @@ -14828,22 +15133,21 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y if (Cond.getOpcode() == X86ISD::SETCC && Cond.getOperand(1).getOpcode() == X86ISD::CMP && - isZero(Cond.getOperand(1).getOperand(1))) { + isNullConstant(Cond.getOperand(1).getOperand(1))) { SDValue Cmp = Cond.getOperand(1); unsigned CondCode =cast(Cond.getOperand(0))->getZExtValue(); - if ((isAllOnes(Op1) || isAllOnes(Op2)) && + if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { - SDValue Y = isAllOnes(Op2) ? Op1 : Op2; + SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2; SDValue CmpOp0 = Cmp.getOperand(0); // Apply further optimizations for special cases // (select (x != 0), -1, 0) -> neg & sbb // (select (x == 0), 0, -1) -> neg & sbb - if (ConstantSDNode *YC = dyn_cast(Y)) - if (YC->isNullValue() && - (isAllOnes(Op1) == (CondCode == X86::COND_NE))) { + if (isNullConstant(Y) && + (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, DAG.getConstant(0, DL, @@ -14863,11 +15167,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp); - if (isAllOnes(Op1) != (CondCode == X86::COND_E)) + if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E)) Res = DAG.getNOT(DL, Res, Res.getValueType()); - ConstantSDNode *N2C = dyn_cast(Op2); - if (!N2C || !N2C->isNullValue()) + if (!isNullConstant(Op2)) Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); return Res; } @@ -14875,11 +15178,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // Look past (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && - Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { - ConstantSDNode *C = dyn_cast(Cond.getOperand(1)); - if (C && C->getAPIntValue() == 1) - Cond = Cond.getOperand(0); - } + Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && + isOneConstant(Cond.getOperand(1))) + Cond = Cond.getOperand(0); // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. @@ -14967,11 +15268,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { unsigned CondCode = cast(CC)->getZExtValue(); if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && - (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { + (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && + (isNullConstant(Op1) || isNullConstant(Op2))) { SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getConstant(X86::COND_B, DL, MVT::i8), Cond); - if (isAllOnes(Op1) != (CondCode == X86::COND_B)) + if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B)) return DAG.getNOT(DL, Res, Res.getValueType()); return Res; } @@ -15361,11 +15663,9 @@ static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { static bool isXor1OfSetCC(SDValue Op) { if (Op.getOpcode() != ISD::XOR) return false; - ConstantSDNode *N1C = dyn_cast(Op.getOperand(1)); - if (N1C && N1C->getAPIntValue() == 1) { + if (isOneConstant(Op.getOperand(1))) return Op.getOperand(0).getOpcode() == X86ISD::SETCC && - Op.getOperand(0).hasOneUse(); - } + Op.getOperand(0).hasOneUse(); return false; } @@ -15381,8 +15681,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (Cond.getOpcode() == ISD::SETCC) { // Check for setcc([su]{add,sub,mul}o == 0). if (cast(Cond.getOperand(2))->get() == ISD::SETEQ && - isa(Cond.getOperand(1)) && - cast(Cond.getOperand(1))->isNullValue() && + isNullConstant(Cond.getOperand(1)) && Cond.getOperand(0).getResNo() == 1 && (Cond.getOperand(0).getOpcode() == ISD::SADDO || Cond.getOperand(0).getOpcode() == ISD::UADDO || @@ -15409,11 +15708,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { // Look pass (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && - Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { - ConstantSDNode *C = dyn_cast(Cond.getOperand(1)); - if (C && C->getAPIntValue() == 1) - Cond = Cond.getOperand(0); - } + Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && + isOneConstant(Cond.getOperand(1))) + Cond = Cond.getOperand(0); // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. @@ -15457,16 +15754,14 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { switch (CondOpcode) { case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; case ISD::SADDO: - if (ConstantSDNode *C = dyn_cast(RHS)) - if (C->isOne()) { + if (isOneConstant(RHS)) { X86Opcode = X86ISD::INC; X86Cond = X86::COND_O; break; } X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; case ISD::SSUBO: - if (ConstantSDNode *C = dyn_cast(RHS)) - if (C->isOne()) { + if (isOneConstant(RHS)) { X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O; break; } @@ -15660,54 +15955,40 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SplitStack; SDLoc dl(Op); + // Get the inputs. + SDNode *Node = Op.getNode(); + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + unsigned Align = cast(Op.getOperand(2))->getZExtValue(); + EVT VT = Node->getValueType(0); + + // Chain the dynamic stack allocation so that it doesn't modify the stack + // pointer when other instructions are using the stack. + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl); + + bool Is64Bit = Subtarget->is64Bit(); + MVT SPTy = getPointerTy(DAG.getDataLayout()); + + SDValue Result; if (!Lower) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDNode* Node = Op.getNode(); - unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" - " not tell us which reg is the stack pointer!"); + " not tell us which reg is the stack pointer!"); EVT VT = Node->getValueType(0); - SDValue Tmp1 = SDValue(Node, 0); - SDValue Tmp2 = SDValue(Node, 1); SDValue Tmp3 = Node->getOperand(2); - SDValue Chain = Tmp1.getOperand(0); - // Chain the dynamic stack allocation so that it doesn't modify the stack - // pointer when other instructions are using the stack. - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), - SDLoc(Node)); - - SDValue Size = Tmp2.getOperand(1); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); unsigned Align = cast(Tmp3)->getZExtValue(); const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); unsigned StackAlign = TFI.getStackAlignment(); - Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value + Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value if (Align > StackAlign) - Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, - DAG.getConstant(-(uint64_t)Align, dl, VT)); - Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain - - Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), - DAG.getIntPtrConstant(0, dl, true), SDValue(), - SDLoc(Node)); - - SDValue Ops[2] = { Tmp1, Tmp2 }; - return DAG.getMergeValues(Ops, dl); - } - - // Get the inputs. - SDValue Chain = Op.getOperand(0); - SDValue Size = Op.getOperand(1); - unsigned Align = cast(Op.getOperand(2))->getZExtValue(); - EVT VT = Op.getNode()->getValueType(0); - - bool Is64Bit = Subtarget->is64Bit(); - MVT SPTy = getPointerTy(DAG.getDataLayout()); - - if (SplitStack) { + Result = DAG.getNode(ISD::AND, dl, VT, Result, + DAG.getConstant(-(uint64_t)Align, dl, VT)); + Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain + } else if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); if (Is64Bit) { @@ -15725,10 +16006,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); - SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, + Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, DAG.getRegister(Vreg, SPTy)); - SDValue Ops1[2] = { Value, Chain }; - return DAG.getMergeValues(Ops1, dl); } else { SDValue Flag; const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX); @@ -15750,9 +16029,14 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); } - SDValue Ops1[2] = { SP, Chain }; - return DAG.getMergeValues(Ops1, dl); + Result = SP; } + + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), + DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); + + SDValue Ops[2] = {Result, Chain}; + return DAG.getMergeValues(Ops, dl); } SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { @@ -16029,57 +16313,90 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); } +/// \brief Return Mask with the necessary casting or extending +/// for \p Mask according to \p MaskVT when lowering masking intrinsics +static SDValue getMaskNode(SDValue Mask, MVT MaskVT, + const X86Subtarget *Subtarget, + SelectionDAG &DAG, SDLoc dl) { + + if (MaskVT.bitsGT(Mask.getSimpleValueType())) { + // Mask should be extended + Mask = DAG.getNode(ISD::ANY_EXTEND, dl, + MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask); + } + + if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) { + if (MaskVT == MVT::v64i1) { + assert(Subtarget->hasBWI() && "Expected AVX512BW target!"); + // In case 32bit mode, bitcast i64 is illegal, extend/split it. + SDValue Lo, Hi; + Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, + DAG.getConstant(0, dl, MVT::i32)); + Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, + DAG.getConstant(1, dl, MVT::i32)); + + Lo = DAG.getBitcast(MVT::v32i1, Lo); + Hi = DAG.getBitcast(MVT::v32i1, Hi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi); + } else { + // MaskVT require < 64bit. Truncate mask (should succeed in any case), + // and bitcast. + MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits()); + return DAG.getBitcast(MaskVT, + DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask)); + } + + } else { + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); + } +} + /// \brief Return (and \p Op, \p Mask) for compare instructions or /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the /// necessary casting or extending for \p Mask when lowering masking intrinsics static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, - SDValue PreservedSrc, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); - MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); - SDValue VMask; - unsigned OpcodeSelect = ISD::VSELECT; - SDLoc dl(Op); + SDValue PreservedSrc, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + unsigned OpcodeSelect = ISD::VSELECT; + SDLoc dl(Op); - if (isAllOnes(Mask)) - return Op; + if (isAllOnesConstant(Mask)) + return Op; - if (MaskVT.bitsGT(Mask.getSimpleValueType())) { - MVT newMaskVT = MVT::getIntegerVT(MaskVT.getSizeInBits()); - VMask = DAG.getBitcast(MaskVT, - DAG.getNode(ISD::ANY_EXTEND, dl, newMaskVT, Mask)); - } else { - MVT BitcastVT = MVT::getVectorVT(MVT::i1, - Mask.getSimpleValueType().getSizeInBits()); - // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements - // are extracted by EXTRACT_SUBVECTOR. - VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); - } + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); - switch (Op.getOpcode()) { - default: break; - case X86ISD::PCMPEQM: - case X86ISD::PCMPGTM: - case X86ISD::CMPM: - case X86ISD::CMPMU: - return DAG.getNode(ISD::AND, dl, VT, Op, VMask); - case X86ISD::VFPCLASS: - return DAG.getNode(ISD::OR, dl, VT, Op, VMask); - case X86ISD::VTRUNC: - case X86ISD::VTRUNCS: - case X86ISD::VTRUNCUS: - // We can't use ISD::VSELECT here because it is not always "Legal" - // for the destination type. For example vpmovqb require only AVX512 - // and vselect that can operate on byte element type require BWI - OpcodeSelect = X86ISD::SELECT; - break; - } - if (PreservedSrc.getOpcode() == ISD::UNDEF) - PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc); + switch (Op.getOpcode()) { + default: break; + case X86ISD::PCMPEQM: + case X86ISD::PCMPGTM: + case X86ISD::CMPM: + case X86ISD::CMPMU: + return DAG.getNode(ISD::AND, dl, VT, Op, VMask); + case X86ISD::VFPCLASS: + case X86ISD::VFPCLASSS: + return DAG.getNode(ISD::OR, dl, VT, Op, VMask); + case X86ISD::VTRUNC: + case X86ISD::VTRUNCS: + case X86ISD::VTRUNCUS: + // We can't use ISD::VSELECT here because it is not always "Legal" + // for the destination type. For example vpmovqb require only AVX512 + // and vselect that can operate on byte element type require BWI + OpcodeSelect = X86ISD::SELECT; + break; + } + if (PreservedSrc.getOpcode() == ISD::UNDEF) + PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc); } /// \brief Creates an SDNode for a predicated scalar operation. @@ -16093,7 +16410,7 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - if (isAllOnes(Mask)) + if (isAllOnesConstant(Mask)) return Op; MVT VT = Op.getSimpleValueType(); @@ -16103,7 +16420,8 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, if (Op.getOpcode() == X86ISD::FSETCC) return DAG.getNode(ISD::AND, dl, VT, Op, IMask); - if (Op.getOpcode() == X86ISD::VFPCLASS) + if (Op.getOpcode() == X86ISD::VFPCLASS || + Op.getOpcode() == X86ISD::VFPCLASSS) return DAG.getNode(ISD::OR, dl, VT, Op, IMask); if (PreservedSrc.getOpcode() == ISD::UNDEF) @@ -16122,15 +16440,16 @@ static int getSEHRegistrationNodeSize(const Function *Fn) { case EHPersonality::MSVC_CXX: return 16; default: break; } - report_fatal_error("can only recover FP for MSVC EH personality functions"); + report_fatal_error( + "can only recover FP for 32-bit MSVC EH personality functions"); } -/// When the 32-bit MSVC runtime transfers control to us, either to an outlined +/// When the MSVC runtime transfers control to us, either to an outlined /// function or when returning to a parent frame after catching an exception, we /// recover the parent frame pointer by doing arithmetic on the incoming EBP. /// Here's the math: /// RegNodeBase = EntryEBP - RegNodeSize -/// ParentFP = RegNodeBase - RegNodeFrameOffset +/// ParentFP = RegNodeBase - ParentFrameOffset /// Subtracting RegNodeSize takes us to the offset of the registration node, and /// subtracting the offset (negative on x86) takes us back to the parent FP. static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, @@ -16147,22 +16466,28 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, if (!Fn->hasPersonalityFn()) return EntryEBP; - int RegNodeSize = getSEHRegistrationNodeSize(Fn); - // Get an MCSymbol that will ultimately resolve to the frame offset of the EH - // registration. + // registration, or the .set_setframe offset. MCSymbol *OffsetSym = MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol( GlobalValue::getRealLinkageName(Fn->getName())); SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT); - SDValue RegNodeFrameOffset = + SDValue ParentFrameOffset = DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal); + // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after + // prologue to RBP in the parent function. + const X86Subtarget &Subtarget = + static_cast(DAG.getSubtarget()); + if (Subtarget.is64Bit()) + return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset); + + int RegNodeSize = getSEHRegistrationNodeSize(Fn); // RegNodeBase = EntryEBP - RegNodeSize - // ParentFP = RegNodeBase - RegNodeFrameOffset + // ParentFP = RegNodeBase - ParentFrameOffset SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP, DAG.getConstant(RegNodeSize, dl, PtrVT)); - return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, RegNodeFrameOffset); + return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset); } static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget, @@ -16377,7 +16702,27 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Mask, PassThru, Subtarget, DAG); } case VPERM_3OP_MASKZ: - case VPERM_3OP_MASK: + case VPERM_3OP_MASK:{ + // Src2 is the PassThru + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + MVT VT = Op.getSimpleValueType(); + SDValue PassThru = SDValue(); + + // set PassThru element + if (IntrData->Type == VPERM_3OP_MASKZ) + PassThru = getZeroVector(VT, Subtarget, DAG, dl); + else + PassThru = DAG.getBitcast(VT, Src2); + + // Swap Src1 and Src2 in the node creation + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, + dl, Op.getValueType(), + Src2, Src1, Src3), + Mask, PassThru, Subtarget, DAG); + } case FMA_OP_MASK3: case FMA_OP_MASKZ: case FMA_OP_MASK: { @@ -16389,7 +16734,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue PassThru = SDValue(); // set PassThru element - if (IntrData->Type == VPERM_3OP_MASKZ || IntrData->Type == FMA_OP_MASKZ) + if (IntrData->Type == FMA_OP_MASKZ) PassThru = getZeroVector(VT, Subtarget, DAG, dl); else if (IntrData->Type == FMA_OP_MASK3) PassThru = Src3; @@ -16544,6 +16889,24 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget DAG.getConstant(X86CC, dl, MVT::i8), Cond); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } + case COMI_RM: { // Comparison intrinsics with Sae + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + SDValue CC = Op.getOperand(3); + SDValue Sae = Op.getOperand(4); + auto ComiType = TranslateX86ConstCondToX86CC(CC); + // choose between ordered and unordered (comi/ucomi) + unsigned comiOp = std::get<0>(ComiType) ? IntrData->Opc0 : IntrData->Opc1; + SDValue Cond; + if (cast(Sae)->getZExtValue() != + X86::STATIC_ROUNDING::CUR_DIRECTION) + Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS, Sae); + else + Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(std::get<1>(ComiType), dl, MVT::i8), Cond); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } case VSHIFT: return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), Op.getOperand(1), Op.getOperand(2), DAG); @@ -16558,7 +16921,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue Mask = Op.getOperand(3); SDValue DataToCompress = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); - if (isAllOnes(Mask)) // return data as is + if (isAllOnesConstant(Mask)) // return data as is return Op.getOperand(1); return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, @@ -16567,7 +16930,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget } case BROADCASTM: { SDValue Mask = Op.getOperand(1); - MVT MaskVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits()); + MVT MaskVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); Mask = DAG.getBitcast(MaskVT, Mask); return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask); } @@ -16575,15 +16939,57 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue Mask = Op.getOperand(3); MVT VT = Op.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); - MVT BitcastVT = MVT::getVectorVT(MVT::i1, - Mask.getSimpleValueType().getSizeInBits()); - SDLoc dl(Op); - SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1), Op.getOperand(2)); } + case KUNPCK: { + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2); + + SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl); + SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl); + // Arguments should be swapped. + SDValue Res = DAG.getNode(IntrData->Opc0, dl, + MVT::getVectorVT(MVT::i1, VT.getSizeInBits()), + Src2, Src1); + return DAG.getBitcast(VT, Res); + } + case CONVERT_TO_MASK: { + MVT SrcVT = Op.getOperand(1).getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()); + + SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT, + Op.getOperand(1)); + SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, + DAG.getUNDEF(BitcastVT), CvtMask, + DAG.getIntPtrConstant(0, dl)); + return DAG.getBitcast(Op.getValueType(), Res); + } + case CONVERT_MASK_TO_VEC: { + SDValue Mask = Op.getOperand(1); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + return DAG.getNode(IntrData->Opc0, dl, VT, VMask); + } + case BRCST_SUBVEC_TO_VEC: { + SDValue Src = Op.getOperand(1); + SDValue Passthru = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + EVT resVT = Passthru.getValueType(); + SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT, + DAG.getUNDEF(resVT), Src, + DAG.getIntPtrConstant(0, dl)); + SDValue immVal; + if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector()) + immVal = DAG.getConstant(0x44, dl, MVT::i8); + else + immVal = DAG.getConstant(0, dl, MVT::i8); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + subVec, subVec, immVal), + Mask, Passthru, Subtarget, DAG); + } default: break; } @@ -16819,7 +17225,7 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); if (Src.getOpcode() == ISD::UNDEF) - Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); + Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; @@ -16986,73 +17392,13 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Results, DL); } -static SDValue LowerSEHRESTOREFRAME(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); - const Function *Fn = MF.getFunction(); - SDLoc dl(Op); SDValue Chain = Op.getOperand(0); - - assert(Subtarget->getFrameLowering()->hasFP(MF) && - "using llvm.x86.seh.restoreframe requires a frame pointer"); - - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - MVT VT = TLI.getPointerTy(DAG.getDataLayout()); - - const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); - unsigned FrameReg = - RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); - unsigned SPReg = RegInfo->getStackRegister(); - unsigned SlotSize = RegInfo->getSlotSize(); - - // Get incoming EBP. - SDValue IncomingEBP = - DAG.getCopyFromReg(Chain, dl, FrameReg, VT); - - // SP is saved in the first field of every registration node, so load - // [EBP-RegNodeSize] into SP. - int RegNodeSize = getSEHRegistrationNodeSize(Fn); - SDValue SPAddr = DAG.getNode(ISD::ADD, dl, VT, IncomingEBP, - DAG.getConstant(-RegNodeSize, dl, VT)); - SDValue NewSP = - DAG.getLoad(VT, dl, Chain, SPAddr, MachinePointerInfo(), false, false, - false, VT.getScalarSizeInBits() / 8); - Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); - - if (!RegInfo->needsStackRealignment(MF)) { - // Adjust EBP to point back to the original frame position. - SDValue NewFP = recoverFramePointer(DAG, Fn, IncomingEBP); - Chain = DAG.getCopyToReg(Chain, dl, FrameReg, NewFP); - } else { - assert(RegInfo->hasBasePointer(MF) && - "functions with Win32 EH must use frame or base pointer register"); - - // Reload the base pointer (ESI) with the adjusted incoming EBP. - SDValue NewBP = recoverFramePointer(DAG, Fn, IncomingEBP); - Chain = DAG.getCopyToReg(Chain, dl, RegInfo->getBaseRegister(), NewBP); - - // Reload the spilled EBP value, now that the stack and base pointers are - // set up. - X86MachineFunctionInfo *X86FI = MF.getInfo(); - X86FI->setHasSEHFramePtrSave(true); - int FI = MF.getFrameInfo()->CreateSpillStackObject(SlotSize, SlotSize); - X86FI->setSEHFramePtrSaveIndex(FI); - SDValue NewFP = DAG.getLoad(VT, dl, Chain, DAG.getFrameIndex(FI, VT), - MachinePointerInfo(), false, false, false, - VT.getScalarSizeInBits() / 8); - Chain = DAG.getCopyToReg(NewFP, dl, FrameReg, NewFP); - } - - return Chain; -} - -static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) { - MachineFunction &MF = DAG.getMachineFunction(); - SDValue Chain = Op.getOperand(0); - SDValue RegNode = Op.getOperand(2); - WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo(); - if (!EHInfo) - report_fatal_error("EH registrations only live in functions using WinEH"); + SDValue RegNode = Op.getOperand(2); + WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo(); + if (!EHInfo) + report_fatal_error("EH registrations only live in functions using WinEH"); // Cast the operand to an alloca, and remember the frame index. auto *FINode = dyn_cast(RegNode); @@ -17078,7 +17424,7 @@ static SDValue LowerINTRINSIC_TRUNCATE_TO_MEM(const SDValue & Op, MVT VT = DataToTruncate.getSimpleValueType(); MVT SVT = MVT::getVectorVT(ElementType, VT.getVectorNumElements()); - if (isAllOnes(Mask)) // return just a truncate store + if (isAllOnesConstant(Mask)) // return just a truncate store return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MachinePointerInfo(), SVT, false, false, SVT.getScalarSizeInBits()/8); @@ -17107,10 +17453,20 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo); if (!IntrData) { - if (IntNo == llvm::Intrinsic::x86_seh_restoreframe) - return LowerSEHRESTOREFRAME(Op, Subtarget, DAG); - else if (IntNo == llvm::Intrinsic::x86_seh_ehregnode) + if (IntNo == llvm::Intrinsic::x86_seh_ehregnode) return MarkEHRegistrationNode(Op, DAG); + if (IntNo == llvm::Intrinsic::x86_flags_read_u32 || + IntNo == llvm::Intrinsic::x86_flags_read_u64 || + IntNo == llvm::Intrinsic::x86_flags_write_u32 || + IntNo == llvm::Intrinsic::x86_flags_write_u64) { + // We need a frame pointer because this will get lowered to a PUSH/POP + // sequence. + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setHasCopyImplyingStackAdjustment(true); + // Don't do anything here, we will expand these intrinsics out later + // during ExpandISelPseudos in EmitInstrWithCustomInserter. + return SDValue(); + } return SDValue(); } @@ -17215,14 +17571,13 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Results, dl); } case COMPRESS_TO_MEM: { - SDLoc dl(Op); SDValue Mask = Op.getOperand(4); SDValue DataToCompress = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); MVT VT = DataToCompress.getSimpleValueType(); - if (isAllOnes(Mask)) // return just a store + if (isAllOnesConstant(Mask)) // return just a store return DAG.getStore(Chain, dl, DataToCompress, Addr, MachinePointerInfo(), false, false, VT.getScalarSizeInBits()/8); @@ -17241,14 +17596,13 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, case TRUNCATE_TO_MEM_VI32: return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32); case EXPAND_FROM_MEM: { - SDLoc dl(Op); SDValue Mask = Op.getOperand(4); SDValue PassThru = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); MVT VT = Op.getSimpleValueType(); - if (isAllOnes(Mask)) // return just a load + if (isAllOnesConstant(Mask)) // return just a load return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false, false, VT.getScalarSizeInBits()/8); @@ -17261,6 +17615,25 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, Mask, PassThru, Subtarget, DAG), Chain}; return DAG.getMergeValues(Results, dl); } + case LOADU: + case LOADA: { + SDValue Mask = Op.getOperand(4); + SDValue PassThru = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + MVT VT = Op.getSimpleValueType(); + + MemIntrinsicSDNode *MemIntr = dyn_cast(Op); + assert(MemIntr && "Expected MemIntrinsicSDNode!"); + + if (isAllOnesConstant(Mask)) // return just a load + return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand()); + + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT, + MemIntr->getMemOperand(), ISD::NON_EXTLOAD); + } } } @@ -17758,9 +18131,6 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, const X86Subtarget *Subtarget, unsigned NumBits = VT.getSizeInBits(); SDLoc dl(Op); - if (VT.isVector() && Subtarget->hasAVX512()) - return LowerVectorCTLZ_AVX512(Op, DAG); - Op = Op.getOperand(0); if (VT == MVT::i8) { // Zero extend to i32 since there is not an i8 bsr. @@ -18266,7 +18636,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, Op.getOpcode() == ISD::SRA && !Subtarget->hasXOP()) return ArithmeticShiftRight64(ShiftAmt); - if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) { + if (VT == MVT::v16i8 || + (Subtarget->hasInt256() && VT == MVT::v32i8) || + VT == MVT::v64i8) { unsigned NumElts = VT.getVectorNumElements(); MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); @@ -18290,10 +18662,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, R, ShiftAmt, DAG); SHL = DAG.getBitcast(VT, SHL); // Zero out the rightmost bits. - SmallVector V( - NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SHL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); + DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. @@ -18301,18 +18671,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, R, ShiftAmt, DAG); SRL = DAG.getBitcast(VT, SRL); // Zero out the leftmost bits. - SmallVector V( - NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SRL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); + DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT)); } if (Op.getOpcode() == ISD::SRA) { // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask) SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); - SmallVector V(NumElts, - DAG.getConstant(128 >> ShiftAmt, dl, - MVT::i8)); - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V); + + SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT); Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; @@ -18972,8 +19338,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { case ISD::SADDO: // A subtract of one will be selected as a INC. Note that INC doesn't // set CF, so we can't do this for UADDO. - if (ConstantSDNode *C = dyn_cast(RHS)) - if (C->isOne()) { + if (isOneConstant(RHS)) { BaseOp = X86ISD::INC; Cond = X86::COND_O; break; @@ -18988,8 +19353,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { case ISD::SSUBO: // A subtract of one will be selected as a DEC. Note that DEC doesn't // set CF, so we can't do this for USUBO. - if (ConstantSDNode *C = dyn_cast(RHS)) - if (C->isOne()) { + if (isOneConstant(RHS)) { BaseOp = X86ISD::DEC; Cond = X86::COND_O; break; @@ -19249,24 +19613,37 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, MVT SrcVT = Op.getOperand(0).getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); - if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) { + if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || + SrcVT == MVT::i64) { assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); if (DstVT != MVT::f64) // This conversion needs to be expanded. return SDValue(); - SDValue InVec = Op->getOperand(0); - SDLoc dl(Op); - unsigned NumElts = SrcVT.getVectorNumElements(); - MVT SVT = SrcVT.getVectorElementType(); - - // Widen the vector in input in the case of MVT::v2i32. - // Example: from MVT::v2i32 to MVT::v4i32. + SDValue Op0 = Op->getOperand(0); SmallVector Elts; - for (unsigned i = 0, e = NumElts; i != e; ++i) - Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec, - DAG.getIntPtrConstant(i, dl))); - + SDLoc dl(Op); + unsigned NumElts; + MVT SVT; + if (SrcVT.isVector()) { + NumElts = SrcVT.getVectorNumElements(); + SVT = SrcVT.getVectorElementType(); + + // Widen the vector in input in the case of MVT::v2i32. + // Example: from MVT::v2i32 to MVT::v4i32. + for (unsigned i = 0, e = NumElts; i != e; ++i) + Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0, + DAG.getIntPtrConstant(i, dl))); + } else { + assert(SrcVT == MVT::i64 && !Subtarget->is64Bit() && + "Unexpected source type in LowerBITCAST"); + Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0, + DAG.getIntPtrConstant(0, dl))); + Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0, + DAG.getIntPtrConstant(1, dl))); + NumElts = 2; + SVT = MVT::i32; + } // Explicitly mark the extra elements as Undef. Elts.append(NumElts, DAG.getUNDEF(SVT)); @@ -19652,31 +20029,203 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); } +/// Widen a vector input to a vector of NVT. The +/// input vector must have the same element type as NVT. +static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, + bool FillWithZeroes = false) { + // Check if InOp already has the right width. + MVT InVT = InOp.getSimpleValueType(); + if (InVT == NVT) + return InOp; + + if (InOp.isUndef()) + return DAG.getUNDEF(NVT); + + assert(InVT.getVectorElementType() == NVT.getVectorElementType() && + "input and widen element type must match"); + + unsigned InNumElts = InVT.getVectorNumElements(); + unsigned WidenNumElts = NVT.getVectorNumElements(); + assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && + "Unexpected request for vector widening"); + + EVT EltVT = NVT.getVectorElementType(); + + SDLoc dl(InOp); + if (InOp.getOpcode() == ISD::CONCAT_VECTORS && + InOp.getNumOperands() == 2) { + SDValue N1 = InOp.getOperand(1); + if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) || + N1.isUndef()) { + InOp = InOp.getOperand(0); + InVT = InOp.getSimpleValueType(); + InNumElts = InVT.getVectorNumElements(); + } + } + if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) || + ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) { + SmallVector Ops; + for (unsigned i = 0; i < InNumElts; ++i) + Ops.push_back(InOp.getOperand(i)); + + SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : + DAG.getUNDEF(EltVT); + for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i) + Ops.push_back(FillVal); + return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops); + } + SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) : + DAG.getUNDEF(NVT); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, + InOp, DAG.getIntPtrConstant(0, dl)); +} + static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Subtarget->hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"); + // X86 scatter kills mask register, so its type should be added to + // the list of return values. + // If the "scatter" has 2 return values, it is already handled. + if (Op.getNode()->getNumValues() == 2) + return Op; + MaskedScatterSDNode *N = cast(Op.getNode()); - MVT VT = N->getValue().getSimpleValueType(); + SDValue Src = N->getValue(); + MVT VT = Src.getSimpleValueType(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op"); SDLoc dl(Op); - // X86 scatter kills mask register, so its type should be added to - // the list of return values - if (N->getNumValues() == 1) { - SDValue Index = N->getIndex(); - if (!Subtarget->hasVLX() && !VT.is512BitVector() && - !Index.getSimpleValueType().is512BitVector()) + SDValue NewScatter; + SDValue Index = N->getIndex(); + SDValue Mask = N->getMask(); + SDValue Chain = N->getChain(); + SDValue BasePtr = N->getBasePtr(); + MVT MemVT = N->getMemoryVT().getSimpleVT(); + MVT IndexVT = Index.getSimpleValueType(); + MVT MaskVT = Mask.getSimpleValueType(); + + if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) { + // The v2i32 value was promoted to v2i64. + // Now we "redo" the type legalizer's work and widen the original + // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64 + // with a shuffle. + assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) && + "Unexpected memory type"); + int ShuffleMask[] = {0, 2, -1, -1}; + Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src), + DAG.getUNDEF(MVT::v4i32), ShuffleMask); + // Now we have 4 elements instead of 2. + // Expand the index. + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4); + Index = ExtendToType(Index, NewIndexVT, DAG); + + // Expand the mask with zeroes + // Mask may be <2 x i64> or <2 x i1> at this moment + assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) && + "Unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4); + Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); + VT = MVT::v4i32; + } + + unsigned NumElts = VT.getVectorNumElements(); + if (!Subtarget->hasVLX() && !VT.is512BitVector() && + !Index.getSimpleValueType().is512BitVector()) { + // AVX512F supports only 512-bit vectors. Or data or index should + // be 512 bit wide. If now the both index and data are 256-bit, but + // the vector contains 8 elements, we just sign-extend the index + if (IndexVT == MVT::v8i32) + // Just extend index Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + else { + // The minimal number of elts in scatter is 8 + NumElts = 8; + // Index + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); + // Use original index here, do not modify the index twice + Index = ExtendToType(N->getIndex(), NewIndexVT, DAG); + if (IndexVT.getScalarType() == MVT::i32) + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + + // Mask + // At this point we have promoted mask operand + assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); + // Use the original mask here, do not modify the mask twice + Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true); + + // The value that should be stored + MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); + Src = ExtendToType(Src, NewVT, DAG); + } + } + // If the mask is "wide" at this point - truncate it to i1 vector + MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts); + Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask); + + // The mask is killed by scatter, add it to the values + SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other); + SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index}; + NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); + return SDValue(NewScatter.getNode(), 0); +} + +static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { - SDVTList VTs = DAG.getVTList(N->getMask().getValueType(), MVT::Other); - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), - N->getOperand(3), Index }; + MaskedLoadSDNode *N = cast(Op.getNode()); + MVT VT = Op.getSimpleValueType(); + SDValue Mask = N->getMask(); + SDLoc dl(Op); + + if (Subtarget->hasAVX512() && !Subtarget->hasVLX() && + !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) { + // This operation is legal for targets with VLX, but without + // VLX the vector should be widened to 512 bit + unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); + MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec); + MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); + SDValue Src0 = N->getSrc0(); + Src0 = ExtendToType(Src0, WideDataVT, DAG); + Mask = ExtendToType(Mask, WideMaskVT, DAG, true); + SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(), + N->getBasePtr(), Mask, Src0, + N->getMemoryVT(), N->getMemOperand(), + N->getExtensionType()); + + SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + NewLoad.getValue(0), + DAG.getIntPtrConstant(0, dl)); + SDValue RetOps[] = {Exract, NewLoad.getValue(1)}; + return DAG.getMergeValues(RetOps, dl); + } + return Op; +} + +static SDValue LowerMSTORE(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MaskedStoreSDNode *N = cast(Op.getNode()); + SDValue DataToStore = N->getValue(); + MVT VT = DataToStore.getSimpleValueType(); + SDValue Mask = N->getMask(); + SDLoc dl(Op); - SDValue NewScatter = DAG.getMaskedScatter(VTs, VT, dl, Ops, N->getMemOperand()); - DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); - return SDValue(NewScatter.getNode(), 0); + if (Subtarget->hasAVX512() && !Subtarget->hasVLX() && + !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) { + // This operation is legal for targets with VLX, but without + // VLX the vector should be widened to 512 bit + unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); + MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec); + MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); + DataToStore = ExtendToType(DataToStore, WideDataVT, DAG); + Mask = ExtendToType(Mask, WideMaskVT, DAG, true); + return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(), + Mask, N->getMemoryVT(), N->getMemOperand(), + N->isTruncatingStore()); } return Op; } @@ -19687,17 +20236,59 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget, "MGATHER/MSCATTER are supported on AVX-512 arch only"); MaskedGatherSDNode *N = cast(Op.getNode()); + SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); + SDValue Index = N->getIndex(); + SDValue Mask = N->getMask(); + SDValue Src0 = N->getValue(); + MVT IndexVT = Index.getSimpleValueType(); + MVT MaskVT = Mask.getSimpleValueType(); + + unsigned NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); - SDLoc dl(Op); - SDValue Index = N->getIndex(); if (!Subtarget->hasVLX() && !VT.is512BitVector() && !Index.getSimpleValueType().is512BitVector()) { - Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), - N->getOperand(3), Index }; - DAG.UpdateNodeOperands(N, Ops); + // AVX512F supports only 512-bit vectors. Or data or index should + // be 512 bit wide. If now the both index and data are 256-bit, but + // the vector contains 8 elements, we just sign-extend the index + if (NumElts == 8) { + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), Index }; + DAG.UpdateNodeOperands(N, Ops); + return Op; + } + + // Minimal number of elements in Gather + NumElts = 8; + // Index + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); + Index = ExtendToType(Index, NewIndexVT, DAG); + if (IndexVT.getScalarType() == MVT::i32) + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + + // Mask + MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts); + // At this point we have promoted mask operand + assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); + Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); + Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask); + + // The pass-thru value + MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); + Src0 = ExtendToType(Src0, NewVT, DAG); + + SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; + SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other), + N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + NewGather.getValue(0), + DAG.getIntPtrConstant(0, dl)); + SDValue RetOps[] = {Exract, NewGather.getValue(1)}; + return DAG.getMergeValues(RetOps, dl); } return Op; } @@ -19840,6 +20431,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::UMAX: case ISD::UMIN: return LowerMINMAX(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); + case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); + case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG); case ISD::GC_TRANSITION_START: @@ -19986,6 +20579,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results); } } + case ISD::INTRINSIC_WO_CHAIN: { + if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG)) + Results.push_back(V); + return; + } case ISD::READCYCLECOUNTER: { return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, Results); @@ -20130,6 +20728,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CMOV: return "X86ISD::CMOV"; case X86ISD::BRCOND: return "X86ISD::BRCOND"; case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; + case X86ISD::IRET: return "X86ISD::IRET"; case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; @@ -20191,6 +20790,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD"; case X86ISD::CVTUDQ2PD: return "X86ISD::CVTUDQ2PD"; + case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; @@ -20199,6 +20799,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VSHLI: return "X86ISD::VSHLI"; case X86ISD::VSRLI: return "X86ISD::VSRLI"; case X86ISD::VSRAI: return "X86ISD::VSRAI"; + case X86ISD::VROTLI: return "X86ISD::VROTLI"; + case X86ISD::VROTRI: return "X86ISD::VROTRI"; case X86ISD::CMPP: return "X86ISD::CMPP"; case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; @@ -20326,6 +20928,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FP_TO_SINT_RND: return "X86ISD::FP_TO_SINT_RND"; case X86ISD::FP_TO_UINT_RND: return "X86ISD::FP_TO_UINT_RND"; case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS"; + case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS"; } return nullptr; } @@ -20480,7 +21083,7 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; } bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { - if (!(Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512())) + if (!Subtarget->hasAnyFMA()) return false; VT = VT.getScalarType(); @@ -20668,6 +21271,47 @@ static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, return BB; } +static MachineBasicBlock *EmitWRPKRU(MachineInstr *MI, MachineBasicBlock *BB, + const X86Subtarget *Subtarget) { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + + // insert input VAL into EAX + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) + .addReg(MI->getOperand(0).getReg()); + // insert zero to ECX + BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX) + .addReg(X86::ECX) + .addReg(X86::ECX); + // insert zero to EDX + BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::EDX) + .addReg(X86::EDX) + .addReg(X86::EDX); + // insert WRPKRU instruction + BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr)); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; +} + +static MachineBasicBlock *EmitRDPKRU(MachineInstr *MI, MachineBasicBlock *BB, + const X86Subtarget *Subtarget) { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + + // insert zero to ECX + BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX) + .addReg(X86::ECX) + .addReg(X86::ECX); + // insert RDPKRU instruction + BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr)); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) + .addReg(X86::EAX); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; +} + static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, const X86Subtarget *Subtarget) { DebugLoc dl = MI->getDebugLoc(); @@ -21240,7 +21884,8 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, if (LastCMOV == MI && NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() && - NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) { + NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg() && + NextMIIt->getOperand(1).isKill()) { CascadedCMOV = &*NextMIIt; } @@ -21997,6 +22642,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, return EmitLoweredTLSCall(MI, BB); case X86::CMOV_FR32: case X86::CMOV_FR64: + case X86::CMOV_FR128: case X86::CMOV_GR8: case X86::CMOV_GR16: case X86::CMOV_GR32: @@ -22018,6 +22664,36 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::CMOV_V64I1: return EmitLoweredSelect(MI, BB); + case X86::RDFLAGS32: + case X86::RDFLAGS64: { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + unsigned PushF = + MI->getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64; + unsigned Pop = + MI->getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r; + BuildMI(*BB, MI, DL, TII->get(PushF)); + BuildMI(*BB, MI, DL, TII->get(Pop), MI->getOperand(0).getReg()); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; + } + + case X86::WRFLAGS32: + case X86::WRFLAGS64: { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + unsigned Push = + MI->getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r; + unsigned PopF = + MI->getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64; + BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI->getOperand(0).getReg()); + BuildMI(*BB, MI, DL, TII->get(PopF)); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; + } + case X86::RELEASE_FADD32mr: case X86::RELEASE_FADD64mr: return EmitLoweredAtomicFP(MI, BB); @@ -22134,7 +22810,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // Thread synchronization. case X86::MONITOR: return EmitMonitor(MI, BB, Subtarget); - + // PKU feature + case X86::WRPKRU: + return EmitWRPKRU(MI, BB, Subtarget); + case X86::RDPKRU: + return EmitRDPKRU(MI, BB, Subtarget); // xbegin case X86::XBEGIN: return EmitXBegin(MI, BB, Subtarget->getInstrInfo()); @@ -22296,39 +22976,8 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N, return TargetLowering::isGAPlusOffset(N, GA, Offset); } -/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the -/// same as extracting the high 128-bit part of 256-bit vector and then -/// inserting the result into the low part of a new 256-bit vector -static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - - // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j) - if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || - SVOp->getMaskElt(j) >= 0) - return false; - - return true; -} - -/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the -/// same as extracting the low 128-bit part of 256-bit vector and then -/// inserting the result into the high part of a new 256-bit vector -static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - - // vector_shuffle or - for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j) - if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || - SVOp->getMaskElt(j) >= 0) - return false; - - return true; -} - /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. +/// FIXME: This could be expanded to support 512 bit vectors as well. static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget* Subtarget) { @@ -22336,7 +22985,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, ShuffleVectorSDNode *SVOp = cast(N); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); - EVT VT = SVOp->getValueType(0); + MVT VT = SVOp->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); if (V1.getOpcode() == ISD::CONCAT_VECTORS && @@ -22402,24 +23051,6 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, return DCI.CombineTo(N, InsV); } - //===--------------------------------------------------------------------===// - // Combine some shuffles into subvector extracts and inserts: - // - - // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - if (isShuffleHigh128VectorInsertLow(SVOp)) { - SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); - SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); - return DCI.CombineTo(N, InsV); - } - - // vector_shuffle or - if (isShuffleLow128VectorInsertHigh(SVOp)) { - SDValue V = Extract128BitVector(V1, 0, DAG, dl); - SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); - return DCI.CombineTo(N, InsV); - } - return SDValue(); } @@ -22670,7 +23301,7 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, return false; SmallVector OpMask; bool IsUnary; - bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary); + bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, true, OpMask, IsUnary); // We only can combine unary shuffles which we can decode the mask for. if (!HaveMask || !IsUnary) return false; @@ -22767,7 +23398,7 @@ static SmallVector getPSHUFShuffleMask(SDValue N) { MVT VT = N.getSimpleValueType(); SmallVector Mask; bool IsUnary; - bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary); + bool HaveMask = getTargetShuffleMask(N.getNode(), VT, false, Mask, IsUnary); (void)HaveMask; assert(HaveMask); @@ -23052,6 +23683,31 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, } return SDValue(); } + case X86ISD::BLENDI: { + SDValue V0 = N->getOperand(0); + SDValue V1 = N->getOperand(1); + assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() && + "Unexpected input vector types"); + + // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector + // operands and changing the mask to 1. This saves us a bunch of + // pattern-matching possibilities related to scalar math ops in SSE/AVX. + // x86InstrInfo knows how to commute this back after instruction selection + // if it would help register allocation. + + // TODO: If optimizing for size or a processor that doesn't suffer from + // partial register update stalls, this should be transformed into a MOVSD + // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. + + if (VT == MVT::v2f64) + if (auto *Mask = dyn_cast(N->getOperand(2))) + if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) { + SDValue NewMask = DAG.getConstant(1, DL, MVT::i8); + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); + } + + return SDValue(); + } default: return SDValue(); } @@ -23145,9 +23801,13 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, /// the operands which explicitly discard the lanes which are unused by this /// operation to try to flow through the rest of the combiner the fact that /// they're unused. -static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { +static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDLoc DL(N); EVT VT = N->getValueType(0); + if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && + (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) + return SDValue(); // We only handle target-independent shuffles. // FIXME: It would be easy and harmless to use the target shuffle mask @@ -23156,14 +23816,19 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { return SDValue(); auto *SVN = cast(N); - ArrayRef Mask = SVN->getMask(); + SmallVector Mask; + for (int M : SVN->getMask()) + Mask.push_back(M); + SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); - // We require the first shuffle operand to be the SUB node, and the second to - // be the ADD node. - // FIXME: We should support the commuted patterns. - if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD) + // We require the first shuffle operand to be the FSUB node, and the second to + // be the FADD node. + if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) { + ShuffleVectorSDNode::commuteMask(Mask); + std::swap(V1, V2); + } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD) return SDValue(); // If there are other uses of these operations we can't fold them. @@ -23184,12 +23849,6 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}))) return SDValue(); - // Only specific types are legal at this point, assert so we notice if and - // when these change. - assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 || - VT == MVT::v4f64) && - "Unknown vector type encountered!"); - return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS); } @@ -23209,12 +23868,12 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, // If we have legalized the vector types, look for blends of FADD and FSUB // nodes that we can fuse into an ADDSUB node. - if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3()) - if (SDValue AddSub = combineShuffleToAddSub(N, DAG)) + if (TLI.isTypeLegal(VT)) + if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG)) return AddSub; // Combine 256-bit vector shuffles. This is only profitable when in AVX mode - if (Subtarget->hasFp256() && VT.is256BitVector() && + if (TLI.isTypeLegal(VT) && Subtarget->hasFp256() && VT.is256BitVector() && N->getOpcode() == ISD::VECTOR_SHUFFLE) return PerformShuffleCombine256(N, DAG, DCI, Subtarget); @@ -23312,6 +23971,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, SDValue InVec = N->getOperand(0); SDValue EltNo = N->getOperand(1); + EVT EltVT = N->getValueType(0); if (!isa(EltNo)) return SDValue(); @@ -23340,14 +24000,22 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, SmallVector ShuffleMask; bool UnaryShuffle; - if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), + if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true, ShuffleMask, UnaryShuffle)) return SDValue(); // Select the input vector, guarding against out of range extract vector. unsigned NumElems = CurrentVT.getVectorNumElements(); int Elt = cast(EltNo)->getZExtValue(); - int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; + int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt]; + + if (Idx == SM_SentinelZero) + return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT) + : DAG.getConstantFP(+0.0, SDLoc(N), EltVT); + if (Idx == SM_SentinelUndef) + return DAG.getUNDEF(EltVT); + + assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range"); SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) : InVec.getOperand(1); @@ -23372,7 +24040,6 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) return SDValue(); - EVT EltVT = N->getValueType(0); // If there's a bitcast before the shuffle, check if the load type and // alignment is valid. unsigned Align = LN0->getAlignment(); @@ -23409,9 +24076,9 @@ static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG, // store-load conversions. if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR && N0.getValueType() == MVT::v2i32 && - isa(N0.getOperand(1))) { + isNullConstant(N0.getOperand(1))) { SDValue N00 = N0->getOperand(0); - if (N0.getConstantOperandVal(1) == 0 && N00.getValueType() == MVT::i32) + if (N00.getValueType() == MVT::i32) return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00); } @@ -23655,7 +24322,8 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // ignored in unsafe-math mode). // We also try to create v2f32 min/max nodes, which we later widen to v4f32. if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && - VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && + VT != MVT::f80 && VT != MVT::f128 && + (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && (Subtarget->hasSSE2() || (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); @@ -24212,12 +24880,9 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { SetCC.getOpcode() == ISD::AND) { if (SetCC.getOpcode() == ISD::AND) { int OpIdx = -1; - ConstantSDNode *CS; - if ((CS = dyn_cast(SetCC.getOperand(0))) && - CS->getZExtValue() == 1) + if (isOneConstant(SetCC.getOperand(0))) OpIdx = 1; - if ((CS = dyn_cast(SetCC.getOperand(1))) && - CS->getZExtValue() == 1) + if (isOneConstant(SetCC.getOperand(1))) OpIdx = 0; if (OpIdx == -1) break; @@ -24296,8 +24961,7 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd) { if (Cond->getOpcode() == X86ISD::CMP) { - ConstantSDNode *CondOp1C = dyn_cast(Cond->getOperand(1)); - if (!CondOp1C || !CondOp1C->isNullValue()) + if (!isNullConstant(Cond->getOperand(1))) return false; Cond = Cond->getOperand(0); @@ -24576,9 +25240,11 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, MulAmt1 = 3; MulAmt2 = MulAmt / 3; } + + SDLoc DL(N); + SDValue NewMul; if (MulAmt2 && (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ - SDLoc DL(N); if (isPowerOf2_64(MulAmt2) && !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) @@ -24587,7 +25253,6 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, // is an add. std::swap(MulAmt1, MulAmt2); - SDValue NewMul; if (isPowerOf2_64(MulAmt1)) NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8)); @@ -24601,10 +25266,31 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, DAG.getConstant(MulAmt2, DL, VT)); + } + + if (!NewMul) { + assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) + && "Both cases that could cause potential overflows should have " + "already been handled."); + if (isPowerOf2_64(MulAmt - 1)) + // (mul x, 2^N + 1) => (add (shl x, N), x) + NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(MulAmt - 1), DL, + MVT::i8))); + else if (isPowerOf2_64(MulAmt + 1)) + // (mul x, 2^N - 1) => (sub (shl x, N), x) + NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT, + N->getOperand(0), + DAG.getConstant(Log2_64(MulAmt + 1), + DL, MVT::i8)), N->getOperand(0)); + } + + if (NewMul) // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, NewMul, false); - } + return SDValue(); } @@ -24668,6 +25354,59 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + unsigned Size = VT.getSizeInBits(); + + // fold (ashr (shl, a, [56,48,32,24,16]), SarConst) + // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or + // into (lshr, (sext (a), SarConst - [56,48,32,24,16])) + // depending on sign of (SarConst - [56,48,32,24,16]) + + // sexts in X86 are MOVs. The MOVs have the same code size + // as above SHIFTs (only SHIFT on 1 has lower code size). + // However the MOVs have 2 advantages to a SHIFT: + // 1. MOVs can write to a register that differs from source + // 2. MOVs accept memory operands + + if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant || + N0.getOpcode() != ISD::SHL || !N0.hasOneUse() || + N0.getOperand(1).getOpcode() != ISD::Constant) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + SDValue N01 = N0.getOperand(1); + APInt ShlConst = (cast(N01))->getAPIntValue(); + APInt SarConst = (cast(N1))->getAPIntValue(); + EVT CVT = N1.getValueType(); + + if (SarConst.isNegative()) + return SDValue(); + + for (MVT SVT : MVT::integer_valuetypes()) { + unsigned ShiftSize = SVT.getSizeInBits(); + // skipping types without corresponding sext/zext and + // ShlConst that is not one of [56,48,32,24,16] + if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize) + continue; + SDLoc DL(N); + SDValue NN = + DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT)); + SarConst = SarConst - (Size - ShiftSize); + if (SarConst == 0) + return NN; + else if (SarConst.isNegative()) + return DAG.getNode(ISD::SHL, DL, VT, NN, + DAG.getConstant(-SarConst, DL, CVT)); + else + return DAG.getNode(ISD::SRA, DL, VT, NN, + DAG.getConstant(SarConst, DL, CVT)); + } + return SDValue(); +} + /// \brief Returns a vector of 0s if the node in input is a vector logical /// shift by a constant amount which is known to be bigger than or equal /// to the vector element size in bits. @@ -24693,7 +25432,7 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, // the element size. The constant shift amount will be // encoded as a 8-bit immediate. if (ShiftAmt.trunc(8).uge(MaxAmount)) - return getZeroVector(VT, Subtarget, DAG, DL); + return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL); } return SDValue(); @@ -24707,6 +25446,10 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, if (SDValue V = PerformSHLCombine(N, DAG)) return V; + if (N->getOpcode() == ISD::SRA) + if (SDValue V = PerformSRACombine(N, DAG)) + return V; + // Try to fold this logical shift into a zero vector. if (N->getOpcode() != ISD::SRA) if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget)) @@ -25334,7 +26077,7 @@ static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) { return SDValue(); // Make sure we are performing an xor against one. - if (!isa(N1) || !cast(N1)->isOne()) + if (!isOneConstant(N1)) return SDValue(); // SetCC on x86 zero extends so only act on this if it's a logical shift. @@ -25502,12 +26245,6 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, return SDValue(); } -static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { - return detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, Subtarget, - SDLoc(N)); -} - /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -25610,8 +26347,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; - for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) - ShuffleVec[i] = NumElems*SizeRatio; + for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i) + ShuffleVec[i] = NumElems * SizeRatio; NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, DAG.getConstant(0, dl, WideVecVT), &ShuffleVec[0]); @@ -25692,8 +26429,8 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, "WideVecVT should be legal"); SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, - DAG.getUNDEF(WideVecVT), - &ShuffleVec[0]); + DAG.getUNDEF(WideVecVT), + &ShuffleVec[0]); SDValue NewMask; SDValue Mask = Mst->getMask(); @@ -25725,8 +26462,9 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); } - return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(), - NewMask, StVT, Mst->getMemOperand(), false); + return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, + Mst->getBasePtr(), NewMask, StVT, + Mst->getMemOperand(), false); } /// PerformSTORECombine - Do target-specific dag combines on STORE nodes. static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, @@ -26150,33 +26888,231 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS. +static SDValue +combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG, + SmallVector &Regs) { + assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 || + Regs[0].getValueType() == MVT::v2i64)); + EVT OutVT = N->getValueType(0); + EVT OutSVT = OutVT.getVectorElementType(); + EVT InVT = Regs[0].getValueType(); + EVT InSVT = InVT.getVectorElementType(); + SDLoc DL(N); + + // First, use mask to unset all bits that won't appear in the result. + assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) && + "OutSVT can only be either i8 or i16."); + SDValue MaskVal = + DAG.getConstant(OutSVT == MVT::i8 ? 0xFF : 0xFFFF, DL, InSVT); + SDValue MaskVec = DAG.getNode( + ISD::BUILD_VECTOR, DL, InVT, + SmallVector(InVT.getVectorNumElements(), MaskVal)); + for (auto &Reg : Regs) + Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVec, Reg); + + MVT UnpackedVT, PackedVT; + if (OutSVT == MVT::i8) { + UnpackedVT = MVT::v8i16; + PackedVT = MVT::v16i8; + } else { + UnpackedVT = MVT::v4i32; + PackedVT = MVT::v8i16; + } + + // In each iteration, truncate the type by a half size. + auto RegNum = Regs.size(); + for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits(); + j < e; j *= 2, RegNum /= 2) { + for (unsigned i = 0; i < RegNum; i++) + Regs[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, Regs[i]); + for (unsigned i = 0; i < RegNum / 2; i++) + Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2], + Regs[i * 2 + 1]); + } + + // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and + // then extract a subvector as the result since v8i8 is not a legal type. + if (OutVT == MVT::v8i8) { + Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]); + Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0], + DAG.getIntPtrConstant(0, DL)); + return Regs[0]; + } else if (RegNum > 1) { + Regs.resize(RegNum); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); + } else + return Regs[0]; +} + +/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS. +static SDValue +combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG, + SmallVector &Regs) { + assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32); + EVT OutVT = N->getValueType(0); + SDLoc DL(N); + + // Shift left by 16 bits, then arithmetic-shift right by 16 bits. + SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32); + for (auto &Reg : Regs) { + Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG); + Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG); + } + + for (unsigned i = 0, e = Regs.size() / 2; i < e; i++) + Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2], + Regs[i * 2 + 1]); + + if (Regs.size() > 2) { + Regs.resize(Regs.size() / 2); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); + } else + return Regs[0]; +} + +/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into +/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type +/// legalization the truncation will be translated into a BUILD_VECTOR with each +/// element that is extracted from a vector and then truncated, and it is +/// diffcult to do this optimization based on them. +static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT OutVT = N->getValueType(0); + if (!OutVT.isVector()) + return SDValue(); + + SDValue In = N->getOperand(0); + if (!In.getValueType().isSimple()) + return SDValue(); + + EVT InVT = In.getValueType(); + unsigned NumElems = OutVT.getVectorNumElements(); + + // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on + // SSE2, and we need to take care of it specially. + // AVX512 provides vpmovdb. + if (!Subtarget->hasSSE2() || Subtarget->hasAVX2()) + return SDValue(); + + EVT OutSVT = OutVT.getVectorElementType(); + EVT InSVT = InVT.getVectorElementType(); + if (!((InSVT == MVT::i32 || InSVT == MVT::i64) && + (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) && + NumElems >= 8)) + return SDValue(); + + // SSSE3's pshufb results in less instructions in the cases below. + if (Subtarget->hasSSSE3() && NumElems == 8 && + ((OutSVT == MVT::i8 && InSVT != MVT::i64) || + (InSVT == MVT::i32 && OutSVT == MVT::i16))) + return SDValue(); + + SDLoc DL(N); + + // Split a long vector into vectors of legal type. + unsigned RegNum = InVT.getSizeInBits() / 128; + SmallVector SubVec(RegNum); + if (InSVT == MVT::i32) { + for (unsigned i = 0; i < RegNum; i++) + SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, + DAG.getIntPtrConstant(i * 4, DL)); + } else { + for (unsigned i = 0; i < RegNum; i++) + SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, + DAG.getIntPtrConstant(i * 2, DL)); + } + + // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PAKCUS + // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to + // truncate 2 x v4i32 to v8i16. + if (Subtarget->hasSSE41() || OutSVT == MVT::i8) + return combineVectorTruncationWithPACKUS(N, DAG, SubVec); + else if (InSVT == MVT::i32) + return combineVectorTruncationWithPACKSS(N, DAG, SubVec); + else + return SDValue(); +} + +static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + // Try to detect AVG pattern first. + SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, + Subtarget, SDLoc(N)); + if (Avg.getNode()) + return Avg; + + return combineVectorTruncation(N, DAG, Subtarget); +} + /// Do target-specific dag combines on floating point negations. static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); + EVT SVT = VT.getScalarType(); SDValue Arg = N->getOperand(0); + SDLoc DL(N); + + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + // If we're negating a FMUL node on a target with FMA, then we can avoid the + // use of a constant by performing (-0 - A*B) instead. + // FIXME: Check rounding control flags as well once it becomes available. + if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && + Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) { + SDValue Zero = DAG.getConstantFP(0.0, DL, VT); + return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Zero); + } // If we're negating a FMA node, then we can adjust the // instruction to include the extra negation. if (Arg.hasOneUse()) { switch (Arg.getOpcode()) { - case X86ISD::FMADD: - return DAG.getNode(X86ISD::FNMSUB, SDLoc(N), VT, Arg.getOperand(0), - Arg.getOperand(1), Arg.getOperand(2)); - case X86ISD::FMSUB: - return DAG.getNode(X86ISD::FNMADD, SDLoc(N), VT, Arg.getOperand(0), - Arg.getOperand(1), Arg.getOperand(2)); - case X86ISD::FNMADD: - return DAG.getNode(X86ISD::FMSUB, SDLoc(N), VT, Arg.getOperand(0), - Arg.getOperand(1), Arg.getOperand(2)); - case X86ISD::FNMSUB: - return DAG.getNode(X86ISD::FMADD, SDLoc(N), VT, Arg.getOperand(0), - Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FMADD: + return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FMSUB: + return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FNMADD: + return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FNMSUB: + return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); } } return SDValue(); } +static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (VT.is512BitVector() && !Subtarget->hasDQI()) { + // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention. + // These logic operations may be executed in the integer domain. + SDLoc dl(N); + MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits()); + MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements()); + + SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0)); + SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1)); + unsigned IntOpcode = 0; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected FP logic op"); + case X86ISD::FOR: IntOpcode = ISD::OR; break; + case X86ISD::FXOR: IntOpcode = ISD::XOR; break; + case X86ISD::FAND: IntOpcode = ISD::AND; break; + case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break; + } + SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1); + return DAG.getNode(ISD::BITCAST, dl, VT, IntOp); + } + return SDValue(); +} /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { @@ -26192,19 +27128,7 @@ static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG, if (C->getValueAPF().isPosZero()) return N->getOperand(0); - EVT VT = N->getValueType(0); - if (VT.is512BitVector() && !Subtarget->hasDQI()) { - SDLoc dl(N); - MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits()); - MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements()); - - SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0)); - SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1)); - unsigned IntOpcode = (N->getOpcode() == X86ISD::FOR) ? ISD::OR : ISD::XOR; - SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1); - return DAG.getNode(ISD::BITCAST, dl, VT, IntOp); - } - return SDValue(); + return lowerX86FPLogicOp(N, DAG, Subtarget); } /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes. @@ -26228,8 +27152,65 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { N->getOperand(0), N->getOperand(1)); } +static SDValue performFMinNumFMaxNumCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + if (Subtarget->useSoftFloat()) + return SDValue(); + + // TODO: Check for global or instruction-level "nnan". In that case, we + // should be able to lower to FMAX/FMIN alone. + // TODO: If an operand is already known to be a NaN or not a NaN, this + // should be an optional swap and FMAX/FMIN. + + EVT VT = N->getValueType(0); + if (!((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || + (Subtarget->hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) || + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64)))) + return SDValue(); + + // This takes at least 3 instructions, so favor a library call when operating + // on a scalar and minimizing code size. + if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize()) + return SDValue(); + + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + SDLoc DL(N); + EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType( + DAG.getDataLayout(), *DAG.getContext(), VT); + + // There are 4 possibilities involving NaN inputs, and these are the required + // outputs: + // Op1 + // Num NaN + // ---------------- + // Num | Max | Op0 | + // Op0 ---------------- + // NaN | Op1 | NaN | + // ---------------- + // + // The SSE FP max/min instructions were not designed for this case, but rather + // to implement: + // Min = Op1 < Op0 ? Op1 : Op0 + // Max = Op1 > Op0 ? Op1 : Op0 + // + // So they always return Op0 if either input is a NaN. However, we can still + // use those instructions for fmaxnum by selecting away a NaN input. + + // If either operand is NaN, the 2nd source operand (Op0) is passed through. + auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN; + SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0); + SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO); + + // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands + // are NaN, the NaN value of Op1 is the result. + auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; + return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax); +} + /// Do target-specific dag combines on X86ISD::FAND nodes. -static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { // FAND(0.0, x) -> 0.0 if (ConstantFPSDNode *C = dyn_cast(N->getOperand(0))) if (C->getValueAPF().isPosZero()) @@ -26240,11 +27221,12 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { if (C->getValueAPF().isPosZero()) return N->getOperand(1); - return SDValue(); + return lowerX86FPLogicOp(N, DAG, Subtarget); } /// Do target-specific dag combines on X86ISD::FANDN nodes -static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { // FANDN(0.0, x) -> x if (ConstantFPSDNode *C = dyn_cast(N->getOperand(0))) if (C->getValueAPF().isPosZero()) @@ -26255,7 +27237,7 @@ static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { if (C->getValueAPF().isPosZero()) return N->getOperand(1); - return SDValue(); + return lowerX86FPLogicOp(N, DAG, Subtarget); } static SDValue PerformBTCombine(SDNode *N, @@ -26376,6 +27358,32 @@ static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG, return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags); } +/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) -> +/// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y) +/// This exposes the {s/z}ext to the sdivrem lowering, so that it directly +/// extends from AH (which we otherwise need to do contortions to access). +static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + auto OpcodeN = N->getOpcode(); + auto OpcodeN0 = N0.getOpcode(); + if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) || + (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM))) + return SDValue(); + + EVT VT = N->getValueType(0); + EVT InVT = N0.getValueType(); + if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32) + return SDValue(); + + SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); + auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG + : X86ISD::UDIVREM8_ZEXT_HREG; + SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0), + N0.getOperand(1)); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); + return R.getValue(1); +} + static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { @@ -26386,18 +27394,8 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, EVT InSVT = InVT.getScalarType(); SDLoc DL(N); - // (i8,i32 sext (sdivrem (i8 x, i8 y)) -> - // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y) - // This exposes the sext to the sdivrem lowering, so that it directly extends - // from AH (which we otherwise need to do contortions to access). - if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 && - InVT == MVT::i8 && VT == MVT::i32) { - SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); - SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, DL, NodeTys, - N0.getOperand(0), N0.getOperand(1)); - DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); - return R.getValue(1); - } + if (SDValue DivRem8 = getDivRem8(N, DAG)) + return DivRem8; if (!DCI.isBeforeLegalizeOps()) { if (InVT == MVT::i1) { @@ -26486,9 +27484,7 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT ScalarVT = VT.getScalarType(); - if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || - (!Subtarget->hasFMA() && !Subtarget->hasFMA4() && - !Subtarget->hasAVX512())) + if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasAnyFMA()) return SDValue(); SDValue A = N->getOperand(0); @@ -26533,8 +27529,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, N0.getOperand(0).hasOneUse()) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == X86ISD::SETCC_CARRY) { - ConstantSDNode *C = dyn_cast(N0.getOperand(1)); - if (!C || C->getZExtValue() != 1) + if (!isOneConstant(N0.getOperand(1))) return SDValue(); return DAG.getNode(ISD::AND, dl, VT, DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, @@ -26559,19 +27554,8 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; - // (i8,i32 zext (udivrem (i8 x, i8 y)) -> - // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y) - // This exposes the zext to the udivrem lowering, so that it directly extends - // from AH (which we otherwise need to do contortions to access). - if (N0.getOpcode() == ISD::UDIVREM && - N0.getResNo() == 1 && N0.getValueType() == MVT::i8 && - (VT == MVT::i32 || VT == MVT::i64)) { - SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); - SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys, - N0.getOperand(0), N0.getOperand(1)); - DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); - return R.getValue(1); - } + if (SDValue DivRem8 = getDivRem8(N, DAG)) + return DivRem8; return SDValue(); } @@ -26587,21 +27571,19 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) - if (ConstantSDNode *C = dyn_cast(LHS.getOperand(0))) - if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS, - LHS.getOperand(1)); - return DAG.getSetCC(DL, N->getValueType(0), addV, - DAG.getConstant(0, DL, addV.getValueType()), CC); - } + if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) { + SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS, + LHS.getOperand(1)); + return DAG.getSetCC(DL, N->getValueType(0), addV, + DAG.getConstant(0, DL, addV.getValueType()), CC); + } if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) - if (ConstantSDNode *C = dyn_cast(RHS.getOperand(0))) - if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS, - RHS.getOperand(1)); - return DAG.getSetCC(DL, N->getValueType(0), addV, - DAG.getConstant(0, DL, addV.getValueType()), CC); - } + if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) { + SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS, + RHS.getOperand(1)); + return DAG.getSetCC(DL, N->getValueType(0), addV, + DAG.getConstant(0, DL, addV.getValueType()), CC); + } if (VT.getScalarType() == MVT::i1 && (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { @@ -26639,29 +27621,17 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) { - SDValue V0 = N->getOperand(0); - SDValue V1 = N->getOperand(1); +static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); - EVT VT = N->getValueType(0); - - // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector - // operands and changing the mask to 1. This saves us a bunch of - // pattern-matching possibilities related to scalar math ops in SSE/AVX. - // x86InstrInfo knows how to commute this back after instruction selection - // if it would help register allocation. - - // TODO: If optimizing for size or a processor that doesn't suffer from - // partial register update stalls, this should be transformed into a MOVSD - // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. - - if (VT == MVT::v2f64) - if (auto *Mask = dyn_cast(N->getOperand(2))) - if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) { - SDValue NewMask = DAG.getConstant(1, DL, MVT::i8); - return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); - } - + // Gather and Scatter instructions use k-registers for masks. The type of + // the masks is v*i1. So the mask will be truncated anyway. + // The SIGN_EXTEND_INREG my be dropped. + SDValue Mask = N->getOperand(2); + if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) { + SmallVector NewOps(N->op_begin(), N->op_end()); + NewOps[2] = Mask.getOperand(0); + DAG.UpdateNodeOperands(N, NewOps); + } return SDValue(); } @@ -27021,8 +27991,7 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) { SDValue ExtractedV = V.getOperand(0); SDValue OrigV = ExtractedV.getOperand(0); - if (auto *ExtractIdx = dyn_cast(ExtractedV.getOperand(1))) - if (ExtractIdx->getZExtValue() == 0) { + if (isNullConstant(ExtractedV.getOperand(1))) { MVT OrigVT = OrigV.getSimpleValueType(); // Extract a subvector if necessary... if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) { @@ -27077,8 +28046,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FOR: return PerformFORCombine(N, DAG, Subtarget); case X86ISD::FMIN: case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); - case X86ISD::FAND: return PerformFANDCombine(N, DAG); - case X86ISD::FANDN: return PerformFANDNCombine(N, DAG); + case ISD::FMINNUM: + case ISD::FMAXNUM: return performFMinNumFMaxNumCombine(N, DAG, + Subtarget); + case X86ISD::FAND: return PerformFANDCombine(N, DAG, Subtarget); + case X86ISD::FANDN: return PerformFANDNCombine(N, DAG, Subtarget); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); case ISD::ANY_EXTEND: @@ -27092,6 +28064,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::PALIGNR: + case X86ISD::BLENDI: case X86ISD::UNPCKH: case X86ISD::UNPCKL: case X86ISD::MOVHLPS: @@ -27106,7 +28079,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); - case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG); + case ISD::MGATHER: + case ISD::MSCATTER: return PerformGatherScatterCombine(N, DAG); } return SDValue(); @@ -27141,6 +28115,18 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { } } +/// This function checks if any of the users of EFLAGS copies the EFLAGS. We +/// know that the code that lowers COPY of EFLAGS has to use the stack, and if +/// we don't adjust the stack we clobber the first frame index. +/// See X86InstrInfo::copyPhysReg. +bool X86TargetLowering::hasCopyImplyingStackAdjustment( + MachineFunction *MF) const { + const MachineRegisterInfo &MRI = MF->getRegInfo(); + + return any_of(MRI.reg_instructions(X86::EFLAGS), + [](const MachineInstr &RI) { return RI.isCopy(); }); +} + /// IsDesirableToPromoteOp - This method query the target whether it is /// beneficial for dag combiner to promote the specified node. If true, it /// should return the desired promotion type by reference. @@ -27736,6 +28722,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case MVT::f64: case MVT::i64: return std::make_pair(0U, &X86::FR64RegClass); + // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. // Vector types. case MVT::v16i8: case MVT::v8i16: @@ -27820,17 +28807,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (Class == &X86::GR8RegClass || Class == &X86::GR16RegClass || Class == &X86::GR32RegClass || Class == &X86::GR64RegClass) { unsigned Size = VT.getSizeInBits(); - MVT::SimpleValueType SimpleTy = Size == 1 || Size == 8 ? MVT::i8 - : Size == 16 ? MVT::i16 - : Size == 32 ? MVT::i32 - : Size == 64 ? MVT::i64 - : MVT::Other; - unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, SimpleTy); + if (Size == 1) Size = 8; + unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size); if (DestReg > 0) { Res.first = DestReg; - Res.second = SimpleTy == MVT::i8 ? &X86::GR8RegClass - : SimpleTy == MVT::i16 ? &X86::GR16RegClass - : SimpleTy == MVT::i32 ? &X86::GR32RegClass + Res.second = Size == 8 ? &X86::GR8RegClass + : Size == 16 ? &X86::GR16RegClass + : Size == 32 ? &X86::GR32RegClass : &X86::GR64RegClass; assert(Res.second->contains(Res.first) && "Register in register class"); } else { @@ -27848,6 +28831,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // target independent register mapper will just pick the first match it can // find, ignoring the required type. + // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. if (VT == MVT::f32 || VT == MVT::i32) Res.second = &X86::FR32RegClass; else if (VT == MVT::f64 || VT == MVT::i64) @@ -27909,26 +28893,51 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { return OptSize && !VT.isVector(); } -void X86TargetLowering::markInRegArguments(SelectionDAG &DAG, - TargetLowering::ArgListTy& Args) const { - // The MCU psABI requires some arguments to be passed in-register. - // For regular calls, the inreg arguments are marked by the front-end. - // However, for compiler generated library calls, we have to patch this - // up here. - if (!Subtarget->isTargetMCU() || !Args.size()) +void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { + if (!Subtarget->is64Bit()) return; - unsigned FreeRegs = 3; - for (auto &Arg : Args) { - // For library functions, we do not expect any fancy types. - unsigned Size = DAG.getDataLayout().getTypeSizeInBits(Arg.Ty); - unsigned SizeInRegs = (Size + 31) / 32; - if (SizeInRegs > 2 || SizeInRegs > FreeRegs) - continue; + // Update IsSplitCSR in X86MachineFunctionInfo. + X86MachineFunctionInfo *AFI = + Entry->getParent()->getInfo(); + AFI->setIsSplitCSR(true); +} - Arg.isInReg = true; - FreeRegs -= SizeInRegs; - if (!FreeRegs) - break; +void X86TargetLowering::insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl &Exits) const { + const X86RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); + if (!IStart) + return; + + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + MachineBasicBlock::iterator MBBI = Entry->begin(); + for (const MCPhysReg *I = IStart; *I; ++I) { + const TargetRegisterClass *RC = nullptr; + if (X86::GR64RegClass.contains(*I)) + RC = &X86::GR64RegClass; + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + + unsigned NewVR = MRI->createVirtualRegister(RC); + // Create copy from CSR to a virtual register. + // FIXME: this currently does not emit CFI pseudo-instructions, it works + // fine for CXX_FAST_TLS since the C++-style TLS access functions should be + // nounwind. If we want to generalize this later, we may need to emit + // CFI pseudo-instructions. + assert(Entry->getParent()->getFunction()->hasFnAttribute( + Attribute::NoUnwind) && + "Function should be nounwind in insertCopiesSplitCSR!"); + Entry->addLiveIn(*I); + BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) + .addReg(*I); + + // Insert the copy-back instructions right before the terminator. + for (auto *Exit : Exits) + BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), + TII->get(TargetOpcode::COPY), *I) + .addReg(NewVR); } }