X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86ISelLowering.cpp;h=dd9966f9e1791a34f7ab4d6eb358ff2529b7ad0a;hp=f5b38b396965671b64b26433ca8ab67083a78d19;hb=f65e46be097186d748836d42c38a6dc7f30e6c3b;hpb=c30d240fb899bd0ad8295642447f6d48421113d1 diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f5b38b39696..dd9966f9e17 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -18,6 +18,7 @@ #include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" +#include "X86ShuffleDecodeConstantPool.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "llvm/ADT/SmallBitVector.h" @@ -25,7 +26,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -199,23 +200,29 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); } - // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 - // are Legal, f80 is custom lowered. - setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); - setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); - // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have // this operation. setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); - if (X86ScalarSSEf32) { - setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); - // f32 and f64 cases are Legal, f80 case is not - setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + if (!Subtarget->useSoftFloat()) { + // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 + // are Legal, f80 is custom lowered. + setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); + setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); + + if (X86ScalarSSEf32) { + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); + // f32 and f64 cases are Legal, f80 case is not + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + } else { + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + } } else { - setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); - setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand); + setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand); } // Handle FP_TO_UINT by promoting the destination to a larger signed @@ -258,7 +265,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Without SSE, i64->f64 goes through memory. setOperationAction(ISD::BITCAST , MVT::i64 , Expand); } - } + } else if (!Subtarget->is64Bit()) + setOperationAction(ISD::BITCAST , MVT::i64 , Custom); // Scalar integer divide and remainder are lowered to use operations that // produce two results, to match the available instructions. This exposes @@ -290,6 +298,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BR_CC , MVT::f32, Expand); setOperationAction(ISD::BR_CC , MVT::f64, Expand); setOperationAction(ISD::BR_CC , MVT::f80, Expand); + setOperationAction(ISD::BR_CC , MVT::f128, Expand); setOperationAction(ISD::BR_CC , MVT::i8, Expand); setOperationAction(ISD::BR_CC , MVT::i16, Expand); setOperationAction(ISD::BR_CC , MVT::i32, Expand); @@ -297,6 +306,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT_CC , MVT::f32, Expand); setOperationAction(ISD::SELECT_CC , MVT::f64, Expand); setOperationAction(ISD::SELECT_CC , MVT::f80, Expand); + setOperationAction(ISD::SELECT_CC , MVT::f128, Expand); setOperationAction(ISD::SELECT_CC , MVT::i8, Expand); setOperationAction(ISD::SELECT_CC , MVT::i16, Expand); setOperationAction(ISD::SELECT_CC , MVT::i32, Expand); @@ -409,15 +419,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT , MVT::f32 , Custom); setOperationAction(ISD::SELECT , MVT::f64 , Custom); setOperationAction(ISD::SELECT , MVT::f80 , Custom); + setOperationAction(ISD::SELECT , MVT::f128 , Custom); setOperationAction(ISD::SETCC , MVT::i8 , Custom); setOperationAction(ISD::SETCC , MVT::i16 , Custom); setOperationAction(ISD::SETCC , MVT::i32 , Custom); setOperationAction(ISD::SETCC , MVT::f32 , Custom); setOperationAction(ISD::SETCC , MVT::f64 , Custom); setOperationAction(ISD::SETCC , MVT::f80 , Custom); + setOperationAction(ISD::SETCC , MVT::f128 , Custom); + setOperationAction(ISD::SETCCE , MVT::i8 , Custom); + setOperationAction(ISD::SETCCE , MVT::i16 , Custom); + setOperationAction(ISD::SETCCE , MVT::i32 , Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::SELECT , MVT::i64 , Custom); setOperationAction(ISD::SETCC , MVT::i64 , Custom); + setOperationAction(ISD::SETCCE , MVT::i64 , Custom); } setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support @@ -609,8 +625,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); - // Long double always uses X87. + // Long double always uses X87, except f128 in MMX. if (!Subtarget->useSoftFloat()) { + if (Subtarget->is64Bit() && Subtarget->hasMMX()) { + addRegisterClass(MVT::f128, &X86::FR128RegClass); + ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat); + setOperationAction(ISD::FABS , MVT::f128, Custom); + setOperationAction(ISD::FNEG , MVT::f128, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); + } + addRegisterClass(MVT::f80, &X86::RFP80RegClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); @@ -1138,7 +1162,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); - if (Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()) { + if (Subtarget->hasAnyFMA()) { setOperationAction(ISD::FMA, MVT::v8f32, Legal); setOperationAction(ISD::FMA, MVT::v4f64, Legal); setOperationAction(ISD::FMA, MVT::v4f32, Legal); @@ -1311,6 +1335,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::SETCC, MVT::i1, Custom); + setOperationAction(ISD::SETCCE, MVT::i1, Custom); setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); setOperationAction(ISD::XOR, MVT::i1, Legal); setOperationAction(ISD::OR, MVT::i1, Legal); @@ -1330,6 +1355,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FDIV, MVT::v16f32, Legal); setOperationAction(ISD::FSQRT, MVT::v16f32, Legal); setOperationAction(ISD::FNEG, MVT::v16f32, Custom); + setOperationAction(ISD::FABS, MVT::v16f32, Custom); setOperationAction(ISD::FADD, MVT::v8f64, Legal); setOperationAction(ISD::FSUB, MVT::v8f64, Legal); @@ -1337,6 +1363,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FDIV, MVT::v8f64, Legal); setOperationAction(ISD::FSQRT, MVT::v8f64, Legal); setOperationAction(ISD::FNEG, MVT::v8f64, Custom); + setOperationAction(ISD::FABS, MVT::v8f64, Custom); setOperationAction(ISD::FMA, MVT::v8f64, Legal); setOperationAction(ISD::FMA, MVT::v16f32, Legal); @@ -1374,6 +1401,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); + } else { + setOperationAction(ISD::MLOAD, MVT::v8i32, Custom); + setOperationAction(ISD::MLOAD, MVT::v8f32, Custom); + setOperationAction(ISD::MSTORE, MVT::v8i32, Custom); + setOperationAction(ISD::MSTORE, MVT::v8f32, Custom); } setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); @@ -1440,7 +1472,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom); setOperationAction(ISD::SETCC, MVT::v16i1, Custom); setOperationAction(ISD::SETCC, MVT::v8i1, Custom); @@ -1449,6 +1481,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); @@ -1495,17 +1528,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget->hasCDI()) { setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i64, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i32, Legal); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i32, Expand); setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); setOperationAction(ISD::CTLZ, MVT::v16i16, Custom); setOperationAction(ISD::CTLZ, MVT::v32i8, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i16, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i8, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i16, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i8, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i16, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i8, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i16, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i8, Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom); @@ -1515,10 +1548,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTLZ, MVT::v8i32, Legal); setOperationAction(ISD::CTLZ, MVT::v2i64, Legal); setOperationAction(ISD::CTLZ, MVT::v4i32, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Legal); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Legal); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); @@ -1529,10 +1562,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTLZ, MVT::v8i32, Custom); setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand); } } // Subtarget->hasCDI() @@ -1549,7 +1582,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::OR, VT, Legal); setOperationAction(ISD::XOR, VT, Legal); } - if (EltSize >= 32 && VT.getSizeInBits() <= 512) { + if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) { setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } @@ -1575,6 +1608,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); + setOperationAction(ISD::MGATHER, VT, Legal); + setOperationAction(ISD::MSCATTER, VT, Custom); } } for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) { @@ -1601,8 +1636,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MUL, MVT::v32i16, Legal); setOperationAction(ISD::MULHS, MVT::v32i16, Legal); setOperationAction(ISD::MULHU, MVT::v32i16, Legal); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Legal); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); @@ -1650,13 +1685,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget->hasCDI()) { setOperationAction(ISD::CTLZ, MVT::v32i16, Custom); setOperationAction(ISD::CTLZ, MVT::v64i8, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i16, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i16, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8, Expand); } for (auto VT : { MVT::v64i8, MVT::v32i16 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + + setOperationAction(ISD::AND, VT, Promote); + AddPromotedToType (ISD::AND, VT, MVT::v8i64); + setOperationAction(ISD::OR, VT, Promote); + AddPromotedToType (ISD::OR, VT, MVT::v8i64); + setOperationAction(ISD::XOR, VT, Promote); + AddPromotedToType (ISD::XOR, VT, MVT::v8i64); } } @@ -1700,8 +1745,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - if (!Subtarget->is64Bit()) + if (!Subtarget->is64Bit()) { setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); + } // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. @@ -1763,12 +1810,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); + setTargetDAGCombine(ISD::FNEG); setTargetDAGCombine(ISD::FMA); + setTargetDAGCombine(ISD::FMINNUM); + setTargetDAGCombine(ISD::FMAXNUM); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::MLOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::MSTORE); + setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); @@ -1779,6 +1830,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); + setTargetDAGCombine(ISD::MSCATTER); + setTargetDAGCombine(ISD::MGATHER); computeRegisterProperties(Subtarget->getRegisterInfo()); @@ -2149,6 +2202,9 @@ X86TargetLowering::LowerReturn(SDValue Chain, MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); + if (CallConv == CallingConv::X86_INTR && !Outs.empty()) + report_fatal_error("X86 interrupts may not return any value"); + SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); @@ -2256,13 +2312,28 @@ X86TargetLowering::LowerReturn(SDValue Chain, DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); } + const X86RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *I = + TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); + if (I) { + for (; *I; ++I) { + if (X86::GR64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i64)); + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + } + } + RetOps[0] = Chain; // Update chain. // Add the flag if we have it. if (Flag.getNode()) RetOps.push_back(Flag); - return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps); + X86ISD::NodeType opcode = X86ISD::RET_FLAG; + if (CallConv == CallingConv::X86_INTR) + opcode = X86ISD::IRET; + return DAG.getNode(opcode, dl, MVT::Other, RetOps); } bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { @@ -2341,7 +2412,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, EVT CopyVT = VA.getLocVT(); // If this is x86-64, and we disabled SSE, we can't return FP values - if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && + if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } @@ -2392,28 +2463,28 @@ enum StructReturnType { StackStructReturn }; static StructReturnType -callIsStructReturn(const SmallVectorImpl &Outs) { +callIsStructReturn(const SmallVectorImpl &Outs, bool IsMCU) { if (Outs.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Outs[0].Flags; if (!Flags.isSRet()) return NotStructReturn; - if (Flags.isInReg()) + if (Flags.isInReg() || IsMCU) return RegStructReturn; return StackStructReturn; } /// Determines whether a function uses struct return semantics. static StructReturnType -argsAreStructReturn(const SmallVectorImpl &Ins) { +argsAreStructReturn(const SmallVectorImpl &Ins, bool IsMCU) { if (Ins.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Ins[0].Flags; if (!Flags.isSRet()) return NotStructReturn; - if (Flags.isInReg()) + if (Flags.isInReg() || IsMCU) return RegStructReturn; return StackStructReturn; } @@ -2502,6 +2573,19 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, else ValVT = VA.getValVT(); + // Calculate SP offset of interrupt parameter, re-arrange the slot normally + // taken by a return address. + int Offset = 0; + if (CallConv == CallingConv::X86_INTR) { + const X86Subtarget& Subtarget = + static_cast(DAG.getSubtarget()); + // X86 interrupts may take one or two arguments. + // On the stack there will be no return address as in regular call. + // Offset of last argument need to be set to -4/-8 bytes. + // Where offset of the first argument out of two, should be set to 0 bytes. + Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1); + } + // FIXME: For now, all byval parameter objects are marked mutable. This can be // changed with more analysis. // In case of tail call optimization mark all arguments mutable. Since they @@ -2510,10 +2594,19 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, unsigned Bytes = Flags.getByValSize(); if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); + // Adjust SP offset of interrupt parameter. + if (CallConv == CallingConv::X86_INTR) { + MFI->setObjectOffset(FI, Offset); + } return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); } else { int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, VA.getLocMemOffset(), isImmutable); + // Adjust SP offset of interrupt parameter. + if (CallConv == CallingConv::X86_INTR) { + MFI->setObjectOffset(FI, Offset); + } + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); SDValue Val = DAG.getLoad( ValVT, dl, Chain, FIN, @@ -2593,6 +2686,14 @@ SDValue X86TargetLowering::LowerFormalArguments( assert(!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); + if (CallConv == CallingConv::X86_INTR) { + bool isLegal = Ins.size() == 1 || + (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) || + (!Is64Bit && Ins[1].VT == MVT::i32))); + if (!isLegal) + report_fatal_error("X86 interrupts may take one or two arguments"); + } + // Assign locations to all of the incoming arguments. SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); @@ -2625,6 +2726,8 @@ SDValue X86TargetLowering::LowerFormalArguments( RC = &X86::FR32RegClass; else if (RegVT == MVT::f64) RC = &X86::FR64RegClass; + else if (RegVT == MVT::f128) + RC = &X86::FR128RegClass; else if (RegVT.is512BitVector()) RC = &X86::VR512RegClass; else if (RegVT.is256BitVector()) @@ -2715,8 +2818,6 @@ SDValue X86TargetLowering::LowerFormalArguments( MFI->CreateFixedObject(1, StackSize, true)); } - MachineModuleInfo &MMI = MF.getMMI(); - // Figure out if XMM registers are in use. assert(!(Subtarget->useSoftFloat() && Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && @@ -2852,12 +2953,15 @@ SDValue X86TargetLowering::LowerFormalArguments( if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, MF.getTarget().Options.GuaranteedTailCallOpt)) { FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. + } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { + // X86 interrupts must pop the error code if present + FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4); } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. if (!Is64Bit && !canGuaranteeTCO(CallConv) && !Subtarget->getTargetTriple().isOSMSVCRT() && - argsAreStructReturn(Ins) == StackStructReturn) + argsAreStructReturn(Ins, Subtarget->isTargetMCU()) == StackStructReturn) FuncInfo->setBytesToPopOnReturn(4); } @@ -2872,17 +2976,20 @@ SDValue X86TargetLowering::LowerFormalArguments( FuncInfo->setArgumentStackSize(StackSize); - if (MMI.hasWinEHFuncInfo(Fn)) { - if (Is64Bit) { - int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false); - SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64); - MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI; - SDValue Neg2 = DAG.getConstant(-2, dl, MVT::i64); - Chain = DAG.getStore(Chain, dl, Neg2, StackSlot, - MachinePointerInfo::getFixedStack( - DAG.getMachineFunction(), UnwindHelpFI), - /*isVolatile=*/true, - /*isNonTemporal=*/false, /*Alignment=*/0); + if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) { + EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn()); + if (Personality == EHPersonality::CoreCLR) { + assert(Is64Bit); + // TODO: Add a mechanism to frame lowering that will allow us to indicate + // that we'd prefer this slot be allocated towards the bottom of the frame + // (i.e. near the stack pointer after allocating the frame). Every + // funclet needs a copy of this slot in its (mostly empty) frame, and the + // offset from the bottom of this and each funclet's frame must be the + // same, so the size of funclets' (mostly empty) frames is dictated by + // how far this slot is from the bottom (since they allocate just enough + // space to accomodate holding this slot at the correct offset). + int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false); + EHInfo->PSPSymFrameIdx = PSPSymFI; } } @@ -2974,11 +3081,14 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); - StructReturnType SR = callIsStructReturn(Outs); + StructReturnType SR = callIsStructReturn(Outs, Subtarget->isTargetMCU()); bool IsSibcall = false; X86MachineFunctionInfo *X86Info = MF.getInfo(); auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); + if (CallConv == CallingConv::X86_INTR) + report_fatal_error("X86 interrupts may not be called directly"); + if (Attr.getValueAsString() == "true") isTailCall = false; @@ -3811,6 +3921,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::SHUFP: + case X86ISD::INSERTPS: case X86ISD::PALIGNR: case X86ISD::MOVLHPS: case X86ISD::MOVLHPD: @@ -3950,6 +4061,22 @@ static bool isX86CCUnsigned(unsigned X86CC) { } } +static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) { + switch (SetCCOpcode) { + default: llvm_unreachable("Invalid integer condition!"); + case ISD::SETEQ: return X86::COND_E; + case ISD::SETGT: return X86::COND_G; + case ISD::SETGE: return X86::COND_GE; + case ISD::SETLT: return X86::COND_L; + case ISD::SETLE: return X86::COND_LE; + case ISD::SETNE: return X86::COND_NE; + case ISD::SETULT: return X86::COND_B; + case ISD::SETUGT: return X86::COND_A; + case ISD::SETULE: return X86::COND_BE; + case ISD::SETUGE: return X86::COND_AE; + } +} + /// Do a one-to-one translation of a ISD::CondCode to the X86-specific /// condition code, returning the condition code and the LHS/RHS of the /// comparison to make. @@ -3973,19 +4100,7 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP, } } - switch (SetCCOpcode) { - default: llvm_unreachable("Invalid integer condition!"); - case ISD::SETEQ: return X86::COND_E; - case ISD::SETGT: return X86::COND_G; - case ISD::SETGE: return X86::COND_GE; - case ISD::SETLT: return X86::COND_L; - case ISD::SETLE: return X86::COND_LE; - case ISD::SETNE: return X86::COND_NE; - case ISD::SETULT: return X86::COND_B; - case ISD::SETUGT: return X86::COND_A; - case ISD::SETULE: return X86::COND_BE; - case ISD::SETUGE: return X86::COND_AE; - } + return TranslateIntegerX86CC(SetCCOpcode); } // First determine if it is required or is profitable to flip the operands. @@ -4057,6 +4172,35 @@ static bool hasFPCMov(unsigned X86CC) { } } + +bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const { + + const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic); + if (!IntrData) + return false; + + switch (IntrData->Type) { + case LOADA: + case LOADU: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(I.getType()); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = (IntrData->Type == LOADA ? Info.memVT.getSizeInBits()/8 : 1); + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + return true; + } + default: + break; + } + + return false; +} + /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. @@ -4247,19 +4391,9 @@ unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 256); } -/// Returns true if V is a constant integer zero. -static bool isZero(SDValue V) { - ConstantSDNode *C = dyn_cast(V); - return C && C->isNullValue(); -} - /// Returns true if Elt is a constant zero or a floating point constant +0.0. bool X86::isZeroNode(SDValue Elt) { - if (isZero(Elt)) - return true; - if (ConstantFPSDNode *CFP = dyn_cast(Elt)) - return CFP->getValueAPF().isPosZero(); - return false; + return isNullConstant(Elt) || isNullFPConstant(Elt); } // Build a vector of constants @@ -4297,7 +4431,7 @@ static SDValue getConstVector(ArrayRef Values, MVT VT, } /// Returns a vector of specified type with all zero elements. -static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, +static SDValue getZeroVector(MVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); @@ -4467,6 +4601,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8); + Result = DAG.getBitcast(CastVT, Result); Vec256 = DAG.getBitcast(CastVT, Vec256); Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask); return DAG.getBitcast(ResultVT, Vec256); @@ -4481,6 +4616,81 @@ static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); } +/// Insert i1-subvector to i1-vector. +static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) { + + SDLoc dl(Op); + SDValue Vec = Op.getOperand(0); + SDValue SubVec = Op.getOperand(1); + SDValue Idx = Op.getOperand(2); + + if (!isa(Idx)) + return SDValue(); + + unsigned IdxVal = cast(Idx)->getZExtValue(); + if (IdxVal == 0 && Vec.isUndef()) // the operation is legal + return Op; + + MVT OpVT = Op.getSimpleValueType(); + MVT SubVecVT = SubVec.getSimpleValueType(); + unsigned NumElems = OpVT.getVectorNumElements(); + unsigned SubVecNumElems = SubVecVT.getVectorNumElements(); + + assert(IdxVal + SubVecNumElems <= NumElems && + IdxVal % SubVecVT.getSizeInBits() == 0 && + "Unexpected index value in INSERT_SUBVECTOR"); + + // There are 3 possible cases: + // 1. Subvector should be inserted in the lower part (IdxVal == 0) + // 2. Subvector should be inserted in the upper part + // (IdxVal + SubVecNumElems == NumElems) + // 3. Subvector should be inserted in the middle (for example v2i1 + // to v16i1, index 2) + + SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); + SDValue Undef = DAG.getUNDEF(OpVT); + SDValue WideSubVec = + DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, SubVec, ZeroIdx); + if (Vec.isUndef()) + return DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec, + DAG.getConstant(IdxVal, dl, MVT::i8)); + + if (ISD::isBuildVectorAllZeros(Vec.getNode())) { + unsigned ShiftLeft = NumElems - SubVecNumElems; + unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; + WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec, + DAG.getConstant(ShiftLeft, dl, MVT::i8)); + return ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, OpVT, WideSubVec, + DAG.getConstant(ShiftRight, dl, MVT::i8)) : WideSubVec; + } + + if (IdxVal == 0) { + // Zero lower bits of the Vec + SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); + Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); + // Merge them together + return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec); + } + + // Simple case when we put subvector in the upper part + if (IdxVal + SubVecNumElems == NumElems) { + // Zero upper bits of the Vec + WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, + DAG.getConstant(IdxVal, dl, MVT::i8)); + SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); + Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); + return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec); + } + // Subvector should be inserted in the middle - use shuffle + SmallVector Mask; + for (unsigned i = 0; i < NumElems; ++i) + Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ? + i : i + NumElems); + return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask); +} + /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 /// instructions. This is used because creating CONCAT_VECTOR nodes of /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower @@ -4577,8 +4787,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, /// uses one source. Note that this will set IsUnary for shuffles which use a /// single input multiple times, and in those cases it will /// adjust the mask to only have indices within that single input. -/// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero. -static bool getTargetShuffleMask(SDNode *N, MVT VT, +static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, SmallVectorImpl &Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); SDValue ImmN; @@ -4595,6 +4804,11 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, DecodeSHUFPMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; + case X86ISD::INSERTPS: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeINSERTPSMask(cast(ImmN)->getZExtValue(), Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + break; case X86ISD::UNPCKH: DecodeUNPCKHMask(VT, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); @@ -4687,8 +4901,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, if (auto *C = dyn_cast(MaskCP->getConstVal())) { DecodePSHUFBMask(C, Mask); - if (Mask.empty()) - return false; break; } @@ -4706,11 +4918,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, case X86ISD::VPERM2X128: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2X128Mask(VT, cast(ImmN)->getZExtValue(), Mask); - if (Mask.empty()) return false; - // Mask only contains negative index if an element is zero. - if (std::any_of(Mask.begin(), Mask.end(), - [](int M){ return M == SM_SentinelZero; })) - return false; + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVSLDUP: DecodeMOVSLDUPMask(VT, Mask); @@ -4759,8 +4967,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, if (MaskNode->getOpcode() == X86ISD::VBROADCAST) { unsigned NumEltsInMask = MaskNode->getNumOperands(); MaskNode = MaskNode->getOperand(0); - auto *CN = dyn_cast(MaskNode); - if (CN) { + if (auto *CN = dyn_cast(MaskNode)) { APInt MaskEltValue = CN->getAPIntValue(); for (unsigned i = 0; i < NumEltsInMask; ++i) RawMask.push_back(MaskEltValue.getLoBits(MaskLoBits).getZExtValue()); @@ -4783,11 +4990,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) return false; - auto *C = dyn_cast(MaskCP->getConstVal()); - if (C) { + if (auto *C = dyn_cast(MaskCP->getConstVal())) { DecodeVPERMVMask(C, VT, Mask); - if (Mask.empty()) - return false; break; } return false; @@ -4836,11 +5040,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) return false; - auto *C = dyn_cast(MaskCP->getConstVal()); - if (C) { + if (auto *C = dyn_cast(MaskCP->getConstVal())) { DecodeVPERMV3Mask(C, VT, Mask); - if (Mask.empty()) - return false; break; } return false; @@ -4848,6 +5049,16 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, default: llvm_unreachable("unknown target shuffle node"); } + // Empty mask indicates the decode failed. + if (Mask.empty()) + return false; + + // Check if we're getting a shuffle mask with zero'd elements. + if (!AllowSentinelZero) + if (std::any_of(Mask.begin(), Mask.end(), + [](int M){ return M == SM_SentinelZero; })) + return false; + // If we have a fake unary shuffle, the shuffle mask is spread across two // inputs that are actually the same node. Re-map the mask to always point // into the first input. @@ -4886,19 +5097,19 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { MVT ShufVT = V.getSimpleValueType(); - unsigned NumElems = ShufVT.getVectorNumElements(); + int NumElems = (int)ShufVT.getVectorNumElements(); SmallVector ShuffleMask; bool IsUnary; - if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) + if (!getTargetShuffleMask(N, ShufVT, false, ShuffleMask, IsUnary)) return SDValue(); int Elt = ShuffleMask[Index]; - if (Elt < 0) + if (Elt == SM_SentinelUndef) return DAG.getUNDEF(ShufVT.getVectorElementType()); - SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) - : N->getOperand(1); + assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range"); + SDValue NewV = (Elt < NumElems) ? N->getOperand(0) : N->getOperand(1); return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } @@ -6152,7 +6363,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { unsigned NumZero = 0; unsigned NumNonZero = 0; - unsigned NonZeros = 0; + uint64_t NonZeros = 0; bool IsAllConstants = true; SmallSet Values; for (unsigned i = 0; i < NumElems; ++i) { @@ -6166,7 +6377,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (X86::isZeroNode(Elt)) NumZero++; else { - NonZeros |= (1 << i); + assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range. + NonZeros |= ((uint64_t)1 << i); NumNonZero++; } } @@ -6322,7 +6534,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // One half is zero or undef. unsigned Idx = countTrailingZeros(NonZeros); SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, - Op.getOperand(Idx)); + Op.getOperand(Idx)); return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); } return SDValue(); @@ -6330,13 +6542,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8 && NumElems == 16) - if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, - Subtarget, *this)) + if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero, + DAG, Subtarget, *this)) return V; if (EVTBits == 16 && NumElems == 8) - if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, - Subtarget, *this)) + if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero, + DAG, Subtarget, *this)) return V; // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS @@ -6348,7 +6560,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SmallVector V(NumElems); if (NumElems == 4 && NumZero > 0) { for (unsigned i = 0; i < 4; ++i) { - bool isZero = !(NonZeros & (1 << i)); + bool isZero = !(NonZeros & (1ULL << i)); if (isZero) V[i] = getZeroVector(VT, Subtarget, DAG, dl); else @@ -6484,7 +6696,26 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, assert(isPowerOf2_32(NumOfOperands) && "Unexpected number of operands in CONCAT_VECTORS"); + SDValue Undef = DAG.getUNDEF(ResVT); if (NumOfOperands > 2) { + // Specialize the cases when all, or all but one, of the operands are undef. + unsigned NumOfDefinedOps = 0; + unsigned OpIdx = 0; + for (unsigned i = 0; i < NumOfOperands; i++) + if (!Op.getOperand(i).isUndef()) { + NumOfDefinedOps++; + OpIdx = i; + } + if (NumOfDefinedOps == 0) + return Undef; + if (NumOfDefinedOps == 1) { + unsigned SubVecNumElts = + Op.getOperand(OpIdx).getValueType().getVectorNumElements(); + SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, + Op.getOperand(OpIdx), IdxVal); + } + MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), ResVT.getVectorNumElements()/2); SmallVector Ops; @@ -6498,31 +6729,38 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } + // 2 operands SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); + unsigned NumElems = ResVT.getVectorNumElements(); + assert(V1.getValueType() == V2.getValueType() && + V1.getValueType().getVectorNumElements() == NumElems/2 && + "Unexpected operands in CONCAT_VECTORS"); + + if (ResVT.getSizeInBits() >= 16) + return Op; // The operation is legal with KUNPCK + bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode()); bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode()); - + SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl); if (IsZeroV1 && IsZeroV2) - return getZeroVector(ResVT, Subtarget, DAG, dl); + return ZeroVec; SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); - SDValue Undef = DAG.getUNDEF(ResVT); - unsigned NumElems = ResVT.getVectorNumElements(); - SDValue ShiftBits = DAG.getConstant(NumElems/2, dl, MVT::i8); + if (V2.isUndef()) + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); + if (IsZeroV2) + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx); + + SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl); + if (V1.isUndef()) + V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal); - V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, ZeroIdx); - V2 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V2, ShiftBits); if (IsZeroV1) - return V2; + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal); V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); - // Zero the upper bits of V1 - V1 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V1, ShiftBits); - V1 = DAG.getNode(X86ISD::VSRLI, dl, ResVT, V1, ShiftBits); - if (IsZeroV2) - return V1; - return DAG.getNode(ISD::OR, dl, ResVT, V1, V2); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal); } static SDValue LowerCONCAT_VECTORS(SDValue Op, @@ -7336,8 +7574,9 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1, SDValue &V = (M < Size ? V1 : V2); M = M % Size; - // All mask elements must be in the lower half. - if (M >= HalfSize) + // The extracted elements must start at a valid index and all mask + // elements must be in the lower half. + if (i > M || M >= HalfSize) return SDValue(); if (Idx < 0 || (Src == V && Idx == (M - i))) { @@ -7913,6 +8152,7 @@ static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0, /// For convenience, this code also bundles all of the subtarget feature set /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. +/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them? static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, ArrayRef Mask, const X86Subtarget *Subtarget, @@ -7976,6 +8216,13 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG)) return TruncBroadcast; + MVT BroadcastVT = VT; + + // Peek through any bitcast (only useful for loads). + SDValue BC = V; + while (BC.getOpcode() == ISD::BITCAST) + BC = BC.getOperand(0); + // Also check the simpler case, where we can directly reuse the scalar. if (V.getOpcode() == ISD::BUILD_VECTOR || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { @@ -7985,13 +8232,32 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, // Only AVX2 has register broadcasts. if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) return SDValue(); + } else if (MayFoldLoad(BC) && !cast(BC)->isVolatile()) { + // 32-bit targets need to load i64 as a f64 and then bitcast the result. + if (!Subtarget->is64Bit() && VT.getScalarType() == MVT::i64) + BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); + + // If we are broadcasting a load that is only used by the shuffle + // then we can reduce the vector load to the broadcasted scalar load. + LoadSDNode *Ld = cast(BC); + SDValue BaseAddr = Ld->getOperand(1); + EVT AddrVT = BaseAddr.getValueType(); + EVT SVT = BroadcastVT.getScalarType(); + unsigned Offset = BroadcastIdx * SVT.getStoreSize(); + SDValue NewAddr = DAG.getNode( + ISD::ADD, DL, AddrVT, BaseAddr, + DAG.getConstant(Offset, DL, AddrVT)); + V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, + DAG.getMachineFunction().getMachineMemOperand( + Ld->getMemOperand(), Offset, SVT.getStoreSize())); } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) { // We can't broadcast from a vector register without AVX2, and we can only // broadcast from the zero-element of a vector register. return SDValue(); } - return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V); + V = DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, V); + return DAG.getBitcast(VT, V); } // Check for whether we can use INSERTPS to perform the shuffle. We only use @@ -10153,6 +10419,108 @@ static SDValue lowerVectorShuffleByMerging128BitLanes( return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask); } +/// Lower shuffles where an entire half of a 256-bit vector is UNDEF. +/// This allows for fast cases such as subvector extraction/insertion +/// or shuffling smaller vector types which can lower more efficiently. +static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(VT.getSizeInBits() == 256 && "Expected 256-bit vector"); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfNumElts = NumElts / 2; + MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); + + bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts); + bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts); + if (!UndefLower && !UndefUpper) + return SDValue(); + + // Upper half is undef and lower half is whole upper subvector. + // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> + if (UndefUpper && + isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(HalfNumElts, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(0, DL)); + } + + // Lower half is undef and upper half is whole lower subvector. + // e.g. vector_shuffle or + if (UndefLower && + isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(0, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(HalfNumElts, DL)); + } + + // AVX2 supports efficient immediate 64-bit element cross-lane shuffles. + if (UndefLower && Subtarget->hasAVX2() && + (VT == MVT::v4f64 || VT == MVT::v4i64)) + return SDValue(); + + // If the shuffle only uses the lower halves of the input operands, + // then extract them and perform the 'half' shuffle at half width. + // e.g. vector_shuffle or + int HalfIdx1 = -1, HalfIdx2 = -1; + SmallVector HalfMask; + unsigned Offset = UndefLower ? HalfNumElts : 0; + for (unsigned i = 0; i != HalfNumElts; ++i) { + int M = Mask[i + Offset]; + if (M < 0) { + HalfMask.push_back(M); + continue; + } + + // Determine which of the 4 half vectors this element is from. + // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2. + int HalfIdx = M / HalfNumElts; + + // Only shuffle using the lower halves of the inputs. + // TODO: Investigate usefulness of shuffling with upper halves. + if (HalfIdx != 0 && HalfIdx != 2) + return SDValue(); + + // Determine the element index into its half vector source. + int HalfElt = M % HalfNumElts; + + // We can shuffle with up to 2 half vectors, set the new 'half' + // shuffle mask accordingly. + if (-1 == HalfIdx1 || HalfIdx1 == HalfIdx) { + HalfMask.push_back(HalfElt); + HalfIdx1 = HalfIdx; + continue; + } + if (-1 == HalfIdx2 || HalfIdx2 == HalfIdx) { + HalfMask.push_back(HalfElt + HalfNumElts); + HalfIdx2 = HalfIdx; + continue; + } + + // Too many half vectors referenced. + return SDValue(); + } + assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"); + + auto GetHalfVector = [&](int HalfIdx) { + if (HalfIdx < 0) + return DAG.getUNDEF(HalfVT); + SDValue V = (HalfIdx < 2 ? V1 : V2); + HalfIdx = (HalfIdx % 2) * HalfNumElts; + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V, + DAG.getIntPtrConstant(HalfIdx, DL)); + }; + + SDValue Half1 = GetHalfVector(HalfIdx1); + SDValue Half2 = GetHalfVector(HalfIdx2); + SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, + DAG.getIntPtrConstant(Offset, DL)); +} + /// \brief Test whether the specified input (0 or 1) is in-place blended by the /// given mask. /// @@ -10429,9 +10797,7 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (Subtarget->hasAVX2()) return DAG.getNode( X86ISD::VPERMV, DL, MVT::v8f32, - DAG.getBitcast(MVT::v8f32, DAG.getNode(ISD::BUILD_VECTOR, DL, - MVT::v8i32, VPermMask)), - V1); + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1); // Otherwise, fall back. return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, @@ -10724,6 +11090,11 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, VT, V1, V2, Mask, Subtarget, DAG)) return Insertion; + // Handle special cases where the lower or upper half is UNDEF. + if (SDValue V = + lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) + return V; + // There is a really nice hard cut-over between AVX1 and AVX2 that means we // can check for those subtargets here and avoid much of the subtarget // querying in the per-vector-type lowering routines. With AVX1 we have @@ -11173,7 +11544,7 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, unsigned &MaskValue) { MaskValue = 0; unsigned NumElems = BuildVector->getNumOperands(); - + // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. // We don't handle the >2 lanes case right now. unsigned NumLanes = (NumElems - 1) / 8 + 1; @@ -11190,9 +11561,9 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, int Lane1Cond = -1, Lane2Cond = -1; if (isa(EltCond)) - Lane1Cond = !isZero(EltCond); + Lane1Cond = !isNullConstant(EltCond); if (isa(SndLaneEltCond)) - Lane2Cond = !isZero(SndLaneEltCond); + Lane2Cond = !isNullConstant(SndLaneEltCond); unsigned LaneMask = 0; if (Lane1Cond == Lane2Cond || Lane2Cond < 0) @@ -11233,7 +11604,8 @@ static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) { SDValue CondElt = CondBV->getOperand(i); Mask.push_back( - isa(CondElt) ? i + (isZero(CondElt) ? Size : 0) : -1); + isa(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0) + : -1); } return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask); } @@ -11298,9 +11670,8 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { } if (VT.getSizeInBits() == 16) { - unsigned Idx = cast(Op.getOperand(1))->getZExtValue(); // If Idx is 0, it's cheaper to do a move instead of a pextrw. - if (Idx == 0) + if (isNullConstant(Op.getOperand(1))) return DAG.getNode( ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, @@ -11323,8 +11694,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { return SDValue(); SDNode *User = *Op.getNode()->use_begin(); if ((User->getOpcode() != ISD::STORE || - (isa(Op.getOperand(1)) && - cast(Op.getOperand(1))->isNullValue())) && + isNullConstant(Op.getOperand(1))) && (User->getOpcode() != ISD::BITCAST || User->getValueType(0) != MVT::i32)) return SDValue(); @@ -11441,8 +11811,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // TODO: handle v16i8. if (VT.getSizeInBits() == 16) { SDValue Vec = Op.getOperand(0); - unsigned Idx = cast(Op.getOperand(1))->getZExtValue(); - if (Idx == 0) + if (isNullConstant(Op.getOperand(1))) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), @@ -11474,8 +11843,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught // to match extract_elt for f64. - unsigned Idx = cast(Op.getOperand(1))->getZExtValue(); - if (Idx == 0) + if (isNullConstant(Op.getOperand(1))) return Op; // UNPCKHPD the element to the lowest double word, then movsd. @@ -11753,37 +12121,9 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); - if (OpVT.getVectorElementType() == MVT::i1) { - if (IdxVal == 0 && Vec.getOpcode() == ISD::UNDEF) // the operation is legal - return Op; - SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); - SDValue Undef = DAG.getUNDEF(OpVT); - unsigned NumElems = OpVT.getVectorNumElements(); - SDValue ShiftBits = DAG.getConstant(NumElems/2, dl, MVT::i8); - - if (IdxVal == OpVT.getVectorNumElements() / 2) { - // Zero upper bits of the Vec - Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); - - SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, - SubVec, ZeroIdx); - Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits); - return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2); - } - if (IdxVal == 0) { - SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, - SubVec, ZeroIdx); - // Zero upper bits of the Vec2 - Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits); - Vec2 = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec2, ShiftBits); - // Zero lower bits of the Vec - Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); - // Merge them together - return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2); - } - } + if (OpVT.getVectorElementType() == MVT::i1) + return Insert1BitVector(Op, DAG); + return SDValue(); } @@ -12137,6 +12477,12 @@ SDValue X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GA = cast(Op); + + // Cygwin uses emutls. + // FIXME: It may be EmulatedTLS-generic also for X86-Android. + if (Subtarget->isTargetWindowsCygwin()) + return LowerToTLSEmulatedModel(GA, DAG); + const GlobalValue *GV = GA->getGlobal(); auto PtrVT = getPointerTy(DAG.getDataLayout()); @@ -12191,8 +12537,12 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // location. SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL); SDValue Args[] = { Chain, Offset }; Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); + Chain = + DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true), + DAG.getIntPtrConstant(0, DL, true), SDValue(), DL); // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); @@ -12365,13 +12715,21 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, return Op; } + SDValue ValueToStore = Op.getOperand(0); + if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && + !Subtarget->is64Bit()) + // Bitcasting to f64 here allows us to do a single 64-bit store from + // an SSE register, avoiding the store forwarding penalty that would come + // with two 32-bit stores. + ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); + unsigned Size = SrcVT.getSizeInBits()/8; MachineFunction &MF = DAG.getMachineFunction(); auto PtrVT = getPointerTy(MF.getDataLayout()); int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); SDValue Chain = DAG.getStore( - DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, + DAG.getEntryNode(), dl, ValueToStore, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false, false, 0); return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); @@ -12744,7 +13102,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, } assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), + SDValue ValueToStore = Op.getOperand(0); + if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget->is64Bit()) + // Bitcasting to f64 here allows us to do a single 64-bit store from + // an SSE register, avoiding the store forwarding penalty that would come + // with two 32-bit stores. + ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot, MachinePointerInfo(), false, false, 0); // For i64 source, we need to add the appropriate power of 2 if the input @@ -13082,6 +13446,62 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, return SDValue(); } +static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + SDValue In = Op.getOperand(0); + MVT InVT = In.getSimpleValueType(); + + assert(VT.getVectorElementType() == MVT::i1 && "Unexected vector type."); + + // Shift LSB to MSB and use VPMOVB2M - SKX. + unsigned ShiftInx = InVT.getScalarSizeInBits() - 1; + if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 && + Subtarget->hasBWI()) || // legal, will go to VPMOVB2M, VPMOVW2M + ((InVT.is256BitVector() || InVT.is128BitVector()) && + InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI() && + Subtarget->hasVLX())) { // legal, will go to VPMOVB2M, VPMOVW2M + // Shift packed bytes not supported natively, bitcast to dword + MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT, + DAG.getBitcast(ExtVT, In), + DAG.getConstant(ShiftInx, DL, ExtVT)); + ShiftNode = DAG.getBitcast(InVT, ShiftNode); + return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); + } + if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 && + Subtarget->hasDQI()) || // legal, will go to VPMOVD2M, VPMOVQ2M + ((InVT.is256BitVector() || InVT.is128BitVector()) && + InVT.getScalarSizeInBits() >= 32 && Subtarget->hasDQI() && + Subtarget->hasVLX())) { // legal, will go to VPMOVD2M, VPMOVQ2M + + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, + DAG.getConstant(ShiftInx, DL, InVT)); + return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); + } + + // Shift LSB to MSB, extend if necessary and use TESTM. + unsigned NumElts = InVT.getVectorNumElements(); + if (InVT.getSizeInBits() < 512 && + (InVT.getScalarType() == MVT::i8 || InVT.getScalarType() == MVT::i16 || + !Subtarget->hasVLX())) { + assert((NumElts == 8 || NumElts == 16) && "Unexected vector type."); + + // TESTD/Q should be used (if BW supported we use CVT2MASK above), + // so vector should be extended to packed dword/qword. + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); + In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); + InVT = ExtVT; + ShiftInx = InVT.getScalarSizeInBits() - 1; + } + + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, + DAG.getConstant(ShiftInx, DL, InVT)); + return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode); +} + SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); @@ -13099,39 +13519,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); - // move vector to mask - truncate solution for SKX - if (VT.getVectorElementType() == MVT::i1) { - if (InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 && - Subtarget->hasBWI()) - return Op; // legal, will go to VPMOVB2M, VPMOVW2M - if ((InVT.is256BitVector() || InVT.is128BitVector()) - && InVT.getScalarSizeInBits() <= 16 && - Subtarget->hasBWI() && Subtarget->hasVLX()) - return Op; // legal, will go to VPMOVB2M, VPMOVW2M - if (InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 && - Subtarget->hasDQI()) - return Op; // legal, will go to VPMOVD2M, VPMOVQ2M - if ((InVT.is256BitVector() || InVT.is128BitVector()) - && InVT.getScalarSizeInBits() >= 32 && - Subtarget->hasDQI() && Subtarget->hasVLX()) - return Op; // legal, will go to VPMOVB2M, VPMOVQ2M - } - - if (VT.getVectorElementType() == MVT::i1) { - assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); - unsigned NumElts = InVT.getVectorNumElements(); - assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type"); - if (InVT.getSizeInBits() < 512) { - MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64; - In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); - InVT = ExtVT; - } - - SDValue OneV = - DAG.getConstant(APInt::getSignBit(InVT.getScalarSizeInBits()), DL, InVT); - SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In); - return DAG.getNode(X86ISD::TESTM, DL, VT, And, And); - } + if (VT.getVectorElementType() == MVT::i1) + return LowerTruncateVecI1(Op, DAG, Subtarget); // vpmovqb/w/d, vpmovdb/w, vpmovwb if (Subtarget->hasAVX512()) { @@ -13308,6 +13697,8 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); + bool IsF128 = (VT == MVT::f128); + // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to // decide if we should generate a 16-byte constant mask when we only need 4 or // 8 bytes for the scalar case. @@ -13320,6 +13711,11 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { LogicVT = VT; EltVT = VT.getVectorElementType(); NumElts = VT.getVectorNumElements(); + } else if (IsF128) { + // SSE instructions are used for optimized f128 logical operations. + LogicVT = MVT::f128; + EltVT = VT; + NumElts = 1; } else { // There are no scalar bitwise logical SSE/AVX instructions, so we // generate a 16-byte vector constant and logic op even for the scalar case. @@ -13351,7 +13747,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; - if (VT.isVector()) + if (VT.isVector() || IsF128) return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); // For the scalar case extend to a 128-bit vector, perform the logic op, @@ -13370,6 +13766,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT SrcVT = Op1.getSimpleValueType(); + bool IsF128 = (VT == MVT::f128); // If second operand is smaller, extend it first. if (SrcVT.bitsLT(VT)) { @@ -13384,13 +13781,16 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { // At this point the operands and the result should have the same // type, and that won't be f80 since that is not custom lowered. + assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) && + "Unexpected type in LowerFCOPYSIGN"); const fltSemantics &Sem = - VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle; + VT == MVT::f64 ? APFloat::IEEEdouble : + (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle); const unsigned SizeInBits = VT.getSizeInBits(); SmallVector CV( - VT == MVT::f64 ? 2 : 4, + VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4), ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0)))); // First, clear all bits but the sign bit from the second operand (sign). @@ -13403,12 +13803,13 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { // Perform all logic operations as 16-byte vectors because there are no // scalar FP logic instructions in SSE. This allows load folding of the // constants into the logic instructions. - MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; + MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32); SDValue Mask1 = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, false, false, 16); - Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1); + if (!IsF128) + Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1); // Next, clear the sign bit from the first operand (magnitude). @@ -13417,8 +13818,9 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { APFloat APF = Op0CN->getValueAPF(); // If the magnitude is a positive zero, the sign bit alone is enough. if (APF.isPosZero()) - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit, - DAG.getIntPtrConstant(0, dl)); + return IsF128 ? SignBit : + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit, + DAG.getIntPtrConstant(0, dl)); APF.clearSign(); CV[0] = ConstantFP::get(*Context, APF); } else { @@ -13434,13 +13836,15 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { false, false, false, 16); // If the magnitude operand wasn't a constant, we need to AND out the sign. if (!isa(Op0)) { - Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0); + if (!IsF128) + Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0); Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val); } // OR the magnitude value with the sign bit. Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val, - DAG.getIntPtrConstant(0, dl)); + return IsF128 ? Val : + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val, + DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { @@ -13667,14 +14071,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, if (ConstantSDNode *C = dyn_cast(ArithOp.getNode()->getOperand(1))) { // An add of one will be selected as an INC. - if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) { + if (C->isOne() && !Subtarget->slowIncDec()) { Opcode = X86ISD::INC; NumOperands = 1; break; } // An add of negative one (subtract of one) will be selected as a DEC. - if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) { + if (C->isAllOnesValue() && !Subtarget->slowIncDec()) { Opcode = X86ISD::DEC; NumOperands = 1; break; @@ -13803,13 +14207,11 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, /// equivalent. SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, SDLoc dl, SelectionDAG &DAG) const { - if (ConstantSDNode *C = dyn_cast(Op1)) { - if (C->getAPIntValue() == 0) - return EmitTest(Op0, X86CC, dl, DAG); + if (isNullConstant(Op1)) + return EmitTest(Op0, X86CC, dl, DAG); - assert(Op0.getValueType() != MVT::i1 && - "Unexpected comparison operation for MVT::i1 operands"); - } + assert(!(isa(Op1) && Op0.getValueType() == MVT::i1) && + "Unexpected comparison operation for MVT::i1 operands"); if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { @@ -13855,6 +14257,9 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, DAG.getConstant(8, dl, MVT::i8)); SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); + + // Some 64-bit targets lack SAHF support, but they do support FCOMI. + assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?"); return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } @@ -13932,11 +14337,6 @@ unsigned X86TargetLowering::combineRepeatedFPDivisors() const { return 2; } -static bool isAllOnes(SDValue V) { - ConstantSDNode *C = dyn_cast(V); - return C && C->isAllOnesValue(); -} - /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node /// if it's possible. SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, @@ -13952,8 +14352,7 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, if (Op1.getOpcode() == ISD::SHL) std::swap(Op0, Op1); if (Op0.getOpcode() == ISD::SHL) { - if (ConstantSDNode *And00C = dyn_cast(Op0.getOperand(0))) - if (And00C->getZExtValue() == 1) { + if (isOneConstant(Op0.getOperand(0))) { // If we looked past a truncate, check that it's only truncating away // known zeros. unsigned BitWidth = Op0.getValueSizeInBits(); @@ -14512,11 +14911,9 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Lower ((X >>u N) & 1) != 0 to BT(X, N). // Lower ((X >>s N) & 1) != 0 to BT(X, N). if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && - Op1.getOpcode() == ISD::Constant && - cast(Op1)->isNullValue() && + isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { - SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); - if (NewSetCC.getNode()) { + if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) { if (VT == MVT::i1) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC); return NewSetCC; @@ -14525,17 +14922,14 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of // these. - if (Op1.getOpcode() == ISD::Constant && - (cast(Op1)->getZExtValue() == 1 || - cast(Op1)->isNullValue()) && + if ((isOneConstant(Op1) || isNullConstant(Op1)) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // If the input is a setcc, then reuse the input setcc or use a new one with // the inverted condition. if (Op0.getOpcode() == X86ISD::SETCC) { X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); - bool Invert = (CC == ISD::SETNE) ^ - cast(Op1)->isNullValue(); + bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1); if (!Invert) return Op0; @@ -14548,8 +14942,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return SetCC; } } - if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) && - (cast(Op1)->getZExtValue() == 1) && + if ((Op0.getValueType() == MVT::i1) && isOneConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true); @@ -14570,6 +14963,26 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return SetCC; } +SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const { + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue Carry = Op.getOperand(2); + SDValue Cond = Op.getOperand(3); + SDLoc DL(Op); + + assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only."); + X86::CondCode CC = TranslateIntegerX86CC(cast(Cond)->get()); + + assert(Carry.getOpcode() != ISD::CARRY_FALSE); + SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); + SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1)); + if (Op.getSimpleValueType() == MVT::i1) + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); + return SetCC; +} + // isX86LogicalCmp - Return true if opcode is a X86 logical comparison. static bool isX86LogicalCmp(SDValue Op) { unsigned Opc = Op.getNode()->getOpcode(); @@ -14720,22 +15133,21 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y if (Cond.getOpcode() == X86ISD::SETCC && Cond.getOperand(1).getOpcode() == X86ISD::CMP && - isZero(Cond.getOperand(1).getOperand(1))) { + isNullConstant(Cond.getOperand(1).getOperand(1))) { SDValue Cmp = Cond.getOperand(1); unsigned CondCode =cast(Cond.getOperand(0))->getZExtValue(); - if ((isAllOnes(Op1) || isAllOnes(Op2)) && + if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { - SDValue Y = isAllOnes(Op2) ? Op1 : Op2; + SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2; SDValue CmpOp0 = Cmp.getOperand(0); // Apply further optimizations for special cases // (select (x != 0), -1, 0) -> neg & sbb // (select (x == 0), 0, -1) -> neg & sbb - if (ConstantSDNode *YC = dyn_cast(Y)) - if (YC->isNullValue() && - (isAllOnes(Op1) == (CondCode == X86::COND_NE))) { + if (isNullConstant(Y) && + (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, DAG.getConstant(0, DL, @@ -14755,11 +15167,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp); - if (isAllOnes(Op1) != (CondCode == X86::COND_E)) + if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E)) Res = DAG.getNOT(DL, Res, Res.getValueType()); - ConstantSDNode *N2C = dyn_cast(Op2); - if (!N2C || !N2C->isNullValue()) + if (!isNullConstant(Op2)) Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); return Res; } @@ -14767,11 +15178,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // Look past (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && - Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { - ConstantSDNode *C = dyn_cast(Cond.getOperand(1)); - if (C && C->getAPIntValue() == 1) - Cond = Cond.getOperand(0); - } + Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && + isOneConstant(Cond.getOperand(1))) + Cond = Cond.getOperand(0); // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. @@ -14837,8 +15246,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { - SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); - if (NewSetCC.getNode()) { + if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); addTest = false; @@ -14860,11 +15268,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { unsigned CondCode = cast(CC)->getZExtValue(); if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && - (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { + (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && + (isNullConstant(Op1) || isNullConstant(Op2))) { SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getConstant(X86::COND_B, DL, MVT::i8), Cond); - if (isAllOnes(Op1) != (CondCode == X86::COND_B)) + if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B)) return DAG.getNOT(DL, Res, Res.getValueType()); return Res; } @@ -15254,11 +15663,9 @@ static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { static bool isXor1OfSetCC(SDValue Op) { if (Op.getOpcode() != ISD::XOR) return false; - ConstantSDNode *N1C = dyn_cast(Op.getOperand(1)); - if (N1C && N1C->getAPIntValue() == 1) { + if (isOneConstant(Op.getOperand(1))) return Op.getOperand(0).getOpcode() == X86ISD::SETCC && - Op.getOperand(0).hasOneUse(); - } + Op.getOperand(0).hasOneUse(); return false; } @@ -15274,8 +15681,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (Cond.getOpcode() == ISD::SETCC) { // Check for setcc([su]{add,sub,mul}o == 0). if (cast(Cond.getOperand(2))->get() == ISD::SETEQ && - isa(Cond.getOperand(1)) && - cast(Cond.getOperand(1))->isNullValue() && + isNullConstant(Cond.getOperand(1)) && Cond.getOperand(0).getResNo() == 1 && (Cond.getOperand(0).getOpcode() == ISD::SADDO || Cond.getOperand(0).getOpcode() == ISD::UADDO || @@ -15302,11 +15708,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { // Look pass (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && - Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { - ConstantSDNode *C = dyn_cast(Cond.getOperand(1)); - if (C && C->getAPIntValue() == 1) - Cond = Cond.getOperand(0); - } + Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && + isOneConstant(Cond.getOperand(1))) + Cond = Cond.getOperand(0); // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. @@ -15350,16 +15754,14 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { switch (CondOpcode) { case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; case ISD::SADDO: - if (ConstantSDNode *C = dyn_cast(RHS)) - if (C->isOne()) { + if (isOneConstant(RHS)) { X86Opcode = X86ISD::INC; X86Cond = X86::COND_O; break; } X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; case ISD::SSUBO: - if (ConstantSDNode *C = dyn_cast(RHS)) - if (C->isOne()) { + if (isOneConstant(RHS)) { X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O; break; } @@ -15521,8 +15923,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { - SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); - if (NewSetCC.getNode()) { + if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); addTest = false; @@ -15554,54 +15955,40 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SplitStack; SDLoc dl(Op); + // Get the inputs. + SDNode *Node = Op.getNode(); + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + unsigned Align = cast(Op.getOperand(2))->getZExtValue(); + EVT VT = Node->getValueType(0); + + // Chain the dynamic stack allocation so that it doesn't modify the stack + // pointer when other instructions are using the stack. + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl); + + bool Is64Bit = Subtarget->is64Bit(); + MVT SPTy = getPointerTy(DAG.getDataLayout()); + + SDValue Result; if (!Lower) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDNode* Node = Op.getNode(); - unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" - " not tell us which reg is the stack pointer!"); + " not tell us which reg is the stack pointer!"); EVT VT = Node->getValueType(0); - SDValue Tmp1 = SDValue(Node, 0); - SDValue Tmp2 = SDValue(Node, 1); SDValue Tmp3 = Node->getOperand(2); - SDValue Chain = Tmp1.getOperand(0); - // Chain the dynamic stack allocation so that it doesn't modify the stack - // pointer when other instructions are using the stack. - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), - SDLoc(Node)); - - SDValue Size = Tmp2.getOperand(1); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); unsigned Align = cast(Tmp3)->getZExtValue(); const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); unsigned StackAlign = TFI.getStackAlignment(); - Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value + Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value if (Align > StackAlign) - Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, - DAG.getConstant(-(uint64_t)Align, dl, VT)); - Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain - - Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), - DAG.getIntPtrConstant(0, dl, true), SDValue(), - SDLoc(Node)); - - SDValue Ops[2] = { Tmp1, Tmp2 }; - return DAG.getMergeValues(Ops, dl); - } - - // Get the inputs. - SDValue Chain = Op.getOperand(0); - SDValue Size = Op.getOperand(1); - unsigned Align = cast(Op.getOperand(2))->getZExtValue(); - EVT VT = Op.getNode()->getValueType(0); - - bool Is64Bit = Subtarget->is64Bit(); - MVT SPTy = getPointerTy(DAG.getDataLayout()); - - if (SplitStack) { + Result = DAG.getNode(ISD::AND, dl, VT, Result, + DAG.getConstant(-(uint64_t)Align, dl, VT)); + Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain + } else if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); if (Is64Bit) { @@ -15619,10 +16006,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); - SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, + Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, DAG.getRegister(Vreg, SPTy)); - SDValue Ops1[2] = { Value, Chain }; - return DAG.getMergeValues(Ops1, dl); } else { SDValue Flag; const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX); @@ -15644,9 +16029,14 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); } - SDValue Ops1[2] = { SP, Chain }; - return DAG.getMergeValues(Ops1, dl); + Result = SP; } + + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), + DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); + + SDValue Ops[2] = {Result, Chain}; + return DAG.getMergeValues(Ops, dl); } SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { @@ -15923,60 +16313,98 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); } -/// \brief Return (and \p Op, \p Mask) for compare instructions or -/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the -/// necessary casting or extending for \p Mask when lowering masking intrinsics -static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, - SDValue PreservedSrc, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); - MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); - SDValue VMask; - unsigned OpcodeSelect = ISD::VSELECT; - SDLoc dl(Op); +/// \brief Return Mask with the necessary casting or extending +/// for \p Mask according to \p MaskVT when lowering masking intrinsics +static SDValue getMaskNode(SDValue Mask, MVT MaskVT, + const X86Subtarget *Subtarget, + SelectionDAG &DAG, SDLoc dl) { - if (isAllOnes(Mask)) - return Op; + if (isAllOnesConstant(Mask)) + return DAG.getTargetConstant(1, dl, MaskVT); + if (X86::isZeroNode(Mask)) + return DAG.getTargetConstant(0, dl, MaskVT); + + if (MaskVT.bitsGT(Mask.getSimpleValueType())) { + // Mask should be extended + Mask = DAG.getNode(ISD::ANY_EXTEND, dl, + MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask); + } + + if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) { + if (MaskVT == MVT::v64i1) { + assert(Subtarget->hasBWI() && "Expected AVX512BW target!"); + // In case 32bit mode, bitcast i64 is illegal, extend/split it. + SDValue Lo, Hi; + Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, + DAG.getConstant(0, dl, MVT::i32)); + Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, + DAG.getConstant(1, dl, MVT::i32)); - if (MaskVT.bitsGT(Mask.getSimpleValueType())) { - MVT newMaskVT = MVT::getIntegerVT(MaskVT.getSizeInBits()); - VMask = DAG.getBitcast(MaskVT, - DAG.getNode(ISD::ANY_EXTEND, dl, newMaskVT, Mask)); + Lo = DAG.getBitcast(MVT::v32i1, Lo); + Hi = DAG.getBitcast(MVT::v32i1, Hi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi); } else { - MVT BitcastVT = MVT::getVectorVT(MVT::i1, - Mask.getSimpleValueType().getSizeInBits()); - // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements - // are extracted by EXTRACT_SUBVECTOR. - VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); + // MaskVT require < 64bit. Truncate mask (should succeed in any case), + // and bitcast. + MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits()); + return DAG.getBitcast(MaskVT, + DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask)); } - switch (Op.getOpcode()) { - default: break; - case X86ISD::PCMPEQM: - case X86ISD::PCMPGTM: - case X86ISD::CMPM: - case X86ISD::CMPMU: - return DAG.getNode(ISD::AND, dl, VT, Op, VMask); - case X86ISD::VFPCLASS: - return DAG.getNode(ISD::OR, dl, VT, Op, VMask); - case X86ISD::VTRUNC: - case X86ISD::VTRUNCS: - case X86ISD::VTRUNCUS: - // We can't use ISD::VSELECT here because it is not always "Legal" - // for the destination type. For example vpmovqb require only AVX512 - // and vselect that can operate on byte element type require BWI - OpcodeSelect = X86ISD::SELECT; - break; - } - if (PreservedSrc.getOpcode() == ISD::UNDEF) - PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc); + } else { + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); + } } -/// \brief Creates an SDNode for a predicated scalar operation. +/// \brief Return (and \p Op, \p Mask) for compare instructions or +/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the +/// necessary casting or extending for \p Mask when lowering masking intrinsics +static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, + SDValue PreservedSrc, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + unsigned OpcodeSelect = ISD::VSELECT; + SDLoc dl(Op); + + if (isAllOnesConstant(Mask)) + return Op; + + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + + switch (Op.getOpcode()) { + default: break; + case X86ISD::PCMPEQM: + case X86ISD::PCMPGTM: + case X86ISD::CMPM: + case X86ISD::CMPMU: + return DAG.getNode(ISD::AND, dl, VT, Op, VMask); + case X86ISD::VFPCLASS: + case X86ISD::VFPCLASSS: + return DAG.getNode(ISD::OR, dl, VT, Op, VMask); + case X86ISD::VTRUNC: + case X86ISD::VTRUNCS: + case X86ISD::VTRUNCUS: + // We can't use ISD::VSELECT here because it is not always "Legal" + // for the destination type. For example vpmovqb require only AVX512 + // and vselect that can operate on byte element type require BWI + OpcodeSelect = X86ISD::SELECT; + break; + } + if (PreservedSrc.getOpcode() == ISD::UNDEF) + PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc); +} + +/// \brief Creates an SDNode for a predicated scalar operation. /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). /// The mask is coming as MVT::i8 and it should be truncated /// to MVT::i1 while lowering masking intrinsics. @@ -15987,7 +16415,7 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - if (isAllOnes(Mask)) + if (isAllOnesConstant(Mask)) return Op; MVT VT = Op.getSimpleValueType(); @@ -15997,7 +16425,8 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, if (Op.getOpcode() == X86ISD::FSETCC) return DAG.getNode(ISD::AND, dl, VT, Op, IMask); - if (Op.getOpcode() == X86ISD::VFPCLASS) + if (Op.getOpcode() == X86ISD::VFPCLASS || + Op.getOpcode() == X86ISD::VFPCLASSS) return DAG.getNode(ISD::OR, dl, VT, Op, IMask); if (PreservedSrc.getOpcode() == ISD::UNDEF) @@ -16016,15 +16445,16 @@ static int getSEHRegistrationNodeSize(const Function *Fn) { case EHPersonality::MSVC_CXX: return 16; default: break; } - report_fatal_error("can only recover FP for MSVC EH personality functions"); + report_fatal_error( + "can only recover FP for 32-bit MSVC EH personality functions"); } -/// When the 32-bit MSVC runtime transfers control to us, either to an outlined +/// When the MSVC runtime transfers control to us, either to an outlined /// function or when returning to a parent frame after catching an exception, we /// recover the parent frame pointer by doing arithmetic on the incoming EBP. /// Here's the math: /// RegNodeBase = EntryEBP - RegNodeSize -/// ParentFP = RegNodeBase - RegNodeFrameOffset +/// ParentFP = RegNodeBase - ParentFrameOffset /// Subtracting RegNodeSize takes us to the offset of the registration node, and /// subtracting the offset (negative on x86) takes us back to the parent FP. static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, @@ -16041,22 +16471,28 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, if (!Fn->hasPersonalityFn()) return EntryEBP; - int RegNodeSize = getSEHRegistrationNodeSize(Fn); - // Get an MCSymbol that will ultimately resolve to the frame offset of the EH - // registration. + // registration, or the .set_setframe offset. MCSymbol *OffsetSym = MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol( GlobalValue::getRealLinkageName(Fn->getName())); SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT); - SDValue RegNodeFrameOffset = + SDValue ParentFrameOffset = DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal); + // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after + // prologue to RBP in the parent function. + const X86Subtarget &Subtarget = + static_cast(DAG.getSubtarget()); + if (Subtarget.is64Bit()) + return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset); + + int RegNodeSize = getSEHRegistrationNodeSize(Fn); // RegNodeBase = EntryEBP - RegNodeSize - // ParentFP = RegNodeBase - RegNodeFrameOffset + // ParentFP = RegNodeBase - ParentFrameOffset SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP, DAG.getConstant(RegNodeSize, dl, PtrVT)); - return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, RegNodeFrameOffset); + return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset); } static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget, @@ -16271,7 +16707,27 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Mask, PassThru, Subtarget, DAG); } case VPERM_3OP_MASKZ: - case VPERM_3OP_MASK: + case VPERM_3OP_MASK:{ + // Src2 is the PassThru + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + MVT VT = Op.getSimpleValueType(); + SDValue PassThru = SDValue(); + + // set PassThru element + if (IntrData->Type == VPERM_3OP_MASKZ) + PassThru = getZeroVector(VT, Subtarget, DAG, dl); + else + PassThru = DAG.getBitcast(VT, Src2); + + // Swap Src1 and Src2 in the node creation + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, + dl, Op.getValueType(), + Src2, Src1, Src3), + Mask, PassThru, Subtarget, DAG); + } case FMA_OP_MASK3: case FMA_OP_MASKZ: case FMA_OP_MASK: { @@ -16283,7 +16739,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue PassThru = SDValue(); // set PassThru element - if (IntrData->Type == VPERM_3OP_MASKZ || IntrData->Type == FMA_OP_MASKZ) + if (IntrData->Type == FMA_OP_MASKZ) PassThru = getZeroVector(VT, Subtarget, DAG, dl); else if (IntrData->Type == FMA_OP_MASK3) PassThru = Src3; @@ -16438,6 +16894,24 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget DAG.getConstant(X86CC, dl, MVT::i8), Cond); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } + case COMI_RM: { // Comparison intrinsics with Sae + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + SDValue CC = Op.getOperand(3); + SDValue Sae = Op.getOperand(4); + auto ComiType = TranslateX86ConstCondToX86CC(CC); + // choose between ordered and unordered (comi/ucomi) + unsigned comiOp = std::get<0>(ComiType) ? IntrData->Opc0 : IntrData->Opc1; + SDValue Cond; + if (cast(Sae)->getZExtValue() != + X86::STATIC_ROUNDING::CUR_DIRECTION) + Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS, Sae); + else + Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(std::get<1>(ComiType), dl, MVT::i8), Cond); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } case VSHIFT: return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), Op.getOperand(1), Op.getOperand(2), DAG); @@ -16452,26 +16926,75 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue Mask = Op.getOperand(3); SDValue DataToCompress = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); - if (isAllOnes(Mask)) // return data as is + if (isAllOnesConstant(Mask)) // return data as is return Op.getOperand(1); return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress), Mask, PassThru, Subtarget, DAG); } + case BROADCASTM: { + SDValue Mask = Op.getOperand(1); + MVT MaskVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); + Mask = DAG.getBitcast(MaskVT, Mask); + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask); + } case BLEND: { SDValue Mask = Op.getOperand(3); MVT VT = Op.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); - MVT BitcastVT = MVT::getVectorVT(MVT::i1, - Mask.getSimpleValueType().getSizeInBits()); - SDLoc dl(Op); - SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1), Op.getOperand(2)); } + case KUNPCK: { + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2); + + SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl); + SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl); + // Arguments should be swapped. + SDValue Res = DAG.getNode(IntrData->Opc0, dl, + MVT::getVectorVT(MVT::i1, VT.getSizeInBits()), + Src2, Src1); + return DAG.getBitcast(VT, Res); + } + case CONVERT_TO_MASK: { + MVT SrcVT = Op.getOperand(1).getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()); + + SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT, + Op.getOperand(1)); + SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, + DAG.getUNDEF(BitcastVT), CvtMask, + DAG.getIntPtrConstant(0, dl)); + return DAG.getBitcast(Op.getValueType(), Res); + } + case CONVERT_MASK_TO_VEC: { + SDValue Mask = Op.getOperand(1); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + return DAG.getNode(IntrData->Opc0, dl, VT, VMask); + } + case BRCST_SUBVEC_TO_VEC: { + SDValue Src = Op.getOperand(1); + SDValue Passthru = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + EVT resVT = Passthru.getValueType(); + SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT, + DAG.getUNDEF(resVT), Src, + DAG.getIntPtrConstant(0, dl)); + SDValue immVal; + if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector()) + immVal = DAG.getConstant(0x44, dl, MVT::i8); + else + immVal = DAG.getConstant(0, dl, MVT::i8); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + subVec, subVec, immVal), + Mask, Passthru, Subtarget, DAG); + } default: break; } @@ -16689,26 +17212,14 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); - SDValue MaskInReg; - ConstantSDNode *MaskC = dyn_cast(Mask); - if (MaskC) - MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); - else { - MVT BitcastVT = MVT::getVectorVT(MVT::i1, - Mask.getSimpleValueType().getSizeInBits()); - // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements - // are extracted by EXTRACT_SUBVECTOR. - MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); - } + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); if (Src.getOpcode() == ISD::UNDEF) - Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); - SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; + Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); + SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; return DAG.getMergeValues(RetOps, dl); @@ -16716,7 +17227,8 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, - SDValue Index, SDValue ScaleOp, SDValue Chain) { + SDValue Index, SDValue ScaleOp, SDValue Chain, + const X86Subtarget &Subtarget) { SDLoc dl(Op); auto *C = cast(ScaleOp); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); @@ -16724,29 +17236,18 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Segment = DAG.getRegister(0, MVT::i32); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); - SDValue MaskInReg; - ConstantSDNode *MaskC = dyn_cast(Mask); - if (MaskC) - MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); - else { - MVT BitcastVT = MVT::getVectorVT(MVT::i1, - Mask.getSimpleValueType().getSizeInBits()); - // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements - // are extracted by EXTRACT_SUBVECTOR. - MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); - } + SDValue VMask = getMaskNode(Mask, MaskVT, &Subtarget, DAG, dl); SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); - SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; + SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); return SDValue(Res, 1); } static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, - SDValue ScaleOp, SDValue Chain) { + SDValue ScaleOp, SDValue Chain, + const X86Subtarget &Subtarget) { SDLoc dl(Op); auto *C = cast(ScaleOp); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); @@ -16754,14 +17255,9 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Segment = DAG.getRegister(0, MVT::i32); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); - SDValue MaskInReg; - ConstantSDNode *MaskC = dyn_cast(Mask); - if (MaskC) - MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); - else - MaskInReg = DAG.getBitcast(MaskVT, Mask); + SDValue VMask = getMaskNode(Mask, MaskVT, &Subtarget, DAG, dl); //SDVTList VTs = DAG.getVTList(MVT::Other); - SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; + SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); return SDValue(Res, 0); } @@ -16874,63 +17370,21 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Results, DL); } -static SDValue LowerSEHRESTOREFRAME(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); - const Function *Fn = MF.getFunction(); - SDLoc dl(Op); SDValue Chain = Op.getOperand(0); - - assert(Subtarget->getFrameLowering()->hasFP(MF) && - "using llvm.x86.seh.restoreframe requires a frame pointer"); - - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - MVT VT = TLI.getPointerTy(DAG.getDataLayout()); - - const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); - unsigned FrameReg = - RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); - unsigned SPReg = RegInfo->getStackRegister(); - unsigned SlotSize = RegInfo->getSlotSize(); - - // Get incoming EBP. - SDValue IncomingEBP = - DAG.getCopyFromReg(Chain, dl, FrameReg, VT); - - // SP is saved in the first field of every registration node, so load - // [EBP-RegNodeSize] into SP. - int RegNodeSize = getSEHRegistrationNodeSize(Fn); - SDValue SPAddr = DAG.getNode(ISD::ADD, dl, VT, IncomingEBP, - DAG.getConstant(-RegNodeSize, dl, VT)); - SDValue NewSP = - DAG.getLoad(VT, dl, Chain, SPAddr, MachinePointerInfo(), false, false, - false, VT.getScalarSizeInBits() / 8); - Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); - - if (!RegInfo->needsStackRealignment(MF)) { - // Adjust EBP to point back to the original frame position. - SDValue NewFP = recoverFramePointer(DAG, Fn, IncomingEBP); - Chain = DAG.getCopyToReg(Chain, dl, FrameReg, NewFP); - } else { - assert(RegInfo->hasBasePointer(MF) && - "functions with Win32 EH must use frame or base pointer register"); - - // Reload the base pointer (ESI) with the adjusted incoming EBP. - SDValue NewBP = recoverFramePointer(DAG, Fn, IncomingEBP); - Chain = DAG.getCopyToReg(Chain, dl, RegInfo->getBaseRegister(), NewBP); - - // Reload the spilled EBP value, now that the stack and base pointers are - // set up. - X86MachineFunctionInfo *X86FI = MF.getInfo(); - X86FI->setHasSEHFramePtrSave(true); - int FI = MF.getFrameInfo()->CreateSpillStackObject(SlotSize, SlotSize); - X86FI->setSEHFramePtrSaveIndex(FI); - SDValue NewFP = DAG.getLoad(VT, dl, Chain, DAG.getFrameIndex(FI, VT), - MachinePointerInfo(), false, false, false, - VT.getScalarSizeInBits() / 8); - Chain = DAG.getCopyToReg(NewFP, dl, FrameReg, NewFP); - } - + SDValue RegNode = Op.getOperand(2); + WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo(); + if (!EHInfo) + report_fatal_error("EH registrations only live in functions using WinEH"); + + // Cast the operand to an alloca, and remember the frame index. + auto *FINode = dyn_cast(RegNode); + if (!FINode) + report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca"); + EHInfo->EHRegNodeFrameIndex = FINode->getIndex(); + + // Return the chain operand without making any DAG nodes. return Chain; } @@ -16948,7 +17402,7 @@ static SDValue LowerINTRINSIC_TRUNCATE_TO_MEM(const SDValue & Op, MVT VT = DataToTruncate.getSimpleValueType(); MVT SVT = MVT::getVectorVT(ElementType, VT.getVectorNumElements()); - if (isAllOnes(Mask)) // return just a truncate store + if (isAllOnesConstant(Mask)) // return just a truncate store return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MachinePointerInfo(), SVT, false, false, SVT.getScalarSizeInBits()/8); @@ -16977,8 +17431,20 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo); if (!IntrData) { - if (IntNo == llvm::Intrinsic::x86_seh_restoreframe) - return LowerSEHRESTOREFRAME(Op, Subtarget, DAG); + if (IntNo == llvm::Intrinsic::x86_seh_ehregnode) + return MarkEHRegistrationNode(Op, DAG); + if (IntNo == llvm::Intrinsic::x86_flags_read_u32 || + IntNo == llvm::Intrinsic::x86_flags_read_u64 || + IntNo == llvm::Intrinsic::x86_flags_write_u32 || + IntNo == llvm::Intrinsic::x86_flags_write_u64) { + // We need a frame pointer because this will get lowered to a PUSH/POP + // sequence. + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setHasCopyImplyingStackAdjustment(true); + // Don't do anything here, we will expand these intrinsics out later + // during ExpandISelPseudos in EmitInstrWithCustomInserter. + return SDValue(); + } return SDValue(); } @@ -17025,7 +17491,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue Src = Op.getOperand(5); SDValue Scale = Op.getOperand(6); return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, - Scale, Chain); + Scale, Chain, *Subtarget); } case PREFETCH: { SDValue Hint = Op.getOperand(6); @@ -17037,7 +17503,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue Index = Op.getOperand(3); SDValue Base = Op.getOperand(4); SDValue Scale = Op.getOperand(5); - return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain); + return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain, + *Subtarget); } // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP). case RDTSC: { @@ -17083,14 +17550,13 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Results, dl); } case COMPRESS_TO_MEM: { - SDLoc dl(Op); SDValue Mask = Op.getOperand(4); SDValue DataToCompress = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); MVT VT = DataToCompress.getSimpleValueType(); - if (isAllOnes(Mask)) // return just a store + if (isAllOnesConstant(Mask)) // return just a store return DAG.getStore(Chain, dl, DataToCompress, Addr, MachinePointerInfo(), false, false, VT.getScalarSizeInBits()/8); @@ -17109,14 +17575,13 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, case TRUNCATE_TO_MEM_VI32: return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32); case EXPAND_FROM_MEM: { - SDLoc dl(Op); SDValue Mask = Op.getOperand(4); SDValue PassThru = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); MVT VT = Op.getSimpleValueType(); - if (isAllOnes(Mask)) // return just a load + if (isAllOnesConstant(Mask)) // return just a load return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false, false, VT.getScalarSizeInBits()/8); @@ -17129,6 +17594,25 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, Mask, PassThru, Subtarget, DAG), Chain}; return DAG.getMergeValues(Results, dl); } + case LOADU: + case LOADA: { + SDValue Mask = Op.getOperand(4); + SDValue PassThru = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + MVT VT = Op.getSimpleValueType(); + + MemIntrinsicSDNode *MemIntr = dyn_cast(Op); + assert(MemIntr && "Expected MemIntrinsicSDNode!"); + + if (isAllOnesConstant(Mask)) // return just a load + return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand()); + + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT, + MemIntr->getMemOperand(), ISD::NON_EXTLOAD); + } } } @@ -17626,9 +18110,6 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, const X86Subtarget *Subtarget, unsigned NumBits = VT.getSizeInBits(); SDLoc dl(Op); - if (VT.isVector() && Subtarget->hasAVX512()) - return LowerVectorCTLZ_AVX512(Op, DAG); - Op = Op.getOperand(0); if (VT == MVT::i8) { // Zero extend to i32 since there is not an i8 bsr. @@ -18134,7 +18615,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, Op.getOpcode() == ISD::SRA && !Subtarget->hasXOP()) return ArithmeticShiftRight64(ShiftAmt); - if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) { + if (VT == MVT::v16i8 || + (Subtarget->hasInt256() && VT == MVT::v32i8) || + VT == MVT::v64i8) { unsigned NumElts = VT.getVectorNumElements(); MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); @@ -18158,10 +18641,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, R, ShiftAmt, DAG); SHL = DAG.getBitcast(VT, SHL); // Zero out the rightmost bits. - SmallVector V( - NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SHL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); + DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. @@ -18169,18 +18650,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, R, ShiftAmt, DAG); SRL = DAG.getBitcast(VT, SRL); // Zero out the leftmost bits. - SmallVector V( - NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SRL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); + DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT)); } if (Op.getOpcode() == ISD::SRA) { // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask) SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); - SmallVector V(NumElts, - DAG.getConstant(128 >> ShiftAmt, dl, - MVT::i8)); - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V); + + SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT); Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; @@ -18840,8 +19317,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { case ISD::SADDO: // A subtract of one will be selected as a INC. Note that INC doesn't // set CF, so we can't do this for UADDO. - if (ConstantSDNode *C = dyn_cast(RHS)) - if (C->isOne()) { + if (isOneConstant(RHS)) { BaseOp = X86ISD::INC; Cond = X86::COND_O; break; @@ -18856,8 +19332,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { case ISD::SSUBO: // A subtract of one will be selected as a DEC. Note that DEC doesn't // set CF, so we can't do this for USUBO. - if (ConstantSDNode *C = dyn_cast(RHS)) - if (C->isOne()) { + if (isOneConstant(RHS)) { BaseOp = X86ISD::DEC; Cond = X86::COND_O; break; @@ -19117,24 +19592,37 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, MVT SrcVT = Op.getOperand(0).getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); - if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) { + if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || + SrcVT == MVT::i64) { assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); if (DstVT != MVT::f64) // This conversion needs to be expanded. return SDValue(); - SDValue InVec = Op->getOperand(0); - SDLoc dl(Op); - unsigned NumElts = SrcVT.getVectorNumElements(); - MVT SVT = SrcVT.getVectorElementType(); - - // Widen the vector in input in the case of MVT::v2i32. - // Example: from MVT::v2i32 to MVT::v4i32. + SDValue Op0 = Op->getOperand(0); SmallVector Elts; - for (unsigned i = 0, e = NumElts; i != e; ++i) - Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec, - DAG.getIntPtrConstant(i, dl))); - + SDLoc dl(Op); + unsigned NumElts; + MVT SVT; + if (SrcVT.isVector()) { + NumElts = SrcVT.getVectorNumElements(); + SVT = SrcVT.getVectorElementType(); + + // Widen the vector in input in the case of MVT::v2i32. + // Example: from MVT::v2i32 to MVT::v4i32. + for (unsigned i = 0, e = NumElts; i != e; ++i) + Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0, + DAG.getIntPtrConstant(i, dl))); + } else { + assert(SrcVT == MVT::i64 && !Subtarget->is64Bit() && + "Unexpected source type in LowerBITCAST"); + Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0, + DAG.getIntPtrConstant(0, dl))); + Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0, + DAG.getIntPtrConstant(1, dl))); + NumElts = 2; + SVT = MVT::i32; + } // Explicitly mark the extra elements as Undef. Elts.append(NumElts, DAG.getUNDEF(SVT)); @@ -19186,7 +19674,8 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, // chunks, thus directly computes the pop count for v2i64 and v4i64. if (EltVT == MVT::i64) { SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); - V = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, V, Zeros); + MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); + V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros); return DAG.getBitcast(VT, V); } @@ -19202,9 +19691,10 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, // Do the horizontal sums into two v2i64s. Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); - Low = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, + MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); + Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, DAG.getBitcast(ByteVecVT, Low), Zeros); - High = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, + High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, DAG.getBitcast(ByteVecVT, High), Zeros); // Merge them together. @@ -19518,31 +20008,203 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); } +/// Widen a vector input to a vector of NVT. The +/// input vector must have the same element type as NVT. +static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, + bool FillWithZeroes = false) { + // Check if InOp already has the right width. + MVT InVT = InOp.getSimpleValueType(); + if (InVT == NVT) + return InOp; + + if (InOp.isUndef()) + return DAG.getUNDEF(NVT); + + assert(InVT.getVectorElementType() == NVT.getVectorElementType() && + "input and widen element type must match"); + + unsigned InNumElts = InVT.getVectorNumElements(); + unsigned WidenNumElts = NVT.getVectorNumElements(); + assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && + "Unexpected request for vector widening"); + + EVT EltVT = NVT.getVectorElementType(); + + SDLoc dl(InOp); + if (InOp.getOpcode() == ISD::CONCAT_VECTORS && + InOp.getNumOperands() == 2) { + SDValue N1 = InOp.getOperand(1); + if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) || + N1.isUndef()) { + InOp = InOp.getOperand(0); + InVT = InOp.getSimpleValueType(); + InNumElts = InVT.getVectorNumElements(); + } + } + if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) || + ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) { + SmallVector Ops; + for (unsigned i = 0; i < InNumElts; ++i) + Ops.push_back(InOp.getOperand(i)); + + SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : + DAG.getUNDEF(EltVT); + for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i) + Ops.push_back(FillVal); + return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops); + } + SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) : + DAG.getUNDEF(NVT); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, + InOp, DAG.getIntPtrConstant(0, dl)); +} + static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Subtarget->hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"); + // X86 scatter kills mask register, so its type should be added to + // the list of return values. + // If the "scatter" has 2 return values, it is already handled. + if (Op.getNode()->getNumValues() == 2) + return Op; + MaskedScatterSDNode *N = cast(Op.getNode()); - MVT VT = N->getValue().getSimpleValueType(); + SDValue Src = N->getValue(); + MVT VT = Src.getSimpleValueType(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op"); SDLoc dl(Op); - // X86 scatter kills mask register, so its type should be added to - // the list of return values - if (N->getNumValues() == 1) { - SDValue Index = N->getIndex(); - if (!Subtarget->hasVLX() && !VT.is512BitVector() && - !Index.getSimpleValueType().is512BitVector()) + SDValue NewScatter; + SDValue Index = N->getIndex(); + SDValue Mask = N->getMask(); + SDValue Chain = N->getChain(); + SDValue BasePtr = N->getBasePtr(); + MVT MemVT = N->getMemoryVT().getSimpleVT(); + MVT IndexVT = Index.getSimpleValueType(); + MVT MaskVT = Mask.getSimpleValueType(); + + if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) { + // The v2i32 value was promoted to v2i64. + // Now we "redo" the type legalizer's work and widen the original + // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64 + // with a shuffle. + assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) && + "Unexpected memory type"); + int ShuffleMask[] = {0, 2, -1, -1}; + Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src), + DAG.getUNDEF(MVT::v4i32), ShuffleMask); + // Now we have 4 elements instead of 2. + // Expand the index. + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4); + Index = ExtendToType(Index, NewIndexVT, DAG); + + // Expand the mask with zeroes + // Mask may be <2 x i64> or <2 x i1> at this moment + assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) && + "Unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4); + Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); + VT = MVT::v4i32; + } + + unsigned NumElts = VT.getVectorNumElements(); + if (!Subtarget->hasVLX() && !VT.is512BitVector() && + !Index.getSimpleValueType().is512BitVector()) { + // AVX512F supports only 512-bit vectors. Or data or index should + // be 512 bit wide. If now the both index and data are 256-bit, but + // the vector contains 8 elements, we just sign-extend the index + if (IndexVT == MVT::v8i32) + // Just extend index Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + else { + // The minimal number of elts in scatter is 8 + NumElts = 8; + // Index + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); + // Use original index here, do not modify the index twice + Index = ExtendToType(N->getIndex(), NewIndexVT, DAG); + if (IndexVT.getScalarType() == MVT::i32) + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + + // Mask + // At this point we have promoted mask operand + assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); + // Use the original mask here, do not modify the mask twice + Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true); + + // The value that should be stored + MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); + Src = ExtendToType(Src, NewVT, DAG); + } + } + // If the mask is "wide" at this point - truncate it to i1 vector + MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts); + Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask); + + // The mask is killed by scatter, add it to the values + SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other); + SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index}; + NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); + return SDValue(NewScatter.getNode(), 0); +} + +static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { - SDVTList VTs = DAG.getVTList(N->getMask().getValueType(), MVT::Other); - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), - N->getOperand(3), Index }; + MaskedLoadSDNode *N = cast(Op.getNode()); + MVT VT = Op.getSimpleValueType(); + SDValue Mask = N->getMask(); + SDLoc dl(Op); - SDValue NewScatter = DAG.getMaskedScatter(VTs, VT, dl, Ops, N->getMemOperand()); - DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); - return SDValue(NewScatter.getNode(), 0); + if (Subtarget->hasAVX512() && !Subtarget->hasVLX() && + !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) { + // This operation is legal for targets with VLX, but without + // VLX the vector should be widened to 512 bit + unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); + MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec); + MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); + SDValue Src0 = N->getSrc0(); + Src0 = ExtendToType(Src0, WideDataVT, DAG); + Mask = ExtendToType(Mask, WideMaskVT, DAG, true); + SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(), + N->getBasePtr(), Mask, Src0, + N->getMemoryVT(), N->getMemOperand(), + N->getExtensionType()); + + SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + NewLoad.getValue(0), + DAG.getIntPtrConstant(0, dl)); + SDValue RetOps[] = {Exract, NewLoad.getValue(1)}; + return DAG.getMergeValues(RetOps, dl); + } + return Op; +} + +static SDValue LowerMSTORE(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MaskedStoreSDNode *N = cast(Op.getNode()); + SDValue DataToStore = N->getValue(); + MVT VT = DataToStore.getSimpleValueType(); + SDValue Mask = N->getMask(); + SDLoc dl(Op); + + if (Subtarget->hasAVX512() && !Subtarget->hasVLX() && + !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) { + // This operation is legal for targets with VLX, but without + // VLX the vector should be widened to 512 bit + unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); + MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec); + MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); + DataToStore = ExtendToType(DataToStore, WideDataVT, DAG); + Mask = ExtendToType(Mask, WideMaskVT, DAG, true); + return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(), + Mask, N->getMemoryVT(), N->getMemOperand(), + N->isTruncatingStore()); } return Op; } @@ -19553,17 +20215,59 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget, "MGATHER/MSCATTER are supported on AVX-512 arch only"); MaskedGatherSDNode *N = cast(Op.getNode()); + SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); + SDValue Index = N->getIndex(); + SDValue Mask = N->getMask(); + SDValue Src0 = N->getValue(); + MVT IndexVT = Index.getSimpleValueType(); + MVT MaskVT = Mask.getSimpleValueType(); + + unsigned NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); - SDLoc dl(Op); - SDValue Index = N->getIndex(); if (!Subtarget->hasVLX() && !VT.is512BitVector() && !Index.getSimpleValueType().is512BitVector()) { - Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), - N->getOperand(3), Index }; - DAG.UpdateNodeOperands(N, Ops); + // AVX512F supports only 512-bit vectors. Or data or index should + // be 512 bit wide. If now the both index and data are 256-bit, but + // the vector contains 8 elements, we just sign-extend the index + if (NumElts == 8) { + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), Index }; + DAG.UpdateNodeOperands(N, Ops); + return Op; + } + + // Minimal number of elements in Gather + NumElts = 8; + // Index + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); + Index = ExtendToType(Index, NewIndexVT, DAG); + if (IndexVT.getScalarType() == MVT::i32) + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + + // Mask + MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts); + // At this point we have promoted mask operand + assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); + Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); + Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask); + + // The pass-thru value + MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); + Src0 = ExtendToType(Src0, NewVT, DAG); + + SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; + SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other), + N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + NewGather.getValue(0), + DAG.getIntPtrConstant(0, dl)); + SDValue RetOps[] = {Exract, NewGather.getValue(1)}; + return DAG.getMergeValues(RetOps, dl); } return Op; } @@ -19655,6 +20359,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); + case ISD::SETCCE: return LowerSETCCE(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); @@ -19705,6 +20410,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::UMAX: case ISD::UMIN: return LowerMINMAX(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); + case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); + case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG); case ISD::GC_TRANSITION_START: @@ -19723,6 +20430,36 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); + case X86ISD::AVG: { + // Legalize types for X86ISD::AVG by expanding vectors. + assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + + auto InVT = N->getValueType(0); + auto InVTSize = InVT.getSizeInBits(); + const unsigned RegSize = + (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128; + assert((!Subtarget->hasAVX512() || RegSize < 512) && + "512-bit vector requires AVX512"); + assert((!Subtarget->hasAVX2() || RegSize < 256) && + "256-bit vector requires AVX2"); + + auto ElemVT = InVT.getVectorElementType(); + auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, + RegSize / ElemVT.getSizeInBits()); + assert(RegSize % InVT.getSizeInBits() == 0); + unsigned NumConcat = RegSize / InVT.getSizeInBits(); + + SmallVector Ops(NumConcat, DAG.getUNDEF(InVT)); + Ops[0] = N->getOperand(0); + SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + Ops[0] = N->getOperand(1); + SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + + SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1); + Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res, + DAG.getIntPtrConstant(0, dl))); + return; + } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. case X86ISD::FMINC: case X86ISD::FMIN: @@ -19821,6 +20558,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results); } } + case ISD::INTRINSIC_WO_CHAIN: { + if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG)) + Results.push_back(V); + return; + } case ISD::READCYCLECOUNTER: { return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, Results); @@ -19965,6 +20707,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CMOV: return "X86ISD::CMOV"; case X86ISD::BRCOND: return "X86ISD::BRCOND"; case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; + case X86ISD::IRET: return "X86ISD::IRET"; case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; @@ -20026,6 +20769,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD"; case X86ISD::CVTUDQ2PD: return "X86ISD::CVTUDQ2PD"; + case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; @@ -20034,6 +20778,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VSHLI: return "X86ISD::VSHLI"; case X86ISD::VSRLI: return "X86ISD::VSRLI"; case X86ISD::VSRAI: return "X86ISD::VSRAI"; + case X86ISD::VROTLI: return "X86ISD::VROTLI"; + case X86ISD::VROTRI: return "X86ISD::VROTRI"; case X86ISD::CMPP: return "X86ISD::CMPP"; case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; @@ -20084,6 +20830,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; + case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST"; case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV"; @@ -20160,6 +20907,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FP_TO_SINT_RND: return "X86ISD::FP_TO_SINT_RND"; case X86ISD::FP_TO_UINT_RND: return "X86ISD::FP_TO_UINT_RND"; case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS"; + case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS"; } return nullptr; } @@ -20314,7 +21062,7 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; } bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { - if (!(Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512())) + if (!Subtarget->hasAnyFMA()) return false; VT = VT.getScalarType(); @@ -20502,6 +21250,47 @@ static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, return BB; } +static MachineBasicBlock *EmitWRPKRU(MachineInstr *MI, MachineBasicBlock *BB, + const X86Subtarget *Subtarget) { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + + // insert input VAL into EAX + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) + .addReg(MI->getOperand(0).getReg()); + // insert zero to ECX + BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX) + .addReg(X86::ECX) + .addReg(X86::ECX); + // insert zero to EDX + BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::EDX) + .addReg(X86::EDX) + .addReg(X86::EDX); + // insert WRPKRU instruction + BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr)); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; +} + +static MachineBasicBlock *EmitRDPKRU(MachineInstr *MI, MachineBasicBlock *BB, + const X86Subtarget *Subtarget) { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + + // insert zero to ECX + BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX) + .addReg(X86::ECX) + .addReg(X86::ECX); + // insert RDPKRU instruction + BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr)); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) + .addReg(X86::EAX); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; +} + static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, const X86Subtarget *Subtarget) { DebugLoc dl = MI->getDebugLoc(); @@ -21074,7 +21863,8 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, if (LastCMOV == MI && NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() && - NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) { + NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg() && + NextMIIt->getOperand(1).isKill()) { CascadedCMOV = &*NextMIIt; } @@ -21380,28 +22170,26 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock * X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { - DebugLoc DL = MI->getDebugLoc(); - assert(!Subtarget->isTargetMachO()); - - Subtarget->getFrameLowering()->emitStackProbeCall(*BB->getParent(), *BB, MI, - DL); - - MI->eraseFromParent(); // The pseudo instruction is gone now. - return BB; + DebugLoc DL = MI->getDebugLoc(); + MachineInstr *ResumeMI = Subtarget->getFrameLowering()->emitStackProbe( + *BB->getParent(), *BB, MI, DL, false); + MachineBasicBlock *ResumeBB = ResumeMI->getParent(); + MI->eraseFromParent(); // The pseudo instruction is gone now. + return ResumeBB; } MachineBasicBlock * X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); - const Constant *PerFn = MF->getFunction()->getPersonalityFn(); - bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn)); const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); MachineBasicBlock *TargetMBB = MI->getOperand(0).getMBB(); DebugLoc DL = MI->getDebugLoc(); - assert(!IsSEH && "SEH does not use catchret!"); + assert(!isAsynchronousEHPersonality( + classifyEHPersonality(MF->getFunction()->getPersonalityFn())) && + "SEH does not use catchret!"); // Only 32-bit EH needs to worry about manually restoring stack pointers. if (!Subtarget->is32Bit()) @@ -21412,7 +22200,7 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI, MachineBasicBlock *RestoreMBB = MF->CreateMachineBasicBlock(BB->getBasicBlock()); assert(BB->succ_size() == 1); - MF->insert(TargetMBB->getIterator(), RestoreMBB); + MF->insert(std::next(BB->getIterator()), RestoreMBB); RestoreMBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(RestoreMBB); MI->getOperand(0).setMBB(RestoreMBB); @@ -21439,6 +22227,35 @@ X86TargetLowering::EmitLoweredCatchPad(MachineInstr *MI, return BB; } +MachineBasicBlock * +X86TargetLowering::EmitLoweredTLSAddr(MachineInstr *MI, + MachineBasicBlock *BB) const { + // So, here we replace TLSADDR with the sequence: + // adjust_stackdown -> TLSADDR -> adjust_stackup. + // We need this because TLSADDR is lowered into calls + // inside MC, therefore without the two markers shrink-wrapping + // may push the prologue/epilogue pass them. + const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + MachineFunction &MF = *BB->getParent(); + + // Emit CALLSEQ_START right before the instruction. + unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); + MachineInstrBuilder CallseqStart = + BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0); + BB->insert(MachineBasicBlock::iterator(MI), CallseqStart); + + // Emit CALLSEQ_END right after the instruction. + // We don't call erase from parent because we want to keep the + // original instruction around. + unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); + MachineInstrBuilder CallseqEnd = + BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0); + BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd); + + return BB; +} + MachineBasicBlock * X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, MachineBasicBlock *BB) const { @@ -21457,6 +22274,8 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, // FIXME: The 32-bit calls have non-standard calling conventions. Use a // proper register mask. const uint32_t *RegMask = + Subtarget->is64Bit() ? + Subtarget->getRegisterInfo()->getDarwinTLSCallPreservedMask() : Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C); if (Subtarget->is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, @@ -21817,6 +22636,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::TCRETURNri64: case X86::TCRETURNmi64: return BB; + case X86::TLS_addr32: + case X86::TLS_addr64: + case X86::TLS_base_addr32: + case X86::TLS_base_addr64: + return EmitLoweredTLSAddr(MI, BB); case X86::WIN_ALLOCA: return EmitLoweredWinAlloca(MI, BB); case X86::CATCHRET: @@ -21831,6 +22655,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, return EmitLoweredTLSCall(MI, BB); case X86::CMOV_FR32: case X86::CMOV_FR64: + case X86::CMOV_FR128: case X86::CMOV_GR8: case X86::CMOV_GR16: case X86::CMOV_GR32: @@ -21852,6 +22677,36 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::CMOV_V64I1: return EmitLoweredSelect(MI, BB); + case X86::RDFLAGS32: + case X86::RDFLAGS64: { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + unsigned PushF = + MI->getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64; + unsigned Pop = + MI->getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r; + BuildMI(*BB, MI, DL, TII->get(PushF)); + BuildMI(*BB, MI, DL, TII->get(Pop), MI->getOperand(0).getReg()); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; + } + + case X86::WRFLAGS32: + case X86::WRFLAGS64: { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + unsigned Push = + MI->getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r; + unsigned PopF = + MI->getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64; + BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI->getOperand(0).getReg()); + BuildMI(*BB, MI, DL, TII->get(PopF)); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; + } + case X86::RELEASE_FADD32mr: case X86::RELEASE_FADD64mr: return EmitLoweredAtomicFP(MI, BB); @@ -21968,7 +22823,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // Thread synchronization. case X86::MONITOR: return EmitMonitor(MI, BB, Subtarget); - + // PKU feature + case X86::WRPKRU: + return EmitWRPKRU(MI, BB, Subtarget); + case X86::RDPKRU: + return EmitRDPKRU(MI, BB, Subtarget); // xbegin case X86::XBEGIN: return EmitXBegin(MI, BB, Subtarget->getInstrInfo()); @@ -22130,39 +22989,8 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N, return TargetLowering::isGAPlusOffset(N, GA, Offset); } -/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the -/// same as extracting the high 128-bit part of 256-bit vector and then -/// inserting the result into the low part of a new 256-bit vector -static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - - // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j) - if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || - SVOp->getMaskElt(j) >= 0) - return false; - - return true; -} - -/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the -/// same as extracting the low 128-bit part of 256-bit vector and then -/// inserting the result into the high part of a new 256-bit vector -static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - - // vector_shuffle or - for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j) - if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || - SVOp->getMaskElt(j) >= 0) - return false; - - return true; -} - /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. +/// FIXME: This could be expanded to support 512 bit vectors as well. static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget* Subtarget) { @@ -22170,7 +22998,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, ShuffleVectorSDNode *SVOp = cast(N); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); - EVT VT = SVOp->getValueType(0); + MVT VT = SVOp->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); if (V1.getOpcode() == ISD::CONCAT_VECTORS && @@ -22236,24 +23064,6 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, return DCI.CombineTo(N, InsV); } - //===--------------------------------------------------------------------===// - // Combine some shuffles into subvector extracts and inserts: - // - - // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - if (isShuffleHigh128VectorInsertLow(SVOp)) { - SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); - SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); - return DCI.CombineTo(N, InsV); - } - - // vector_shuffle or - if (isShuffleLow128VectorInsertHigh(SVOp)) { - SDValue V = Extract128BitVector(V1, 0, DAG, dl); - SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); - return DCI.CombineTo(N, InsV); - } - return SDValue(); } @@ -22504,7 +23314,7 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, return false; SmallVector OpMask; bool IsUnary; - bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary); + bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, true, OpMask, IsUnary); // We only can combine unary shuffles which we can decode the mask for. if (!HaveMask || !IsUnary) return false; @@ -22601,7 +23411,7 @@ static SmallVector getPSHUFShuffleMask(SDValue N) { MVT VT = N.getSimpleValueType(); SmallVector Mask; bool IsUnary; - bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary); + bool HaveMask = getTargetShuffleMask(N.getNode(), VT, false, Mask, IsUnary); (void)HaveMask; assert(HaveMask); @@ -22851,6 +23661,66 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, Mask = getPSHUFShuffleMask(N); assert(Mask.size() == 4); break; + case X86ISD::UNPCKL: { + // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in + // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE + // moves upper half elements into the lower half part. For example: + // + // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1, + // undef:v16i8 + // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2 + // + // will be combined to: + // + // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1 + + // This is only for 128-bit vectors. From SSE4.1 onward this combine may not + // happen due to advanced instructions. + if (!VT.is128BitVector()) + return SDValue(); + + auto Op0 = N.getOperand(0); + auto Op1 = N.getOperand(1); + if (Op0.getOpcode() == ISD::UNDEF && + Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) { + ArrayRef Mask = cast(Op1.getNode())->getMask(); + + unsigned NumElts = VT.getVectorNumElements(); + SmallVector ExpectedMask(NumElts, -1); + std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2, + NumElts / 2); + + auto ShufOp = Op1.getOperand(0); + if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask)) + return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp); + } + return SDValue(); + } + case X86ISD::BLENDI: { + SDValue V0 = N->getOperand(0); + SDValue V1 = N->getOperand(1); + assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() && + "Unexpected input vector types"); + + // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector + // operands and changing the mask to 1. This saves us a bunch of + // pattern-matching possibilities related to scalar math ops in SSE/AVX. + // x86InstrInfo knows how to commute this back after instruction selection + // if it would help register allocation. + + // TODO: If optimizing for size or a processor that doesn't suffer from + // partial register update stalls, this should be transformed into a MOVSD + // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. + + if (VT == MVT::v2f64) + if (auto *Mask = dyn_cast(N->getOperand(2))) + if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) { + SDValue NewMask = DAG.getConstant(1, DL, MVT::i8); + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); + } + + return SDValue(); + } default: return SDValue(); } @@ -22944,9 +23814,13 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, /// the operands which explicitly discard the lanes which are unused by this /// operation to try to flow through the rest of the combiner the fact that /// they're unused. -static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { +static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDLoc DL(N); EVT VT = N->getValueType(0); + if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && + (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) + return SDValue(); // We only handle target-independent shuffles. // FIXME: It would be easy and harmless to use the target shuffle mask @@ -22955,14 +23829,19 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { return SDValue(); auto *SVN = cast(N); - ArrayRef Mask = SVN->getMask(); + SmallVector Mask; + for (int M : SVN->getMask()) + Mask.push_back(M); + SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); - // We require the first shuffle operand to be the SUB node, and the second to - // be the ADD node. - // FIXME: We should support the commuted patterns. - if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD) + // We require the first shuffle operand to be the FSUB node, and the second to + // be the FADD node. + if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) { + ShuffleVectorSDNode::commuteMask(Mask); + std::swap(V1, V2); + } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD) return SDValue(); // If there are other uses of these operations we can't fold them. @@ -22983,12 +23862,6 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}))) return SDValue(); - // Only specific types are legal at this point, assert so we notice if and - // when these change. - assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 || - VT == MVT::v4f64) && - "Unknown vector type encountered!"); - return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS); } @@ -23008,12 +23881,12 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, // If we have legalized the vector types, look for blends of FADD and FSUB // nodes that we can fuse into an ADDSUB node. - if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3()) - if (SDValue AddSub = combineShuffleToAddSub(N, DAG)) + if (TLI.isTypeLegal(VT)) + if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG)) return AddSub; // Combine 256-bit vector shuffles. This is only profitable when in AVX mode - if (Subtarget->hasFp256() && VT.is256BitVector() && + if (TLI.isTypeLegal(VT) && Subtarget->hasFp256() && VT.is256BitVector() && N->getOpcode() == ISD::VECTOR_SHUFFLE) return PerformShuffleCombine256(N, DAG, DCI, Subtarget); @@ -23111,6 +23984,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, SDValue InVec = N->getOperand(0); SDValue EltNo = N->getOperand(1); + EVT EltVT = N->getValueType(0); if (!isa(EltNo)) return SDValue(); @@ -23139,14 +24013,22 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, SmallVector ShuffleMask; bool UnaryShuffle; - if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), + if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true, ShuffleMask, UnaryShuffle)) return SDValue(); // Select the input vector, guarding against out of range extract vector. unsigned NumElems = CurrentVT.getVectorNumElements(); int Elt = cast(EltNo)->getZExtValue(); - int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; + int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt]; + + if (Idx == SM_SentinelZero) + return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT) + : DAG.getConstantFP(+0.0, SDLoc(N), EltVT); + if (Idx == SM_SentinelUndef) + return DAG.getUNDEF(EltVT); + + assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range"); SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) : InVec.getOperand(1); @@ -23171,7 +24053,6 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) return SDValue(); - EVT EltVT = N->getValueType(0); // If there's a bitcast before the shuffle, check if the load type and // alignment is valid. unsigned Align = LN0->getAlignment(); @@ -23208,9 +24089,9 @@ static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG, // store-load conversions. if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR && N0.getValueType() == MVT::v2i32 && - isa(N0.getOperand(1))) { + isNullConstant(N0.getOperand(1))) { SDValue N00 = N0->getOperand(0); - if (N0.getConstantOperandVal(1) == 0 && N00.getValueType() == MVT::i32) + if (N00.getValueType() == MVT::i32) return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00); } @@ -23454,7 +24335,8 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // ignored in unsafe-math mode). // We also try to create v2f32 min/max nodes, which we later widen to v4f32. if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && - VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && + VT != MVT::f80 && VT != MVT::f128 && + (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && (Subtarget->hasSSE2() || (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); @@ -24011,12 +24893,9 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { SetCC.getOpcode() == ISD::AND) { if (SetCC.getOpcode() == ISD::AND) { int OpIdx = -1; - ConstantSDNode *CS; - if ((CS = dyn_cast(SetCC.getOperand(0))) && - CS->getZExtValue() == 1) + if (isOneConstant(SetCC.getOperand(0))) OpIdx = 1; - if ((CS = dyn_cast(SetCC.getOperand(1))) && - CS->getZExtValue() == 1) + if (isOneConstant(SetCC.getOperand(1))) OpIdx = 0; if (OpIdx == -1) break; @@ -24095,8 +24974,7 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd) { if (Cond->getOpcode() == X86ISD::CMP) { - ConstantSDNode *CondOp1C = dyn_cast(Cond->getOperand(1)); - if (!CondOp1C || !CondOp1C->isNullValue()) + if (!isNullConstant(Cond->getOperand(1))) return false; Cond = Cond->getOperand(0); @@ -24375,9 +25253,11 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, MulAmt1 = 3; MulAmt2 = MulAmt / 3; } + + SDLoc DL(N); + SDValue NewMul; if (MulAmt2 && (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ - SDLoc DL(N); if (isPowerOf2_64(MulAmt2) && !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) @@ -24386,7 +25266,6 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, // is an add. std::swap(MulAmt1, MulAmt2); - SDValue NewMul; if (isPowerOf2_64(MulAmt1)) NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8)); @@ -24400,10 +25279,31 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, DAG.getConstant(MulAmt2, DL, VT)); + } + if (!NewMul) { + assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) + && "Both cases that could cause potential overflows should have " + "already been handled."); + if (isPowerOf2_64(MulAmt - 1)) + // (mul x, 2^N + 1) => (add (shl x, N), x) + NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(MulAmt - 1), DL, + MVT::i8))); + + else if (isPowerOf2_64(MulAmt + 1)) + // (mul x, 2^N - 1) => (sub (shl x, N), x) + NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT, + N->getOperand(0), + DAG.getConstant(Log2_64(MulAmt + 1), + DL, MVT::i8)), N->getOperand(0)); + } + + if (NewMul) // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, NewMul, false); - } + return SDValue(); } @@ -24467,6 +25367,59 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + unsigned Size = VT.getSizeInBits(); + + // fold (ashr (shl, a, [56,48,32,24,16]), SarConst) + // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or + // into (lshr, (sext (a), SarConst - [56,48,32,24,16])) + // depending on sign of (SarConst - [56,48,32,24,16]) + + // sexts in X86 are MOVs. The MOVs have the same code size + // as above SHIFTs (only SHIFT on 1 has lower code size). + // However the MOVs have 2 advantages to a SHIFT: + // 1. MOVs can write to a register that differs from source + // 2. MOVs accept memory operands + + if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant || + N0.getOpcode() != ISD::SHL || !N0.hasOneUse() || + N0.getOperand(1).getOpcode() != ISD::Constant) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + SDValue N01 = N0.getOperand(1); + APInt ShlConst = (cast(N01))->getAPIntValue(); + APInt SarConst = (cast(N1))->getAPIntValue(); + EVT CVT = N1.getValueType(); + + if (SarConst.isNegative()) + return SDValue(); + + for (MVT SVT : MVT::integer_valuetypes()) { + unsigned ShiftSize = SVT.getSizeInBits(); + // skipping types without corresponding sext/zext and + // ShlConst that is not one of [56,48,32,24,16] + if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize) + continue; + SDLoc DL(N); + SDValue NN = + DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT)); + SarConst = SarConst - (Size - ShiftSize); + if (SarConst == 0) + return NN; + else if (SarConst.isNegative()) + return DAG.getNode(ISD::SHL, DL, VT, NN, + DAG.getConstant(-SarConst, DL, CVT)); + else + return DAG.getNode(ISD::SRA, DL, VT, NN, + DAG.getConstant(SarConst, DL, CVT)); + } + return SDValue(); +} + /// \brief Returns a vector of 0s if the node in input is a vector logical /// shift by a constant amount which is known to be bigger than or equal /// to the vector element size in bits. @@ -24492,7 +25445,7 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, // the element size. The constant shift amount will be // encoded as a 8-bit immediate. if (ShiftAmt.trunc(8).uge(MaxAmount)) - return getZeroVector(VT, Subtarget, DAG, DL); + return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL); } return SDValue(); @@ -24506,6 +25459,10 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, if (SDValue V = PerformSHLCombine(N, DAG)) return V; + if (N->getOpcode() == ISD::SRA) + if (SDValue V = PerformSRACombine(N, DAG)) + return V; + // Try to fold this logical shift into a zero vector. if (N->getOpcode() != ISD::SRA) if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget)) @@ -25133,7 +26090,7 @@ static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) { return SDValue(); // Make sure we are performing an xor against one. - if (!isa(N1) || !cast(N1)->isOne()) + if (!isOneConstant(N1)) return SDValue(); // SetCC on x86 zero extends so only act on this if it's a logical shift. @@ -25181,6 +26138,126 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// This function detects the AVG pattern between vectors of unsigned i8/i16, +/// which is c = (a + b + 1) / 2, and replace this operation with the efficient +/// X86ISD::AVG instruction. +static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, + const X86Subtarget *Subtarget, SDLoc DL) { + if (!VT.isVector() || !VT.isSimple()) + return SDValue(); + EVT InVT = In.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + EVT ScalarVT = VT.getVectorElementType(); + if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && + isPowerOf2_32(NumElems))) + return SDValue(); + + // InScalarVT is the intermediate type in AVG pattern and it should be greater + // than the original input type (i8/i16). + EVT InScalarVT = InVT.getVectorElementType(); + if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits()) + return SDValue(); + + if (Subtarget->hasAVX512()) { + if (VT.getSizeInBits() > 512) + return SDValue(); + } else if (Subtarget->hasAVX2()) { + if (VT.getSizeInBits() > 256) + return SDValue(); + } else { + if (VT.getSizeInBits() > 128) + return SDValue(); + } + + // Detect the following pattern: + // + // %1 = zext %a to + // %2 = zext %b to + // %3 = add nuw nsw %1, + // %4 = add nuw nsw %3, %2 + // %5 = lshr %N, + // %6 = trunc %5 to + // + // In AVX512, the last instruction can also be a trunc store. + + if (In.getOpcode() != ISD::SRL) + return SDValue(); + + // A lambda checking the given SDValue is a constant vector and each element + // is in the range [Min, Max]. + auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) { + BuildVectorSDNode *BV = dyn_cast(V); + if (!BV || !BV->isConstant()) + return false; + for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) { + ConstantSDNode *C = dyn_cast(V.getOperand(i)); + if (!C) + return false; + uint64_t Val = C->getZExtValue(); + if (Val < Min || Val > Max) + return false; + } + return true; + }; + + // Check if each element of the vector is left-shifted by one. + auto LHS = In.getOperand(0); + auto RHS = In.getOperand(1); + if (!IsConstVectorInRange(RHS, 1, 1)) + return SDValue(); + if (LHS.getOpcode() != ISD::ADD) + return SDValue(); + + // Detect a pattern of a + b + 1 where the order doesn't matter. + SDValue Operands[3]; + Operands[0] = LHS.getOperand(0); + Operands[1] = LHS.getOperand(1); + + // Take care of the case when one of the operands is a constant vector whose + // element is in the range [1, 256]. + if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) && + Operands[0].getOpcode() == ISD::ZERO_EXTEND && + Operands[0].getOperand(0).getValueType() == VT) { + // The pattern is detected. Subtract one from the constant vector, then + // demote it and emit X86ISD::AVG instruction. + SDValue One = DAG.getConstant(1, DL, InScalarVT); + SDValue Ones = DAG.getNode(ISD::BUILD_VECTOR, DL, InVT, + SmallVector(NumElems, One)); + Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], Ones); + Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]); + return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), + Operands[1]); + } + + if (Operands[0].getOpcode() == ISD::ADD) + std::swap(Operands[0], Operands[1]); + else if (Operands[1].getOpcode() != ISD::ADD) + return SDValue(); + Operands[2] = Operands[1].getOperand(0); + Operands[1] = Operands[1].getOperand(1); + + // Now we have three operands of two additions. Check that one of them is a + // constant vector with ones, and the other two are promoted from i8/i16. + for (int i = 0; i < 3; ++i) { + if (!IsConstVectorInRange(Operands[i], 1, 1)) + continue; + std::swap(Operands[i], Operands[2]); + + // Check if Operands[0] and Operands[1] are results of type promotion. + for (int j = 0; j < 2; ++j) + if (Operands[j].getOpcode() != ISD::ZERO_EXTEND || + Operands[j].getOperand(0).getValueType() != VT) + return SDValue(); + + // The pattern is detected, emit X86ISD::AVG instruction. + return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), + Operands[1].getOperand(0)); + } + + return SDValue(); +} + /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -25283,8 +26360,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; - for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) - ShuffleVec[i] = NumElems*SizeRatio; + for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i) + ShuffleVec[i] = NumElems * SizeRatio; NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, DAG.getConstant(0, dl, WideVecVT), &ShuffleVec[0]); @@ -25365,8 +26442,8 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, "WideVecVT should be legal"); SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, - DAG.getUNDEF(WideVecVT), - &ShuffleVec[0]); + DAG.getUNDEF(WideVecVT), + &ShuffleVec[0]); SDValue NewMask; SDValue Mask = Mst->getMask(); @@ -25398,8 +26475,9 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); } - return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(), - NewMask, StVT, Mst->getMemOperand(), false); + return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, + Mst->getBasePtr(), NewMask, StVT, + Mst->getMemOperand(), false); } /// PerformSTORECombine - Do target-specific dag combines on STORE nodes. static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, @@ -25445,6 +26523,16 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. if (St->isTruncatingStore() && VT.isVector()) { + // Check if we can detect an AVG pattern from the truncation. If yes, + // replace the trunc store by a normal store with the result of X86ISD::AVG + // instruction. + SDValue Avg = + detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl); + if (Avg.getNode()) + return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElems = VT.getVectorNumElements(); assert(StVT != VT && "Cannot truncate to the same type"); @@ -25813,35 +26901,248 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. -static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { - assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); +/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS. +static SDValue +combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG, + SmallVector &Regs) { + assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 || + Regs[0].getValueType() == MVT::v2i64)); + EVT OutVT = N->getValueType(0); + EVT OutSVT = OutVT.getVectorElementType(); + EVT InVT = Regs[0].getValueType(); + EVT InSVT = InVT.getVectorElementType(); + SDLoc DL(N); - // F[X]OR(0.0, x) -> x - if (ConstantFPSDNode *C = dyn_cast(N->getOperand(0))) - if (C->getValueAPF().isPosZero()) - return N->getOperand(1); + // First, use mask to unset all bits that won't appear in the result. + assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) && + "OutSVT can only be either i8 or i16."); + SDValue MaskVal = + DAG.getConstant(OutSVT == MVT::i8 ? 0xFF : 0xFFFF, DL, InSVT); + SDValue MaskVec = DAG.getNode( + ISD::BUILD_VECTOR, DL, InVT, + SmallVector(InVT.getVectorNumElements(), MaskVal)); + for (auto &Reg : Regs) + Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVec, Reg); + + MVT UnpackedVT, PackedVT; + if (OutSVT == MVT::i8) { + UnpackedVT = MVT::v8i16; + PackedVT = MVT::v16i8; + } else { + UnpackedVT = MVT::v4i32; + PackedVT = MVT::v8i16; + } + + // In each iteration, truncate the type by a half size. + auto RegNum = Regs.size(); + for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits(); + j < e; j *= 2, RegNum /= 2) { + for (unsigned i = 0; i < RegNum; i++) + Regs[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, Regs[i]); + for (unsigned i = 0; i < RegNum / 2; i++) + Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2], + Regs[i * 2 + 1]); + } + + // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and + // then extract a subvector as the result since v8i8 is not a legal type. + if (OutVT == MVT::v8i8) { + Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]); + Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0], + DAG.getIntPtrConstant(0, DL)); + return Regs[0]; + } else if (RegNum > 1) { + Regs.resize(RegNum); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); + } else + return Regs[0]; +} - // F[X]OR(x, 0.0) -> x - if (ConstantFPSDNode *C = dyn_cast(N->getOperand(1))) - if (C->getValueAPF().isPosZero()) - return N->getOperand(0); +/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS. +static SDValue +combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG, + SmallVector &Regs) { + assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32); + EVT OutVT = N->getValueType(0); + SDLoc DL(N); + + // Shift left by 16 bits, then arithmetic-shift right by 16 bits. + SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32); + for (auto &Reg : Regs) { + Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG); + Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG); + } + + for (unsigned i = 0, e = Regs.size() / 2; i < e; i++) + Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2], + Regs[i * 2 + 1]); + + if (Regs.size() > 2) { + Regs.resize(Regs.size() / 2); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); + } else + return Regs[0]; +} + +/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into +/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type +/// legalization the truncation will be translated into a BUILD_VECTOR with each +/// element that is extracted from a vector and then truncated, and it is +/// diffcult to do this optimization based on them. +static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT OutVT = N->getValueType(0); + if (!OutVT.isVector()) + return SDValue(); + + SDValue In = N->getOperand(0); + if (!In.getValueType().isSimple()) + return SDValue(); + + EVT InVT = In.getValueType(); + unsigned NumElems = OutVT.getVectorNumElements(); + + // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on + // SSE2, and we need to take care of it specially. + // AVX512 provides vpmovdb. + if (!Subtarget->hasSSE2() || Subtarget->hasAVX2()) + return SDValue(); + + EVT OutSVT = OutVT.getVectorElementType(); + EVT InSVT = InVT.getVectorElementType(); + if (!((InSVT == MVT::i32 || InSVT == MVT::i64) && + (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) && + NumElems >= 8)) + return SDValue(); + + // SSSE3's pshufb results in less instructions in the cases below. + if (Subtarget->hasSSSE3() && NumElems == 8 && + ((OutSVT == MVT::i8 && InSVT != MVT::i64) || + (InSVT == MVT::i32 && OutSVT == MVT::i16))) + return SDValue(); + + SDLoc DL(N); + + // Split a long vector into vectors of legal type. + unsigned RegNum = InVT.getSizeInBits() / 128; + SmallVector SubVec(RegNum); + if (InSVT == MVT::i32) { + for (unsigned i = 0; i < RegNum; i++) + SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, + DAG.getIntPtrConstant(i * 4, DL)); + } else { + for (unsigned i = 0; i < RegNum; i++) + SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, + DAG.getIntPtrConstant(i * 2, DL)); + } + + // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PAKCUS + // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to + // truncate 2 x v4i32 to v8i16. + if (Subtarget->hasSSE41() || OutSVT == MVT::i8) + return combineVectorTruncationWithPACKUS(N, DAG, SubVec); + else if (InSVT == MVT::i32) + return combineVectorTruncationWithPACKSS(N, DAG, SubVec); + else + return SDValue(); +} + +static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + // Try to detect AVG pattern first. + SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, + Subtarget, SDLoc(N)); + if (Avg.getNode()) + return Avg; + return combineVectorTruncation(N, DAG, Subtarget); +} + +/// Do target-specific dag combines on floating point negations. +static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + EVT SVT = VT.getScalarType(); + SDValue Arg = N->getOperand(0); + SDLoc DL(N); + + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + // If we're negating a FMUL node on a target with FMA, then we can avoid the + // use of a constant by performing (-0 - A*B) instead. + // FIXME: Check rounding control flags as well once it becomes available. + if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && + Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) { + SDValue Zero = DAG.getConstantFP(0.0, DL, VT); + return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Zero); + } + + // If we're negating a FMA node, then we can adjust the + // instruction to include the extra negation. + if (Arg.hasOneUse()) { + switch (Arg.getOpcode()) { + case X86ISD::FMADD: + return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FMSUB: + return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FNMADD: + return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FNMSUB: + return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + } + } + return SDValue(); +} + +static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); if (VT.is512BitVector() && !Subtarget->hasDQI()) { + // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention. + // These logic operations may be executed in the integer domain. SDLoc dl(N); MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits()); MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements()); SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0)); SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1)); - unsigned IntOpcode = (N->getOpcode() == X86ISD::FOR) ? ISD::OR : ISD::XOR; + unsigned IntOpcode = 0; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected FP logic op"); + case X86ISD::FOR: IntOpcode = ISD::OR; break; + case X86ISD::FXOR: IntOpcode = ISD::XOR; break; + case X86ISD::FAND: IntOpcode = ISD::AND; break; + case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break; + } SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1); return DAG.getNode(ISD::BITCAST, dl, VT, IntOp); } return SDValue(); } +/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. +static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); + + // F[X]OR(0.0, x) -> x + if (ConstantFPSDNode *C = dyn_cast(N->getOperand(0))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(1); + + // F[X]OR(x, 0.0) -> x + if (ConstantFPSDNode *C = dyn_cast(N->getOperand(1))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(0); + + return lowerX86FPLogicOp(N, DAG, Subtarget); +} /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes. static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { @@ -25864,8 +27165,65 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { N->getOperand(0), N->getOperand(1)); } +static SDValue performFMinNumFMaxNumCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + if (Subtarget->useSoftFloat()) + return SDValue(); + + // TODO: Check for global or instruction-level "nnan". In that case, we + // should be able to lower to FMAX/FMIN alone. + // TODO: If an operand is already known to be a NaN or not a NaN, this + // should be an optional swap and FMAX/FMIN. + + EVT VT = N->getValueType(0); + if (!((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || + (Subtarget->hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) || + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64)))) + return SDValue(); + + // This takes at least 3 instructions, so favor a library call when operating + // on a scalar and minimizing code size. + if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize()) + return SDValue(); + + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + SDLoc DL(N); + EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType( + DAG.getDataLayout(), *DAG.getContext(), VT); + + // There are 4 possibilities involving NaN inputs, and these are the required + // outputs: + // Op1 + // Num NaN + // ---------------- + // Num | Max | Op0 | + // Op0 ---------------- + // NaN | Op1 | NaN | + // ---------------- + // + // The SSE FP max/min instructions were not designed for this case, but rather + // to implement: + // Min = Op1 < Op0 ? Op1 : Op0 + // Max = Op1 > Op0 ? Op1 : Op0 + // + // So they always return Op0 if either input is a NaN. However, we can still + // use those instructions for fmaxnum by selecting away a NaN input. + + // If either operand is NaN, the 2nd source operand (Op0) is passed through. + auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN; + SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0); + SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO); + + // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands + // are NaN, the NaN value of Op1 is the result. + auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; + return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax); +} + /// Do target-specific dag combines on X86ISD::FAND nodes. -static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { // FAND(0.0, x) -> 0.0 if (ConstantFPSDNode *C = dyn_cast(N->getOperand(0))) if (C->getValueAPF().isPosZero()) @@ -25876,11 +27234,12 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { if (C->getValueAPF().isPosZero()) return N->getOperand(1); - return SDValue(); + return lowerX86FPLogicOp(N, DAG, Subtarget); } /// Do target-specific dag combines on X86ISD::FANDN nodes -static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { // FANDN(0.0, x) -> x if (ConstantFPSDNode *C = dyn_cast(N->getOperand(0))) if (C->getValueAPF().isPosZero()) @@ -25891,7 +27250,7 @@ static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { if (C->getValueAPF().isPosZero()) return N->getOperand(1); - return SDValue(); + return lowerX86FPLogicOp(N, DAG, Subtarget); } static SDValue PerformBTCombine(SDNode *N, @@ -26012,6 +27371,32 @@ static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG, return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags); } +/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) -> +/// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y) +/// This exposes the {s/z}ext to the sdivrem lowering, so that it directly +/// extends from AH (which we otherwise need to do contortions to access). +static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + auto OpcodeN = N->getOpcode(); + auto OpcodeN0 = N0.getOpcode(); + if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) || + (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM))) + return SDValue(); + + EVT VT = N->getValueType(0); + EVT InVT = N0.getValueType(); + if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32) + return SDValue(); + + SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); + auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG + : X86ISD::UDIVREM8_ZEXT_HREG; + SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0), + N0.getOperand(1)); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); + return R.getValue(1); +} + static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { @@ -26022,18 +27407,8 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, EVT InSVT = InVT.getScalarType(); SDLoc DL(N); - // (i8,i32 sext (sdivrem (i8 x, i8 y)) -> - // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y) - // This exposes the sext to the sdivrem lowering, so that it directly extends - // from AH (which we otherwise need to do contortions to access). - if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 && - InVT == MVT::i8 && VT == MVT::i32) { - SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); - SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, DL, NodeTys, - N0.getOperand(0), N0.getOperand(1)); - DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); - return R.getValue(1); - } + if (SDValue DivRem8 = getDivRem8(N, DAG)) + return DivRem8; if (!DCI.isBeforeLegalizeOps()) { if (InVT == MVT::i1) { @@ -26122,9 +27497,7 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT ScalarVT = VT.getScalarType(); - if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || - (!Subtarget->hasFMA() && !Subtarget->hasFMA4() && - !Subtarget->hasAVX512())) + if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasAnyFMA()) return SDValue(); SDValue A = N->getOperand(0); @@ -26169,8 +27542,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, N0.getOperand(0).hasOneUse()) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == X86ISD::SETCC_CARRY) { - ConstantSDNode *C = dyn_cast(N0.getOperand(1)); - if (!C || C->getZExtValue() != 1) + if (!isOneConstant(N0.getOperand(1))) return SDValue(); return DAG.getNode(ISD::AND, dl, VT, DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, @@ -26195,19 +27567,8 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; - // (i8,i32 zext (udivrem (i8 x, i8 y)) -> - // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y) - // This exposes the zext to the udivrem lowering, so that it directly extends - // from AH (which we otherwise need to do contortions to access). - if (N0.getOpcode() == ISD::UDIVREM && - N0.getResNo() == 1 && N0.getValueType() == MVT::i8 && - (VT == MVT::i32 || VT == MVT::i64)) { - SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); - SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys, - N0.getOperand(0), N0.getOperand(1)); - DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); - return R.getValue(1); - } + if (SDValue DivRem8 = getDivRem8(N, DAG)) + return DivRem8; return SDValue(); } @@ -26223,21 +27584,19 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) - if (ConstantSDNode *C = dyn_cast(LHS.getOperand(0))) - if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS, - LHS.getOperand(1)); - return DAG.getSetCC(DL, N->getValueType(0), addV, - DAG.getConstant(0, DL, addV.getValueType()), CC); - } + if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) { + SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS, + LHS.getOperand(1)); + return DAG.getSetCC(DL, N->getValueType(0), addV, + DAG.getConstant(0, DL, addV.getValueType()), CC); + } if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) - if (ConstantSDNode *C = dyn_cast(RHS.getOperand(0))) - if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS, - RHS.getOperand(1)); - return DAG.getSetCC(DL, N->getValueType(0), addV, - DAG.getConstant(0, DL, addV.getValueType()), CC); - } + if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) { + SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS, + RHS.getOperand(1)); + return DAG.getSetCC(DL, N->getValueType(0), addV, + DAG.getConstant(0, DL, addV.getValueType()), CC); + } if (VT.getScalarType() == MVT::i1 && (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { @@ -26275,29 +27634,17 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) { - SDValue V0 = N->getOperand(0); - SDValue V1 = N->getOperand(1); +static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); - EVT VT = N->getValueType(0); - - // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector - // operands and changing the mask to 1. This saves us a bunch of - // pattern-matching possibilities related to scalar math ops in SSE/AVX. - // x86InstrInfo knows how to commute this back after instruction selection - // if it would help register allocation. - - // TODO: If optimizing for size or a processor that doesn't suffer from - // partial register update stalls, this should be transformed into a MOVSD - // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. - - if (VT == MVT::v2f64) - if (auto *Mask = dyn_cast(N->getOperand(2))) - if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) { - SDValue NewMask = DAG.getConstant(1, DL, MVT::i8); - return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); - } - + // Gather and Scatter instructions use k-registers for masks. The type of + // the masks is v*i1. So the mask will be truncated anyway. + // The SIGN_EXTEND_INREG my be dropped. + SDValue Mask = N->getOperand(2); + if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) { + SmallVector NewOps(N->op_begin(), N->op_end()); + NewOps[2] = Mask.getOperand(0); + DAG.UpdateNodeOperands(N, NewOps); + } return SDValue(); } @@ -26474,7 +27821,7 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, } // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have - // a 32-bit target where SSE doesn't support i64->FP operations. + // a 32-bit target where SSE doesn't support i64->FP operations. if (!Subtarget->useSoftFloat() && Op0.getOpcode() == ISD::LOAD) { LoadSDNode *Ld = cast(Op0.getNode()); EVT LdVT = Ld->getValueType(0); @@ -26657,8 +28004,7 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) { SDValue ExtractedV = V.getOperand(0); SDValue OrigV = ExtractedV.getOperand(0); - if (auto *ExtractIdx = dyn_cast(ExtractedV.getOperand(1))) - if (ExtractIdx->getZExtValue() == 0) { + if (isNullConstant(ExtractedV.getOperand(1))) { MVT OrigVT = OrigV.getSimpleValueType(); // Extract a subvector if necessary... if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) { @@ -26707,12 +28053,17 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG, Subtarget); case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); + case ISD::FNEG: return PerformFNEGCombine(N, DAG, Subtarget); + case ISD::TRUNCATE: return PerformTRUNCATECombine(N, DAG, Subtarget); case X86ISD::FXOR: case X86ISD::FOR: return PerformFORCombine(N, DAG, Subtarget); case X86ISD::FMIN: case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); - case X86ISD::FAND: return PerformFANDCombine(N, DAG); - case X86ISD::FANDN: return PerformFANDNCombine(N, DAG); + case ISD::FMINNUM: + case ISD::FMAXNUM: return performFMinNumFMaxNumCombine(N, DAG, + Subtarget); + case X86ISD::FAND: return PerformFANDCombine(N, DAG, Subtarget); + case X86ISD::FANDN: return PerformFANDNCombine(N, DAG, Subtarget); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); case ISD::ANY_EXTEND: @@ -26726,6 +28077,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::PALIGNR: + case X86ISD::BLENDI: case X86ISD::UNPCKH: case X86ISD::UNPCKL: case X86ISD::MOVHLPS: @@ -26740,7 +28092,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); - case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG); + case ISD::MGATHER: + case ISD::MSCATTER: return PerformGatherScatterCombine(N, DAG); } return SDValue(); @@ -26775,6 +28128,18 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { } } +/// This function checks if any of the users of EFLAGS copies the EFLAGS. We +/// know that the code that lowers COPY of EFLAGS has to use the stack, and if +/// we don't adjust the stack we clobber the first frame index. +/// See X86InstrInfo::copyPhysReg. +bool X86TargetLowering::hasCopyImplyingStackAdjustment( + MachineFunction *MF) const { + const MachineRegisterInfo &MRI = MF->getRegInfo(); + + return any_of(MRI.reg_instructions(X86::EFLAGS), + [](const MachineInstr &RI) { return RI.isCopy(); }); +} + /// IsDesirableToPromoteOp - This method query the target whether it is /// beneficial for dag combiner to promote the specified node. If true, it /// should return the desired promotion type by reference. @@ -27370,6 +28735,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case MVT::f64: case MVT::i64: return std::make_pair(0U, &X86::FR64RegClass); + // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. // Vector types. case MVT::v16i8: case MVT::v8i16: @@ -27454,17 +28820,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (Class == &X86::GR8RegClass || Class == &X86::GR16RegClass || Class == &X86::GR32RegClass || Class == &X86::GR64RegClass) { unsigned Size = VT.getSizeInBits(); - MVT::SimpleValueType SimpleTy = Size == 1 || Size == 8 ? MVT::i8 - : Size == 16 ? MVT::i16 - : Size == 32 ? MVT::i32 - : Size == 64 ? MVT::i64 - : MVT::Other; - unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, SimpleTy); + if (Size == 1) Size = 8; + unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size); if (DestReg > 0) { Res.first = DestReg; - Res.second = SimpleTy == MVT::i8 ? &X86::GR8RegClass - : SimpleTy == MVT::i16 ? &X86::GR16RegClass - : SimpleTy == MVT::i32 ? &X86::GR32RegClass + Res.second = Size == 8 ? &X86::GR8RegClass + : Size == 16 ? &X86::GR16RegClass + : Size == 32 ? &X86::GR32RegClass : &X86::GR64RegClass; assert(Res.second->contains(Res.first) && "Register in register class"); } else { @@ -27482,6 +28844,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // target independent register mapper will just pick the first match it can // find, ignoring the required type. + // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. if (VT == MVT::f32 || VT == MVT::i32) Res.second = &X86::FR32RegClass; else if (VT == MVT::f64 || VT == MVT::i64) @@ -27543,26 +28906,51 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { return OptSize && !VT.isVector(); } -void X86TargetLowering::markInRegArguments(SelectionDAG &DAG, - TargetLowering::ArgListTy& Args) const { - // The MCU psABI requires some arguments to be passed in-register. - // For regular calls, the inreg arguments are marked by the front-end. - // However, for compiler generated library calls, we have to patch this - // up here. - if (!Subtarget->isTargetMCU() || !Args.size()) +void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { + if (!Subtarget->is64Bit()) return; - unsigned FreeRegs = 3; - for (auto &Arg : Args) { - // For library functions, we do not expect any fancy types. - unsigned Size = DAG.getDataLayout().getTypeSizeInBits(Arg.Ty); - unsigned SizeInRegs = (Size + 31) / 32; - if (SizeInRegs > 2 || SizeInRegs > FreeRegs) - continue; + // Update IsSplitCSR in X86MachineFunctionInfo. + X86MachineFunctionInfo *AFI = + Entry->getParent()->getInfo(); + AFI->setIsSplitCSR(true); +} - Arg.isInReg = true; - FreeRegs -= SizeInRegs; - if (!FreeRegs) - break; +void X86TargetLowering::insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl &Exits) const { + const X86RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); + if (!IStart) + return; + + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + MachineBasicBlock::iterator MBBI = Entry->begin(); + for (const MCPhysReg *I = IStart; *I; ++I) { + const TargetRegisterClass *RC = nullptr; + if (X86::GR64RegClass.contains(*I)) + RC = &X86::GR64RegClass; + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + + unsigned NewVR = MRI->createVirtualRegister(RC); + // Create copy from CSR to a virtual register. + // FIXME: this currently does not emit CFI pseudo-instructions, it works + // fine for CXX_FAST_TLS since the C++-style TLS access functions should be + // nounwind. If we want to generalize this later, we may need to emit + // CFI pseudo-instructions. + assert(Entry->getParent()->getFunction()->hasFnAttribute( + Attribute::NoUnwind) && + "Function should be nounwind in insertCopiesSplitCSR!"); + Entry->addLiveIn(*I); + BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) + .addReg(*I); + + // Insert the copy-back instructions right before the terminator. + for (auto *Exit : Exits) + BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), + TII->get(TargetOpcode::COPY), *I) + .addReg(NewVR); } }