X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86ISelLowering.cpp;h=33b01e007db5a78ccf12b2a8af38359d2de4e38a;hb=758b9df87aa32d670dc93739ff51d83e5021b865;hp=5b99adcb1bd4d6f260ac4f0de753f2688c616db0;hpb=644f1ff1842070ac6a3321e52174df1b1df84998;p=oota-llvm.git diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 5b99adcb1bd..19de1baf12d 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -67,22 +67,12 @@ static cl::opt ExperimentalVectorWideningLegalization( "rather than promotion."), cl::Hidden); -static cl::opt ReciprocalEstimateRefinementSteps( - "x86-recip-refinement-steps", cl::init(1), - cl::desc("Specify the number of Newton-Raphson iterations applied to the " - "result of the hardware reciprocal estimate instruction."), - cl::NotHidden); - -// Forward declarations. -static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, - SDValue V2); - X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); - TD = getDataLayout(); + MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize()); // Set up the TargetLowering object. static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; @@ -124,13 +114,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); - - // The _ftol2 runtime function has an unusual calling conv, which - // is modeled by a special pseudo-instruction. - setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr); - setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr); - setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr); - setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr); } if (Subtarget->isTargetDarwin()) { @@ -238,8 +221,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); if (Subtarget->is64Bit()) { - setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); - setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); + if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) { + // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80. + setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); + setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); + } else { + setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); + setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); + } } else if (!Subtarget->useSoftFloat()) { // Since AVX is a superset of SSE3, only check for SSE here. if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) @@ -248,14 +237,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // the optimal thing for SSE vs. the default expansion in the legalizer. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); else + // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom. // With SSE3 we can use fisttpll to convert to a signed i64; without // SSE, we're stuck with a fistpll. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); - } - if (isTargetFTOL()) { - // Use the _ftol2 runtime function, which has a pseudo-instruction - // to handle its weird calling convention. setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); } @@ -318,7 +304,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); - setOperationAction(ISD::FREM , MVT::f32 , Expand); + + if (Subtarget->is32Bit() && Subtarget->isTargetKnownWindowsMSVC()) { + // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)` + // is. We should promote the value to 64-bits to solve this. + // This is what the CRT headers do - `fmodf` is an inline header + // function casting to f64 and calling `fmod`. + setOperationAction(ISD::FREM , MVT::f32 , Promote); + } else { + setOperationAction(ISD::FREM , MVT::f32 , Expand); + } + setOperationAction(ISD::FREM , MVT::f64 , Expand); setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); @@ -421,6 +417,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SETCC , MVT::i64 , Custom); } setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); + setOperationAction(ISD::CATCHRET , MVT::Other, Custom); // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support // SjLj exception handling but a light-weight setjmp/longjmp replacement to // support continuation, user-level threading, and etc.. As a result, no @@ -479,7 +476,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); } - if (Subtarget->is64Bit()) { + if (Subtarget->isTarget64BitLP64()) { setExceptionPointerRegister(X86::RAX); setExceptionSelectorRegister(X86::RDX); } else { @@ -498,8 +495,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); setOperationAction(ISD::VAEND , MVT::Other, Expand); - if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) { - // TargetInfo::X86_64ABIBuiltinVaList + if (Subtarget->is64Bit()) { setOperationAction(ISD::VAARG , MVT::Other, Custom); setOperationAction(ISD::VACOPY , MVT::Other, Custom); } else { @@ -511,7 +507,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom); + setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom); // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering. setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); @@ -831,6 +827,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FNEG, MVT::v2f64, Custom); setOperationAction(ISD::FABS, MVT::v2f64, Custom); + setOperationAction(ISD::SMAX, MVT::v8i16, Legal); + setOperationAction(ISD::UMAX, MVT::v16i8, Legal); + setOperationAction(ISD::SMIN, MVT::v8i16, Legal); + setOperationAction(ISD::UMIN, MVT::v16i8, Legal); + setOperationAction(ISD::SETCC, MVT::v2i64, Custom); setOperationAction(ISD::SETCC, MVT::v16i8, Custom); setOperationAction(ISD::SETCC, MVT::v8i16, Custom); @@ -842,13 +843,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); - // Only provide customized ctpop vector bit twiddling for vector types we - // know to perform better than using the popcnt instructions on each vector - // element. If popcnt isn't supported, always provide the custom version. - if (!Subtarget->hasPOPCNT()) { - setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); - setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); - } + setOperationAction(ISD::CTPOP, MVT::v16i8, Custom); + setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); + setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); + setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); // Custom lower build_vector, vector_shuffle, and extract_vector_elt. for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { @@ -924,6 +922,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); // As there is no 64-bit GPR available, we need build a special custom @@ -951,6 +951,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); } + setOperationAction(ISD::SMAX, MVT::v16i8, Legal); + setOperationAction(ISD::SMAX, MVT::v4i32, Legal); + setOperationAction(ISD::UMAX, MVT::v8i16, Legal); + setOperationAction(ISD::UMAX, MVT::v4i32, Legal); + setOperationAction(ISD::SMIN, MVT::v16i8, Legal); + setOperationAction(ISD::SMIN, MVT::v4i32, Legal); + setOperationAction(ISD::UMIN, MVT::v8i16, Legal); + setOperationAction(ISD::UMIN, MVT::v4i32, Legal); + // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); @@ -1004,6 +1013,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } if (Subtarget->hasSSE2()) { + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); + setOperationAction(ISD::SRL, MVT::v8i16, Custom); setOperationAction(ISD::SRL, MVT::v16i8, Custom); @@ -1021,6 +1034,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SHL, MVT::v2i64, Custom); setOperationAction(ISD::SHL, MVT::v4i32, Custom); + setOperationAction(ISD::SRA, MVT::v2i64, Custom); setOperationAction(ISD::SRA, MVT::v4i32, Custom); } @@ -1109,7 +1123,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); - if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { + setOperationAction(ISD::CTPOP, MVT::v32i8, Custom); + setOperationAction(ISD::CTPOP, MVT::v16i16, Custom); + setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); + setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); + + if (Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()) { setOperationAction(ISD::FMA, MVT::v8f32, Legal); setOperationAction(ISD::FMA, MVT::v4f64, Legal); setOperationAction(ISD::FMA, MVT::v4f32, Legal); @@ -1139,20 +1158,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MULHU, MVT::v16i16, Legal); setOperationAction(ISD::MULHS, MVT::v16i16, Legal); + setOperationAction(ISD::SMAX, MVT::v32i8, Legal); + setOperationAction(ISD::SMAX, MVT::v16i16, Legal); + setOperationAction(ISD::SMAX, MVT::v8i32, Legal); + setOperationAction(ISD::UMAX, MVT::v32i8, Legal); + setOperationAction(ISD::UMAX, MVT::v16i16, Legal); + setOperationAction(ISD::UMAX, MVT::v8i32, Legal); + setOperationAction(ISD::SMIN, MVT::v32i8, Legal); + setOperationAction(ISD::SMIN, MVT::v16i16, Legal); + setOperationAction(ISD::SMIN, MVT::v8i32, Legal); + setOperationAction(ISD::UMIN, MVT::v32i8, Legal); + setOperationAction(ISD::UMIN, MVT::v16i16, Legal); + setOperationAction(ISD::UMIN, MVT::v8i32, Legal); + // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); - // Only provide customized ctpop vector bit twiddling for vector types we - // know to perform better than using the popcnt instructions on each - // vector element. If popcnt isn't supported, always provide the custom - // version. - if (!Subtarget->hasPOPCNT()) - setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); - - // Custom CTPOP always performs better on natively supported v8i32 - setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); - // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); @@ -1182,6 +1204,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MUL, MVT::v8i32, Custom); setOperationAction(ISD::MUL, MVT::v16i16, Custom); setOperationAction(ISD::MUL, MVT::v32i8, Custom); + + setOperationAction(ISD::SMAX, MVT::v32i8, Custom); + setOperationAction(ISD::SMAX, MVT::v16i16, Custom); + setOperationAction(ISD::SMAX, MVT::v8i32, Custom); + setOperationAction(ISD::UMAX, MVT::v32i8, Custom); + setOperationAction(ISD::UMAX, MVT::v16i16, Custom); + setOperationAction(ISD::UMAX, MVT::v8i32, Custom); + setOperationAction(ISD::SMIN, MVT::v32i8, Custom); + setOperationAction(ISD::SMIN, MVT::v16i16, Custom); + setOperationAction(ISD::SMIN, MVT::v8i32, Custom); + setOperationAction(ISD::UMIN, MVT::v32i8, Custom); + setOperationAction(ISD::UMIN, MVT::v16i16, Custom); + setOperationAction(ISD::UMIN, MVT::v8i32, Custom); } // In the customized shift lowering, the legal cases in AVX2 will be @@ -1192,6 +1227,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SHL, MVT::v4i64, Custom); setOperationAction(ISD::SHL, MVT::v8i32, Custom); + setOperationAction(ISD::SRA, MVT::v4i64, Custom); setOperationAction(ISD::SRA, MVT::v8i32, Custom); // Custom lower several nodes for 256-bit types. @@ -1257,11 +1293,27 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v32i16, MVT::v32i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v32i16, MVT::v32i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i32, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i32, Legal); + setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::SETCC, MVT::i1, Custom); setOperationAction(ISD::XOR, MVT::i1, Legal); setOperationAction(ISD::OR, MVT::i1, Legal); setOperationAction(ISD::AND, MVT::i1, Legal); + setOperationAction(ISD::SUB, MVT::i1, Custom); + setOperationAction(ISD::ADD, MVT::i1, Custom); + setOperationAction(ISD::MUL, MVT::i1, Custom); setOperationAction(ISD::LOAD, MVT::v16f32, Legal); setOperationAction(ISD::LOAD, MVT::v8f64, Legal); setOperationAction(ISD::LOAD, MVT::v8i64, Legal); @@ -1284,13 +1336,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMA, MVT::v8f64, Legal); setOperationAction(ISD::FMA, MVT::v16f32, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); + // FIXME: [US]INT_TO_FP are not legal for f80. setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); if (Subtarget->is64Bit()) { - setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal); } @@ -1311,12 +1360,55 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); + setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); + setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); + if (Subtarget->hasVLX()){ + setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); + setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); + setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); + setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); + setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal); + + setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); + setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal); + setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); + } setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); if (Subtarget->hasDQI()) { - setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); + + setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); + if (Subtarget->hasVLX()) { + setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); + } + } + if (Subtarget->hasVLX()) { + setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); } setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); @@ -1368,6 +1460,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v16i1, Custom); setOperationAction(ISD::SELECT, MVT::v8i1, Custom); + setOperationAction(ISD::SMAX, MVT::v16i32, Legal); + setOperationAction(ISD::SMAX, MVT::v8i64, Legal); + setOperationAction(ISD::UMAX, MVT::v16i32, Legal); + setOperationAction(ISD::UMAX, MVT::v8i64, Legal); + setOperationAction(ISD::SMIN, MVT::v16i32, Legal); + setOperationAction(ISD::SMIN, MVT::v8i64, Legal); + setOperationAction(ISD::UMIN, MVT::v16i32, Legal); + setOperationAction(ISD::UMIN, MVT::v8i64, Legal); + setOperationAction(ISD::ADD, MVT::v8i64, Legal); setOperationAction(ISD::ADD, MVT::v16i32, Legal); @@ -1395,6 +1496,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget->hasCDI()) { setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i64, Legal); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i32, Legal); + } + if (Subtarget->hasVLX() && Subtarget->hasCDI()) { + setOperationAction(ISD::CTLZ, MVT::v4i64, Legal); + setOperationAction(ISD::CTLZ, MVT::v8i32, Legal); + setOperationAction(ISD::CTLZ, MVT::v2i64, Legal); + setOperationAction(ISD::CTLZ, MVT::v4i32, Legal); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Legal); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Legal); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Legal); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Legal); } if (Subtarget->hasDQI()) { setOperationAction(ISD::MUL, MVT::v2i64, Legal); @@ -1465,6 +1578,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SUB, MVT::v32i16, Legal); setOperationAction(ISD::SUB, MVT::v64i8, Legal); setOperationAction(ISD::MUL, MVT::v32i16, Legal); + setOperationAction(ISD::MULHS, MVT::v32i16, Legal); + setOperationAction(ISD::MULHU, MVT::v32i16, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); @@ -1472,9 +1587,33 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v32i1, Custom); setOperationAction(ISD::SELECT, MVT::v64i1, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom); + setOperationAction(ISD::VSELECT, MVT::v32i16, Legal); + setOperationAction(ISD::VSELECT, MVT::v64i8, Legal); + setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); + + setOperationAction(ISD::SMAX, MVT::v64i8, Legal); + setOperationAction(ISD::SMAX, MVT::v32i16, Legal); + setOperationAction(ISD::UMAX, MVT::v64i8, Legal); + setOperationAction(ISD::UMAX, MVT::v32i16, Legal); + setOperationAction(ISD::SMIN, MVT::v64i8, Legal); + setOperationAction(ISD::SMIN, MVT::v32i16, Legal); + setOperationAction(ISD::UMIN, MVT::v64i8, Legal); + setOperationAction(ISD::UMIN, MVT::v32i16, Legal); + + setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); + setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); + if (Subtarget->hasVLX()) + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { const MVT VT = (MVT::SimpleValueType)i; @@ -1515,6 +1654,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::XOR, MVT::v4i32, Legal); setOperationAction(ISD::SRA, MVT::v2i64, Custom); setOperationAction(ISD::SRA, MVT::v4i64, Custom); + + setOperationAction(ISD::SMAX, MVT::v2i64, Legal); + setOperationAction(ISD::SMAX, MVT::v4i64, Legal); + setOperationAction(ISD::UMAX, MVT::v2i64, Legal); + setOperationAction(ISD::UMAX, MVT::v4i64, Legal); + setOperationAction(ISD::SMIN, MVT::v2i64, Legal); + setOperationAction(ISD::SMIN, MVT::v4i64, Legal); + setOperationAction(ISD::UMIN, MVT::v2i64, Legal); + setOperationAction(ISD::UMIN, MVT::v4i64, Legal); } // We want to custom lower some of our intrinsics. @@ -1595,22 +1743,20 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::SINT_TO_FP); + setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); - setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); computeRegisterProperties(Subtarget->getRegisterInfo()); - // On Darwin, -Os means optimize for size without hurting performance, - // do not reduce the limit. MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores - MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; + MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores - MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores - MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + MaxStoresPerMemmoveOptSize = 4; setPrefLoopAlignment(4); // 2^4 bytes. // Predictable cmov don't hurt on atom because it's in-order. @@ -1636,7 +1782,8 @@ X86TargetLowering::getPreferredVectorAction(EVT VT) const { return TargetLoweringBase::getPreferredVectorAction(VT); } -EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { +EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, + EVT VT) const { if (!VT.isVector()) return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; @@ -1693,9 +1840,9 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { if (EltAlign > MaxAlign) MaxAlign = EltAlign; } else if (StructType *STy = dyn_cast(Ty)) { - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + for (auto *EltTy : STy->elements()) { unsigned EltAlign = 0; - getMaxByValAlign(STy->getElementType(i), EltAlign); + getMaxByValAlign(EltTy, EltAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; if (MaxAlign == 16) @@ -1708,10 +1855,11 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { /// function arguments in the caller parameter area. For X86, aggregates /// that contain SSE vectors are placed at 16-byte boundaries while the rest /// are at 4-byte boundaries. -unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { +unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty, + const DataLayout &DL) const { if (Subtarget->is64Bit()) { // Max of 8 and alignment of type. - unsigned TyAlign = TD->getABITypeAlignment(Ty); + unsigned TyAlign = DL.getABITypeAlignment(Ty); if (TyAlign > 8) return TyAlign; return 8; @@ -1744,10 +1892,11 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, if ((!IsMemset || ZeroMemset) && !F->hasFnAttribute(Attribute::NoImplicitFloat)) { if (Size >= 16 && - (Subtarget->isUnalignedMemAccessFast() || + (!Subtarget->isUnalignedMem16Slow() || ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16)))) { if (Size >= 32) { + // FIXME: Check if unaligned 32-byte accesses are slow. if (Subtarget->hasInt256()) return MVT::v8i32; if (Subtarget->hasFp256()) @@ -1765,6 +1914,9 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, return MVT::f64; } } + // This is a compromise. If we reach here, unaligned accesses may be slow on + // this target. However, creating smaller, aligned accesses could be even + // slower and would certainly be a lot more code. if (Subtarget->is64Bit() && Size >= 8) return MVT::i64; return MVT::i32; @@ -1783,8 +1935,22 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, unsigned, bool *Fast) const { - if (Fast) - *Fast = Subtarget->isUnalignedMemAccessFast(); + if (Fast) { + switch (VT.getSizeInBits()) { + default: + // 8-byte and under are always assumed to be fast. + *Fast = true; + break; + case 128: + *Fast = !Subtarget->isUnalignedMem16Slow(); + break; + case 256: + *Fast = !Subtarget->isUnalignedMem32Slow(); + break; + // TODO: What about AVX-512 (512-bit) accesses? + } + } + // Misaligned accesses of any size are always allowed. return true; } @@ -1814,7 +1980,7 @@ X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, Subtarget->isPICStyleGOT()); // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF // entries. - return MCSymbolRefExpr::Create(MBB->getSymbol(), + return MCSymbolRefExpr::create(MBB->getSymbol(), MCSymbolRefExpr::VK_GOTOFF, Ctx); } @@ -1824,7 +1990,8 @@ SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, if (!Subtarget->is64Bit()) // This doesn't have SDLoc associated with it, but is not really the // same as a Register. - return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()); + return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), + getPointerTy(DAG.getDataLayout())); return Table; } @@ -1838,7 +2005,7 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); // Otherwise, the reference is relative to the PIC base. - return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); + return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); } std::pair @@ -1953,7 +2120,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); } else if (VA.getLocInfo() == CCValAssign::BCvt) - ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy); + ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy); assert(VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."); @@ -1990,13 +2157,13 @@ X86TargetLowering::LowerReturn(SDValue Chain, if (Subtarget->is64Bit()) { if (ValVT == MVT::x86mmx) { if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { - ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); + ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy); ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); // If we don't have SSE2 available, convert to v4f32 so the generated // register is legal. if (!Subtarget->hasSSE2()) - ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); + ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy); } } } @@ -2016,7 +2183,8 @@ X86TargetLowering::LowerReturn(SDValue Chain, // false, then an sret argument may be implicitly inserted in the SelDAG. In // either case FuncInfo->setSRetReturnReg() will have been called. if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { - SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy()); + SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, + getPointerTy(MF.getDataLayout())); unsigned RetValReg = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ? @@ -2025,7 +2193,8 @@ X86TargetLowering::LowerReturn(SDValue Chain, Flag = Chain.getValue(1); // RAX/EAX now acts like a return value. - RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy())); + RetOps.push_back( + DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); } RetOps[0] = Chain; // Update chain. @@ -2219,7 +2388,9 @@ static bool IsCCallConvention(CallingConv::ID CC) { } bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { - if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) + auto Attr = + CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); + if (!CI->isTailCall() || Attr.getValueAsString() == "true") return false; CallSite CS(CI); @@ -2270,14 +2441,15 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, unsigned Bytes = Flags.getByValSize(); if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); - return DAG.getFrameIndex(FI, getPointerTy()); + return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); } else { int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, VA.getLocMemOffset(), isImmutable); - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); - SDValue Val = DAG.getLoad(ValVT, dl, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0); + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); + SDValue Val = DAG.getLoad( + ValVT, dl, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, + false, false, 0); return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val; } @@ -2423,7 +2595,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); else if (VA.getLocInfo() == CCValAssign::BCvt) - ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); + ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue); if (VA.isExtInLoc()) { // Handle MMX values passed in XMM regs. @@ -2453,7 +2625,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, if (Ins[i].Flags.isSRet()) { unsigned Reg = FuncInfo->getSRetReturnReg(); if (!Reg) { - MVT PtrTy = getPointerTy(); + MVT PtrTy = getPointerTy(DAG.getDataLayout()); Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); FuncInfo->setSRetReturnReg(Reg); } @@ -2481,7 +2653,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, MachineModuleInfo &MMI = MF.getMMI(); const Function *WinEHParent = nullptr; - if (IsWin64 && MMI.hasWinEHFuncInfo(Fn)) + if (MMI.hasWinEHFuncInfo(Fn)) WinEHParent = MMI.getWinEHParent(Fn); bool IsWinEHOutlined = WinEHParent && WinEHParent != Fn; bool IsWinEHParent = WinEHParent && WinEHParent == Fn; @@ -2543,16 +2715,17 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // Store the integer parameter registers. SmallVector MemOps; SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), - getPointerTy()); + getPointerTy(DAG.getDataLayout())); unsigned Offset = FuncInfo->getVarArgsGPOffset(); for (SDValue Val : LiveGPRs) { - SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, - DAG.getIntPtrConstant(Offset, dl)); + SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), + RSFIN, DAG.getIntPtrConstant(Offset, dl)); SDValue Store = - DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo::getFixedStack( - FuncInfo->getRegSaveFrameIndex(), Offset), - false, false, 0); + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), + FuncInfo->getRegSaveFrameIndex(), Offset), + false, false, 0); MemOps.push_back(Store); Offset += 8; } @@ -2574,7 +2747,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); - } else if (IsWinEHOutlined) { + } else if (IsWin64 && IsWinEHOutlined) { // Get to the caller-allocated home save location. Add 8 to account // for the return address. int HomeOffset = TFI.getOffsetOfLocalArea() + 8; @@ -2587,13 +2760,14 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // Store the second integer parameter (rdx) into rsp+16 relative to the // stack pointer at the entry of the function. - SDValue RSFIN = - DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy()); + SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), + getPointerTy(DAG.getDataLayout())); unsigned GPR = MF.addLiveIn(X86::RDX, &X86::GR64RegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64); Chain = DAG.getStore( Val.getValue(1), dl, Val, RSFIN, - MachinePointerInfo::getFixedStack(FuncInfo->getRegSaveFrameIndex()), + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), + FuncInfo->getRegSaveFrameIndex()), /*isVolatile=*/true, /*isNonTemporal=*/false, /*Alignment=*/0); } @@ -2662,14 +2836,22 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, FuncInfo->setArgumentStackSize(StackSize); if (IsWinEHParent) { - int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false); - SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64); - MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI; - SDValue Neg2 = DAG.getConstant(-2, dl, MVT::i64); - Chain = DAG.getStore(Chain, dl, Neg2, StackSlot, - MachinePointerInfo::getFixedStack(UnwindHelpFI), - /*isVolatile=*/true, - /*isNonTemporal=*/false, /*Alignment=*/0); + if (Is64Bit) { + int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false); + SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64); + MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI; + SDValue Neg2 = DAG.getConstant(-2, dl, MVT::i64); + Chain = DAG.getStore(Chain, dl, Neg2, StackSlot, + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), UnwindHelpFI), + /*isVolatile=*/true, + /*isNonTemporal=*/false, /*Alignment=*/0); + } else { + // Functions using Win32 EH are considered to have opaque SP adjustments + // to force local variables to be addressed from the frame or base + // pointers. + MFI->setHasOpaqueSPAdjustment(true); + } } return Chain; @@ -2683,13 +2865,15 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain, ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); - PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); + PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), + StackPtr, PtrOff); if (Flags.isByVal()) return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); - return DAG.getStore(Chain, dl, Arg, PtrOff, - MachinePointerInfo::getStack(LocMemOffset), - false, false, 0); + return DAG.getStore( + Chain, dl, Arg, PtrOff, + MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), + false, false, 0); } /// Emit a load of return address if tail call @@ -2700,7 +2884,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, bool IsTailCall, bool Is64Bit, int FPDiff, SDLoc dl) const { // Adjust the Return address stack slot. - EVT VT = getPointerTy(); + EVT VT = getPointerTy(DAG.getDataLayout()); OutRetAddr = getReturnAddressFrameIndex(DAG); // Load the "old" Return address. @@ -2723,11 +2907,24 @@ static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, false); SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, - MachinePointerInfo::getFixedStack(NewReturnAddrFI), + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), NewReturnAddrFI), false, false, 0); return Chain; } +/// Returns a vector_shuffle mask for an movs{s|d}, movd +/// operation of specified width. +static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, + SDValue V2) { + unsigned NumElems = VT.getVectorNumElements(); + SmallVector Mask; + Mask.push_back(NumElems); + for (unsigned i = 1; i != NumElems; ++i) + Mask.push_back(i); + return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); +} + SDValue X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { @@ -2748,10 +2945,24 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, StructReturnType SR = callIsStructReturn(Outs); bool IsSibcall = false; X86MachineFunctionInfo *X86Info = MF.getInfo(); + auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); - if (MF.getTarget().Options.DisableTailCalls) + if (Attr.getValueAsString() == "true") isTailCall = false; + if (Subtarget->isPICStyleGOT() && + !MF.getTarget().Options.GuaranteedTailCallOpt) { + // If we are using a GOT, disable tail calls to external symbols with + // default visibility. Tail calling such a symbol requires using a GOT + // relocation, which forces early binding of the symbol. This breaks code + // that require lazy function symbol resolution. Using musttail or + // GuaranteedTailCallOpt will override this. + GlobalAddressSDNode *G = dyn_cast(Callee); + if (!G || (!G->getGlobal()->hasLocalLinkage() && + G->getGlobal()->hasDefaultVisibility())) + isTailCall = false; + } + bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall(); if (IsMustTail) { // Force this to be a tail call. The verifier rules are enough to ensure @@ -2870,22 +3081,23 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); else if (RegVT.is128BitVector()) { // Special case: passing MMX values in XMM registers. - Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); + Arg = DAG.getBitcast(MVT::i64, Arg); Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); } else Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); break; case CCValAssign::BCvt: - Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); + Arg = DAG.getBitcast(RegVT, Arg); break; case CCValAssign::Indirect: { // Store the argument. SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); int FI = cast(SpillSlot)->getIndex(); - Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, - MachinePointerInfo::getFixedStack(FI), - false, false, 0); + Chain = DAG.getStore( + Chain, dl, Arg, SpillSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, 0); Arg = SpillSlot; break; } @@ -2910,7 +3122,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert(VA.isMemLoc()); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), - getPointerTy()); + getPointerTy(DAG.getDataLayout())); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, dl, DAG, VA, Flags)); } @@ -2923,8 +3135,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // ELF / PIC requires GOT in the EBX register before function calls via PLT // GOT pointer. if (!isTailCall) { - RegsToPass.push_back(std::make_pair(unsigned(X86::EBX), - DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()))); + RegsToPass.push_back(std::make_pair( + unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), + getPointerTy(DAG.getDataLayout())))); } else { // If we are tail calling and generating PIC/GOT style code load the // address of the callee into ECX. The value in ecx is used as target of @@ -2936,8 +3149,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Note: The actual moving to ECX is done further down. GlobalAddressSDNode *G = dyn_cast(Callee); - if (G && !G->getGlobal()->hasHiddenVisibility() && - !G->getGlobal()->hasProtectedVisibility()) + if (G && !G->getGlobal()->hasLocalLinkage() && + G->getGlobal()->hasDefaultVisibility()) Callee = LowerGlobalAddress(Callee, DAG); else if (isa(Callee)) Callee = LowerExternalSymbol(Callee, DAG); @@ -3004,26 +3217,26 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, int32_t Offset = VA.getLocMemOffset()+FPDiff; uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); - FIN = DAG.getFrameIndex(FI, getPointerTy()); + FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); if (Flags.isByVal()) { // Copy relative to framepointer. SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl); if (!StackPtr.getNode()) - StackPtr = DAG.getCopyFromReg(Chain, dl, - RegInfo->getStackRegister(), - getPointerTy()); - Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); + StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), + getPointerTy(DAG.getDataLayout())); + Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), + StackPtr, Source); MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, ArgChain, Flags, DAG, dl)); } else { // Store relative to framepointer. - MemOpChains2.push_back( - DAG.getStore(ArgChain, dl, Arg, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, 0)); + MemOpChains2.push_back(DAG.getStore( + ArgChain, dl, Arg, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, 0)); } } @@ -3032,8 +3245,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Store the return address to the appropriate stack slot. Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, - getPointerTy(), RegInfo->getSlotSize(), - FPDiff, dl); + getPointerTy(DAG.getDataLayout()), + RegInfo->getSlotSize(), FPDiff, dl); } // Build a sequence of copy-to-reg nodes chained together with token chain @@ -3074,7 +3287,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && - (GV->isDeclaration() || GV->isWeakForLinker()) && + !GV->isStrongDefinitionForLinker() && (!Subtarget->getTargetTriple().isMacOSX() || Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, @@ -3091,17 +3304,19 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ExtraLoad = true; } - Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), - G->getOffset(), OpFlags); + Callee = DAG.getTargetGlobalAddress( + GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags); // Add a wrapper if needed. if (WrapperKind != ISD::DELETED_NODE) - Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); + Callee = DAG.getNode(X86ISD::WrapperRIP, dl, + getPointerTy(DAG.getDataLayout()), Callee); // Add extra indirection if needed. if (ExtraLoad) - Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, - MachinePointerInfo::getGOT(), - false, false, false, 0); + Callee = DAG.getLoad( + getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, + false, 0); } } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { unsigned char OpFlags = 0; @@ -3120,8 +3335,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, OpFlags = X86II::MO_DARWIN_STUB; } - Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), - OpFlags); + Callee = DAG.getTargetExternalSymbol( + S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags); } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) { // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI @@ -3152,9 +3367,24 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. - const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); - const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); + const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); + + // If this is an invoke in a 32-bit function using an MSVC personality, assume + // the function clobbers all registers. If an exception is thrown, the runtime + // will not restore CSRs. + // FIXME: Model this more precisely so that we can register allocate across + // the normal edge and spill and fill across the exceptional edge. + if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) { + const Function *CallerFn = MF.getFunction(); + EHPersonality Pers = + CallerFn->hasPersonalityFn() + ? classifyEHPersonality(CallerFn->getPersonalityFn()) + : EHPersonality::Unknown; + if (isMSVCEHPersonality(Pers)) + Mask = RegInfo->getNoPreservedMask(); + } + Ops.push_back(DAG.getRegisterMask(Mask)); if (InFlag.getNode()) @@ -3237,8 +3467,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // EDI // local1 .. -/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned -/// for a 16 byte align requirement. +/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align +/// requirement. unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG) const { @@ -3259,9 +3489,8 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, return Offset; } -/// MatchingStackOffset - Return true if the given stack call argument is -/// already available in the same position (relatively) of the caller's -/// incoming argument stack. +/// Return true if the given stack call argument is already available in the +/// same position (relatively) of the caller's incoming argument stack. static bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, @@ -3314,9 +3543,8 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); } -/// IsEligibleForTailCallOptimization - Check whether the call is eligible -/// for tail call optimization. Targets which want to do tail call -/// optimization should implement this function. +/// Check whether the call is eligible for tail call optimization. Targets +/// that want to do tail call optimization should implement this function. bool X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, @@ -3567,6 +3795,8 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::VPERMILPI: case X86ISD::VPERM2X128: case X86ISD::VPERMI: + case X86ISD::VPERMV: + case X86ISD::VPERMV3: return true; } } @@ -3618,7 +3848,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { FuncInfo->setRAIndex(ReturnAddrIndex); } - return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); + return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout())); } bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, @@ -3651,8 +3881,8 @@ bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, return false; } -/// isCalleePop - Determines whether the callee is required to pop its -/// own arguments. Callee pop is necessary to support tail calls. +/// Determines whether the callee is required to pop its own arguments. +/// Callee pop is necessary to support tail calls. bool X86::isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg, bool TailCallOpt) { switch (CallingConv) { @@ -3689,8 +3919,8 @@ static bool isX86CCUnsigned(unsigned X86CC) { llvm_unreachable("covered switch fell through?!"); } -/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 -/// specific condition code, returning the condition code and the LHS/RHS of the +/// Do a one-to-one translation of a ISD::CondCode to the X86-specific +/// condition code, returning the condition code and the LHS/RHS of the /// comparison to make. static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { @@ -3777,8 +4007,8 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP, } } -/// hasFPCMov - is there a floating point cmov for the specific X86 condition -/// code. Current x86 isa includes the following FP cmov instructions: +/// Is there a floating point cmov for the specific X86 condition code? +/// Current x86 isa includes the following FP cmov instructions: /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. static bool hasFPCMov(unsigned X86CC) { switch (X86CC) { @@ -3796,7 +4026,7 @@ static bool hasFPCMov(unsigned X86CC) { } } -/// isFPImmLegal - Returns true if the target can instruction select the +/// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { @@ -3849,19 +4079,27 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const { return Subtarget->hasLZCNT(); } -/// isUndefOrInRange - Return true if Val is undef or if its value falls within -/// the specified range (L, H]. +/// Return true if every element in Mask, beginning +/// from position Pos and ending in Pos+Size is undef. +static bool isUndefInRange(ArrayRef Mask, unsigned Pos, unsigned Size) { + for (unsigned i = Pos, e = Pos + Size; i != e; ++i) + if (0 <= Mask[i]) + return false; + return true; +} + +/// Return true if Val is undef or if its value falls within the +/// specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { return (Val < 0) || (Val >= Low && Val < Hi); } -/// isUndefOrEqual - Val is either less than zero (undef) or equal to the -/// specified value. +/// Val is either less than zero (undef) or equal to the specified value. static bool isUndefOrEqual(int Val, int CmpVal) { return (Val < 0 || Val == CmpVal); } -/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning +/// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (Low, Low+Size]. or is undef. static bool isSequentialOrUndefInRange(ArrayRef Mask, @@ -3872,9 +4110,8 @@ static bool isSequentialOrUndefInRange(ArrayRef Mask, return true; } -/// isVEXTRACTIndex - Return true if the specified -/// EXTRACT_SUBVECTOR operand specifies a vector extract that is -/// suitable for instruction that extract 128 or 256 bit vectors +/// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector +/// extract that is suitable for instruction that extract 128 or 256 bit vectors static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); if (!isa(N->getOperand(1).getNode())) @@ -3891,7 +4128,7 @@ static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { return Result; } -/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR +/// Return true if the specified INSERT_SUBVECTOR /// operand specifies a subvector insert that is suitable for input to /// insertion of 128 or 256-bit subvectors static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { @@ -3955,42 +4192,37 @@ static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { return Index / NumElemsPerChunk; } -/// getExtractVEXTRACT128Immediate - Return the appropriate immediate -/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 -/// and VINSERTI128 instructions. +/// Return the appropriate immediate to extract the specified +/// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions. unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) { return getExtractVEXTRACTImmediate(N, 128); } -/// getExtractVEXTRACT256Immediate - Return the appropriate immediate -/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4 -/// and VINSERTI64x4 instructions. +/// Return the appropriate immediate to extract the specified +/// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions. unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) { return getExtractVEXTRACTImmediate(N, 256); } -/// getInsertVINSERT128Immediate - Return the appropriate immediate -/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 -/// and VINSERTI128 instructions. +/// Return the appropriate immediate to insert at the specified +/// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions. unsigned X86::getInsertVINSERT128Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 128); } -/// getInsertVINSERT256Immediate - Return the appropriate immediate -/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4 -/// and VINSERTI64x4 instructions. +/// Return the appropriate immediate to insert at the specified +/// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions. unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 256); } -/// isZero - Returns true if Elt is a constant integer zero +/// Returns true if Elt is a constant integer zero static bool isZero(SDValue V) { ConstantSDNode *C = dyn_cast(V); return C && C->isNullValue(); } -/// isZeroNode - Returns true if Elt is a constant zero or a floating point -/// constant +0.0. +/// Returns true if Elt is a constant zero or a floating point constant +0.0. bool X86::isZeroNode(SDValue Elt) { if (isZero(Elt)) return true; @@ -3999,8 +4231,7 @@ bool X86::isZeroNode(SDValue Elt) { return false; } -/// getZeroVector - Returns a vector of specified type with all zero elements. -/// +/// Returns a vector of specified type with all zero elements. static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); @@ -4045,7 +4276,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, } else llvm_unreachable("Unexpected vector type"); - return DAG.getNode(ISD::BITCAST, dl, VT, Vec); + return DAG.getBitcast(VT, Vec); } static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, @@ -4172,9 +4403,9 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8); - Vec256 = DAG.getNode(ISD::BITCAST, dl, CastVT, Vec256); + Vec256 = DAG.getBitcast(CastVT, Vec256); Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask); - return DAG.getNode(ISD::BITCAST, dl, ResultVT, Vec256); + return DAG.getBitcast(ResultVT, Vec256); } return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); @@ -4204,7 +4435,7 @@ static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, return Insert256BitVector(V, V2, NumElems/2, DAG, dl); } -/// getOnesVector - Returns a vector of specified type with all bits set. +/// Returns a vector of specified type with all bits set. /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. /// Then bitcast to their original type, ensuring they get CSE'd. @@ -4227,22 +4458,10 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, } else llvm_unreachable("Unexpected vector type"); - return DAG.getNode(ISD::BITCAST, dl, VT, Vec); -} - -/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd -/// operation of specified width. -static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, - SDValue V2) { - unsigned NumElems = VT.getVectorNumElements(); - SmallVector Mask; - Mask.push_back(NumElems); - for (unsigned i = 1; i != NumElems; ++i) - Mask.push_back(i); - return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); + return DAG.getBitcast(VT, Vec); } -/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. +/// Returns a vector_shuffle node for an unpackl operation. static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); @@ -4254,7 +4473,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } -/// getUnpackh - Returns a vector_shuffle node for an unpackh operation. +/// Returns a vector_shuffle node for an unpackh operation. static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); @@ -4266,10 +4485,10 @@ static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } -/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified -/// vector of zero or undef vector. This produces a shuffle where the low -/// element of V2 is swizzled into the zero/undef vector, landing at element -/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). +/// Return a vector_shuffle of the specified vector of zero or undef vector. +/// This produces a shuffle where the low element of V2 is swizzled into the +/// zero/undef vector, landing at element Idx. +/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, bool IsZero, const X86Subtarget *Subtarget, @@ -4285,11 +4504,12 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]); } -/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the -/// target specific opcode. Returns true if the Mask could be calculated. Sets -/// IsUnary to true if only uses one source. Note that this will set IsUnary for -/// shuffles which use a single input multiple times, and in those cases it will +/// Calculates the shuffle mask corresponding to the target-specific opcode. +/// Returns true if the Mask could be calculated. Sets IsUnary to true if only +/// uses one source. Note that this will set IsUnary for shuffles which use a +/// single input multiple times, and in those cases it will /// adjust the mask to only have indices within that single input. +/// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero. static bool getTargetShuffleMask(SDNode *N, MVT VT, SmallVectorImpl &Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); @@ -4419,6 +4639,10 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2X128Mask(VT, cast(ImmN)->getZExtValue(), Mask); if (Mask.empty()) return false; + // Mask only contains negative index if an element is zero. + if (std::any_of(Mask.begin(), Mask.end(), + [](int M){ return M == SM_SentinelZero; })) + return false; break; case X86ISD::MOVSLDUP: DecodeMOVSLDUPMask(VT, Mask); @@ -4437,6 +4661,122 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, case X86ISD::MOVLPS: // Not yet implemented return false; + case X86ISD::VPERMV: { + IsUnary = true; + SDValue MaskNode = N->getOperand(0); + while (MaskNode->getOpcode() == ISD::BITCAST) + MaskNode = MaskNode->getOperand(0); + + unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()); + SmallVector RawMask; + if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { + // If we have a build-vector, then things are easy. + assert(MaskNode.getValueType().isInteger() && + MaskNode.getValueType().getVectorNumElements() == + VT.getVectorNumElements()); + + for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) { + SDValue Op = MaskNode->getOperand(i); + if (Op->getOpcode() == ISD::UNDEF) + RawMask.push_back((uint64_t)SM_SentinelUndef); + else if (isa(Op)) { + APInt MaskElement = cast(Op)->getAPIntValue(); + RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue()); + } else + return false; + } + DecodeVPERMVMask(RawMask, Mask); + break; + } + if (MaskNode->getOpcode() == X86ISD::VBROADCAST) { + unsigned NumEltsInMask = MaskNode->getNumOperands(); + MaskNode = MaskNode->getOperand(0); + auto *CN = dyn_cast(MaskNode); + if (CN) { + APInt MaskEltValue = CN->getAPIntValue(); + for (unsigned i = 0; i < NumEltsInMask; ++i) + RawMask.push_back(MaskEltValue.getLoBits(MaskLoBits).getZExtValue()); + DecodeVPERMVMask(RawMask, Mask); + break; + } + // It may be a scalar load + } + + auto *MaskLoad = dyn_cast(MaskNode); + if (!MaskLoad) + return false; + + SDValue Ptr = MaskLoad->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper || + Ptr->getOpcode() == X86ISD::WrapperRIP) + Ptr = Ptr->getOperand(0); + + auto *MaskCP = dyn_cast(Ptr); + if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) + return false; + + auto *C = dyn_cast(MaskCP->getConstVal()); + if (C) { + DecodeVPERMVMask(C, VT, Mask); + if (Mask.empty()) + return false; + break; + } + return false; + } + case X86ISD::VPERMV3: { + IsUnary = false; + SDValue MaskNode = N->getOperand(1); + while (MaskNode->getOpcode() == ISD::BITCAST) + MaskNode = MaskNode->getOperand(1); + + if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { + // If we have a build-vector, then things are easy. + EVT MaskVT = MaskNode.getValueType(); + assert(MaskVT.isInteger() && + MaskVT.getVectorNumElements() == VT.getVectorNumElements()); + + SmallVector RawMask; + unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()*2); + + for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) { + SDValue Op = MaskNode->getOperand(i); + if (Op->getOpcode() == ISD::UNDEF) + RawMask.push_back((uint64_t)SM_SentinelUndef); + else { + auto *CN = dyn_cast(Op.getNode()); + if (!CN) + return false; + APInt MaskElement = CN->getAPIntValue(); + RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue()); + } + } + DecodeVPERMV3Mask(RawMask, Mask); + break; + } + + auto *MaskLoad = dyn_cast(MaskNode); + if (!MaskLoad) + return false; + + SDValue Ptr = MaskLoad->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper || + Ptr->getOpcode() == X86ISD::WrapperRIP) + Ptr = Ptr->getOperand(0); + + auto *MaskCP = dyn_cast(Ptr); + if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) + return false; + + auto *C = dyn_cast(MaskCP->getConstVal()); + if (C) { + DecodeVPERMV3Mask(C, VT, Mask); + if (Mask.empty()) + return false; + break; + } + return false; + } default: llvm_unreachable("unknown target shuffle node"); } @@ -4451,7 +4791,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, return true; } -/// getShuffleScalarElt - Returns the scalar element that will make up the ith +/// Returns the scalar element that will make up the ith /// element of the result of the vector shuffle. static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, unsigned Depth) { @@ -4515,8 +4855,7 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, return SDValue(); } -/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. -/// +/// Custom lower build_vector of v16i8. static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, @@ -4583,11 +4922,10 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, } } - return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); + return DAG.getBitcast(MVT::v16i8, V); } -/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. -/// +/// Custom lower build_vector of v8i16. static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, @@ -4618,7 +4956,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, return V; } -/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32. +/// Custom lower build_vector of v4i32 or v4f32. static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget, const TargetLowering &TLI) { @@ -4721,7 +5059,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, SDLoc DL(Op); SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, DAG.getIntPtrConstant(InsertPSMask, DL)); - return DAG.getNode(ISD::BITCAST, DL, VT, Result); + return DAG.getBitcast(VT, Result); } /// Return a vector logical shift node. @@ -4731,12 +5069,11 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, assert(VT.is128BitVector() && "Unknown type for VShift"); MVT ShVT = MVT::v2i64; unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; - SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); - MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType()); + SrcOp = DAG.getBitcast(ShVT, SrcOp); + MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT); assert(NumBits % 8 == 0 && "Only support byte sized shifts"); SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy); - return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); + return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); } static SDValue @@ -4790,7 +5127,7 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { return SDValue(); if ((Offset % RequiredAlign) & 3) return SDValue(); - int64_t StartOffset = Offset & ~(RequiredAlign-1); + int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1); if (StartOffset) { SDLoc DL(Ptr); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, @@ -4921,7 +5258,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, SDValue(ResNode.getNode(), 1)); } - return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); + return DAG.getBitcast(VT, ResNode); } return SDValue(); } @@ -5023,8 +5360,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, // TODO: If multiple splats are generated to load the same constant, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. - const Function *F = DAG.getMachineFunction().getFunction(); - bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize); + bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize(); // Handle broadcasting a single constant scalar from the constant pool // into a vector. @@ -5051,11 +5387,13 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, assert(C && "Invalid constant type"); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); + SDValue CP = + DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); unsigned Alignment = cast(CP)->getAlignment(); - Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); + Ld = DAG.getLoad( + CVT, dl, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, Alignment); return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } @@ -5194,7 +5532,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { return NV; } -static SDValue ConvertI1VectorToInterger(SDValue Op, SelectionDAG &DAG) { +static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) { assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && "Can not convert non-constant vector"); @@ -5231,10 +5569,10 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { } if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { - SDValue Imm = ConvertI1VectorToInterger(Op, DAG); + SDValue Imm = ConvertI1VectorToInteger(Op, DAG); if (Imm.getValueSizeInBits() == VT.getSizeInBits()) - return DAG.getNode(ISD::BITCAST, dl, VT, Imm); - SDValue ExtVec = DAG.getNode(ISD::BITCAST, dl, MVT::v8i1, Imm); + return DAG.getBitcast(VT, Imm); + SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, DAG.getIntPtrConstant(0, dl)); } @@ -5249,7 +5587,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { SDValue In = Op.getOperand(idx); if (In.getOpcode() == ISD::UNDEF) continue; - if (!isa(In)) + if (!isa(In)) NonConstIdx.push_back(idx); else { Immediate |= cast(In)->getZExtValue() << idx; @@ -5276,12 +5614,12 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { } else if (HasConstElts) Imm = DAG.getConstant(0, dl, VT); - else + else Imm = DAG.getUNDEF(VT); if (Imm.getValueSizeInBits() == VT.getSizeInBits()) - DstVec = DAG.getNode(ISD::BITCAST, dl, VT, Imm); + DstVec = DAG.getBitcast(VT, Imm); else { - SDValue ExtVec = DAG.getNode(ISD::BITCAST, dl, MVT::v8i1, Imm); + SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, DAG.getIntPtrConstant(0, dl)); } @@ -5415,7 +5753,7 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, /// /// Otherwise, the first horizontal binop dag node takes as input the lower /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop -/// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1. +/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1. /// Example: /// HADD V0_LO, V1_LO /// HADD V0_HI, V1_HI @@ -5790,9 +6128,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // convert it to a vector with movd (S2V+shuffle to zero extend). Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); - return DAG.getNode( - ISD::BITCAST, dl, VT, - getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG)); + return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef( + Item, Idx * 2, true, Subtarget, DAG)); } } @@ -5838,7 +6175,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } - return DAG.getNode(ISD::BITCAST, dl, VT, Item); + return DAG.getBitcast(VT, Item); } } @@ -6043,7 +6380,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction +// 256-bit AVX can use the vinsertf128 instruction // to create 256-bit vectors from two other 128-bit ones. static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); @@ -6288,6 +6625,92 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef Mask, SDLoc DL, return DAG.getConstant(Imm, DL, MVT::i8); } +/// \brief Compute whether each element of a shuffle is zeroable. +/// +/// A "zeroable" vector shuffle element is one which can be lowered to zero. +/// Either it is an undef element in the shuffle mask, the element of the input +/// referenced is undef, or the element of the input referenced is known to be +/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle +/// as many lanes with this technique as possible to simplify the remaining +/// shuffle. +static SmallBitVector computeZeroableShuffleElements(ArrayRef Mask, + SDValue V1, SDValue V2) { + SmallBitVector Zeroable(Mask.size(), false); + + while (V1.getOpcode() == ISD::BITCAST) + V1 = V1->getOperand(0); + while (V2.getOpcode() == ISD::BITCAST) + V2 = V2->getOperand(0); + + bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); + + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + int M = Mask[i]; + // Handle the easy cases. + if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { + Zeroable[i] = true; + continue; + } + + // If this is an index into a build_vector node (which has the same number + // of elements), dig out the input value and use it. + SDValue V = M < Size ? V1 : V2; + if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands()) + continue; + + SDValue Input = V.getOperand(M % Size); + // The UNDEF opcode check really should be dead code here, but not quite + // worth asserting on (it isn't invalid, just unexpected). + if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input)) + Zeroable[i] = true; + } + + return Zeroable; +} + +/// \brief Try to emit a bitmask instruction for a shuffle. +/// +/// This handles cases where we can model a blend exactly as a bitmask due to +/// one of the inputs being zeroable. +static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + SelectionDAG &DAG) { + MVT EltVT = VT.getScalarType(); + int NumEltBits = EltVT.getSizeInBits(); + MVT IntEltVT = MVT::getIntegerVT(NumEltBits); + SDValue Zero = DAG.getConstant(0, DL, IntEltVT); + SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, + IntEltVT); + if (EltVT.isFloatingPoint()) { + Zero = DAG.getBitcast(EltVT, Zero); + AllOnes = DAG.getBitcast(EltVT, AllOnes); + } + SmallVector VMaskOps(Mask.size(), Zero); + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + SDValue V; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Zeroable[i]) + continue; + if (Mask[i] % Size != i) + return SDValue(); // Not a blend. + if (!V) + V = Mask[i] < Size ? V1 : V2; + else if (V != (Mask[i] < Size ? V1 : V2)) + return SDValue(); // Can only let one input through the mask. + + VMaskOps[i] = AllOnes; + } + if (!V) + return SDValue(); // No non-zeroable elements! + + SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps); + V = DAG.getNode(VT.isFloatingPoint() + ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND, + DL, VT, V, VMask); + return V; +} + /// \brief Try to emit a blend instruction for a shuffle using bit math. /// /// This is used as a fallback approach when first class blend instructions are @@ -6313,10 +6736,9 @@ static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1, V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask); // We have to cast V2 around. MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); - V2 = DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::ANDNP, DL, MaskVT, - DAG.getNode(ISD::BITCAST, DL, MaskVT, V1Mask), - DAG.getNode(ISD::BITCAST, DL, MaskVT, V2))); + V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT, + DAG.getBitcast(MaskVT, V1Mask), + DAG.getBitcast(MaskVT, V2))); return DAG.getNode(ISD::OR, DL, VT, V1, V2); } @@ -6367,11 +6789,11 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, BlendMask |= 1u << (i * Scale + j); MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; - V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1); - V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2); - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8))); + V1 = DAG.getBitcast(BlendVT, V1); + V2 = DAG.getBitcast(BlendVT, V2); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, + DAG.getConstant(BlendMask, DL, MVT::i8))); } // FALLTHROUGH case MVT::v8i16: { @@ -6384,11 +6806,11 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, for (int j = 0; j < Scale; ++j) BlendMask |= 1u << (i * Scale + j); - V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1); - V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2); - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8))); + V1 = DAG.getBitcast(MVT::v8i16, V1); + V2 = DAG.getBitcast(MVT::v8i16, V2); + return DAG.getBitcast(VT, + DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, + DAG.getConstant(BlendMask, DL, MVT::i8))); } case MVT::v16i16: { @@ -6411,6 +6833,10 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) && "256-bit byte-blends require AVX2 support!"); + // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. + if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG)) + return Masked; + // Scale the blend by the number of bytes per element. int Scale = VT.getScalarSizeInBits() / 8; @@ -6437,13 +6863,12 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8)); - V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1); - V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2); - return DAG.getNode( - ISD::BITCAST, DL, VT, - DAG.getNode(ISD::VSELECT, DL, BlendVT, - DAG.getNode(ISD::BUILD_VECTOR, DL, BlendVT, VSELECTMask), - V1, V2)); + V1 = DAG.getBitcast(BlendVT, V1); + V2 = DAG.getBitcast(BlendVT, V2); + return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, DL, BlendVT, + DAG.getNode(ISD::BUILD_VECTOR, DL, + BlendVT, VSELECTMask), + V1, V2)); } default: @@ -6624,13 +7049,12 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, if (Subtarget->hasSSSE3()) { // Cast the inputs to i8 vector of correct length to match PALIGNR. MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes); - Lo = DAG.getNode(ISD::BITCAST, DL, AlignVT, Lo); - Hi = DAG.getNode(ISD::BITCAST, DL, AlignVT, Hi); + Lo = DAG.getBitcast(AlignVT, Lo); + Hi = DAG.getBitcast(AlignVT, Hi); - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo, - DAG.getConstant(Rotation * Scale, DL, - MVT::i8))); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Lo, Hi, + DAG.getConstant(Rotation * Scale, DL, MVT::i8))); } assert(VT.getSizeInBits() == 128 && @@ -6643,104 +7067,18 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, int HiByteShift = Rotation * Scale; // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ. - Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo); - Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi); + Lo = DAG.getBitcast(MVT::v2i64, Lo); + Hi = DAG.getBitcast(MVT::v2i64, Hi); SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo, DAG.getConstant(LoByteShift, DL, MVT::i8)); SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi, DAG.getConstant(HiByteShift, DL, MVT::i8)); - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift)); + return DAG.getBitcast(VT, + DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift)); } -/// \brief Compute whether each element of a shuffle is zeroable. -/// -/// A "zeroable" vector shuffle element is one which can be lowered to zero. -/// Either it is an undef element in the shuffle mask, the element of the input -/// referenced is undef, or the element of the input referenced is known to be -/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle -/// as many lanes with this technique as possible to simplify the remaining -/// shuffle. -static SmallBitVector computeZeroableShuffleElements(ArrayRef Mask, - SDValue V1, SDValue V2) { - SmallBitVector Zeroable(Mask.size(), false); - - while (V1.getOpcode() == ISD::BITCAST) - V1 = V1->getOperand(0); - while (V2.getOpcode() == ISD::BITCAST) - V2 = V2->getOperand(0); - - bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); - bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); - - for (int i = 0, Size = Mask.size(); i < Size; ++i) { - int M = Mask[i]; - // Handle the easy cases. - if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { - Zeroable[i] = true; - continue; - } - - // If this is an index into a build_vector node (which has the same number - // of elements), dig out the input value and use it. - SDValue V = M < Size ? V1 : V2; - if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands()) - continue; - - SDValue Input = V.getOperand(M % Size); - // The UNDEF opcode check really should be dead code here, but not quite - // worth asserting on (it isn't invalid, just unexpected). - if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input)) - Zeroable[i] = true; - } - - return Zeroable; -} - -/// \brief Try to emit a bitmask instruction for a shuffle. -/// -/// This handles cases where we can model a blend exactly as a bitmask due to -/// one of the inputs being zeroable. -static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef Mask, - SelectionDAG &DAG) { - MVT EltVT = VT.getScalarType(); - int NumEltBits = EltVT.getSizeInBits(); - MVT IntEltVT = MVT::getIntegerVT(NumEltBits); - SDValue Zero = DAG.getConstant(0, DL, IntEltVT); - SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, - IntEltVT); - if (EltVT.isFloatingPoint()) { - Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero); - AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes); - } - SmallVector VMaskOps(Mask.size(), Zero); - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - SDValue V; - for (int i = 0, Size = Mask.size(); i < Size; ++i) { - if (Zeroable[i]) - continue; - if (Mask[i] % Size != i) - return SDValue(); // Not a blend. - if (!V) - V = Mask[i] < Size ? V1 : V2; - else if (V != (Mask[i] < Size ? V1 : V2)) - return SDValue(); // Can only let one input through the mask. - - VMaskOps[i] = AllOnes; - } - if (!V) - return SDValue(); // No non-zeroable elements! - - SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps); - V = DAG.getNode(VT.isFloatingPoint() - ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND, - DL, VT, V, VMask); - return V; -} - -/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros). +/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function @@ -6805,11 +7143,11 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1, MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale); assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && "Illegal integer vector type"); - V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V); + V = DAG.getBitcast(ShiftVT, V); V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, DL, MVT::i8)); - return DAG.getNode(ISD::BITCAST, DL, VT, V); + return DAG.getBitcast(VT, V); }; // SSE/AVX supports logical shifts up to 64-bit integers - so we can just @@ -6830,6 +7168,136 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1, return SDValue(); } +/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. +static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + SelectionDAG &DAG) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + assert(!Zeroable.all() && "Fully zeroable shuffle mask"); + + int Size = Mask.size(); + int HalfSize = Size / 2; + assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); + + // Upper half must be undefined. + if (!isUndefInRange(Mask, HalfSize, HalfSize)) + return SDValue(); + + // EXTRQ: Extract Len elements from lower half of source, starting at Idx. + // Remainder of lower half result is zero and upper half is all undef. + auto LowerAsEXTRQ = [&]() { + // Determine the extraction length from the part of the + // lower half that isn't zeroable. + int Len = HalfSize; + for (; Len >= 0; --Len) + if (!Zeroable[Len - 1]) + break; + assert(Len > 0 && "Zeroable shuffle mask"); + + // Attempt to match first Len sequential elements from the lower half. + SDValue Src; + int Idx = -1; + for (int i = 0; i != Len; ++i) { + int M = Mask[i]; + if (M < 0) + continue; + SDValue &V = (M < Size ? V1 : V2); + M = M % Size; + + // All mask elements must be in the lower half. + if (M > HalfSize) + return SDValue(); + + if (Idx < 0 || (Src == V && Idx == (M - i))) { + Src = V; + Idx = M - i; + continue; + } + return SDValue(); + } + + if (Idx < 0) + return SDValue(); + + assert((Idx + Len) <= HalfSize && "Illegal extraction mask"); + int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; + int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; + return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src, + DAG.getConstant(BitLen, DL, MVT::i8), + DAG.getConstant(BitIdx, DL, MVT::i8)); + }; + + if (SDValue ExtrQ = LowerAsEXTRQ()) + return ExtrQ; + + // INSERTQ: Extract lowest Len elements from lower half of second source and + // insert over first source, starting at Idx. + // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } + auto LowerAsInsertQ = [&]() { + for (int Idx = 0; Idx != HalfSize; ++Idx) { + SDValue Base; + + // Attempt to match first source from mask before insertion point. + if (isUndefInRange(Mask, 0, Idx)) { + /* EMPTY */ + } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) { + Base = V1; + } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) { + Base = V2; + } else { + continue; + } + + // Extend the extraction length looking to match both the insertion of + // the second source and the remaining elements of the first. + for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) { + SDValue Insert; + int Len = Hi - Idx; + + // Match insertion. + if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) { + Insert = V1; + } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) { + Insert = V2; + } else { + continue; + } + + // Match the remaining elements of the lower half. + if (isUndefInRange(Mask, Hi, HalfSize - Hi)) { + /* EMPTY */ + } else if ((!Base || (Base == V1)) && + isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) { + Base = V1; + } else if ((!Base || (Base == V2)) && + isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, + Size + Hi)) { + Base = V2; + } else { + continue; + } + + // We may not have a base (first source) - this can safely be undefined. + if (!Base) + Base = DAG.getUNDEF(VT); + + int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; + int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; + return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert, + DAG.getConstant(BitLen, DL, MVT::i8), + DAG.getConstant(BitIdx, DL, MVT::i8)); + } + } + + return SDValue(); + }; + + if (SDValue InsertQ = LowerAsInsertQ()) + return InsertQ; + + return SDValue(); +} + /// \brief Lower a vector shuffle as a zero or any extension. /// /// Given a specific number of elements, element bit width, and extension @@ -6837,7 +7305,7 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1, /// features of the subtarget. static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV, - const X86Subtarget *Subtarget, SelectionDAG &DAG) { + ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Scale > 1 && "Need a scale to extend."); int NumElements = VT.getVectorNumElements(); int EltBits = VT.getScalarSizeInBits(); @@ -6850,31 +7318,50 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( if (Subtarget->hasSSE41()) { MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV)); + return DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV)); } // For any extends we can cheat for larger element sizes and use shuffle // instructions that can fold with a load and/or copy. if (AnyExt && EltBits == 32) { int PSHUFDMask[4] = {0, -1, 1, -1}; - return DAG.getNode( - ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, - DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV), - getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, + DAG.getBitcast(MVT::v4i32, InputV), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } if (AnyExt && EltBits == 16 && Scale > 2) { int PSHUFDMask[4] = {0, -1, 0, -1}; InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, - DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV), + DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); int PSHUFHWMask[4] = {1, -1, -1, -1}; - return DAG.getNode( - ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, - DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV), - getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG))); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, + DAG.getBitcast(MVT::v8i16, InputV), + getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG))); + } + + // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes + // to 64-bits. + if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) { + assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"); + assert(VT.getSizeInBits() == 128 && "Unexpected vector width!"); + + SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, + DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, + DAG.getConstant(EltBits, DL, MVT::i8), + DAG.getConstant(0, DL, MVT::i8))); + if (isUndefInRange(Mask, NumElements/2, NumElements/2)) + return DAG.getNode(ISD::BITCAST, DL, VT, Lo); + + SDValue Hi = + DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, + DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, + DAG.getConstant(EltBits, DL, MVT::i8), + DAG.getConstant(EltBits, DL, MVT::i8))); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi)); } // If this would require more than 2 unpack instructions to expand, use @@ -6886,11 +7373,11 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( for (int i = 0; i < 16; ++i) PSHUFBMask[i] = DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, DL, MVT::i8); - InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV); - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, - DAG.getNode(ISD::BUILD_VECTOR, DL, - MVT::v16i8, PSHUFBMask))); + InputV = DAG.getBitcast(MVT::v16i8, InputV); + return DAG.getBitcast(VT, + DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, + DAG.getNode(ISD::BUILD_VECTOR, DL, + MVT::v16i8, PSHUFBMask))); } // Otherwise emit a sequence of unpacks. @@ -6898,13 +7385,13 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT) : getZeroVector(InputVT, Subtarget, DAG, DL); - InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV); + InputV = DAG.getBitcast(InputVT, InputV); InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext); Scale /= 2; EltBits *= 2; NumElements /= 2; } while (Scale > 1); - return DAG.getNode(ISD::BITCAST, DL, VT, InputV); + return DAG.getBitcast(VT, InputV); } /// \brief Try to lower a vector shuffle as a zero extension on any microarch. @@ -6967,7 +7454,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( return SDValue(); return lowerVectorShuffleAsSpecificZeroOrAnyExtend( - DL, VT, Scale, AnyExt, InputV, Subtarget, DAG); + DL, VT, Scale, AnyExt, InputV, Mask, Subtarget, DAG); }; // The widest scale possible for extending is to a 64-bit integer. @@ -7002,9 +7489,9 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( }; if (SDValue V = CanZExtLowHalf()) { - V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V); + V = DAG.getBitcast(MVT::v2i64, V); V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V); - return DAG.getNode(ISD::BITCAST, DL, VT, V); + return DAG.getBitcast(VT, V); } // No viable ext lowering found. @@ -7075,10 +7562,11 @@ static SDValue lowerVectorShuffleAsElementInsertion( // all the smarts here sunk into that routine. However, the current // lowering of BUILD_VECTOR makes that nearly impossible until the old // vector shuffle lowering is dead. - if (SDValue V2S = getScalarValueForVectorElement( - V2, Mask[V2Index] - Mask.size(), DAG)) { + SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(), + DAG); + if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) { // We need to zext the scalar if it is smaller than an i32. - V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S); + V2S = DAG.getBitcast(EltVT, V2S); if (EltVT == MVT::i8 || EltVT == MVT::i16) { // Using zext to expand a narrow element won't work for non-zero // insertions. @@ -7127,7 +7615,7 @@ static SDValue lowerVectorShuffleAsElementInsertion( V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); if (ExtVT != VT) - V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); + V2 = DAG.getBitcast(VT, V2); if (V2Index != 0) { // If we have 4 or fewer lanes we can cheaply shuffle the element into @@ -7139,13 +7627,13 @@ static SDValue lowerVectorShuffleAsElementInsertion( V2Shuffle[V2Index] = 0; V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); } else { - V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2); + V2 = DAG.getBitcast(MVT::v2i64, V2); V2 = DAG.getNode( X86ISD::VSHLDQ, DL, MVT::v2i64, V2, - DAG.getConstant( - V2Index * EltVT.getSizeInBits()/8, DL, - DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64))); - V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); + DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, + DAG.getTargetLoweringInfo().getScalarShiftAmountTy( + DAG.getDataLayout(), VT))); + V2 = DAG.getBitcast(VT, V2); } } return V2; @@ -7211,6 +7699,23 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, // Check if this is a broadcast of a scalar. We special case lowering // for scalars so that we can more effectively fold with loads. + // First, look through bitcast: if the original value has a larger element + // type than the shuffle, the broadcast element is in essence truncated. + // Make that explicit to ease folding. + if (V.getOpcode() == ISD::BITCAST && VT.isInteger()) { + EVT EltVT = VT.getVectorElementType(); + SDValue V0 = V.getOperand(0); + EVT V0VT = V0.getValueType(); + + if (V0VT.isInteger() && V0VT.getVectorElementType().bitsGT(EltVT) && + ((V0.getOpcode() == ISD::BUILD_VECTOR || + (V0.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)))) { + V = DAG.getNode(ISD::TRUNCATE, DL, EltVT, V0.getOperand(BroadcastIdx)); + BroadcastIdx = 0; + } + } + + // Also check the simpler case, where we can directly reuse the scalar. if (V.getOpcode() == ISD::BUILD_VECTOR || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { V = V.getOperand(BroadcastIdx); @@ -7368,13 +7873,13 @@ static SDValue lowerVectorShuffleAsUnpack(SDLoc DL, MVT VT, SDValue V1, V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); // Cast the inputs to the type we will use to unpack them. - V1 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V1); - V2 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V2); + V1 = DAG.getBitcast(UnpackVT, V1); + V2 = DAG.getBitcast(UnpackVT, V2); // Unpack the inputs and cast the result back to the desired type. - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, - DL, UnpackVT, V1, V2)); + return DAG.getBitcast( + VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, + UnpackVT, V1, V2)); }; // We try each unpack from the largest to the smallest to try and find one @@ -7530,12 +8035,12 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We have to map the mask as it is actually a v4i32 shuffle instruction. - V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1); + V1 = DAG.getBitcast(MVT::v4i32, V1); int WidenedMask[4] = { std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1, std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1}; - return DAG.getNode( - ISD::BITCAST, DL, MVT::v2i64, + return DAG.getBitcast( + MVT::v2i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG))); } @@ -7556,12 +8061,12 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, }; if (SDValue V1Pack = GetPackNode(V1)) if (SDValue V2Pack = GetPackNode(V2)) - return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, - DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, - Mask[0] == 0 ? V1Pack.getOperand(0) - : V1Pack.getOperand(1), - Mask[1] == 2 ? V2Pack.getOperand(0) - : V2Pack.getOperand(1))); + return DAG.getBitcast(MVT::v2i64, + DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, + Mask[0] == 0 ? V1Pack.getOperand(0) + : V1Pack.getOperand(1), + Mask[1] == 2 ? V2Pack.getOperand(0) + : V2Pack.getOperand(1))); // Try to use shift instructions. if (SDValue Shift = @@ -7611,10 +8116,10 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // incur 2 cycles of stall for integer vectors on Nehalem and older chips. // However, all the alternatives are still more cycles and newer chips don't // have this problem. It would be really nice if x86 had better shuffles here. - V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1); - V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2); - return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, - DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); + V1 = DAG.getBitcast(MVT::v2f64, V1); + V2 = DAG.getBitcast(MVT::v2f64, V2); + return DAG.getBitcast(MVT::v2i64, + DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); } /// \brief Test whether this can be lowered with a single SHUFPS instruction. @@ -7913,11 +8418,10 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // up the inputs, bypassing domain shift penalties that we would encur if we // directly used PSHUFD on Nehalem and older. For newer chips, this isn't // relevant. - return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, - DAG.getVectorShuffle( - MVT::v4f32, DL, - DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1), - DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask)); + return DAG.getBitcast( + MVT::v4i32, + DAG.getVectorShuffle(MVT::v4f32, DL, DAG.getBitcast(MVT::v4f32, V1), + DAG.getBitcast(MVT::v4f32, V2), Mask)); } /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 @@ -8007,16 +8511,18 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( assert(AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."); + bool ThreeAInputs = AToAInputs.size() == 3; + // Compute the index of dword with only one word among the three inputs in // a half by taking the sum of the half with three inputs and subtracting // the sum of the actual three inputs. The difference is the remaining // slot. int ADWord, BDWord; - int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord; - int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord; - int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset; - ArrayRef TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs; - int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0]; + int &TripleDWord = ThreeAInputs ? ADWord : BDWord; + int &OneInputDWord = ThreeAInputs ? BDWord : ADWord; + int TripleInputOffset = ThreeAInputs ? AOffset : BOffset; + ArrayRef TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs; + int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0]; int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); int TripleNonInputIdx = TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); @@ -8085,8 +8591,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs); } else { assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!"); - int APinnedIdx = - AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; + int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput; FixFlippedInputs(APinnedIdx, ADWord, AToBInputs); } } @@ -8095,11 +8600,10 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( int PSHUFDMask[] = {0, 1, 2, 3}; PSHUFDMask[ADWord] = BDWord; PSHUFDMask[BDWord] = ADWord; - V = DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, - DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V), - getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, - DAG))); + V = DAG.getBitcast( + VT, + DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); // Adjust the mask to match the new locations of A and B. for (int &M : Mask) @@ -8340,11 +8844,10 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG)); if (!isNoopShuffleMask(PSHUFDMask)) - V = DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, - DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V), - getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, - DAG))); + V = DAG.getBitcast( + VT, + DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); // At this point, each half should contain all its inputs, and we can then // just shuffle them into their final position. @@ -8405,11 +8908,11 @@ static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1, if (V1InUse) V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, - DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V1), + DAG.getBitcast(MVT::v16i8, V1), DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask)); if (V2InUse) V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, - DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V2), + DAG.getBitcast(MVT::v16i8, V2), DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask)); // If we need shuffled inputs from both, blend the two. @@ -8420,7 +8923,7 @@ static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1, V = V1InUse ? V1 : V2; // Cast the result back to the correct type. - return DAG.getNode(ISD::BITCAST, DL, VT, V); + return DAG.getBitcast(VT, V); } /// \brief Generic lowering of 8-lane i16 shuffles. @@ -8497,6 +9000,11 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG)) return Shift; + // See if we can use SSE4A Extraction / Insertion. + if (Subtarget->hasSSE4A()) + if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG)) + return V; + // There are special ways we can lower some single-element blends. if (NumV2Inputs == 1) if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2, @@ -8649,6 +9157,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return ZExt; + // See if we can use SSE4A Extraction / Insertion. + if (Subtarget->hasSSE4A()) + if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG)) + return V; + int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; }); @@ -8721,10 +9234,9 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // Update the lane map based on the mapping we ended up with. LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2; } - V1 = DAG.getNode( - ISD::BITCAST, DL, MVT::v16i8, - DAG.getVectorShuffle(MVT::v8i16, DL, - DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), + V1 = DAG.getBitcast( + MVT::v16i8, + DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1), DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); // Unpack the bytes to form the i16s that will be shuffled into place. @@ -8742,16 +9254,19 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entrties in the original shuffle!"); } - return DAG.getNode( - ISD::BITCAST, DL, MVT::v16i8, - DAG.getVectorShuffle(MVT::v8i16, DL, - DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), + return DAG.getBitcast( + MVT::v16i8, + DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1), DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle)); }; if (SDValue V = tryToWidenViaDuplication()) return V; } + if (SDValue Masked = + lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG)) + return Masked; + // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(V1, V2, Mask, {// Low half. 0, 16, 1, 17, 2, 18, 3, 19, @@ -8838,19 +9353,18 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // We use the mask type to pick which bytes are preserved based on how many // elements are dropped. MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 }; - SDValue ByteClearMask = - DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, - DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1])); + SDValue ByteClearMask = DAG.getBitcast( + MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1])); V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask); if (!IsSingleInput) V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask); // Now pack things back together. - V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1); - V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2); + V1 = DAG.getBitcast(MVT::v8i16, V1); + V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2); SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2); for (int i = 1; i < NumEvenDrops; ++i) { - Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result); + Result = DAG.getBitcast(MVT::v8i16, Result); Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result); } @@ -8884,7 +9398,7 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask), [](int M) { return M >= 0 && M % 2 == 1; })) { // Use a mask to drop the high bytes. - VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); + VLoHalf = DAG.getBitcast(MVT::v8i16, V); VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf, DAG.getConstant(0x00FF, DL, MVT::v8i16)); @@ -8901,10 +9415,10 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, } else { // Otherwise just unpack the low half of V into VLoHalf and the high half into // VHiHalf so that we can blend them as i16s. - VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, - DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); - VHiHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, - DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); + VLoHalf = DAG.getBitcast( + MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); + VHiHalf = DAG.getBitcast( + MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); } SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask); @@ -9045,8 +9559,8 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps); HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps); } - return std::make_pair(DAG.getNode(ISD::BITCAST, DL, SplitVT, LoV), - DAG.getNode(ISD::BITCAST, DL, SplitVT, HiV)); + return std::make_pair(DAG.getBitcast(SplitVT, LoV), + DAG.getBitcast(SplitVT, HiV)); }; SDValue LoV1, HiV1, LoV2, HiV2; @@ -9379,12 +9893,12 @@ static SDValue lowerVectorShuffleByMerging128BitLanes( LaneMask[2 * i + 1] = 2*Lanes[i] + 1; } - V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1); - V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2); + V1 = DAG.getBitcast(LaneVT, V1); + V2 = DAG.getBitcast(LaneVT, V2); SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask); // Cast it back to the type we actually want. - LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle); + LaneShuffle = DAG.getBitcast(VT, LaneShuffle); // Now do a simple shuffle that isn't lane crossing. SmallVector NewMask; @@ -9413,6 +9927,37 @@ static bool isShuffleMaskInputInPlace(int Input, ArrayRef Mask) { return true; } +static SDValue lowerVectorShuffleWithSHUFPD(SDLoc DL, MVT VT, + ArrayRef Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { + + // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, .. + // Mask for V4F64; 0/1, 4/5, 2/3, 6/7.. + assert(VT.getScalarSizeInBits() == 64 && "Unexpected data type for VSHUFPD"); + int NumElts = VT.getVectorNumElements(); + bool ShufpdMask = true; + bool CommutableMask = true; + unsigned Immediate = 0; + for (int i = 0; i < NumElts; ++i) { + if (Mask[i] < 0) + continue; + int Val = (i & 6) + NumElts * (i & 1); + int CommutVal = (i & 0xe) + NumElts * ((i & 1)^1); + if (Mask[i] < Val || Mask[i] > Val + 1) + ShufpdMask = false; + if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1) + CommutableMask = false; + Immediate |= (Mask[i] % 2) << i; + } + if (ShufpdMask) + return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, + DAG.getConstant(Immediate, DL, MVT::i8)); + if (CommutableMask) + return DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, + DAG.getConstant(Immediate, DL, MVT::i8)); + return SDValue(); +} + /// \brief Handle lowering of 4-lane 64-bit floating point shuffles. /// /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 @@ -9477,24 +10022,9 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Check if the blend happens to exactly fit that of SHUFPD. - if ((Mask[0] == -1 || Mask[0] < 2) && - (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) && - (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) && - (Mask[3] == -1 || Mask[3] >= 6)) { - unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) | - ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3); - return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2, - DAG.getConstant(SHUFPDMask, DL, MVT::i8)); - } - if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) && - (Mask[1] == -1 || Mask[1] < 2) && - (Mask[2] == -1 || Mask[2] >= 6) && - (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) { - unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) | - ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3); - return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1, - DAG.getConstant(SHUFPDMask, DL, MVT::i8)); - } + if (SDValue Op = + lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG)) + return Op; // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, @@ -9556,10 +10086,10 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, PSHUFDMask[2 * i] = 2 * RepeatedMask[i]; PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1; } - return DAG.getNode( - ISD::BITCAST, DL, MVT::v4i64, + return DAG.getBitcast( + MVT::v4i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, - DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1), + DAG.getBitcast(MVT::v8i32, V1), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } } @@ -9672,11 +10202,11 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask)); if (Subtarget->hasAVX2()) - return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, - DAG.getNode(ISD::BITCAST, DL, MVT::v8f32, - DAG.getNode(ISD::BUILD_VECTOR, DL, + return DAG.getNode( + X86ISD::VPERMV, DL, MVT::v8f32, + DAG.getBitcast(MVT::v8f32, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask)), - V1); + V1); // Otherwise, fall back. return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, @@ -9866,12 +10396,11 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, PSHUFBMask[2 * i] = DAG.getConstant(2 * M, DL, MVT::i8); PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, DL, MVT::i8); } - return DAG.getNode( - ISD::BITCAST, DL, MVT::v16i16, - DAG.getNode( - X86ISD::PSHUFB, DL, MVT::v32i8, - DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1), - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask))); + return DAG.getBitcast(MVT::v16i16, + DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, + DAG.getBitcast(MVT::v32i8, V1), + DAG.getNode(ISD::BUILD_VECTOR, DL, + MVT::v32i8, PSHUFBMask))); } // Try to simplify this by merging 128-bit lanes to enable a lane-based @@ -10011,10 +10540,9 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), VT.getVectorNumElements()); - V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1); - V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2); - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask)); + V1 = DAG.getBitcast(FpVT, V1); + V2 = DAG.getBitcast(FpVT, V2); + return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask)); } switch (VT.SimpleTy) { @@ -10036,6 +10564,73 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, } } +static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT, + ArrayRef Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { + + assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV"); + + MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); + MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); + + SmallVector VPermMask; + for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) + VPermMask.push_back(Mask[i] < 0 ? DAG.getUNDEF(MaskEltVT) : + DAG.getConstant(Mask[i], DL, MaskEltVT)); + SDValue MaskNode = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVecVT, + VPermMask); + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); + + return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2); +} + +// X86 has dedicated unpack instructions that can handle specific blend +// operations: UNPCKH and UNPCKL. +static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, + ArrayRef Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { + int NumElts = VT.getVectorNumElements(); + bool Unpckl = true; + bool Unpckh = true; + bool UnpcklSwapped = true; + bool UnpckhSwapped = true; + int NumEltsInLane = 128 / VT.getScalarSizeInBits(); + + for (int i = 0; i < NumElts ; ++i) { + unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane; + + int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2); + int HiPos = LoPos + NumEltsInLane / 2; + int LoPosSwapped = (LoPos + NumElts) % (NumElts * 2); + int HiPosSwapped = (HiPos + NumElts) % (NumElts * 2); + + if (Mask[i] == -1) + continue; + if (Mask[i] != LoPos) + Unpckl = false; + if (Mask[i] != HiPos) + Unpckh = false; + if (Mask[i] != LoPosSwapped) + UnpcklSwapped = false; + if (Mask[i] != HiPosSwapped) + UnpckhSwapped = false; + if (!Unpckl && !Unpckh && !UnpcklSwapped && !UnpckhSwapped) + return SDValue(); + } + if (Unpckl) + return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); + if (Unpckh) + return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); + if (UnpcklSwapped) + return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1); + if (UnpckhSwapped) + return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1); + + llvm_unreachable("Unexpected result of UNPCK mask analysis"); + return SDValue(); +} + /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, @@ -10047,15 +10642,12 @@ static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - // X86 has dedicated unpack instructions that can handle specific blend - // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2); + SDValue UnpckNode = + lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG); + if (UnpckNode) + return UnpckNode; - // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG); + return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit floating point shuffles. @@ -10069,22 +10661,12 @@ static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane. - 0, 16, 1, 17, 4, 20, 5, 21, - // Second 128-bit lane. - 8, 24, 9, 25, 12, 28, 13, 29})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane. - 2, 18, 3, 19, 6, 22, 7, 23, - // Second 128-bit lane. - 10, 26, 11, 27, 14, 30, 15, 31})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2); + SDValue UnpckNode = + lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG); + if (UnpckNode) + return UnpckNode; - // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG); + return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); } /// \brief Handle lowering of 8-lane 64-bit integer shuffles. @@ -10098,15 +10680,12 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - // X86 has dedicated unpack instructions that can handle specific blend - // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2); + SDValue UnpckNode = + lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG); + if (UnpckNode) + return UnpckNode; - // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG); + return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit integer shuffles. @@ -10120,22 +10699,12 @@ static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane. - 0, 16, 1, 17, 4, 20, 5, 21, - // Second 128-bit lane. - 8, 24, 9, 25, 12, 28, 13, 29})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane. - 2, 18, 3, 19, 6, 22, 7, 23, - // Second 128-bit lane. - 10, 26, 11, 27, 14, 30, 15, 31})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2); + SDValue UnpckNode = + lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG); + if (UnpckNode) + return UnpckNode; - // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG); + return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } /// \brief Handle lowering of 32-lane 16-bit integer shuffles. @@ -10150,8 +10719,7 @@ static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"); - // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG); + return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); } /// \brief Handle lowering of 64-lane 8-bit integer shuffles. @@ -10283,10 +10851,10 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, // Make sure that the new vector type is legal. For example, v2f64 isn't // legal on SSE1. if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { - V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); - V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); - return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask)); + V1 = DAG.getBitcast(NewVT, V1); + V2 = DAG.getBitcast(NewVT, V2); + return DAG.getBitcast( + VT, DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask)); } } @@ -10367,7 +10935,7 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, unsigned NumLanes = (NumElems - 1) / 8 + 1; unsigned NumElemsInLane = NumElems / NumLanes; - // Blend for v16i16 should be symetric for the both lanes. + // Blend for v16i16 should be symmetric for the both lanes. for (unsigned i = 0; i < NumElemsInLane; ++i) { SDValue EltCond = BuildVector->getOperand(i); SDValue SndLaneEltCond = @@ -10481,12 +11049,11 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { unsigned Idx = cast(Op.getOperand(1))->getZExtValue(); // If Idx is 0, it's cheaper to do a move instead of a pextrw. if (Idx == 0) - return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, - DAG.getNode(ISD::BITCAST, dl, - MVT::v4i32, - Op.getOperand(0)), - Op.getOperand(1))); + return DAG.getNode( + ISD::TRUNCATE, dl, MVT::i16, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + DAG.getBitcast(MVT::v4i32, Op.getOperand(0)), + Op.getOperand(1))); SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Op.getOperand(0), Op.getOperand(1)); SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, @@ -10510,10 +11077,9 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { User->getValueType(0) != MVT::i32)) return SDValue(); SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, - DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, - Op.getOperand(0)), - Op.getOperand(1)); - return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); + DAG.getBitcast(MVT::v4i32, Op.getOperand(0)), + Op.getOperand(1)); + return DAG.getBitcast(MVT::f32, Extract); } if (VT == MVT::i32 || VT == MVT::i64) { @@ -10583,12 +11149,13 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, MaskEltVT.getSizeInBits()); Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT); + auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT, - getZeroVector(MaskVT, Subtarget, DAG, dl), - Idx, DAG.getConstant(0, dl, getPointerTy())); + getZeroVector(MaskVT, Subtarget, DAG, dl), Idx, + DAG.getConstant(0, dl, PtrVT)); SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), - Perm, DAG.getConstant(0, dl, getPointerTy())); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm, + DAG.getConstant(0, dl, PtrVT)); } return SDValue(); } @@ -10613,11 +11180,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, assert(VecVT.is128BitVector() && "Unexpected vector length"); - if (Subtarget->hasSSE41()) { - SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); - if (Res.getNode()) + if (Subtarget->hasSSE41()) + if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG)) return Res; - } MVT VT = Op.getSimpleValueType(); // TODO: handle v16i8. @@ -10627,8 +11192,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, if (Idx == 0) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, - DAG.getNode(ISD::BITCAST, dl, - MVT::v4i32, Vec), + DAG.getBitcast(MVT::v4i32, Vec), Op.getOperand(1))); // Transform it so it match pextrw which produces a 32-bit result. MVT EltVT = MVT::i32; @@ -10784,8 +11348,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // Bits [3:0] of the constant are the zero mask. The DAG Combiner may // combine either bitwise AND or insert of float 0.0 to set these bits. - const Function *F = DAG.getMachineFunction().getFunction(); - bool MinSize = F->hasFnAttribute(Attribute::MinSize); + bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize(); if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { // If this is an insertion of 32-bits into the low 32-bits of // a vector, we prefer to generate a blend with immediate rather @@ -10849,8 +11412,8 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); assert(OpVT.is128BitVector() && "Expected an SSE type!"); - return DAG.getNode(ISD::BITCAST, dl, OpVT, - DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); + return DAG.getBitcast( + OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt)); } // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in @@ -10905,15 +11468,25 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, // --> load32 addr if ((IdxVal == OpVT.getVectorNumElements() / 2) && Vec.getOpcode() == ISD::INSERT_SUBVECTOR && - OpVT.is256BitVector() && SubVecVT.is128BitVector() && - !Subtarget->isUnalignedMem32Slow()) { - SDValue SubVec2 = Vec.getOperand(1); - if (auto *Idx2 = dyn_cast(Vec.getOperand(2))) { - if (Idx2->getZExtValue() == 0) { - SDValue Ops[] = { SubVec2, SubVec }; - SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false); - if (LD.getNode()) - return LD; + OpVT.is256BitVector() && SubVecVT.is128BitVector()) { + auto *Idx2 = dyn_cast(Vec.getOperand(2)); + if (Idx2 && Idx2->getZExtValue() == 0) { + SDValue SubVec2 = Vec.getOperand(1); + // If needed, look through a bitcast to get to the load. + if (SubVec2.getNode() && SubVec2.getOpcode() == ISD::BITCAST) + SubVec2 = SubVec2.getOperand(0); + + if (auto *FirstLd = dyn_cast(SubVec2)) { + bool Fast; + unsigned Alignment = FirstLd->getAlignment(); + unsigned AS = FirstLd->getAddressSpace(); + const X86TargetLowering *TLI = Subtarget->getTargetLowering(); + if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), + OpVT, AS, Alignment, &Fast) && Fast) { + SDValue Ops[] = { SubVec2, SubVec }; + if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) + return Ld; + } } } } @@ -10983,17 +11556,16 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { else if (Subtarget->isPICStyleStubPIC()) OpFlag = X86II::MO_PIC_BASE_OFFSET; - SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), - CP->getAlignment(), - CP->getOffset(), OpFlag); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Result = DAG.getTargetConstantPool( + CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag); SDLoc DL(CP); - Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); + Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (OpFlag) { - Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), - DAG.getNode(X86ISD::GlobalBaseReg, - SDLoc(), getPointerTy()), - Result); + Result = + DAG.getNode(ISD::ADD, DL, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); } return Result; @@ -11016,17 +11588,16 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { else if (Subtarget->isPICStyleStubPIC()) OpFlag = X86II::MO_PIC_BASE_OFFSET; - SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), - OpFlag); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag); SDLoc DL(JT); - Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); + Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (OpFlag) - Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), - DAG.getNode(X86ISD::GlobalBaseReg, - SDLoc(), getPointerTy()), - Result); + Result = + DAG.getNode(ISD::ADD, DL, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); return Result; } @@ -11054,25 +11625,26 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { OpFlag = X86II::MO_DARWIN_NONLAZY; } - SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag); SDLoc DL(Op); - Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); + Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ && !Subtarget->is64Bit()) { - Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), - DAG.getNode(X86ISD::GlobalBaseReg, - SDLoc(), getPointerTy()), - Result); + Result = + DAG.getNode(ISD::ADD, DL, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); } // For symbols that require a load from a stub to get the address, emit the // load. if (isGlobalStubReference(OpFlag)) - Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, false, 0); + Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); return Result; } @@ -11086,20 +11658,19 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { const BlockAddress *BA = cast(Op)->getBlockAddress(); int64_t Offset = cast(Op)->getOffset(); SDLoc dl(Op); - SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset, - OpFlags); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags); if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) - Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); + Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result); else - Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); + Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (isGlobalRelativeToPICBase(OpFlags)) { - Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), - DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), - Result); + Result = DAG.getNode(ISD::ADD, dl, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); } return Result; @@ -11113,40 +11684,41 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, DAG.getTarget()); CodeModel::Model M = DAG.getTarget().getCodeModel(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result; if (OpFlags == X86II::MO_NO_FLAG && X86::isOffsetSuitableForCodeModel(Offset, M)) { // A direct static reference to a global. - Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); + Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset); Offset = 0; } else { - Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); + Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags); } if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) - Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); + Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result); else - Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); + Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (isGlobalRelativeToPICBase(OpFlags)) { - Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), - DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), - Result); + Result = DAG.getNode(ISD::ADD, dl, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); } // For globals that require a load from a stub to get the address, emit the // load. if (isGlobalStubReference(OpFlags)) - Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, false, 0); + Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); // If there was a non-zero offset that we didn't fold, create an explicit // addition for it. if (Offset != 0) - Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, - DAG.getConstant(Offset, dl, getPointerTy())); + Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, + DAG.getConstant(Offset, dl, PtrVT)); return Result; } @@ -11297,7 +11869,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, } Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); } // The address of the thread local variable is the add of the thread @@ -11310,22 +11883,25 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GA = cast(Op); const GlobalValue *GV = GA->getGlobal(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); if (Subtarget->isTargetELF()) { + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); TLSModel::Model model = DAG.getTarget().getTLSModel(GV); switch (model) { case TLSModel::GeneralDynamic: if (Subtarget->is64Bit()) - return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); - return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); + return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT); + return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT); case TLSModel::LocalDynamic: - return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(), + return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget->is64Bit()); case TLSModel::InitialExec: case TLSModel::LocalExec: - return LowerToTLSExecModel( - GA, DAG, getPointerTy(), model, Subtarget->is64Bit(), - DAG.getTarget().getRelocationModel() == Reloc::PIC_); + return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget->is64Bit(), + DAG.getTarget().getRelocationModel() == + Reloc::PIC_); } llvm_unreachable("Unknown TLS model."); } @@ -11348,13 +11924,12 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, GA->getValueType(0), GA->getOffset(), OpFlag); - SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); + SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result); // With PIC32, the address is actually $g + Offset. if (PIC32) - Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), - DAG.getNode(X86ISD::GlobalBaseReg, - SDLoc(), getPointerTy()), + Offset = DAG.getNode(ISD::ADD, DL, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Offset); // Lowering the machine isd will make sure everything is in the right @@ -11371,8 +11946,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // And our return value (tls address) is in the standard call return value // location. unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; - return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(), - Chain.getValue(1)); + return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1)); } if (Subtarget->isTargetKnownWindowsMSVC() || @@ -11400,50 +11974,50 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { : Type::getInt32PtrTy(*DAG.getContext(), 257)); - SDValue TlsArray = - Subtarget->is64Bit() - ? DAG.getIntPtrConstant(0x58, dl) - : (Subtarget->isTargetWindowsGNU() - ? DAG.getIntPtrConstant(0x2C, dl) - : DAG.getExternalSymbol("_tls_array", getPointerTy())); + SDValue TlsArray = Subtarget->is64Bit() + ? DAG.getIntPtrConstant(0x58, dl) + : (Subtarget->isTargetWindowsGNU() + ? DAG.getIntPtrConstant(0x2C, dl) + : DAG.getExternalSymbol("_tls_array", PtrVT)); SDValue ThreadPointer = - DAG.getLoad(getPointerTy(), dl, Chain, TlsArray, - MachinePointerInfo(Ptr), false, false, false, 0); + DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr), false, + false, false, 0); SDValue res; if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) { res = ThreadPointer; } else { // Load the _tls_index variable - SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy()); + SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT); if (Subtarget->is64Bit()) - IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, IDX, + IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX, MachinePointerInfo(), MVT::i32, false, false, false, 0); else - IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(), - false, false, false, 0); + IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo(), false, + false, false, 0); - SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), dl, - getPointerTy()); - IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale); + auto &DL = DAG.getDataLayout(); + SDValue Scale = + DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT); + IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale); - res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX); + res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX); } - res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(), - false, false, false, 0); + res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo(), false, false, + false, 0); // Get the offset of start of .tls section SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), X86II::MO_SECREL); - SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA); + SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA); // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. - return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset); + return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset); } llvm_unreachable("TLS not implemented for this target."); @@ -11505,15 +12079,21 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - MVT SrcVT = Op.getOperand(0).getSimpleValueType(); + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); if (SrcVT.isVector()) { + if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { + return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT, + DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, + DAG.getUNDEF(SrcVT))); + } if (SrcVT.getVectorElementType() == MVT::i1) { MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), - DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, - Op.getOperand(0))); + DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src)); } return SDValue(); } @@ -11532,12 +12112,13 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, unsigned Size = SrcVT.getSizeInBits()/8; MachineFunction &MF = DAG.getMachineFunction(); + auto PtrVT = getPointerTy(MF.getDataLayout()); int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); - SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); - SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), - StackSlot, - MachinePointerInfo::getFixedStack(SSFI), - false, false, 0); + SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); + SDValue Chain = DAG.getStore( + DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false, + false, 0); return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); } @@ -11559,10 +12140,9 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, MachineMemOperand *MMO; if (FI) { int SSFI = FI->getIndex(); - MMO = - DAG.getMachineFunction() - .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOLoad, ByteSize, ByteSize); + MMO = DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), + MachineMemOperand::MOLoad, ByteSize, ByteSize); } else { MMO = cast(StackSlot)->getMemOperand(); StackSlot = StackSlot.getOperand(1); @@ -11582,21 +12162,22 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, MachineFunction &MF = DAG.getMachineFunction(); unsigned SSFISize = Op.getValueType().getSizeInBits()/8; int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); - SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + auto PtrVT = getPointerTy(MF.getDataLayout()); + SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); Tys = DAG.getVTList(MVT::Other); SDValue Ops[] = { Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag }; - MachineMemOperand *MMO = - DAG.getMachineFunction() - .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOStore, SSFISize, SSFISize); + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), + MachineMemOperand::MOStore, SSFISize, SSFISize); Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, Ops, Op.getValueType(), MMO); - Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, - MachinePointerInfo::getFixedStack(SSFI), - false, false, false, 0); + Result = DAG.getLoad( + Op.getValueType(), DL, Chain, StackSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), + false, false, false, 0); } return Result; @@ -11624,7 +12205,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, // Build some magic constants. static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; Constant *C0 = ConstantDataVector::get(*Context, CV0); - SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16); SmallVector CV1; CV1.push_back( @@ -11634,22 +12216,23 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, APInt(64, 0x4530000000000000ULL)))); Constant *C1 = ConstantVector::get(CV1); - SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); + SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16); // Load the 64-bit value into an XMM register. SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0)); - SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); - SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, - DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1), - CLod0); - - SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); - SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1); + SDValue CLod0 = + DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 16); + SDValue Unpck1 = + getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0); + + SDValue CLod1 = + DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 16); + SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; @@ -11657,12 +12240,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'. Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); } else { - SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub); + SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub); SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32, S2F, 0x4E, DAG); Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, - DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle), - Sub); + DAG.getBitcast(MVT::v2f64, Shuffle), Sub); } return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, @@ -11685,20 +12267,19 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, - DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), + DAG.getBitcast(MVT::v2f64, Load), DAG.getIntPtrConstant(0, dl)); // Or the load with the bias. - SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, - DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, - DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, - MVT::v2f64, Load)), - DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, - DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, - MVT::v2f64, Bias))); - Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, - DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), - DAG.getIntPtrConstant(0, dl)); + SDValue Or = DAG.getNode( + ISD::OR, dl, MVT::v2i64, + DAG.getBitcast(MVT::v2i64, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)), + DAG.getBitcast(MVT::v2i64, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias))); + Or = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, + DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl)); // Subtract the bias. SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); @@ -11777,19 +12358,16 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, if (Subtarget.hasSSE41()) { EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); - SDValue VecCstLowBitcast = - DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow); - SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V); + SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow); + SDValue VecBitcast = DAG.getBitcast(VecI16VT, V); // Low will be bitcasted right away, so do not bother bitcasting back to its // original type. Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32)); // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), // (uint4) 0x53000000, 0xaa); - SDValue VecCstHighBitcast = - DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh); - SDValue VecShiftBitcast = - DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift); + SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh); + SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift); // High will be bitcasted right away, so do not bother bitcasting back to // its original type. High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast, @@ -11815,11 +12393,11 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, makeArrayRef(&CstFAddArray[0], NumElts)); // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); - SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High); + SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High); SDValue FHigh = DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd); // return (float4) lo + fhi; - SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low); + SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low); return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh); } @@ -11856,6 +12434,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); SDLoc dl(Op); + auto PtrVT = getPointerTy(DAG.getDataLayout()); if (Op.getValueType().isVector()) return lowerUINT_TO_FP_vec(Op, DAG); @@ -11878,9 +12457,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // Make a 64-bit buffer, and use it to build an FILD. SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); if (SrcVT == MVT::i32) { - SDValue WordOff = DAG.getConstant(4, dl, getPointerTy()); - SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, - getPointerTy(), StackSlot, WordOff); + SDValue WordOff = DAG.getConstant(4, dl, PtrVT); + SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, WordOff); SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, MachinePointerInfo(), false, false, 0); @@ -11901,10 +12479,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // we must be careful to do the computation in x87 extended precision, not // in SSE. (The generic code can't know it's OK to do this, or how to.) int SSFI = cast(StackSlot)->getIndex(); - MachineMemOperand *MMO = - DAG.getMachineFunction() - .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOLoad, 8, 8); + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), + MachineMemOperand::MOLoad, 8, 8); SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; @@ -11914,42 +12491,72 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, APInt FF(32, 0x5F800000ULL); // Check whether the sign bit is set. - SDValue SignSet = DAG.getSetCC(dl, - getSetCCResultType(*DAG.getContext(), MVT::i64), - Op.getOperand(0), - DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); + SDValue SignSet = DAG.getSetCC( + dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64), + Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. SDValue FudgePtr = DAG.getConstantPool( - ConstantInt::get(*DAG.getContext(), FF.zext(64)), - getPointerTy()); + ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT); // Get a pointer to FF if the sign bit was set, or to 0 otherwise. SDValue Zero = DAG.getIntPtrConstant(0, dl); SDValue Four = DAG.getIntPtrConstant(4, dl); SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, Zero, Four); - FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); + FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset); // Load the value out, extending it from f32 to f80. // FIXME: Avoid the extend by constructing the right constant pool? - SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), - FudgePtr, MachinePointerInfo::getConstantPool(), - MVT::f32, false, false, false, 4); + SDValue Fudge = DAG.getExtLoad( + ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, + false, false, false, 4); // Extend everything to 80 bits to force it to be done on x87. SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0, dl)); } +// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation +// is legal, or has an f16 source (which needs to be promoted to f32), +// just return an pair. +// Otherwise it is assumed to be a conversion from one of f32, f64 or f80 +// to i16, i32 or i64, and we lower it to a legal sequence. +// If lowered to the final integer result we return a pair. +// Otherwise we lower it to a sequence ending with a FIST, return a +// pair, and the caller is responsible for loading +// the final integer result from StackSlot. std::pair -X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, - bool IsSigned, bool IsReplace) const { +X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, + bool IsSigned, bool IsReplace) const { SDLoc DL(Op); EVT DstTy = Op.getValueType(); + EVT TheVT = Op.getOperand(0).getValueType(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); - if (!IsSigned && !isIntegerTypeFTOL(DstTy)) { + if (TheVT == MVT::f16) + // We need to promote the f16 to f32 before using the lowering + // in this routine. + return std::make_pair(SDValue(), SDValue()); + + assert((TheVT == MVT::f32 || + TheVT == MVT::f64 || + TheVT == MVT::f80) && + "Unexpected FP operand type in FP_TO_INTHelper"); + + // If using FIST to compute an unsigned i64, we'll need some fixup + // to handle values above the maximum signed i64. A FIST is always + // used for the 32-bit subtarget, but also for f80 on a 64-bit target. + bool UnsignedFixup = !IsSigned && + DstTy == MVT::i64 && + (!Subtarget->is64Bit() || + !isScalarFPTypeInSSEReg(TheVT)); + + if (!IsSigned && DstTy != MVT::i64 && !Subtarget->hasAVX512()) { + // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST. + // The low 32 bits of the fist result will have the correct uint32 result. assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); DstTy = MVT::i64; } @@ -11967,71 +12574,140 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) return std::make_pair(SDValue(), SDValue()); - // We lower FP->int64 either into FISTP64 followed by a load from a temporary - // stack slot, or into the FTOL runtime function. + // We lower FP->int64 into FISTP64 followed by a load from a temporary + // stack slot. MachineFunction &MF = DAG.getMachineFunction(); unsigned MemSize = DstTy.getSizeInBits()/8; int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); - SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); unsigned Opc; - if (!IsSigned && isIntegerTypeFTOL(DstTy)) - Opc = X86ISD::WIN_FTOL; - else - switch (DstTy.getSimpleVT().SimpleTy) { - default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); - case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; - case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; - case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; - } + switch (DstTy.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); + case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; + case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; + case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; + } SDValue Chain = DAG.getEntryNode(); SDValue Value = Op.getOperand(0); - EVT TheVT = Op.getOperand(0).getValueType(); + SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment. + + if (UnsignedFixup) { + // + // Conversion to unsigned i64 is implemented with a select, + // depending on whether the source value fits in the range + // of a signed i64. Let Thresh be the FP equivalent of + // 0x8000000000000000ULL. + // + // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000; + // FistSrc = (Value < Thresh) ? Value : (Value - Thresh); + // Fist-to-mem64 FistSrc + // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent + // to XOR'ing the high 32 bits with Adjust. + // + // Being a power of 2, Thresh is exactly representable in all FP formats. + // For X87 we'd like to use the smallest FP type for this constant, but + // for DAG type consistency we have to match the FP operand type. + + APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000)); + APFloat::opStatus Status = APFloat::opOK; + bool LosesInfo = false; + if (TheVT == MVT::f64) + // The rounding mode is irrelevant as the conversion should be exact. + Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, + &LosesInfo); + else if (TheVT == MVT::f80) + Status = Thresh.convert(APFloat::x87DoubleExtended, + APFloat::rmNearestTiesToEven, &LosesInfo); + + assert(Status == APFloat::opOK && !LosesInfo && + "FP conversion should have been exact"); + + SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT); + + SDValue Cmp = DAG.getSetCC(DL, + getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), TheVT), + Value, ThreshVal, ISD::SETLT); + Adjust = DAG.getSelect(DL, MVT::i32, Cmp, + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(0x80000000, DL, MVT::i32)); + SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal); + Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), TheVT), + Value, ThreshVal, ISD::SETLT); + Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub); + } + // FIXME This causes a redundant load/store if the SSE-class value is already // in memory, such as if it is on the callstack. if (isScalarFPTypeInSSEReg(TheVT)) { assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); Chain = DAG.getStore(Chain, DL, Value, StackSlot, - MachinePointerInfo::getFixedStack(SSFI), - false, false, 0); + MachinePointerInfo::getFixedStack(MF, SSFI), false, + false, 0); SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(TheVT) }; MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOLoad, MemSize, MemSize); + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), + MachineMemOperand::MOLoad, MemSize, MemSize); Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO); Chain = Value.getValue(1); SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); - StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + StackSlot = DAG.getFrameIndex(SSFI, PtrVT); } MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOStore, MemSize, MemSize); + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), + MachineMemOperand::MOStore, MemSize, MemSize); + + if (UnsignedFixup) { + + // Insert the FIST, load its result as two i32's, + // and XOR the high i32 with Adjust. + + SDValue FistOps[] = { Chain, Value, StackSlot }; + SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), + FistOps, DstTy, MMO); + + SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot, + MachinePointerInfo(), + false, false, false, 0); + SDValue HighAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackSlot, + DAG.getConstant(4, DL, PtrVT)); + + SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr, + MachinePointerInfo(), + false, false, false, 0); + High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust); + + if (Subtarget->is64Bit()) { + // Join High32 and Low32 into a 64-bit result. + // (High32 << 32) | Low32 + Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32); + High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32); + High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32, + DAG.getConstant(32, DL, MVT::i8)); + SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32); + return std::make_pair(Result, SDValue()); + } - if (Opc != X86ISD::WIN_FTOL) { + SDValue ResultOps[] = { Low32, High32 }; + + SDValue pair = IsReplace + ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps) + : DAG.getMergeValues(ResultOps, DL); + return std::make_pair(pair, SDValue()); + } else { // Build the FP_TO_INT*_IN_MEM SDValue Ops[] = { Chain, Value, StackSlot }; SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), Ops, DstTy, MMO); return std::make_pair(FIST, StackSlot); - } else { - SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, - DAG.getVTList(MVT::Other, MVT::Glue), - Chain, Value); - SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX, - MVT::i32, ftol.getValue(1)); - SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX, - MVT::i32, eax.getValue(2)); - SDValue Ops[] = { eax, edx }; - SDValue pair = IsReplace - ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops) - : DAG.getMergeValues(Ops, DL); - return std::make_pair(pair, SDValue()); } } @@ -12075,20 +12751,20 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, MVT HVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements()/2); - OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); - OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi); + OpLo = DAG.getBitcast(HVT, OpLo); + OpHi = DAG.getBitcast(HVT, OpHi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, - SelectionDAG &DAG) { + const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); SDLoc DL(Op); unsigned int NumElts = VT.getVectorNumElements(); - if (NumElts != 8 && NumElts != 16) + if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI()) return SDValue(); if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) @@ -12109,11 +12785,9 @@ static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - if (Subtarget->hasFp256()) { - SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); - if (Res.getNode()) + if (Subtarget->hasFp256()) + if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) return Res; - } return SDValue(); } @@ -12126,13 +12800,11 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, MVT SVT = In.getSimpleValueType(); if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1) - return LowerZERO_EXTEND_AVX512(Op, DAG); + return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG); - if (Subtarget->hasFp256()) { - SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); - if (Res.getNode()) + if (Subtarget->hasFp256()) + if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) return Res; - } assert(!VT.is256BitVector() || !SVT.is128BitVector() || VT.getVectorNumElements() != SVT.getVectorNumElements()); @@ -12161,22 +12833,20 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { if (InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI()) return Op; // legal, will go to VPMOVB2M, VPMOVW2M - if ((InVT.is256BitVector() || InVT.is128BitVector()) + if ((InVT.is256BitVector() || InVT.is128BitVector()) && InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI() && Subtarget->hasVLX()) return Op; // legal, will go to VPMOVB2M, VPMOVW2M if (InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 && Subtarget->hasDQI()) return Op; // legal, will go to VPMOVD2M, VPMOVQ2M - if ((InVT.is256BitVector() || InVT.is128BitVector()) + if ((InVT.is256BitVector() || InVT.is128BitVector()) && InVT.getScalarSizeInBits() >= 32 && Subtarget->hasDQI() && Subtarget->hasVLX()) return Op; // legal, will go to VPMOVB2M, VPMOVQ2M } - if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) { - if (VT.getVectorElementType().getSizeInBits() >=8) - return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); + if (VT.getVectorElementType() == MVT::i1) { assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); unsigned NumElts = InVT.getVectorNumElements(); assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type"); @@ -12192,11 +12862,16 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::TESTM, DL, VT, And, And); } + // vpmovqb/w/d, vpmovdb/w, vpmovwb + if (((!InVT.is512BitVector() && Subtarget->hasVLX()) || InVT.is512BitVector()) && + (InVT.getVectorElementType() != MVT::i16 || Subtarget->hasBWI())) + return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); + if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget->hasInt256()) { static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; - In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In); + In = DAG.getBitcast(MVT::v8i32, In); In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32), ShufMask); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, @@ -12207,8 +12882,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { DAG.getIntPtrConstant(0, DL)); SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(2, DL)); - OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); - OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); + OpLo = DAG.getBitcast(MVT::v4i32, OpLo); + OpHi = DAG.getBitcast(MVT::v4i32, OpHi); static const int ShufMask[] = {0, 2, 4, 6}; return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask); } @@ -12216,7 +12891,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { // On AVX2, v8i32 -> v8i16 becomed PSHUFB. if (Subtarget->hasInt256()) { - In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In); + In = DAG.getBitcast(MVT::v32i8, In); SmallVector pshufbMask; for (unsigned i = 0; i < 2; ++i) { @@ -12233,14 +12908,14 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { } SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask); In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); - In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In); + In = DAG.getBitcast(MVT::v4i64, In); static const int ShufMask[] = {0, 2, -1, -1}; In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64), &ShufMask[0]); In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(0, DL)); - return DAG.getNode(ISD::BITCAST, DL, VT, In); + return DAG.getBitcast(VT, In); } SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, @@ -12249,8 +12924,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, DAG.getIntPtrConstant(4, DL)); - OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo); - OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi); + OpLo = DAG.getBitcast(MVT::v16i8, OpLo); + OpHi = DAG.getBitcast(MVT::v16i8, OpHi); // The PSHUFB mask: static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, @@ -12260,13 +12935,13 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1); OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1); - OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); - OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); + OpLo = DAG.getBitcast(MVT::v4i32, OpLo); + OpHi = DAG.getBitcast(MVT::v4i32, OpHi); // The MOVLHPS Mask: static const int ShufMask2[] = {0, 1, 4, 5}; SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2); - return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res); + return DAG.getBitcast(MVT::v8i16, res); } // Handle truncation of V256 to V128 using shuffles. @@ -12282,8 +12957,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { // Prepare truncation shuffle mask for (unsigned i = 0; i != NumElems; ++i) MaskVec[i] = i * 2; - SDValue V = DAG.getVectorShuffle(NVT, DL, - DAG.getNode(ISD::BITCAST, DL, NVT, In), + SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In), DAG.getUNDEF(NVT), &MaskVec[0]); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, DAG.getIntPtrConstant(0, DL)); @@ -12297,7 +12971,8 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, /*IsSigned=*/ true, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. - if (!FIST.getNode()) return Op; + if (!FIST.getNode()) + return Op; if (StackSlot.getNode()) // Load the result. @@ -12314,7 +12989,9 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, std::pair Vals = FP_TO_INTHelper(Op, DAG, /*IsSigned=*/ false, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; - assert(FIST.getNode() && "Unexpected failure"); + // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. + if (!FIST.getNode()) + return Op; if (StackSlot.getNode()) // Load the result. @@ -12354,24 +13031,29 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { if (User->getOpcode() == ISD::FNEG) return Op; - SDValue Op0 = Op.getOperand(0); - bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); - SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); - // Assume scalar op for initialization; update for vector if needed. - // Note that there are no scalar bitwise logical SSE/AVX instructions, so we - // generate a 16-byte vector constant and logic op even for the scalar case. - // Using a 16-byte mask allows folding the load of the mask with - // the logic op, so it can save (~4 bytes) on code size. - MVT EltVT = VT; - unsigned NumElts = VT == MVT::f64 ? 2 : 4; + // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to // decide if we should generate a 16-byte constant mask when we only need 4 or // 8 bytes for the scalar case. + + MVT LogicVT; + MVT EltVT; + unsigned NumElts; + if (VT.isVector()) { + LogicVT = VT; EltVT = VT.getVectorElementType(); NumElts = VT.getVectorNumElements(); + } else { + // There are no scalar bitwise logical SSE/AVX instructions, so we + // generate a 16-byte vector constant and logic op even for the scalar case. + // Using a 16-byte mask allows folding the load of the mask with + // the logic op, so it can save (~4 bytes) on code size. + LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; + EltVT = VT; + NumElts = (VT == MVT::f64) ? 2 : 4; } unsigned EltBits = EltVT.getSizeInBits(); @@ -12382,29 +13064,28 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { Constant *C = ConstantInt::get(*Context, MaskElt); C = ConstantVector::getSplat(NumElts, C); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy()); + SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); unsigned Alignment = cast(CPIdx)->getAlignment(); - SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); + SDValue Mask = + DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, Alignment); - if (VT.isVector()) { - // For a vector, cast operands to a vector type, perform the logic op, - // and cast the result back to the original value type. - MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); - SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask); - SDValue Operand = IsFNABS ? - DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) : - DAG.getNode(ISD::BITCAST, dl, VecVT, Op0); - unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR; - return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted)); - } - - // If not vector, then scalar. - unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; + SDValue Op0 = Op.getOperand(0); + bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); + unsigned LogicOp = + IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; - return DAG.getNode(BitOp, dl, VT, Operand, Mask); + + if (VT.isVector()) + return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); + + // For the scalar case extend to a 128-bit vector, perform the logic op, + // and extract the scalar result back out. + Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand); + SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode, + DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { @@ -12442,11 +13123,19 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { CV[0] = ConstantFP::get(*Context, APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1))); Constant *C = ConstantVector::get(CV); - SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16); - SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); - SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); + auto PtrVT = TLI.getPointerTy(DAG.getDataLayout()); + SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16); + + // Perform all logic operations as 16-byte vectors because there are no + // scalar FP logic instructions in SSE. This allows load folding of the + // constants into the logic instructions. + MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; + SDValue Mask1 = + DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 16); + Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1); + SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1); // Next, clear the sign bit from the first operand (magnitude). // If it's a constant, we can clear it here. @@ -12454,7 +13143,8 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { APFloat APF = Op0CN->getValueAPF(); // If the magnitude is a positive zero, the sign bit alone is enough. if (APF.isPosZero()) - return SignBit; + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit, + DAG.getIntPtrConstant(0, dl)); APF.clearSign(); CV[0] = ConstantFP::get(*Context, APF); } else { @@ -12463,16 +13153,20 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1))); } C = ConstantVector::get(CV); - CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16); - SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); + CPIdx = DAG.getConstantPool(C, PtrVT, 16); + SDValue Val = + DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 16); // If the magnitude operand wasn't a constant, we need to AND out the sign. - if (!isa(Op0)) - Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val); - + if (!isa(Op0)) { + Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0); + Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val); + } // OR the magnitude value with the sign bit. - return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); + Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val, + DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { @@ -12563,7 +13257,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, // Cast all vectors into TestVT for PTEST. for (unsigned i = 0, e = VecIns.size(); i < e; ++i) - VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]); + VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]); // If more than one full vectors are evaluated, OR them first before PTEST. for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { @@ -12737,7 +13431,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0), DAG.getConstant(Mask, dl, VT)); DAG.ReplaceAllUsesWith(Op, New); - DAG.RemoveDeadNode(Op.getNode()); Op = New; } break; @@ -12851,8 +13544,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, // if we're optimizing for size, however, as that'll allow better folding // of memory operations. if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 && - !DAG.getMachineFunction().getFunction()->hasFnAttribute( - Attribute::MinSize) && + !DAG.getMachineFunction().getFunction()->optForMinSize() && !Subtarget->isAtom()) { unsigned ExtendOp = isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; @@ -12898,29 +13590,31 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, DAGCombinerInfo &DCI, unsigned &RefinementSteps, bool &UseOneConstNR) const { - // FIXME: We should use instruction latency models to calculate the cost of - // each potential sequence, but this is very hard to do reliably because - // at least Intel's Core* chips have variable timing based on the number of - // significant digits in the divisor and/or sqrt operand. - if (!Subtarget->useSqrtEst()) - return SDValue(); - EVT VT = Op.getValueType(); + const char *RecipOp; - // SSE1 has rsqrtss and rsqrtps. + // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps. // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 // instructions: convert to single, rsqrtss, convert back to double, refine // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA // along with FMA, this could be a throughput win. - if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || - (Subtarget->hasAVX() && VT == MVT::v8f32)) { - RefinementSteps = 1; - UseOneConstNR = false; - return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); - } - return SDValue(); + if (VT == MVT::f32 && Subtarget->hasSSE1()) + RecipOp = "sqrtf"; + else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) || + (VT == MVT::v8f32 && Subtarget->hasAVX())) + RecipOp = "vec-sqrtf"; + else + return SDValue(); + + TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; + if (!Recips.isEnabled(RecipOp)) + return SDValue(); + + RefinementSteps = Recips.getRefinementSteps(RecipOp); + UseOneConstNR = false; + return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); } /// The minimum architected relative accuracy is 2^-12. We need one @@ -12928,14 +13622,8 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, SDValue X86TargetLowering::getRecipEstimate(SDValue Op, DAGCombinerInfo &DCI, unsigned &RefinementSteps) const { - // FIXME: We should use instruction latency models to calculate the cost of - // each potential sequence, but this is very hard to do reliably because - // at least Intel's Core* chips have variable timing based on the number of - // significant digits in the divisor. - if (!Subtarget->useReciprocalEst()) - return SDValue(); - EVT VT = Op.getValueType(); + const char *RecipOp; // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. // TODO: Add support for AVX512 (v16f32). @@ -12944,12 +13632,20 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, // 15 instructions: convert to single, rcpss, convert back to double, refine // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA // along with FMA, this could be a throughput win. - if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || - (Subtarget->hasAVX() && VT == MVT::v8f32)) { - RefinementSteps = ReciprocalEstimateRefinementSteps; - return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); - } - return SDValue(); + if (VT == MVT::f32 && Subtarget->hasSSE1()) + RecipOp = "divf"; + else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) || + (VT == MVT::v8f32 && Subtarget->hasAVX())) + RecipOp = "vec-divf"; + else + return SDValue(); + + TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; + if (!Recips.isEnabled(RecipOp)) + return SDValue(); + + RefinementSteps = Recips.getRefinementSteps(RecipOp); + return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); } /// If we have at least two divisions that use the same divisor, convert to @@ -12958,8 +13654,8 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, /// This is because we still need one division to calculate the reciprocal and /// then we need two multiplies by that reciprocal as replacements for the /// original divisions. -bool X86TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { - return NumUsers > 1; +unsigned X86TargetLowering::combineRepeatedFPDivisors() const { + return 2; } static bool isAllOnes(SDValue V) { @@ -13129,13 +13825,13 @@ static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { DAG.getConstant(-1, dl, VT)); switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); - case ISD::SETNE: - // (x != y) -> ~(x ^ y) + case ISD::SETEQ: + // (x == y) -> ~(x ^ y) return DAG.getNode(ISD::XOR, dl, VT, DAG.getNode(ISD::XOR, dl, VT, Op0, Op1), DAG.getConstant(-1, dl, VT)); - case ISD::SETEQ: - // (x == y) -> (x ^ y) + case ISD::SETNE: + // (x != y) -> (x ^ y) return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1); case ISD::SETUGT: case ISD::SETGT: @@ -13329,8 +14025,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, if (hasMinMax) { switch (SetCCOpcode) { default: break; - case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break; - case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break; + case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break; + case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break; } if (MinMax) { Swap = false; Invert = false; FlipSigns = false; } @@ -13380,8 +14076,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, assert(Subtarget->hasSSE2() && "Don't know how to lower!"); // First cast everything to the right type. - Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); - Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); + Op0 = DAG.getBitcast(MVT::v4i32, Op0); + Op1 = DAG.getBitcast(MVT::v4i32, Op1); // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. The lower @@ -13415,7 +14111,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, if (Invert) Result = DAG.getNOT(dl, Result, MVT::v4i32); - return DAG.getNode(ISD::BITCAST, dl, VT, Result); + return DAG.getBitcast(VT, Result); } if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) { @@ -13424,8 +14120,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!"); // First cast everything to the right type. - Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); - Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); + Op0 = DAG.getBitcast(MVT::v4i32, Op0); + Op1 = DAG.getBitcast(MVT::v4i32, Op1); // Do the compare. SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); @@ -13438,7 +14134,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, if (Invert) Result = DAG.getNOT(dl, Result, MVT::v4i32); - return DAG.getNode(ISD::BITCAST, dl, VT, Result); + return DAG.getBitcast(VT, Result); } } @@ -13635,7 +14331,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp); EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; - VCmp = DAG.getNode(ISD::BITCAST, DL, VCmpVT, VCmp); + VCmp = DAG.getBitcast(VCmpVT, VCmp); SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2); @@ -13648,26 +14344,26 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } } - if (VT.isVector() && VT.getScalarType() == MVT::i1) { - SDValue Op1Scalar; - if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode())) - Op1Scalar = ConvertI1VectorToInterger(Op1, DAG); - else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0)) - Op1Scalar = Op1.getOperand(0); - SDValue Op2Scalar; - if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode())) - Op2Scalar = ConvertI1VectorToInterger(Op2, DAG); - else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) - Op2Scalar = Op2.getOperand(0); - if (Op1Scalar.getNode() && Op2Scalar.getNode()) { - SDValue newSelect = DAG.getNode(ISD::SELECT, DL, - Op1Scalar.getValueType(), - Cond, Op1Scalar, Op2Scalar); - if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) - return DAG.getNode(ISD::BITCAST, DL, VT, newSelect); - SDValue ExtVec = DAG.getNode(ISD::BITCAST, DL, MVT::v8i1, newSelect); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec, - DAG.getIntPtrConstant(0, DL)); + if (VT.isVector() && VT.getScalarType() == MVT::i1) { + SDValue Op1Scalar; + if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode())) + Op1Scalar = ConvertI1VectorToInteger(Op1, DAG); + else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0)) + Op1Scalar = Op1.getOperand(0); + SDValue Op2Scalar; + if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode())) + Op2Scalar = ConvertI1VectorToInteger(Op2, DAG); + else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) + Op2Scalar = Op2.getOperand(0); + if (Op1Scalar.getNode() && Op2Scalar.getNode()) { + SDValue newSelect = DAG.getNode(ISD::SELECT, DL, + Op1Scalar.getValueType(), + Cond, Op1Scalar, Op2Scalar); + if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) + return DAG.getBitcast(VT, newSelect); + SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec, + DAG.getIntPtrConstant(0, DL)); } } @@ -13804,9 +14500,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } if (addTest) { - // Look pass the truncate if the high bits are known zero. + // Look past the truncate if the high bits are known zero. if (isTruncWithZeroHighBitsInput(Cond, DAG)) - Cond = Cond.getOperand(0); + Cond = Cond.getOperand(0); // We know the result of AND is compared against zero. Try to match // it to BT. @@ -13866,7 +14562,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops); } -static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); @@ -13892,7 +14589,7 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget unsigned int NumElts = VT.getVectorNumElements(); - if (NumElts != 8 && NumElts != 16) + if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI()) return SDValue(); if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) { @@ -13915,6 +14612,62 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget return DAG.getNode(X86ISD::VTRUNC, dl, VT, V); } +static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDValue In = Op->getOperand(0); + MVT VT = Op->getSimpleValueType(0); + MVT InVT = In.getSimpleValueType(); + assert(VT.getSizeInBits() == InVT.getSizeInBits()); + + MVT InSVT = InVT.getScalarType(); + assert(VT.getScalarType().getScalarSizeInBits() > InSVT.getScalarSizeInBits()); + + if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) + return SDValue(); + if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) + return SDValue(); + + SDLoc dl(Op); + + // SSE41 targets can use the pmovsx* instructions directly. + if (Subtarget->hasSSE41()) + return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + + // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI. + SDValue Curr = In; + MVT CurrVT = InVT; + + // As SRAI is only available on i16/i32 types, we expand only up to i32 + // and handle i64 separately. + while (CurrVT != VT && CurrVT.getScalarType() != MVT::i32) { + Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr); + MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2); + CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2); + Curr = DAG.getBitcast(CurrVT, Curr); + } + + SDValue SignExt = Curr; + if (CurrVT != InVT) { + unsigned SignExtShift = + CurrVT.getScalarSizeInBits() - InSVT.getScalarSizeInBits(); + SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, + DAG.getConstant(SignExtShift, dl, MVT::i8)); + } + + if (CurrVT == VT) + return SignExt; + + if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) { + SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, + DAG.getConstant(31, dl, MVT::i8)); + SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5}); + return DAG.getBitcast(VT, Ext); + } + + return SDValue(); +} + static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); @@ -14092,8 +14845,8 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, SmallVector Chains; SDValue Ptr = Ld->getBasePtr(); - SDValue Increment = - DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl, TLI.getPointerTy()); + SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl, + TLI.getPointerTy(DAG.getDataLayout())); SDValue Res = DAG.getUNDEF(LoadUnitVecVT); for (unsigned i = 0; i < NumLoads; ++i) { @@ -14118,7 +14871,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, // Bitcast the loaded value to a vector of the original element type, in // the size of the target vector type. - SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res); + SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res); unsigned SizeRatio = RegSz / MemSz; if (Ext == ISD::SEXTLOAD) { @@ -14143,7 +14896,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, SDValue Shuff = DAG.getVectorShuffle( WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); - Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); + Shuff = DAG.getBitcast(RegVT, Shuff); // Build the arithmetic shift. unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - @@ -14165,7 +14918,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); // Bitcast to the requested type. - Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); + Shuff = DAG.getBitcast(RegVT, Shuff); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); return Shuff; } @@ -14533,7 +15286,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, EVT VT = Op.getNode()->getValueType(0); bool Is64Bit = Subtarget->is64Bit(); - EVT SPTy = getPointerTy(); + MVT SPTy = getPointerTy(DAG.getDataLayout()); if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -14550,8 +15303,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, "have nested arguments."); } - const TargetRegisterClass *AddrRegClass = - getRegClassFor(getPointerTy()); + const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, @@ -14586,16 +15338,17 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); + auto PtrVT = getPointerTy(MF.getDataLayout()); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); const Value *SV = cast(Op.getOperand(2))->getValue(); SDLoc DL(Op); - if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { + if (!Subtarget->is64Bit() || + Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. - SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), - getPointerTy()); + SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), MachinePointerInfo(SV), false, false, 0); } @@ -14615,8 +15368,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MemOps.push_back(Store); // Store fp_offset - FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), - FIN, DAG.getIntPtrConstant(4, DL)); + FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL)); Store = DAG.getStore(Op.getOperand(0), DL, DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), @@ -14624,20 +15376,16 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MemOps.push_back(Store); // Store ptr to overflow_arg_area - FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), - FIN, DAG.getIntPtrConstant(4, DL)); - SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), - getPointerTy()); + FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL)); + SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8), false, false, 0); MemOps.push_back(Store); // Store ptr to reg_save_area. - FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), - FIN, DAG.getIntPtrConstant(8, DL)); - SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), - getPointerTy()); + FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(8, DL)); + SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT); Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(SV, 16), false, false, 0); MemOps.push_back(Store); @@ -14647,10 +15395,13 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->is64Bit() && "LowerVAARG only handles 64-bit va_arg!"); - assert((Subtarget->isTargetLinux() || - Subtarget->isTargetDarwin()) && - "Unhandled target in LowerVAARG"); assert(Op.getNode()->getNumOperands() == 4); + + MachineFunction &MF = DAG.getMachineFunction(); + if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) + // The Win64 ABI uses char* instead of a structure. + return DAG.expandVAArg(Op.getNode()); + SDValue Chain = Op.getOperand(0); SDValue SrcPtr = Op.getOperand(1); const Value *SV = cast(Op.getOperand(2))->getValue(); @@ -14659,7 +15410,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { EVT ArgVT = Op.getNode()->getValueType(0); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy); + uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); uint8_t ArgMode; // Decide which area this value should be read from. @@ -14678,8 +15429,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { if (ArgMode == 2) { // Sanity Check: Make sure using fp_offset makes sense. assert(!Subtarget->useSoftFloat() && - !(DAG.getMachineFunction().getFunction()->hasFnAttribute( - Attribute::NoImplicitFloat)) && + !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()); } @@ -14688,7 +15438,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32), DAG.getConstant(ArgMode, dl, MVT::i8), DAG.getConstant(Align, dl, MVT::i32)}; - SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); + SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other); SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, MachinePointerInfo(SV), @@ -14708,8 +15458,14 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - // X86-64 va_list is a struct { i32, i32, i8*, i8* }. + // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows, + // where a va_list is still an i8*. assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); + if (Subtarget->isCallingConvWin64( + DAG.getMachineFunction().getFunction()->getCallingConv())) + // Probably a Win64 va_copy. + return DAG.expandVACopy(Op.getNode()); + SDValue Chain = Op.getOperand(0); SDValue DstPtr = Op.getOperand(1); SDValue SrcPtr = Op.getOperand(2); @@ -14849,13 +15605,13 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, MVT EltVT = VT.getVectorElementType(); EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); - ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt); + ShAmt = DAG.getBitcast(ShVT, ShAmt); return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); } /// \brief Return (and \p Op, \p Mask) for compare instructions or /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the -/// necessary casting for \p Mask when lowering masking intrinsics. +/// necessary casting or extending for \p Mask when lowering masking intrinsics static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget *Subtarget, @@ -14863,8 +15619,8 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, EVT VT = Op.getValueType(); EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, VT.getVectorNumElements()); - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); + SDValue VMask = SDValue(); + unsigned OpcodeSelect = ISD::VSELECT; SDLoc dl(Op); assert(MaskVT.isSimple() && "invalid mask type"); @@ -14872,11 +15628,20 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, if (isAllOnes(Mask)) return Op; - // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements - // are extracted by EXTRACT_SUBVECTOR. - SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); + if (MaskVT.bitsGT(Mask.getValueType())) { + EVT newMaskVT = EVT::getIntegerVT(*DAG.getContext(), + MaskVT.getSizeInBits()); + VMask = DAG.getBitcast(MaskVT, + DAG.getNode(ISD::ANY_EXTEND, dl, newMaskVT, Mask)); + } else { + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); + } switch (Op.getOpcode()) { default: break; @@ -14885,19 +15650,27 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, case X86ISD::CMPM: case X86ISD::CMPMU: return DAG.getNode(ISD::AND, dl, VT, Op, VMask); + case X86ISD::VTRUNC: + case X86ISD::VTRUNCS: + case X86ISD::VTRUNCUS: + // We can't use ISD::VSELECT here because it is not always "Legal" + // for the destination type. For example vpmovqb require only AVX512 + // and vselect that can operate on byte element type require BWI + OpcodeSelect = X86ISD::SELECT; + break; } if (PreservedSrc.getOpcode() == ISD::UNDEF) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc); + return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc); } /// \brief Creates an SDNode for a predicated scalar operation. /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). -/// The mask is comming as MVT::i8 and it should be truncated +/// The mask is coming as MVT::i8 and it should be truncated /// to MVT::i1 while lowering masking intrinsics. /// The main difference between ScalarMaskingNode and VectorMaskingNode is using -/// "X86select" instead of "vselect". We just can't create the "vselect" node for -/// a scalar instruction. +/// "X86select" instead of "vselect". We just can't create the "vselect" node +/// for a scalar instruction. static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget *Subtarget, @@ -14915,6 +15688,60 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc); } +static int getSEHRegistrationNodeSize(const Function *Fn) { + if (!Fn->hasPersonalityFn()) + report_fatal_error( + "querying registration node size for function without personality"); + // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See + // WinEHStatePass for the full struct definition. + switch (classifyEHPersonality(Fn->getPersonalityFn())) { + case EHPersonality::MSVC_X86SEH: return 24; + case EHPersonality::MSVC_CXX: return 16; + default: break; + } + report_fatal_error("can only recover FP for MSVC EH personality functions"); +} + +/// When the 32-bit MSVC runtime transfers control to us, either to an outlined +/// function or when returning to a parent frame after catching an exception, we +/// recover the parent frame pointer by doing arithmetic on the incoming EBP. +/// Here's the math: +/// RegNodeBase = EntryEBP - RegNodeSize +/// ParentFP = RegNodeBase - RegNodeFrameOffset +/// Subtracting RegNodeSize takes us to the offset of the registration node, and +/// subtracting the offset (negative on x86) takes us back to the parent FP. +static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, + SDValue EntryEBP) { + MachineFunction &MF = DAG.getMachineFunction(); + SDLoc dl; + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); + + // It's possible that the parent function no longer has a personality function + // if the exceptional code was optimized away, in which case we just return + // the incoming EBP. + if (!Fn->hasPersonalityFn()) + return EntryEBP; + + int RegNodeSize = getSEHRegistrationNodeSize(Fn); + + // Get an MCSymbol that will ultimately resolve to the frame offset of the EH + // registration. + MCSymbol *OffsetSym = + MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol( + GlobalValue::getRealLinkageName(Fn->getName())); + SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT); + SDValue RegNodeFrameOffset = + DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal); + + // RegNodeBase = EntryEBP - RegNodeSize + // ParentFP = RegNodeBase - RegNodeFrameOffset + SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP, + DAG.getConstant(RegNodeSize, dl, PtrVT)); + return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, RegNodeFrameOffset); +} + static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); @@ -14928,17 +15755,59 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget case INTR_TYPE_2OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case INTR_TYPE_2OP_IMM8: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), + DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2))); case INTR_TYPE_3OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case INTR_TYPE_4OP: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(4)); case INTR_TYPE_1OP_MASK_RM: { SDValue Src = Op.getOperand(1); - SDValue Src0 = Op.getOperand(2); + SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); - SDValue RoundingMode = Op.getOperand(4); + SDValue RoundingMode; + // We allways add rounding mode to the Node. + // If the rounding mode is not specified, we add the + // "current direction" mode. + if (Op.getNumOperands() == 4) + RoundingMode = + DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); + else + RoundingMode = Op.getOperand(4); + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) + if (cast(RoundingMode)->getZExtValue() != + X86::STATIC_ROUNDING::CUR_DIRECTION) + return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, Op.getValueType(), Src, RoundingMode), + Mask, PassThru, Subtarget, DAG); return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, RoundingMode), - Mask, Src0, Subtarget, DAG); + Mask, PassThru, Subtarget, DAG); + } + case INTR_TYPE_1OP_MASK: { + SDValue Src = Op.getOperand(1); + SDValue PassThru = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + // We add rounding mode to the Node when + // - RM Opcode is specified and + // - RM is not "current direction". + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(4); + unsigned Round = cast(Rnd)->getZExtValue(); + if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { + return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, Op.getValueType(), + Src, Rnd), + Mask, PassThru, Subtarget, DAG); + } + } + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), + Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_SCALAR_MASK_RM: { SDValue Src1 = Op.getOperand(1); @@ -14946,7 +15815,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue Src0 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); // There are 2 kinds of intrinsics in this group: - // (1) With supress-all-exceptions (sae) or rounding mode- 6 operands + // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands // (2) With rounding mode and sae - 7 operands. if (Op.getNumOperands() == 6) { SDValue Sae = Op.getOperand(5); @@ -14985,11 +15854,101 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Src1,Src2), Mask, PassThru, Subtarget, DAG); } + case INTR_TYPE_2OP_MASK_RM: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue PassThru = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + // We specify 2 possible modes for intrinsics, with/without rounding modes. + // First, we check if the intrinsic have rounding mode (6 operands), + // if not, we set rounding mode to "current". + SDValue Rnd; + if (Op.getNumOperands() == 6) + Rnd = Op.getOperand(5); + else + Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + Src1, Src2, Rnd), + Mask, PassThru, Subtarget, DAG); + } + case INTR_TYPE_3OP_SCALAR_MASK_RM: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue PassThru = Op.getOperand(4); + SDValue Mask = Op.getOperand(5); + SDValue Sae = Op.getOperand(6); + + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, + Src2, Src3, Sae), + Mask, PassThru, Subtarget, DAG); + } + case INTR_TYPE_3OP_MASK_RM: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Imm = Op.getOperand(3); + SDValue PassThru = Op.getOperand(4); + SDValue Mask = Op.getOperand(5); + // We specify 2 possible modes for intrinsics, with/without rounding modes. + // First, we check if the intrinsic have rounding mode (7 operands), + // if not, we set rounding mode to "current". + SDValue Rnd; + if (Op.getNumOperands() == 7) + Rnd = Op.getOperand(6); + else + Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + Src1, Src2, Imm, Rnd), + Mask, PassThru, Subtarget, DAG); + } + case INTR_TYPE_3OP_IMM8_MASK: + case INTR_TYPE_3OP_MASK: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue PassThru = Op.getOperand(4); + SDValue Mask = Op.getOperand(5); + + if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK) + Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3); + // We specify 2 possible opcodes for intrinsics with rounding modes. + // First, we check if the intrinsic may have non-default rounding mode, + // (IntrData->Opc1 != 0), then we check the rounding mode operand. + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(6); + unsigned Round = cast(Rnd)->getZExtValue(); + if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { + return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, Op.getValueType(), + Src1, Src2, Src3, Rnd), + Mask, PassThru, Subtarget, DAG); + } + } + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + Src1, Src2, Src3), + Mask, PassThru, Subtarget, DAG); + } + case VPERM_3OP_MASKZ: + case VPERM_3OP_MASK: + case FMA_OP_MASK3: + case FMA_OP_MASKZ: case FMA_OP_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); + EVT VT = Op.getValueType(); + SDValue PassThru = SDValue(); + + // set PassThru element + if (IntrData->Type == VPERM_3OP_MASKZ || IntrData->Type == FMA_OP_MASKZ) + PassThru = getZeroVector(VT, Subtarget, DAG, dl); + else if (IntrData->Type == FMA_OP_MASK3) + PassThru = Src3; + else + PassThru = Src1; + // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. @@ -15001,12 +15960,12 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src1, Src2, Src3, Rnd), - Mask, Src1, Subtarget, DAG); + Mask, PassThru, Subtarget, DAG); } return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src1, Src2, Src3), - Mask, Src1, Subtarget, DAG); + Mask, PassThru, Subtarget, DAG); } case CMP_MASK: case CMP_MASK_CC: { @@ -15056,7 +16015,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, DAG.getUNDEF(BitcastVT), CmpMask, DAG.getIntPtrConstant(0, dl)); - return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); + return DAG.getBitcast(Op.getValueType(), Res); } case COMI: { // Comparison intrinsics ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; @@ -15085,18 +16044,10 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue PassThru = Op.getOperand(2); if (isAllOnes(Mask)) // return data as is return Op.getOperand(1); - EVT VT = Op.getValueType(); - EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - VT.getVectorNumElements()); - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); - SDLoc dl(Op); - SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); - return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress, - PassThru); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + DataToCompress), + Mask, PassThru, Subtarget, DAG); } case BLEND: { SDValue Mask = Op.getOperand(3); @@ -15107,7 +16058,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Mask.getValueType().getSizeInBits()); SDLoc dl(Op); SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getBitcast(BitcastVT, Mask), DAG.getIntPtrConstant(0, dl)); return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1), Op.getOperand(2)); @@ -15127,16 +16078,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(1)); - case Intrinsic::x86_avx512_mask_valign_q_512: - case Intrinsic::x86_avx512_mask_valign_d_512: - // Vector source operands are swapped. - return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl, - Op.getValueType(), Op.getOperand(2), - Op.getOperand(1), - Op.getOperand(3)), - Op.getOperand(5), Op.getOperand(4), - Subtarget, DAG); - // ptest and testp intrinsics. The intrinsic these come from are designed to // return an integer value, not just an instruction so lower it to the ptest // or testp pattern and a setcc for the result. @@ -15205,8 +16146,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget case Intrinsic::x86_avx512_kortestz_w: case Intrinsic::x86_avx512_kortestc_w: { unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B; - SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1)); - SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2)); + SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1)); + SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2)); SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8); SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test); @@ -15289,6 +16230,44 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps); } + + case Intrinsic::x86_seh_lsda: { + // Compute the symbol for the LSDA. We know it'll get emitted later. + MachineFunction &MF = DAG.getMachineFunction(); + SDValue Op1 = Op.getOperand(1); + auto *Fn = cast(cast(Op1)->getGlobal()); + MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol( + GlobalValue::getRealLinkageName(Fn->getName())); + + // Generate a simple absolute symbol reference. This intrinsic is only + // supported on 32-bit Windows, which isn't PIC. + SDValue Result = DAG.getMCSymbol(LSDASym, VT); + return DAG.getNode(X86ISD::Wrapper, dl, VT, Result); + } + + case Intrinsic::x86_seh_recoverfp: { + SDValue FnOp = Op.getOperand(1); + SDValue IncomingFPOp = Op.getOperand(2); + GlobalAddressSDNode *GSD = dyn_cast(FnOp); + auto *Fn = dyn_cast_or_null(GSD ? GSD->getGlobal() : nullptr); + if (!Fn) + report_fatal_error( + "llvm.x86.seh.recoverfp must take a function as the first argument"); + return recoverFramePointer(DAG, Fn, IncomingFPOp); + } + + case Intrinsic::localaddress: { + // Returns one of the stack, base, or frame pointer registers, depending on + // which is used to reference local variables. + MachineFunction &MF = DAG.getMachineFunction(); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + unsigned Reg; + if (RegInfo->hasBasePointer(MF)) + Reg = RegInfo->getBaseRegister(); + else // This function handles the SP or FP case. + Reg = RegInfo->getPtrSizedFrameRegister(MF); + return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); + } } } @@ -15298,7 +16277,12 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, const X86Subtarget * Subtarget) { SDLoc dl(Op); ConstantSDNode *C = dyn_cast(ScaleOp); - assert(C && "Invalid scale type"); + if (!C) + llvm_unreachable("Invalid scale type"); + unsigned ScaleVal = C->getZExtValue(); + if (ScaleVal > 2 && ScaleVal != 4 && ScaleVal != 8) + llvm_unreachable("Valid scale values are 1, 2, 4, 8"); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); EVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); @@ -15306,8 +16290,16 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, ConstantSDNode *MaskC = dyn_cast(Mask); if (MaskC) MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); - else - MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); + else { + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); + } SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); @@ -15324,7 +16316,12 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Index, SDValue ScaleOp, SDValue Chain) { SDLoc dl(Op); ConstantSDNode *C = dyn_cast(ScaleOp); - assert(C && "Invalid scale type"); + if (!C) + llvm_unreachable("Invalid scale type"); + unsigned ScaleVal = C->getZExtValue(); + if (ScaleVal > 2 && ScaleVal != 4 && ScaleVal != 8) + llvm_unreachable("Valid scale values are 1, 2, 4, 8"); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); @@ -15334,8 +16331,16 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, ConstantSDNode *MaskC = dyn_cast(Mask); if (MaskC) MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); - else - MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); + else { + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); + } SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); @@ -15358,7 +16363,7 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, if (MaskC) MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); else - MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); + MaskInReg = DAG.getBitcast(MaskVT, Mask); //SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); @@ -15473,14 +16478,115 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Results, DL); } +static SDValue LowerSEHRESTOREFRAME(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MachineFunction &MF = DAG.getMachineFunction(); + const Function *Fn = MF.getFunction(); + SDLoc dl(Op); + SDValue Chain = Op.getOperand(0); + + assert(Subtarget->getFrameLowering()->hasFP(MF) && + "using llvm.x86.seh.restoreframe requires a frame pointer"); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MVT VT = TLI.getPointerTy(DAG.getDataLayout()); + + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + unsigned FrameReg = + RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); + unsigned SPReg = RegInfo->getStackRegister(); + unsigned SlotSize = RegInfo->getSlotSize(); + + // Get incoming EBP. + SDValue IncomingEBP = + DAG.getCopyFromReg(Chain, dl, FrameReg, VT); + + // SP is saved in the first field of every registration node, so load + // [EBP-RegNodeSize] into SP. + int RegNodeSize = getSEHRegistrationNodeSize(Fn); + SDValue SPAddr = DAG.getNode(ISD::ADD, dl, VT, IncomingEBP, + DAG.getConstant(-RegNodeSize, dl, VT)); + SDValue NewSP = + DAG.getLoad(VT, dl, Chain, SPAddr, MachinePointerInfo(), false, false, + false, VT.getScalarSizeInBits() / 8); + Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); + + if (!RegInfo->needsStackRealignment(MF)) { + // Adjust EBP to point back to the original frame position. + SDValue NewFP = recoverFramePointer(DAG, Fn, IncomingEBP); + Chain = DAG.getCopyToReg(Chain, dl, FrameReg, NewFP); + } else { + assert(RegInfo->hasBasePointer(MF) && + "functions with Win32 EH must use frame or base pointer register"); + + // Reload the base pointer (ESI) with the adjusted incoming EBP. + SDValue NewBP = recoverFramePointer(DAG, Fn, IncomingEBP); + Chain = DAG.getCopyToReg(Chain, dl, RegInfo->getBaseRegister(), NewBP); + + // Reload the spilled EBP value, now that the stack and base pointers are + // set up. + X86MachineFunctionInfo *X86FI = MF.getInfo(); + X86FI->setHasSEHFramePtrSave(true); + int FI = MF.getFrameInfo()->CreateSpillStackObject(SlotSize, SlotSize); + X86FI->setSEHFramePtrSaveIndex(FI); + SDValue NewFP = DAG.getLoad(VT, dl, Chain, DAG.getFrameIndex(FI, VT), + MachinePointerInfo(), false, false, false, + VT.getScalarSizeInBits() / 8); + Chain = DAG.getCopyToReg(NewFP, dl, FrameReg, NewFP); + } + + return Chain; +} + +/// \brief Lower intrinsics for TRUNCATE_TO_MEM case +/// return truncate Store/MaskedStore Node +static SDValue LowerINTRINSIC_TRUNCATE_TO_MEM(const SDValue & Op, + SelectionDAG &DAG, + MVT ElementType) { + SDLoc dl(Op); + SDValue Mask = Op.getOperand(4); + SDValue DataToTruncate = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + + EVT VT = DataToTruncate.getValueType(); + EVT SVT = EVT::getVectorVT(*DAG.getContext(), + ElementType, VT.getVectorNumElements()); + + if (isAllOnes(Mask)) // return just a truncate store + return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, + MachinePointerInfo(), SVT, false, false, + SVT.getScalarSizeInBits()/8); + + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), + MVT::i1, VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); + + MachineMemOperand *MMO = DAG.getMachineFunction(). + getMachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOStore, SVT.getStoreSize(), + SVT.getScalarSizeInBits()/8); + + return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, + VMask, SVT, MMO, true); +} static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { unsigned IntNo = cast(Op.getOperand(1))->getZExtValue(); const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo); - if (!IntrData) + if (!IntrData) { + if (IntNo == llvm::Intrinsic::x86_seh_restoreframe) + return LowerSEHRESTOREFRAME(Op, Subtarget, DAG); return SDValue(); + } SDLoc dl(Op); switch(IntrData->Type) { @@ -15591,49 +16697,44 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); + EVT VT = DataToCompress.getValueType(); if (isAllOnes(Mask)) // return just a store return DAG.getStore(Chain, dl, DataToCompress, Addr, - MachinePointerInfo(), false, false, 0); - - EVT VT = DataToCompress.getValueType(); - EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - VT.getVectorNumElements()); - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); - SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); + MachinePointerInfo(), false, false, + VT.getScalarSizeInBits()/8); - SDValue Compressed = DAG.getNode(IntrData->Opc0, dl, VT, VMask, - DataToCompress, DAG.getUNDEF(VT)); + SDValue Compressed = + getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress), + Mask, DAG.getUNDEF(VT), Subtarget, DAG); return DAG.getStore(Chain, dl, Compressed, Addr, - MachinePointerInfo(), false, false, 0); - } + MachinePointerInfo(), false, false, + VT.getScalarSizeInBits()/8); + } + case TRUNCATE_TO_MEM_VI8: + return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i8); + case TRUNCATE_TO_MEM_VI16: + return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i16); + case TRUNCATE_TO_MEM_VI32: + return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32); case EXPAND_FROM_MEM: { SDLoc dl(Op); SDValue Mask = Op.getOperand(4); - SDValue PathThru = Op.getOperand(3); + SDValue PassThru = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); EVT VT = Op.getValueType(); if (isAllOnes(Mask)) // return just a load return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false, - false, 0); - EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - VT.getVectorNumElements()); - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); - SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); + false, VT.getScalarSizeInBits()/8); SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), - false, false, false, 0); + false, false, false, + VT.getScalarSizeInBits()/8); SDValue Results[] = { - DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand, PathThru), - Chain}; + getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand), + Mask, PassThru, Subtarget, DAG), Chain}; return DAG.getMergeValues(Results, dl); } } @@ -15649,7 +16750,7 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); SDLoc dl(Op); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); @@ -15708,14 +16809,36 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -unsigned X86TargetLowering::getRegisterByName(const char* RegName, - EVT VT) const { +unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const { + const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); + const MachineFunction &MF = DAG.getMachineFunction(); + unsigned Reg = StringSwitch(RegName) .Case("esp", X86::ESP) .Case("rsp", X86::RSP) + .Case("ebp", X86::EBP) + .Case("rbp", X86::RBP) .Default(0); + + if (Reg == X86::EBP || Reg == X86::RBP) { + if (!TFI.hasFP(MF)) + report_fatal_error("register " + StringRef(RegName) + + " is allocatable: function has no frame pointer"); +#ifndef NDEBUG + else { + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + unsigned FrameReg = + RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); + assert((FrameReg == X86::EBP || FrameReg == X86::RBP) && + "Invalid Frame Register!"); + } +#endif + } + if (Reg) return Reg; + report_fatal_error("Invalid register name global variable"); } @@ -15731,7 +16854,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDValue Handler = Op.getOperand(2); SDLoc dl (Op); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || @@ -15752,6 +16875,25 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { DAG.getRegister(StoreAddrReg, PtrVT)); } +SDValue X86TargetLowering::LowerCATCHRET(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + SDValue Dest = Op.getOperand(1); + SDLoc DL(Op); + + MVT PtrVT = getPointerTy(DAG.getDataLayout()); + unsigned ReturnReg = (PtrVT == MVT::i64 ? X86::RAX : X86::EAX); + + // Load the address of the destination block. + MachineBasicBlock *DestMBB = cast(Dest)->getBasicBlock(); + SDValue BlockPtr = DAG.getMCSymbol(DestMBB->getSymbol(), PtrVT); + unsigned WrapperKind = + Subtarget->isPICStyleRIPRel() ? X86ISD::WrapperRIP : X86ISD::Wrapper; + SDValue WrappedPtr = DAG.getNode(WrapperKind, DL, PtrVT, BlockPtr); + Chain = DAG.getCopyToReg(Chain, DL, ReturnReg, WrappedPtr); + return DAG.getNode(X86ISD::CATCHRET, DL, MVT::Other, Chain, + DAG.getRegister(ReturnReg, PtrVT)); +} + SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); @@ -15863,9 +17005,11 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, for (FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end(); I != E; ++I, ++Idx) - if (Attrs.hasAttribute(Idx, Attribute::InReg)) + if (Attrs.hasAttribute(Idx, Attribute::InReg)) { + auto &DL = DAG.getDataLayout(); // FIXME: should only count parameters that are lowered to integers. - InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; + InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32; + } if (InRegCount > 2) { report_fatal_error("Nest register in use - reduce number of inreg" @@ -15950,11 +17094,12 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, // Save FP Control Word to stack slot int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); - SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + SDValue StackSlot = + DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout())); MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOStore, 2, 2); + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), + MachineMemOperand::MOStore, 2, 2); SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, @@ -16099,6 +17244,9 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { } static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { + if (Op.getValueType() == MVT::i1) + return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(), + Op.getOperand(0), Op.getOperand(1)); assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); @@ -16106,6 +17254,16 @@ static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { } static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { + if (Op.getValueType() == MVT::i1) + return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(), + Op.getOperand(0), Op.getOperand(1)); + assert(Op.getSimpleValueType().is256BitVector() && + Op.getSimpleValueType().isInteger() && + "Only handle AVX 256-bit vector integer operation"); + return Lower256IntArith(Op, DAG); +} + +static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) { assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); @@ -16117,6 +17275,9 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); + if (VT == MVT::i1) + return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1)); + // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector() && !Subtarget->hasInt256()) return Lower256IntArith(Op, DAG); @@ -16163,8 +17324,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, -1, 4, -1, 5, -1, 6, -1, 7}; ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); - ALo = DAG.getNode(ISD::BITCAST, dl, ExVT, ALo); - BLo = DAG.getNode(ISD::BITCAST, dl, ExVT, BLo); + ALo = DAG.getBitcast(ExVT, ALo); + BLo = DAG.getBitcast(ExVT, BLo); ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT)); BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT)); } @@ -16183,8 +17344,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, -1, 12, -1, 13, -1, 14, -1, 15}; AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); - AHi = DAG.getNode(ISD::BITCAST, dl, ExVT, AHi); - BHi = DAG.getNode(ISD::BITCAST, dl, ExVT, BHi); + AHi = DAG.getBitcast(ExVT, AHi); + BHi = DAG.getBitcast(ExVT, BHi); AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT)); BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT)); } @@ -16212,8 +17373,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, // Now multiply odd parts. SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds); - Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens); - Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds); + Evens = DAG.getBitcast(VT, Evens); + Odds = DAG.getBitcast(VT, Odds); // Merge the two vectors back together with a shuffle. This expands into 2 // shuffles. @@ -16238,20 +17399,26 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); + SDValue AhiBlo = Ahi; + SDValue AloBhi = Bhi; // Bit cast to 32-bit vectors for MULUDQ EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32; - A = DAG.getNode(ISD::BITCAST, dl, MulVT, A); - B = DAG.getNode(ISD::BITCAST, dl, MulVT, B); - Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi); - Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi); + A = DAG.getBitcast(MulVT, A); + B = DAG.getBitcast(MulVT, B); + Ahi = DAG.getBitcast(MulVT, Ahi); + Bhi = DAG.getBitcast(MulVT, Bhi); SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); - SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); - SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); - - AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG); - AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG); + // After shifting right const values the result may be all-zero. + if (!ISD::isBuildVectorAllZeros(Ahi.getNode())) { + AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); + AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG); + } + if (!ISD::isBuildVectorAllZeros(Bhi.getNode())) { + AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); + AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG); + } SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); @@ -16296,7 +17463,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons } SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), - getPointerTy()); + getPointerTy(DAG.getDataLayout())); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(InChain) @@ -16306,7 +17473,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); std::pair CallInfo = LowerCallTo(CLI); - return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first); + return DAG.getBitcast(VT, CallInfo.first); } static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, @@ -16344,12 +17511,10 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; // PMULUDQ <4 x i32> , <4 x i32> // => <2 x i64> - SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(Opcode, dl, MulVT, Op0, Op1)); + SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1)); // PMULUDQ <4 x i32> , <4 x i32> // => <2 x i64> - SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1)); + SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1)); // Shuffle it back into the right order. SDValue Highs, Lows; @@ -16368,9 +17533,9 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, // If we have a signed multiply but no PMULDQ fix up the high parts of a // unsigned multiply. if (IsSigned && !Subtarget->hasSSE41()) { - SDValue ShAmt = - DAG.getConstant(31, dl, - DAG.getTargetLoweringInfo().getShiftAmountTy(VT)); + SDValue ShAmt = DAG.getConstant( + 31, dl, + DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout())); SDValue T1 = DAG.getNode(ISD::AND, dl, VT, DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1); SDValue T2 = DAG.getNode(ISD::AND, dl, VT, @@ -16386,18 +17551,18 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Ops, dl); } -// Return true if the requred (according to Opcode) shift-imm form is natively +// Return true if the required (according to Opcode) shift-imm form is natively // supported by the Subtarget -static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget, +static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget, unsigned Opcode) { if (VT.getScalarSizeInBits() < 16) return false; - + if (VT.is512BitVector() && (VT.getScalarSizeInBits() > 16 || Subtarget->hasBWI())) return true; - bool LShift = VT.is128BitVector() || + bool LShift = VT.is128BitVector() || (VT.is256BitVector() && Subtarget->hasInt256()); bool AShift = LShift && (Subtarget->hasVLX() || @@ -16406,16 +17571,16 @@ static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget, } // The shift amount is a variable, but it is the same for all vector lanes. -// These instrcutions are defined together with shift-immediate. -static -bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget, +// These instructions are defined together with shift-immediate. +static +bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget, unsigned Opcode) { return SupportedVectorShiftWithImm(VT, Subtarget, Opcode); } -// Return true if the requred (according to Opcode) variable-shift form is +// Return true if the required (according to Opcode) variable-shift form is // natively supported by the Subtarget -static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget, +static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget, unsigned Opcode) { if (!Subtarget->hasInt256() || VT.getScalarSizeInBits() < 16) @@ -16443,6 +17608,38 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI : (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI; + auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) { + assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type"); + MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2); + SDValue Ex = DAG.getBitcast(ExVT, R); + + if (ShiftAmt >= 32) { + // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32. + SDValue Upper = + getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG); + SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, + ShiftAmt - 32, DAG); + if (VT == MVT::v2i64) + Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3}); + if (VT == MVT::v4i64) + Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, + {9, 1, 11, 3, 13, 5, 15, 7}); + } else { + // SRA upper i32, SHL whole i64 and select lower i32. + SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, + ShiftAmt, DAG); + SDValue Lower = + getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG); + Lower = DAG.getBitcast(ExVT, Lower); + if (VT == MVT::v2i64) + Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3}); + if (VT == MVT::v4i64) + Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, + {8, 1, 10, 3, 12, 5, 14, 7}); + } + return DAG.getBitcast(VT, Ex); + }; + // Optimize shl/srl/sra with constant shift amount. if (auto *BVAmt = dyn_cast(Amt)) { if (auto *ShiftConst = BVAmt->getConstantSplatNode()) { @@ -16451,15 +17648,24 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); + // i64 SRA needs to be performed as partial shifts. + if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && + Op.getOpcode() == ISD::SRA) + return ArithmeticShiftRight64(ShiftAmt); + if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) { unsigned NumElts = VT.getVectorNumElements(); MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); if (Op.getOpcode() == ISD::SHL) { + // Simple i8 add case + if (ShiftAmt == 1) + return DAG.getNode(ISD::ADD, dl, VT, R, R); + // Make a large shift. SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R, ShiftAmt, DAG); - SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); + SHL = DAG.getBitcast(VT, SHL); // Zero out the rightmost bits. SmallVector V( NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, MVT::i8)); @@ -16470,7 +17676,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, // Make a large shift. SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R, ShiftAmt, DAG); - SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); + SRL = DAG.getBitcast(VT, SRL); // Zero out the leftmost bits. SmallVector V( NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, MVT::i8)); @@ -16479,12 +17685,12 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, } if (Op.getOpcode() == ISD::SRA) { if (ShiftAmt == 7) { - // R s>> 7 === R s< 0 + // ashr(R, 7) === cmp_slt(R, 0) SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); } - // R s>> a === ((R u>> a) ^ m) - m + // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask) SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); SmallVector V(NumElts, DAG.getConstant(128 >> ShiftAmt, dl, @@ -16501,36 +17707,57 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, // Special case in 32-bit mode, where i64 is expanded into high and low parts. if (!Subtarget->is64Bit() && - (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && - Amt.getOpcode() == ISD::BITCAST && - Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { + (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64))) { + + // Peek through any splat that was introduced for i64 shift vectorization. + int SplatIndex = -1; + if (ShuffleVectorSDNode *SVN = dyn_cast(Amt.getNode())) + if (SVN->isSplat()) { + SplatIndex = SVN->getSplatIndex(); + Amt = Amt.getOperand(0); + assert(SplatIndex < (int)VT.getVectorNumElements() && + "Splat shuffle referencing second operand"); + } + + if (Amt.getOpcode() != ISD::BITCAST || + Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + Amt = Amt.getOperand(0); unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / VT.getVectorNumElements(); unsigned RatioInLog2 = Log2_32_Ceil(Ratio); uint64_t ShiftAmt = 0; + unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio); for (unsigned i = 0; i != Ratio; ++i) { - ConstantSDNode *C = dyn_cast(Amt.getOperand(i)); + ConstantSDNode *C = dyn_cast(Amt.getOperand(i + BaseOp)); if (!C) return SDValue(); // 6 == Log2(64) ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2))); } - // Check remaining shift amounts. - for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { - uint64_t ShAmt = 0; - for (unsigned j = 0; j != Ratio; ++j) { - ConstantSDNode *C = - dyn_cast(Amt.getOperand(i + j)); - if (!C) + + // Check remaining shift amounts (if not a splat). + if (SplatIndex < 0) { + for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { + uint64_t ShAmt = 0; + for (unsigned j = 0; j != Ratio; ++j) { + ConstantSDNode *C = dyn_cast(Amt.getOperand(i + j)); + if (!C) + return SDValue(); + // 6 == Log2(64) + ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); + } + if (ShAmt != ShiftAmt) return SDValue(); - // 6 == Log2(64) - ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); } - if (ShAmt != ShiftAmt) - return SDValue(); } - return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); + + if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) + return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); + + if (Op.getOpcode() == ISD::SRA) + return ArithmeticShiftRight64(ShiftAmt); } return SDValue(); @@ -16612,7 +17839,9 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, if (Vals[j] != Amt.getOperand(i + j)) return SDValue(); } - return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1)); + + if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) + return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1)); } return SDValue(); } @@ -16647,6 +17876,19 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3}); } + // i64 vector arithmetic shift can be emulated with the transform: + // M = lshr(SIGN_BIT, Amt) + // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M) + if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget->hasInt256())) && + Op.getOpcode() == ISD::SRA) { + SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT); + SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt); + R = DAG.getNode(ISD::SRL, dl, VT, R, Amt); + R = DAG.getNode(ISD::XOR, dl, VT, R, M); + R = DAG.getNode(ISD::SUB, dl, VT, R, M); + return R; + } + // If possible, lower this packed shift into a vector multiply instead of // expanding it into a sequence of scalar shifts. // Do this only if the vector shift count is a constant build_vector. @@ -16686,7 +17928,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, dl, VT)); - Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); + Op = DAG.getBitcast(MVT::v4f32, Op); Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); return DAG.getNode(ISD::MUL, dl, VT, Op, R); } @@ -16756,57 +17998,258 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2); if (TargetOpcode == X86ISD::MOVSD) CastVT = MVT::v2i64; - SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1); - SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2); + SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1); + SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2); SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2, BitCast1, DAG); - return DAG.getNode(ISD::BITCAST, dl, VT, Result); + return DAG.getBitcast(VT, Result); } } - if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { - // Turn 'a' into a mask suitable for VSELECT: a = a << 5; - Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, dl, VT)); - - SDValue VSelM = DAG.getConstant(0x80, dl, VT); - SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); - OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); - - // r = VSELECT(r, shl(r, 4), a); - SDValue M = DAG.getNode(ISD::SHL, dl, VT, R, DAG.getConstant(4, dl, VT)); - R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); - - // a += a - Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); - OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); - OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); - - // r = VSELECT(r, shl(r, 2), a); - M = DAG.getNode(ISD::SHL, dl, VT, R, DAG.getConstant(2, dl, VT)); - R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); + // v4i32 Non Uniform Shifts. + // If the shift amount is constant we can shift each lane using the SSE2 + // immediate shifts, else we need to zero-extend each lane to the lower i64 + // and shift using the SSE2 variable shifts. + // The separate results can then be blended together. + if (VT == MVT::v4i32) { + unsigned Opc = Op.getOpcode(); + SDValue Amt0, Amt1, Amt2, Amt3; + if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { + Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0}); + Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1}); + Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2}); + Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3}); + } else { + // ISD::SHL is handled above but we include it here for completeness. + switch (Opc) { + default: + llvm_unreachable("Unknown target vector shift node"); + case ISD::SHL: + Opc = X86ISD::VSHL; + break; + case ISD::SRL: + Opc = X86ISD::VSRL; + break; + case ISD::SRA: + Opc = X86ISD::VSRA; + break; + } + // The SSE2 shifts use the lower i64 as the same shift amount for + // all lanes and the upper i64 is ignored. These shuffle masks + // optimally zero-extend each lanes on SSE2/SSE41/AVX targets. + SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); + Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1}); + Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1}); + Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1}); + Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1}); + } + + SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0); + SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1); + SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2); + SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3); + SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1}); + SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7}); + return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7}); + } + + if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget->hasInt256())) { + MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); + unsigned ShiftOpcode = Op->getOpcode(); + + auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) { + // On SSE41 targets we make use of the fact that VSELECT lowers + // to PBLENDVB which selects bytes based just on the sign bit. + if (Subtarget->hasSSE41()) { + V0 = DAG.getBitcast(VT, V0); + V1 = DAG.getBitcast(VT, V1); + Sel = DAG.getBitcast(VT, Sel); + return DAG.getBitcast(SelVT, + DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1)); + } + // On pre-SSE41 targets we test for the sign bit by comparing to + // zero - a negative value will set all bits of the lanes to true + // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering. + SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl); + SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel); + return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1); + }; - // a += a - Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); - OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); - OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); + // Turn 'a' into a mask suitable for VSELECT: a = a << 5; + // We can safely do this using i16 shifts as we're only interested in + // the 3 lower bits of each byte. + Amt = DAG.getBitcast(ExtVT, Amt); + Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT)); + Amt = DAG.getBitcast(VT, Amt); + + if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) { + // r = VSELECT(r, shift(r, 4), a); + SDValue M = + DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT)); + R = SignBitSelect(VT, Amt, M, R); + + // a += a + Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); + + // r = VSELECT(r, shift(r, 2), a); + M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT)); + R = SignBitSelect(VT, Amt, M, R); + + // a += a + Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); + + // return VSELECT(r, shift(r, 1), a); + M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT)); + R = SignBitSelect(VT, Amt, M, R); + return R; + } - // return VSELECT(r, r+r, a); - R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, - DAG.getNode(ISD::ADD, dl, VT, R, R), R); - return R; + if (Op->getOpcode() == ISD::SRA) { + // For SRA we need to unpack each byte to the higher byte of a i16 vector + // so we can correctly sign extend. We don't care what happens to the + // lower byte. + SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt); + SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt); + SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R); + SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R); + ALo = DAG.getBitcast(ExtVT, ALo); + AHi = DAG.getBitcast(ExtVT, AHi); + RLo = DAG.getBitcast(ExtVT, RLo); + RHi = DAG.getBitcast(ExtVT, RHi); + + // r = VSELECT(r, shift(r, 4), a); + SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo, + DAG.getConstant(4, dl, ExtVT)); + SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi, + DAG.getConstant(4, dl, ExtVT)); + RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); + RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); + + // a += a + ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo); + AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi); + + // r = VSELECT(r, shift(r, 2), a); + MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo, + DAG.getConstant(2, dl, ExtVT)); + MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi, + DAG.getConstant(2, dl, ExtVT)); + RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); + RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); + + // a += a + ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo); + AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi); + + // r = VSELECT(r, shift(r, 1), a); + MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo, + DAG.getConstant(1, dl, ExtVT)); + MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi, + DAG.getConstant(1, dl, ExtVT)); + RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); + RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); + + // Logical shift the result back to the lower byte, leaving a zero upper + // byte + // meaning that we can safely pack with PACKUSWB. + RLo = + DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT)); + RHi = + DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT)); + return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); + } } // It's worth extending once and using the v8i32 shifts for 16-bit types, but // the extra overheads to get from v16i8 to v8i32 make the existing SSE // solution better. if (Subtarget->hasInt256() && VT == MVT::v8i16) { - MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16; + MVT ExtVT = MVT::v8i32; unsigned ExtOpc = Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - R = DAG.getNode(ExtOpc, dl, NewVT, R); - Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt); + R = DAG.getNode(ExtOpc, dl, ExtVT, R); + Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt); return DAG.getNode(ISD::TRUNCATE, dl, VT, - DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt)); + DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt)); + } + + if (Subtarget->hasInt256() && VT == MVT::v16i16) { + MVT ExtVT = MVT::v8i32; + SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); + SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z); + SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z); + SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, R, R); + SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, R, R); + ALo = DAG.getBitcast(ExtVT, ALo); + AHi = DAG.getBitcast(ExtVT, AHi); + RLo = DAG.getBitcast(ExtVT, RLo); + RHi = DAG.getBitcast(ExtVT, RHi); + SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo); + SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi); + Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT)); + Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT)); + return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); + } + + if (VT == MVT::v8i16) { + unsigned ShiftOpcode = Op->getOpcode(); + + auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) { + // On SSE41 targets we make use of the fact that VSELECT lowers + // to PBLENDVB which selects bytes based just on the sign bit. + if (Subtarget->hasSSE41()) { + MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2); + V0 = DAG.getBitcast(ExtVT, V0); + V1 = DAG.getBitcast(ExtVT, V1); + Sel = DAG.getBitcast(ExtVT, Sel); + return DAG.getBitcast( + VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1)); + } + // On pre-SSE41 targets we splat the sign bit - a negative value will + // set all bits of the lanes to true and VSELECT uses that in + // its OR(AND(V0,C),AND(V1,~C)) lowering. + SDValue C = + DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT)); + return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1); + }; + + // Turn 'a' into a mask suitable for VSELECT: a = a << 12; + if (Subtarget->hasSSE41()) { + // On SSE41 targets we need to replicate the shift mask in both + // bytes for PBLENDVB. + Amt = DAG.getNode( + ISD::OR, dl, VT, + DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)), + DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT))); + } else { + Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)); + } + + // r = VSELECT(r, shift(r, 8), a); + SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT)); + R = SignBitSelect(Amt, M, R); + + // a += a + Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); + + // r = VSELECT(r, shift(r, 4), a); + M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT)); + R = SignBitSelect(Amt, M, R); + + // a += a + Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); + + // r = VSELECT(r, shift(r, 2), a); + M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT)); + R = SignBitSelect(Amt, M, R); + + // a += a + Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); + + // return VSELECT(r, shift(r, 1), a); + M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT)); + R = SignBitSelect(Amt, M, R); + return R; } // Decompose 256-bit shifts into smaller 128-bit shifts. @@ -16930,7 +18373,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { /// the corresponding cmpxchg8b or cmpxchg16b instruction is available. /// Used to know whether to use cmpxchg8/16b when expanding atomic operations /// (otherwise we leave them alone to become __sync_fetch_and_... calls). -bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const { +bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const { unsigned OpWidth = MemType->getPrimitiveSizeInBits(); if (OpWidth == 64) @@ -16955,7 +18398,7 @@ bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { TargetLoweringBase::AtomicRMWExpansionKind X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; - const Type *MemType = AI->getType(); + Type *MemType = AI->getType(); // If the operand is too big, we must see if cmpxchg8/16b is available // and default to library calls otherwise. @@ -17001,7 +18444,7 @@ static bool hasMFENCE(const X86Subtarget& Subtarget) { LoadInst * X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; - const Type *MemType = AI->getType(); + Type *MemType = AI->getType(); // Accesses larger than the native width are turned into cmpxchg/libcalls, so // there is no benefit in turning such RMWs into loads, and it is actually // harmful as it introduces a mfence. @@ -17029,7 +18472,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { // lowered to just a load without a fence. A mfence flushes the store buffer, // making the optimization clearly correct. // FIXME: it is required if isAtLeastRelease(Order) but it is not clear - // otherwise, we might be able to be more agressive on relaxed idempotent + // otherwise, we might be able to be more aggressive on relaxed idempotent // rmw. In practice, they do not look useful, so we don't try to be // especially clever. if (SynchScope == SingleThread) @@ -17160,7 +18603,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts); - SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV); + SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64, DAG.getIntPtrConstant(0, dl)); } @@ -17182,141 +18625,241 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, return SDValue(); } -static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - SDNode *Node = Op.getNode(); - SDLoc dl(Node); +/// Compute the horizontal sum of bytes in V for the elements of VT. +/// +/// Requires V to be a byte vector and VT to be an integer vector type with +/// wider elements than V's type. The width of the elements of VT determines +/// how many bytes of V are summed horizontally to produce each element of the +/// result. +static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(V); + MVT ByteVecVT = V.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + int NumElts = VT.getVectorNumElements(); + assert(ByteVecVT.getVectorElementType() == MVT::i8 && + "Expected value to have byte element type."); + assert(EltVT != MVT::i8 && + "Horizontal byte sum only makes sense for wider elements!"); + unsigned VecSize = VT.getSizeInBits(); + assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!"); + + // PSADBW instruction horizontally add all bytes and leave the result in i64 + // chunks, thus directly computes the pop count for v2i64 and v4i64. + if (EltVT == MVT::i64) { + SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); + V = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, V, Zeros); + return DAG.getBitcast(VT, V); + } + + if (EltVT == MVT::i32) { + // We unpack the low half and high half into i32s interleaved with zeros so + // that we can use PSADBW to horizontally sum them. The most useful part of + // this is that it lines up the results of two PSADBW instructions to be + // two v2i64 vectors which concatenated are the 4 population counts. We can + // then use PACKUSWB to shrink and concatenate them into a v4i32 again. + SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL); + SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V, Zeros); + SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V, Zeros); + + // Do the horizontal sums into two v2i64s. + Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); + Low = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, + DAG.getBitcast(ByteVecVT, Low), Zeros); + High = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, + DAG.getBitcast(ByteVecVT, High), Zeros); + + // Merge them together. + MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16); + V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT, + DAG.getBitcast(ShortVecVT, Low), + DAG.getBitcast(ShortVecVT, High)); + + return DAG.getBitcast(VT, V); + } + + // The only element type left is i16. + assert(EltVT == MVT::i16 && "Unknown how to handle type"); + + // To obtain pop count for each i16 element starting from the pop count for + // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s + // right by 8. It is important to shift as i16s as i8 vector shift isn't + // directly supported. + SmallVector Shifters(NumElts, DAG.getConstant(8, DL, EltVT)); + SDValue Shifter = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters); + SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), Shifter); + V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl), + DAG.getBitcast(ByteVecVT, V)); + return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), Shifter); +} + +static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + unsigned VecSize = VT.getSizeInBits(); - Op = Op.getOperand(0); - EVT VT = Op.getValueType(); - assert((VT.is128BitVector() || VT.is256BitVector()) && - "CTPOP lowering only implemented for 128/256-bit wide vector types"); + // Implement a lookup table in register by using an algorithm based on: + // http://wm.ite.pl/articles/sse-popcount.html + // + // The general idea is that every lower byte nibble in the input vector is an + // index into a in-register pre-computed pop count table. We then split up the + // input vector in two new ones: (1) a vector with only the shifted-right + // higher nibbles for each byte and (2) a vector with the lower nibbles (and + // masked out higher ones) for each byte. PSHUB is used separately with both + // to index the in-register table. Next, both are added and the result is a + // i8 vector where each element contains the pop count for input byte. + // + // To obtain the pop count for elements != i8, we follow up with the same + // approach and use additional tricks as described below. + // + const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, + /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, + /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, + /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4}; + + int NumByteElts = VecSize / 8; + MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts); + SDValue In = DAG.getBitcast(ByteVecVT, Op); + SmallVector LUTVec; + for (int i = 0; i < NumByteElts; ++i) + LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); + SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, LUTVec); + SmallVector Mask0F(NumByteElts, + DAG.getConstant(0x0F, DL, MVT::i8)); + SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Mask0F); + + // High nibbles + SmallVector Four(NumByteElts, DAG.getConstant(4, DL, MVT::i8)); + SDValue FourV = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Four); + SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV); + + // Low nibbles + SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F); + + // The input vector is used as the shuffle mask that index elements into the + // LUT. After counting low and high nibbles, add the vector to obtain the + // final pop count per i8 element. + SDValue HighPopCnt = + DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles); + SDValue LowPopCnt = + DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles); + SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt); - unsigned NumElts = VT.getVectorNumElements(); - EVT EltVT = VT.getVectorElementType(); - unsigned Len = EltVT.getSizeInBits(); + if (EltVT == MVT::i8) + return PopCnt; + + return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG); +} + +static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + assert(VT.is128BitVector() && + "Only 128-bit vector bitmath lowering supported."); + + int VecSize = VT.getSizeInBits(); + MVT EltVT = VT.getVectorElementType(); + int Len = EltVT.getSizeInBits(); // This is the vectorized version of the "best" algorithm from // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel // with a minor tweak to use a series of adds + shifts instead of vector - // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types: - // - // v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled - // v8i32 => Always profitable - // - // FIXME: There a couple of possible improvements: - // - // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled). - // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html - // - assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 && - "CTPOP not implemented for this vector element type."); + // multiplications. Implemented for all integer vector types. We only use + // this when we don't have SSSE3 which allows a LUT-based lowering that is + // much faster, even faster than using native popcnt instructions. + + auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) { + MVT VT = V.getSimpleValueType(); + SmallVector Shifters( + VT.getVectorNumElements(), + DAG.getConstant(Shifter, DL, VT.getVectorElementType())); + return DAG.getNode(OpCode, DL, VT, V, + DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters)); + }; + auto GetMask = [&](SDValue V, APInt Mask) { + MVT VT = V.getSimpleValueType(); + SmallVector Masks( + VT.getVectorNumElements(), + DAG.getConstant(Mask, DL, VT.getVectorElementType())); + return DAG.getNode(ISD::AND, DL, VT, V, + DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Masks)); + }; + + // We don't want to incur the implicit masks required to SRL vNi8 vectors on + // x86, so set the SRL type to have elements at least i16 wide. This is + // correct because all of our SRLs are followed immediately by a mask anyways + // that handles any bits that sneak into the high bits of the byte elements. + MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16); - // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid - // extra legalization. - bool NeedsBitcast = EltVT == MVT::i32; - MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64; + SDValue V = Op; + + // v = v - ((v >> 1) & 0x55555555...) + SDValue Srl = + DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1)); + SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55))); + V = DAG.getNode(ISD::SUB, DL, VT, V, And); - SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), dl, - EltVT); - SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), dl, - EltVT); - SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), dl, - EltVT); + // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) + SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33))); + Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2)); + SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33))); + V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS); + + // v = (v + (v >> 4)) & 0x0F0F0F0F... + Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4)); + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl); + V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F))); + + // At this point, V contains the byte-wise population count, and we are + // merely doing a horizontal sum if necessary to get the wider element + // counts. + if (EltVT == MVT::i8) + return V; - // v = v - ((v >> 1) & 0x55555555...) - SmallVector Ones(NumElts, DAG.getConstant(1, dl, EltVT)); - SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones); - SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV); - if (NeedsBitcast) - Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl); - - SmallVector Mask55(NumElts, Cst55); - SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55); - if (NeedsBitcast) - M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55); - - SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55); - if (VT != And.getValueType()) - And = DAG.getNode(ISD::BITCAST, dl, VT, And); - SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And); + return LowerHorizontalByteSum( + DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget, + DAG); +} - // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) - SmallVector Mask33(NumElts, Cst33); - SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33); - SmallVector Twos(NumElts, DAG.getConstant(2, dl, EltVT)); - SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos); +static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + // FIXME: Need to add AVX-512 support here! + assert((VT.is256BitVector() || VT.is128BitVector()) && + "Unknown CTPOP type to handle"); + SDLoc DL(Op.getNode()); + SDValue Op0 = Op.getOperand(0); - Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV); - if (NeedsBitcast) { - Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl); - M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33); - Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub); + if (!Subtarget->hasSSSE3()) { + // We can't use the fast LUT approach, so fall back on vectorized bitmath. + assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!"); + return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG); } - SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33); - SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33); - if (VT != AndRHS.getValueType()) { - AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS); - AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS); - } - SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS); + if (VT.is256BitVector() && !Subtarget->hasInt256()) { + unsigned NumElems = VT.getVectorNumElements(); - // v = (v + (v >> 4)) & 0x0F0F0F0F... - SmallVector Fours(NumElts, DAG.getConstant(4, dl, EltVT)); - SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours); - Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV); - Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); - - SmallVector Mask0F(NumElts, Cst0F); - SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F); - if (NeedsBitcast) { - Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add); - M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F); - } - And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F); - if (VT != And.getValueType()) - And = DAG.getNode(ISD::BITCAST, dl, VT, And); - - // The algorithm mentioned above uses: - // v = (v * 0x01010101...) >> (Len - 8) - // - // Change it to use vector adds + vector shifts which yield faster results on - // Haswell than using vector integer multiplication. - // - // For i32 elements: - // v = v + (v >> 8) - // v = v + (v >> 16) - // - // For i64 elements: - // v = v + (v >> 8) - // v = v + (v >> 16) - // v = v + (v >> 32) - // - Add = And; - SmallVector Csts; - for (unsigned i = 8; i <= Len/2; i *= 2) { - Csts.assign(NumElts, DAG.getConstant(i, dl, EltVT)); - SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts); - Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV); - Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); - Csts.clear(); - } + // Extract each 128-bit vector, compute pop count and concat the result. + SDValue LHS = Extract128BitVector(Op0, 0, DAG, DL); + SDValue RHS = Extract128BitVector(Op0, NumElems/2, DAG, DL); - // The result is on the least significant 6-bits on i32 and 7-bits on i64. - SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), dl, - EltVT); - SmallVector Cst3FV(NumElts, Cst3F); - SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV); - if (NeedsBitcast) { - Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add); - M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, + LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG), + LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG)); } - And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F); - if (VT != And.getValueType()) - And = DAG.getNode(ISD::BITCAST, dl, VT, And); - return And; + return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG); +} + +static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(Op.getValueType().isVector() && + "We only do custom lowering for vector population count."); + return LowerVectorCTPOP(Op, Subtarget, DAG); } static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { @@ -17412,7 +18955,8 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, // the results are returned via SRet in memory. const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret"; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy()); + SDValue Callee = + DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); Type *RetTy = isF64 ? (Type*)StructType::get(ArgTy, ArgTy, nullptr) @@ -17563,6 +19107,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); + case ISD::SIGN_EXTEND_VECTOR_INREG: + return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); @@ -17587,6 +19133,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); + case ISD::CATCHRET: return LowerCATCHRET(Op, DAG); case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); @@ -17615,6 +19162,10 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); case ISD::ADD: return LowerADD(Op, DAG); case ISD::SUB: return LowerSUB(Op, DAG); + case ISD::SMAX: + case ISD::SMIN: + case ISD::UMAX: + case ISD::UMIN: return LowerMINMAX(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG); @@ -17668,17 +19219,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } case ISD::FP_TO_SINT: - // FP_TO_INT*_IN_MEM is not legal for f16 inputs. Do not convert - // (FP_TO_SINT (load f16)) to FP_TO_INT*. - if (N->getOperand(0).getValueType() == MVT::f16) - break; - // fallthrough case ISD::FP_TO_UINT: { bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; - if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType())) - return; - std::pair Vals = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); SDValue FIST = Vals.first, StackSlot = Vals.second; @@ -17705,8 +19248,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, MVT::f64); SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias); SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, - DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias)); - Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or); + DAG.getBitcast(MVT::v2i64, VBias)); + Or = DAG.getBitcast(MVT::v2f64, Or); SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); return; @@ -17829,7 +19372,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, N->getOperand(0)); - SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded); + SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded); if (ExperimentalVectorWideningLegalization) { // If we are legalizing vectors by widening, we already have the desired @@ -17859,7 +19402,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FANDN: return "X86ISD::FANDN"; case X86ISD::FOR: return "X86ISD::FOR"; case X86ISD::FXOR: return "X86ISD::FXOR"; - case X86ISD::FSRL: return "X86ISD::FSRL"; case X86ISD::FILD: return "X86ISD::FILD"; case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; @@ -17910,10 +19452,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::HSUB: return "X86ISD::HSUB"; case X86ISD::FHADD: return "X86ISD::FHADD"; case X86ISD::FHSUB: return "X86ISD::FHSUB"; - case X86ISD::UMAX: return "X86ISD::UMAX"; - case X86ISD::UMIN: return "X86ISD::UMIN"; - case X86ISD::SMAX: return "X86ISD::SMAX"; - case X86ISD::SMIN: return "X86ISD::SMIN"; + case X86ISD::ABS: return "X86ISD::ABS"; + case X86ISD::CONFLICT: return "X86ISD::CONFLICT"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND"; case X86ISD::FMIN: return "X86ISD::FMIN"; @@ -17922,12 +19462,15 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FMINC: return "X86ISD::FMINC"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; case X86ISD::FRCP: return "X86ISD::FRCP"; + case X86ISD::EXTRQI: return "X86ISD::EXTRQI"; + case X86ISD::INSERTQI: return "X86ISD::INSERTQI"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP"; case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP"; case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; + case X86ISD::CATCHRET: return "X86ISD::CATCHRET"; case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; @@ -17939,10 +19482,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VZEXT: return "X86ISD::VZEXT"; case X86ISD::VSEXT: return "X86ISD::VSEXT"; case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; - case X86ISD::VTRUNCM: return "X86ISD::VTRUNCM"; + case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; + case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; case X86ISD::VINSERT: return "X86ISD::VINSERT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; + case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD"; + case X86ISD::CVTUDQ2PD: return "X86ISD::CVTUDQ2PD"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; @@ -17978,6 +19524,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::TESTM: return "X86ISD::TESTM"; case X86ISD::TESTNM: return "X86ISD::TESTNM"; case X86ISD::KORTEST: return "X86ISD::KORTEST"; + case X86ISD::KTEST: return "X86ISD::KTEST"; case X86ISD::PACKSS: return "X86ISD::PACKSS"; case X86ISD::PACKUS: return "X86ISD::PACKUS"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; @@ -17986,6 +19533,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; case X86ISD::SHUFP: return "X86ISD::SHUFP"; + case X86ISD::SHUF128: return "X86ISD::SHUF128"; case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; @@ -18008,8 +19556,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3"; case X86ISD::VPERMI: return "X86ISD::VPERMI"; + case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM"; + case X86ISD::VRANGE: return "X86ISD::VRANGE"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; + case X86ISD::PSADBW: return "X86ISD::PSADBW"; + case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; @@ -18018,10 +19570,11 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SFENCE: return "X86ISD::SFENCE"; case X86ISD::LFENCE: return "X86ISD::LFENCE"; case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; - case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; case X86ISD::SAHF: return "X86ISD::SAHF"; case X86ISD::RDRAND: return "X86ISD::RDRAND"; case X86ISD::RDSEED: return "X86ISD::RDSEED"; + case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW"; + case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD"; case X86ISD::FMADD: return "X86ISD::FMADD"; case X86ISD::FMSUB: return "X86ISD::FMSUB"; case X86ISD::FNMADD: return "X86ISD::FNMADD"; @@ -18034,7 +19587,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND"; case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND"; case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND"; - case X86ISD::RNDSCALE: return "X86ISD::RNDSCALE"; + case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; + case X86ISD::VREDUCE: return "X86ISD::VREDUCE"; + case X86ISD::VGETMANT: return "X86ISD::VGETMANT"; case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; case X86ISD::XTEST: return "X86ISD::XTEST"; @@ -18049,16 +19604,26 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND"; case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND"; case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND"; + case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND"; + case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND"; + case X86ISD::SCALEF: return "X86ISD::SCALEF"; case X86ISD::ADDS: return "X86ISD::ADDS"; case X86ISD::SUBS: return "X86ISD::SUBS"; + case X86ISD::AVG: return "X86ISD::AVG"; + case X86ISD::MULHRS: return "X86ISD::MULHRS"; + case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND"; + case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND"; + case X86ISD::FP_TO_SINT_RND: return "X86ISD::FP_TO_SINT_RND"; + case X86ISD::FP_TO_UINT_RND: return "X86ISD::FP_TO_UINT_RND"; } return nullptr; } // isLegalAddressingMode - Return true if the addressing mode represented // by AM is legal for this target, for a load/store of the specified type. -bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, - Type *Ty) const { +bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, + unsigned AS) const { // X86 supports extremely general addressing modes. CodeModel::Model M = getTargetMachine().getCodeModel(); Reloc::Model R = getTargetMachine().getRelocationModel(); @@ -18204,7 +19769,7 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; } bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { - if (!(Subtarget->hasFMA() || Subtarget->hasFMA4())) + if (!(Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512())) return false; VT = VT.getScalarType(); @@ -18713,7 +20278,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); - if (!Subtarget->isTargetWin64()) { + if (!Subtarget->isCallingConvWin64(F->getFunction()->getCallingConv())) { // If %al is 0, branch around the XMM save block. BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB); @@ -18730,9 +20295,8 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( // In the XMM save block, save all the XMM argument registers. for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) { int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; - MachineMemOperand *MMO = - F->getMachineMemOperand( - MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), + MachineMemOperand *MMO = F->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset), MachineMemOperand::MOStore, /*Size=*/16, /*Align=*/16); BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) @@ -18786,6 +20350,39 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, return true; } +// Return true if it is OK for this CMOV pseudo-opcode to be cascaded +// together with other CMOV pseudo-opcodes into a single basic-block with +// conditional jump around it. +static bool isCMOVPseudo(MachineInstr *MI) { + switch (MI->getOpcode()) { + case X86::CMOV_FR32: + case X86::CMOV_FR64: + case X86::CMOV_GR8: + case X86::CMOV_GR16: + case X86::CMOV_GR32: + case X86::CMOV_RFP32: + case X86::CMOV_RFP64: + case X86::CMOV_RFP80: + case X86::CMOV_V2F64: + case X86::CMOV_V2I64: + case X86::CMOV_V4F32: + case X86::CMOV_V4F64: + case X86::CMOV_V4I64: + case X86::CMOV_V16F32: + case X86::CMOV_V8F32: + case X86::CMOV_V8F64: + case X86::CMOV_V8I64: + case X86::CMOV_V8I1: + case X86::CMOV_V16I1: + case X86::CMOV_V32I1: + case X86::CMOV_V64I1: + return true; + + default: + return false; + } +} + MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *BB) const { @@ -18809,8 +20406,41 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *thisMBB = BB; MachineFunction *F = BB->getParent(); - // We also lower double CMOVs: + // This code lowers all pseudo-CMOV instructions. Generally it lowers these + // as described above, by inserting a BB, and then making a PHI at the join + // point to select the true and false operands of the CMOV in the PHI. + // + // The code also handles two different cases of multiple CMOV opcodes + // in a row. + // + // Case 1: + // In this case, there are multiple CMOVs in a row, all which are based on + // the same condition setting (or the exact opposite condition setting). + // In this case we can lower all the CMOVs using a single inserted BB, and + // then make a number of PHIs at the join point to model the CMOVs. The only + // trickiness here, is that in a case like: + // + // t2 = CMOV cond1 t1, f1 + // t3 = CMOV cond1 t2, f2 + // + // when rewriting this into PHIs, we have to perform some renaming on the + // temps since you cannot have a PHI operand refer to a PHI result earlier + // in the same block. The "simple" but wrong lowering would be: + // + // t2 = PHI t1(BB1), f1(BB2) + // t3 = PHI t2(BB1), f2(BB2) + // + // but clearly t2 is not defined in BB1, so that is incorrect. The proper + // renaming is to note that on the path through BB1, t2 is really just a + // copy of t1, and do that renaming, properly generating: + // + // t2 = PHI t1(BB1), f1(BB2) + // t3 = PHI t1(BB1), f2(BB2) + // + // Case 2, we lower cascaded CMOVs such as + // // (CMOV (CMOV F, T, cc1), T, cc2) + // // to two successives branches. For that, we look for another CMOV as the // following instruction. // @@ -18876,19 +20506,42 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // .LBB5_4: // retq // - MachineInstr *NextCMOV = nullptr; + MachineInstr *CascadedCMOV = nullptr; + MachineInstr *LastCMOV = MI; + X86::CondCode CC = X86::CondCode(MI->getOperand(3).getImm()); + X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); MachineBasicBlock::iterator NextMIIt = std::next(MachineBasicBlock::iterator(MI)); - if (NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && + + // Check for case 1, where there are multiple CMOVs with the same condition + // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the + // number of jumps the most. + + if (isCMOVPseudo(MI)) { + // See if we have a string of CMOVS with the same condition. + while (NextMIIt != BB->end() && + isCMOVPseudo(NextMIIt) && + (NextMIIt->getOperand(3).getImm() == CC || + NextMIIt->getOperand(3).getImm() == OppCC)) { + LastCMOV = &*NextMIIt; + ++NextMIIt; + } + } + + // This checks for case 2, but only do this if we didn't already find + // case 1, as indicated by LastCMOV == MI. + if (LastCMOV == MI && + NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() && - NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) - NextCMOV = &*NextMIIt; + NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) { + CascadedCMOV = &*NextMIIt; + } MachineBasicBlock *jcc1MBB = nullptr; - // If we have a double CMOV, we lower it to two successive branches to + // If we have a cascaded CMOV, we lower it to two successive branches to // the same block. EFLAGS is used by both, so mark it as live in the second. - if (NextCMOV) { + if (CascadedCMOV) { jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, jcc1MBB); jcc1MBB->addLiveIn(X86::EFLAGS); @@ -18903,7 +20556,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // live into the sink and copy blocks. const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); - MachineInstr *LastEFLAGSUser = NextCMOV ? NextCMOV : MI; + MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV; if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) && !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) { copy0MBB->addLiveIn(X86::EFLAGS); @@ -18912,12 +20565,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), BB, - std::next(MachineBasicBlock::iterator(MI)), BB->end()); + std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(BB); // Add the true and fallthrough blocks as its successors. - if (NextCMOV) { - // The fallthrough block may be jcc1MBB, if we have a double CMOV. + if (CascadedCMOV) { + // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV. BB->addSuccessor(jcc1MBB); // In that case, jcc1MBB will itself fallthrough the copy0MBB, and @@ -18932,13 +20585,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, BB->addSuccessor(sinkMBB); // Create the conditional branch instruction. - unsigned Opc = - X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); + unsigned Opc = X86::GetCondBranchFromCond(CC); BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); - if (NextCMOV) { + if (CascadedCMOV) { unsigned Opc2 = X86::GetCondBranchFromCond( - (X86::CondCode)NextCMOV->getOperand(3).getImm()); + (X86::CondCode)CascadedCMOV->getOperand(3).getImm()); BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB); } @@ -18950,27 +20602,104 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // sinkMBB: // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] // ... - MachineInstrBuilder MIB = - BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), - MI->getOperand(0).getReg()) - .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) - .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); + MachineBasicBlock::iterator MIItEnd = + std::next(MachineBasicBlock::iterator(LastCMOV)); + MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin(); + DenseMap> RegRewriteTable; + MachineInstrBuilder MIB; + + // As we are creating the PHIs, we have to be careful if there is more than + // one. Later CMOVs may reference the results of earlier CMOVs, but later + // PHIs have to reference the individual true/false inputs from earlier PHIs. + // That also means that PHI construction must work forward from earlier to + // later, and that the code must maintain a mapping from earlier PHI's + // destination registers, and the registers that went into the PHI. + + for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { + unsigned DestReg = MIIt->getOperand(0).getReg(); + unsigned Op1Reg = MIIt->getOperand(1).getReg(); + unsigned Op2Reg = MIIt->getOperand(2).getReg(); - // If we have a double CMOV, the second Jcc provides the same incoming + // If this CMOV we are generating is the opposite condition from + // the jump we generated, then we have to swap the operands for the + // PHI that is going to be generated. + if (MIIt->getOperand(3).getImm() == OppCC) + std::swap(Op1Reg, Op2Reg); + + if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end()) + Op1Reg = RegRewriteTable[Op1Reg].first; + + if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end()) + Op2Reg = RegRewriteTable[Op2Reg].second; + + MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL, + TII->get(X86::PHI), DestReg) + .addReg(Op1Reg).addMBB(copy0MBB) + .addReg(Op2Reg).addMBB(thisMBB); + + // Add this PHI to the rewrite table. + RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg); + } + + // If we have a cascaded CMOV, the second Jcc provides the same incoming // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes). - if (NextCMOV) { + if (CascadedCMOV) { MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB); // Copy the PHI result to the register defined by the second CMOV. BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), - DL, TII->get(TargetOpcode::COPY), NextCMOV->getOperand(0).getReg()) + DL, TII->get(TargetOpcode::COPY), + CascadedCMOV->getOperand(0).getReg()) .addReg(MI->getOperand(0).getReg()); - NextCMOV->eraseFromParent(); + CascadedCMOV->eraseFromParent(); } - MI->eraseFromParent(); // The pseudo instruction is gone now. + // Now remove the CMOV(s). + for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ) + (MIIt++)->eraseFromParent(); + return sinkMBB; } +MachineBasicBlock * +X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI, + MachineBasicBlock *BB) const { + // Combine the following atomic floating-point modification pattern: + // a.store(reg OP a.load(acquire), release) + // Transform them into: + // OPss (%gpr), %xmm + // movss %xmm, (%gpr) + // Or sd equivalent for 64-bit operations. + unsigned MOp, FOp; + switch (MI->getOpcode()) { + default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP"); + case X86::RELEASE_FADD32mr: MOp = X86::MOVSSmr; FOp = X86::ADDSSrm; break; + case X86::RELEASE_FADD64mr: MOp = X86::MOVSDmr; FOp = X86::ADDSDrm; break; + } + const X86InstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + unsigned MSrc = MI->getOperand(0).getReg(); + unsigned VSrc = MI->getOperand(5).getReg(); + MachineInstrBuilder MIM = BuildMI(*BB, MI, DL, TII->get(MOp)) + .addReg(/*Base=*/MSrc) + .addImm(/*Scale=*/1) + .addReg(/*Index=*/0) + .addImm(0) + .addReg(0); + MachineInstr *MIO = BuildMI(*BB, (MachineInstr *)MIM, DL, TII->get(FOp), + MRI.createVirtualRegister(MRI.getRegClass(VSrc))) + .addReg(VSrc) + .addReg(/*Base=*/MSrc) + .addImm(/*Scale=*/1) + .addReg(/*Index=*/0) + .addImm(/*Disp=*/0) + .addReg(/*Segment=*/0); + MIM.addReg(MIO->getOperand(0).getReg(), RegState::Kill); + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; +} + MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { @@ -19009,7 +20738,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineRegisterInfo &MRI = MF->getRegInfo(); const TargetRegisterClass *AddrRegClass = - getRegClassFor(getPointerTy()); + getRegClassFor(getPointerTy(MF->getDataLayout())); unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), @@ -19110,7 +20839,8 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, assert(!Subtarget->isTargetMachO()); - X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL); + Subtarget->getFrameLowering()->emitStackProbeCall(*BB->getParent(), *BB, MI, + DL); MI->eraseFromParent(); // The pseudo instruction is gone now. return BB; @@ -19203,7 +20933,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MemOpndSlot = CurOp; - MVT PVT = getPointerTy(); + MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); @@ -19335,7 +21065,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); - MVT PVT = getPointerTy(); + MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); @@ -19388,6 +21118,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, // Replace 213-type (isel default) FMA3 instructions with 231-type for // accumulator loops. Writing back to the accumulator allows the coalescer // to remove extra copies in the loop. +// FIXME: Do this on AVX512. We don't support 231 variants yet (PR23937). MachineBasicBlock * X86TargetLowering::emitFMA3Instr(MachineInstr *MI, MachineBasicBlock *MBB) const { @@ -19501,29 +21232,33 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::TLSCall_32: case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); - case X86::CMOV_GR8: case X86::CMOV_FR32: case X86::CMOV_FR64: - case X86::CMOV_V4F32: + case X86::CMOV_GR8: + case X86::CMOV_GR16: + case X86::CMOV_GR32: + case X86::CMOV_RFP32: + case X86::CMOV_RFP64: + case X86::CMOV_RFP80: case X86::CMOV_V2F64: case X86::CMOV_V2I64: - case X86::CMOV_V8F32: + case X86::CMOV_V4F32: case X86::CMOV_V4F64: case X86::CMOV_V4I64: case X86::CMOV_V16F32: + case X86::CMOV_V8F32: case X86::CMOV_V8F64: case X86::CMOV_V8I64: - case X86::CMOV_GR16: - case X86::CMOV_GR32: - case X86::CMOV_RFP32: - case X86::CMOV_RFP64: - case X86::CMOV_RFP80: case X86::CMOV_V8I1: case X86::CMOV_V16I1: case X86::CMOV_V32I1: case X86::CMOV_V64I1: return EmitLoweredSelect(MI, BB); + case X86::RELEASE_FADD32mr: + case X86::RELEASE_FADD64mr: + return EmitLoweredAtomicFP(MI, BB); + case X86::FP32_TO_INT16_IN_MEM: case X86::FP32_TO_INT32_IN_MEM: case X86::FP32_TO_INT64_IN_MEM: @@ -19893,7 +21628,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, SDValue(ResNode.getNode(), 1)); } - return DAG.getNode(ISD::BITCAST, dl, VT, ResNode); + return DAG.getBitcast(VT, ResNode); } } @@ -19952,7 +21687,7 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, // Just remove no-op shuffle masks. if (Mask.size() == 1) { - DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input), + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input), /*AddTo*/ true); return true; } @@ -19988,14 +21723,14 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, } if (Depth == 1 && Root->getOpcode() == Shuffle) return false; // Nothing to do! - Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); + Op = DAG.getBitcast(ShuffleVT, Input); DCI.AddToWorklist(Op.getNode()); if (Shuffle == X86ISD::MOVDDUP) Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); else Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); DCI.AddToWorklist(Op.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), /*AddTo*/ true); return true; } @@ -20006,11 +21741,11 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, MVT ShuffleVT = MVT::v4f32; if (Depth == 1 && Root->getOpcode() == Shuffle) return false; // Nothing to do! - Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); + Op = DAG.getBitcast(ShuffleVT, Input); DCI.AddToWorklist(Op.getNode()); Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); DCI.AddToWorklist(Op.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), /*AddTo*/ true); return true; } @@ -20020,11 +21755,11 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, MVT ShuffleVT = MVT::v4f32; if (Depth == 1 && Root->getOpcode() == Shuffle) return false; // Nothing to do! - Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); + Op = DAG.getBitcast(ShuffleVT, Input); DCI.AddToWorklist(Op.getNode()); Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); DCI.AddToWorklist(Op.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), /*AddTo*/ true); return true; } @@ -20054,11 +21789,11 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, default: llvm_unreachable("Impossible mask size!"); }; - Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); + Op = DAG.getBitcast(ShuffleVT, Input); DCI.AddToWorklist(Op.getNode()); Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); DCI.AddToWorklist(Op.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), /*AddTo*/ true); return true; } @@ -20087,14 +21822,14 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8)); } MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes); - Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Input); + Op = DAG.getBitcast(ByteVT, Input); DCI.AddToWorklist(Op.getNode()); SDValue PSHUFBMaskOp = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask); DCI.AddToWorklist(PSHUFBMaskOp.getNode()); Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp); DCI.AddToWorklist(Op.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), /*AddTo*/ true); return true; } @@ -20266,7 +22001,7 @@ static SmallVector getPSHUFShuffleMask(SDValue N) { #ifndef NDEBUG for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i) for (int j = 0; j < LaneElts; ++j) - assert(Mask[j] == Mask[i * LaneElts + j] - LaneElts && + assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"); #endif Mask.resize(LaneElts); @@ -20397,7 +22132,7 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef Mask, SDValue W = Chain.pop_back_val(); if (V.getValueType() != W.getOperand(0).getValueType()) - V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V); + V = DAG.getBitcast(W.getOperand(0).getValueType(), V); switch (W.getOpcode()) { default: @@ -20416,7 +22151,7 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef Mask, } } if (V.getValueType() != N.getValueType()) - V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V); + V = DAG.getBitcast(N.getValueType(), V); // Return the new chain to replace N. return V; @@ -20533,12 +22268,12 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, DMask[DOffset + 0] = DOffset + 1; DMask[DOffset + 1] = DOffset + 0; MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); - V = DAG.getNode(ISD::BITCAST, DL, DVT, V); + V = DAG.getBitcast(DVT, V); DCI.AddToWorklist(V.getNode()); V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V, getV4X86ShuffleImm8ForMask(DMask, DL, DAG)); DCI.AddToWorklist(V.getNode()); - return DAG.getNode(ISD::BITCAST, DL, VT, V); + return DAG.getBitcast(VT, V); } // Look for shuffle patterns which can be implemented as a single unpack. @@ -20569,7 +22304,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) || makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) { // We can replace all three shuffles with an unpack. - V = DAG.getNode(ISD::BITCAST, DL, VT, D.getOperand(0)); + V = DAG.getBitcast(VT, D.getOperand(0)); DCI.AddToWorklist(V.getNode()); return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL : X86ISD::UNPCKH, @@ -20713,8 +22448,8 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, CanFold = SVOp->getMaskElt(i) < 0; if (CanFold) { - SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0)); - SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1)); + SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0)); + SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1)); SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01); return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]); } @@ -20728,8 +22463,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); - SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); - if (LD.getNode()) + if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true)) return LD; if (isTargetShuffle(N->getOpcode())) { @@ -20776,7 +22510,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, if (!InVec.hasOneUse()) return SDValue(); EVT BCVT = InVec.getOperand(0).getValueType(); - if (!BCVT.isVector() || + if (!BCVT.isVector() || BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) return SDValue(); InVec = InVec.getOperand(0); @@ -20830,7 +22564,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, // alignment is valid. unsigned Align = LN0->getAlignment(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment( + unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment( EltVT.getTypeForEVT(*DAG.getContext())); if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT)) @@ -20846,7 +22580,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, Shuffle = DAG.getVectorShuffle(CurrentVT, dl, InVec.getOperand(0), Shuffle, &ShuffleMask[0]); - Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle); + Shuffle = DAG.getBitcast(OriginalVT, Shuffle); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, EltNo); } @@ -20877,8 +22611,7 @@ static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) { /// use 64-bit extracts and shifts. static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { - SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI); - if (NewOp.getNode()) + if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) return NewOp; SDValue InputVector = N->getOperand(0); @@ -20908,7 +22641,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, } EVT VT = N->getValueType(0); - + if (VT == MVT::i1 && dyn_cast(N->getOperand(1)) && InputVector.getOpcode() == ISD::BITCAST && dyn_cast(InputVector.getOperand(0))) { @@ -20966,15 +22699,16 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, SDValue Vals[4]; if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) { - SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector); - EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(); + SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector); + auto &DL = DAG.getDataLayout(); + EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL); SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, DAG.getConstant(0, dl, VecIdxTy)); SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, DAG.getConstant(1, dl, VecIdxTy)); - SDValue ShAmt = DAG.getConstant(32, dl, - DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64)); + SDValue ShAmt = DAG.getConstant( + 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL)); Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf); Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt)); @@ -20993,10 +22727,11 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, // Replace each use (extract) with a load of the appropriate element. for (unsigned i = 0; i < 4; ++i) { uint64_t Offset = EltSize * i; - SDValue OffsetVal = DAG.getConstant(Offset, dl, TLI.getPointerTy()); + auto PtrVT = TLI.getPointerTy(DAG.getDataLayout()); + SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT); - SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), - StackPtr, OffsetVal); + SDValue ScalarAddr = + DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal); // Load the scalar. Vals[i] = DAG.getLoad(ElementType, dl, Ch, @@ -21020,96 +22755,6 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match. -static std::pair -matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, - SelectionDAG &DAG, const X86Subtarget *Subtarget) { - if (!VT.isVector()) - return std::make_pair(0, false); - - bool NeedSplit = false; - switch (VT.getSimpleVT().SimpleTy) { - default: return std::make_pair(0, false); - case MVT::v4i64: - case MVT::v2i64: - if (!Subtarget->hasVLX()) - return std::make_pair(0, false); - break; - case MVT::v64i8: - case MVT::v32i16: - if (!Subtarget->hasBWI()) - return std::make_pair(0, false); - break; - case MVT::v16i32: - case MVT::v8i64: - if (!Subtarget->hasAVX512()) - return std::make_pair(0, false); - break; - case MVT::v32i8: - case MVT::v16i16: - case MVT::v8i32: - if (!Subtarget->hasAVX2()) - NeedSplit = true; - if (!Subtarget->hasAVX()) - return std::make_pair(0, false); - break; - case MVT::v16i8: - case MVT::v8i16: - case MVT::v4i32: - if (!Subtarget->hasSSE2()) - return std::make_pair(0, false); - } - - // SSE2 has only a small subset of the operations. - bool hasUnsigned = Subtarget->hasSSE41() || - (Subtarget->hasSSE2() && VT == MVT::v16i8); - bool hasSigned = Subtarget->hasSSE41() || - (Subtarget->hasSSE2() && VT == MVT::v8i16); - - ISD::CondCode CC = cast(Cond.getOperand(2))->get(); - - unsigned Opc = 0; - // Check for x CC y ? x : y. - if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && - DAG.isEqualTo(RHS, Cond.getOperand(1))) { - switch (CC) { - default: break; - case ISD::SETULT: - case ISD::SETULE: - Opc = hasUnsigned ? X86ISD::UMIN : 0u; break; - case ISD::SETUGT: - case ISD::SETUGE: - Opc = hasUnsigned ? X86ISD::UMAX : 0u; break; - case ISD::SETLT: - case ISD::SETLE: - Opc = hasSigned ? X86ISD::SMIN : 0u; break; - case ISD::SETGT: - case ISD::SETGE: - Opc = hasSigned ? X86ISD::SMAX : 0u; break; - } - // Check for x CC y ? y : x -- a min/max with reversed arms. - } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && - DAG.isEqualTo(RHS, Cond.getOperand(0))) { - switch (CC) { - default: break; - case ISD::SETULT: - case ISD::SETULE: - Opc = hasUnsigned ? X86ISD::UMAX : 0u; break; - case ISD::SETUGT: - case ISD::SETUGE: - Opc = hasUnsigned ? X86ISD::UMIN : 0u; break; - case ISD::SETLT: - case ISD::SETLE: - Opc = hasSigned ? X86ISD::SMAX : 0u; break; - case ISD::SETGT: - case ISD::SETGE: - Opc = hasSigned ? X86ISD::SMIN : 0u; break; - } - } - - return std::make_pair(Opc, NeedSplit); -} - static SDValue transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { @@ -21519,32 +23164,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } } - // Try to match a min/max vector operation. - if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) { - std::pair ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget); - unsigned Opc = ret.first; - bool NeedSplit = ret.second; - - if (Opc && NeedSplit) { - unsigned NumElems = VT.getVectorNumElements(); - // Extract the LHS vectors - SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL); - SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL); - - // Extract the RHS vectors - SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL); - SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL); - - // Create min/max for each subvector - LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1); - RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2); - - // Merge the result - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS); - } else if (Opc) - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - // Simplify vector selection if condition value type matches vselect // operand type if (N->getOpcode() == ISD::VSELECT && CondVT == VT) { @@ -21560,7 +23179,8 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Check if the selector will be produced by CMPP*/PCMP* Cond.getOpcode() == ISD::SETCC && // Check if SETCC has already been promoted - TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) { + TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) == + CondVT) { bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); @@ -21582,13 +23202,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (TValIsAllOnes && FValIsAllZeros) Ret = Cond; else if (TValIsAllOnes) - Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond, - DAG.getNode(ISD::BITCAST, DL, CondVT, RHS)); + Ret = + DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS)); else if (FValIsAllZeros) Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond, - DAG.getNode(ISD::BITCAST, DL, CondVT, LHS)); + DAG.getBitcast(CondVT, LHS)); - return DAG.getNode(ISD::BITCAST, DL, VT, Ret); + return DAG.getBitcast(VT, Ret); } } @@ -21605,7 +23225,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // know will be matched by LowerVECTOR_SHUFFLEtoBlend. if ((N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SHRUNKBLEND) && - !DCI.isBeforeLegalize()) { + !DCI.isBeforeLegalize() && !VT.is512BitVector()) { SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); if (Shuffle.getNode()) return Shuffle; @@ -22085,106 +23705,15 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { - unsigned IntNo = cast(N->getOperand(0))->getZExtValue(); - switch (IntNo) { - default: return SDValue(); - // SSE/AVX/AVX2 blend intrinsics. - case Intrinsic::x86_avx2_pblendvb: - // Don't try to simplify this intrinsic if we don't have AVX2. - if (!Subtarget->hasAVX2()) - return SDValue(); - // FALL-THROUGH - case Intrinsic::x86_avx_blendv_pd_256: - case Intrinsic::x86_avx_blendv_ps_256: - // Don't try to simplify this intrinsic if we don't have AVX. - if (!Subtarget->hasAVX()) - return SDValue(); - // FALL-THROUGH - case Intrinsic::x86_sse41_blendvps: - case Intrinsic::x86_sse41_blendvpd: - case Intrinsic::x86_sse41_pblendvb: { - SDValue Op0 = N->getOperand(1); - SDValue Op1 = N->getOperand(2); - SDValue Mask = N->getOperand(3); - - // Don't try to simplify this intrinsic if we don't have SSE4.1. - if (!Subtarget->hasSSE41()) - return SDValue(); - - // fold (blend A, A, Mask) -> A - if (Op0 == Op1) - return Op0; - // fold (blend A, B, allZeros) -> A - if (ISD::isBuildVectorAllZeros(Mask.getNode())) - return Op0; - // fold (blend A, B, allOnes) -> B - if (ISD::isBuildVectorAllOnes(Mask.getNode())) - return Op1; - - // Simplify the case where the mask is a constant i32 value. - if (ConstantSDNode *C = dyn_cast(Mask)) { - if (C->isNullValue()) - return Op0; - if (C->isAllOnesValue()) - return Op1; - } - - return SDValue(); - } - - // Packed SSE2/AVX2 arithmetic shift immediate intrinsics. - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_sse2_psrai_d: - case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: - case Intrinsic::x86_sse2_psra_w: - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_avx2_psra_w: - case Intrinsic::x86_avx2_psra_d: { - SDValue Op0 = N->getOperand(1); - SDValue Op1 = N->getOperand(2); - EVT VT = Op0.getValueType(); - assert(VT.isVector() && "Expected a vector type!"); - - if (isa(Op1)) - Op1 = Op1.getOperand(0); - - if (!isa(Op1)) - return SDValue(); - - EVT SVT = VT.getVectorElementType(); - unsigned SVTBits = SVT.getSizeInBits(); - - ConstantSDNode *CND = cast(Op1); - const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue()); - uint64_t ShAmt = C.getZExtValue(); - - // Don't try to convert this shift into a ISD::SRA if the shift - // count is bigger than or equal to the element size. - if (ShAmt >= SVTBits) - return SDValue(); - - // Trivial case: if the shift count is zero, then fold this - // into the first operand. - if (ShAmt == 0) - return Op0; - - // Replace this packed shift intrinsic with a target independent - // shift dag node. - SDLoc DL(N); - SDValue Splat = DAG.getConstant(C, DL, VT); - return DAG.getNode(ISD::SRA, DL, VT, Op0, Splat); - } - } -} - /// PerformMulCombine - Optimize a single multiply with constant into two /// in order to implement it with two cheaper instructions, e.g. /// LEA + SHL, LEA + LEA. static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { + // An imul is usually smaller than the alternative sequence. + if (DAG.getMachineFunction().getFunction()->optForMinSize()) + return SDValue(); + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); @@ -22255,18 +23784,34 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { N1C && N0.getOpcode() == ISD::AND && N0.getOperand(1).getOpcode() == ISD::Constant) { SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == X86ISD::SETCC_CARRY || - ((N00.getOpcode() == ISD::ANY_EXTEND || - N00.getOpcode() == ISD::ZERO_EXTEND) && - N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { - APInt Mask = cast(N0.getOperand(1))->getAPIntValue(); - APInt ShAmt = N1C->getAPIntValue(); - Mask = Mask.shl(ShAmt); - if (Mask != 0) { - SDLoc DL(N); - return DAG.getNode(ISD::AND, DL, VT, - N00, DAG.getConstant(Mask, DL, VT)); - } + APInt Mask = cast(N0.getOperand(1))->getAPIntValue(); + APInt ShAmt = N1C->getAPIntValue(); + Mask = Mask.shl(ShAmt); + bool MaskOK = false; + // We can handle cases concerning bit-widening nodes containing setcc_c if + // we carefully interrogate the mask to make sure we are semantics + // preserving. + // The transform is not safe if the result of C1 << C2 exceeds the bitwidth + // of the underlying setcc_c operation if the setcc_c was zero extended. + // Consider the following example: + // zext(setcc_c) -> i32 0x0000FFFF + // c1 -> i32 0x0000FFFF + // c2 -> i32 0x00000001 + // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE + // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE + if (N00.getOpcode() == X86ISD::SETCC_CARRY) { + MaskOK = true; + } else if (N00.getOpcode() == ISD::SIGN_EXTEND && + N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { + MaskOK = true; + } else if ((N00.getOpcode() == ISD::ZERO_EXTEND || + N00.getOpcode() == ISD::ANY_EXTEND) && + N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { + MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits()); + } + if (MaskOK && Mask != 0) { + SDLoc DL(N); + return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT)); } } @@ -22280,7 +23825,7 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { // We shift all of the values by one. In many cases we do not have // hardware support for this operation. This is better expressed as an ADD // of two values. - if (N1SplatC->getZExtValue() == 1) + if (N1SplatC->getAPIntValue() == 1) return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); } @@ -22321,16 +23866,14 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { - if (N->getOpcode() == ISD::SHL) { - SDValue V = PerformSHLCombine(N, DAG); - if (V.getNode()) return V; - } + if (N->getOpcode() == ISD::SHL) + if (SDValue V = PerformSHLCombine(N, DAG)) + return V; - if (N->getOpcode() != ISD::SRA) { - // Try to fold this logical shift into a zero vector. - SDValue V = performShiftToAllZeros(N, DAG, Subtarget); - if (V.getNode()) return V; - } + // Try to fold this logical shift into a zero vector. + if (N->getOpcode() != ISD::SRA) + if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget)) + return V; return SDValue(); } @@ -22419,15 +23962,13 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, // and work with those going forward. SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, OnesOrZeroesF); - SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, - Vector64); + SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64); OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32, DAG.getIntPtrConstant(0, DL)); IntVT = MVT::i32; } - SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, - OnesOrZeroesF); + SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF); SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI, DAG.getConstant(1, DL, IntVT)); SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, @@ -22640,7 +24181,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL, Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask); - return DAG.getNode(ISD::BITCAST, DL, N0.getValueType(), NewShuffle); + return DAG.getBitcast(N0.getValueType(), NewShuffle); } static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, @@ -22712,8 +24253,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); - SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); - if (R.getNode()) + if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget)) return R; SDValue N0 = N->getOperand(0); @@ -22781,7 +24321,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Unsupported VT for PSIGN"); Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); - return DAG.getNode(ISD::BITCAST, DL, VT, Mask); + return DAG.getBitcast(VT, Mask); } // PBLENDVB only available on SSE 4.1 if (!Subtarget->hasSSE41()) @@ -22789,11 +24329,11 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; - X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X); - Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y); - Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask); + X = DAG.getBitcast(BlendVT, X); + Y = DAG.getBitcast(BlendVT, Y); + Mask = DAG.getBitcast(BlendVT, Mask); Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); - return DAG.getNode(ISD::BITCAST, DL, VT, Mask); + return DAG.getBitcast(VT, Mask); } } @@ -22801,9 +24341,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) - MachineFunction &MF = DAG.getMachineFunction(); - bool OptForSize = - MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize); + bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize(); // SHLD/SHRD instructions have lower register pressure, but on some // platforms they have higher latency than the equivalent @@ -22901,18 +24439,64 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes +// Try to turn tests against the signbit in the form of: +// XOR(TRUNCATE(SRL(X, size(X)-1)), 1) +// into: +// SETGT(X, -1) +static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) { + // This is only worth doing if the output type is i8. + if (N->getValueType(0) != MVT::i8) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // We should be performing an xor against a truncated shift. + if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse()) + return SDValue(); + + // Make sure we are performing an xor against one. + if (!isa(N1) || !cast(N1)->isOne()) + return SDValue(); + + // SetCC on x86 zero extends so only act on this if it's a logical shift. + SDValue Shift = N0.getOperand(0); + if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse()) + return SDValue(); + + // Make sure we are truncating from one of i16, i32 or i64. + EVT ShiftTy = Shift.getValueType(); + if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64) + return SDValue(); + + // Make sure the shift amount extracts the sign bit. + if (!isa(Shift.getOperand(1)) || + Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1) + return SDValue(); + + // Create a greater-than comparison against -1. + // N.B. Using SETGE against 0 works but we want a canonical looking + // comparison, using SETGT matches up with what TranslateX86CC. + SDLoc DL(N); + SDValue ShiftOp = Shift.getOperand(0); + EVT ShiftOpTy = ShiftOp.getValueType(); + SDValue Cond = DAG.getSetCC(DL, MVT::i8, ShiftOp, + DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT); + return Cond; +} + static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); - if (Subtarget->hasCMov()) { - SDValue RV = performIntegerAbsCombine(N, DAG); - if (RV.getNode()) + if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG)) + return RV; + + if (Subtarget->hasCMov()) + if (SDValue RV = performIntegerAbsCombine(N, DAG)) return RV; - } return SDValue(); } @@ -22930,16 +24514,20 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, // For chips with slow 32-byte unaligned loads, break the 32-byte operation // into two 16-byte operations. ISD::LoadExtType Ext = Ld->getExtensionType(); + bool Fast; + unsigned AddressSpace = Ld->getAddressSpace(); unsigned Alignment = Ld->getAlignment(); - bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8; - if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() && - !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { + if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() && + Ext == ISD::NON_EXTLOAD && + TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT, + AddressSpace, Alignment, &Fast) && !Fast) { unsigned NumElems = RegVT.getVectorNumElements(); if (NumElems < 2) return SDValue(); SDValue Ptr = Ld->getBasePtr(); - SDValue Increment = DAG.getConstant(16, dl, TLI.getPointerTy()); + SDValue Increment = + DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout())); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), NumElems/2); @@ -22994,7 +24582,7 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); // Convert Src0 value - SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0()); + SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0()); if (Mld->getSrc0().getOpcode() != ISD::UNDEF) { SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) @@ -23011,7 +24599,7 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, SDValue Mask = Mld->getMask(); if (Mask.getValueType() == VT) { // Mask and original value have the same type - NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask); + NewMask = DAG.getBitcast(WideVecVT, Mask); SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; @@ -23062,6 +24650,15 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, unsigned FromSz = VT.getVectorElementType().getSizeInBits(); unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // The truncating store is legal in some cases. For example + // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw + // are designated for truncate store. + // In this case we don't need any further transformations. + if (TLI.isTruncStoreLegal(VT, StVT)) + return SDValue(); + // From, To sizes and ElemCount must be pow of two assert (isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for truncating masked store"); @@ -23079,7 +24676,7 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); - SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue()); + SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue()); SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; @@ -23096,7 +24693,7 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue Mask = Mst->getMask(); if (Mask.getValueType() == VT) { // Mask and original value have the same type - NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask); + NewMask = DAG.getBitcast(WideVecVT, Mask); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) @@ -23137,10 +24734,12 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // If we are saving a concatenation of two XMM registers and 32-byte stores // are slow, such as on Sandy Bridge, perform two 16-byte stores. + bool Fast; + unsigned AddressSpace = St->getAddressSpace(); unsigned Alignment = St->getAlignment(); - bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8; - if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() && - StVT == VT && !IsAligned) { + if (VT.is256BitVector() && StVT == VT && + TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, + AddressSpace, Alignment, &Fast) && !Fast) { unsigned NumElems = VT.getVectorNumElements(); if (NumElems < 2) return SDValue(); @@ -23148,7 +24747,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl); SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl); - SDValue Stride = DAG.getConstant(16, dl, TLI.getPointerTy()); + SDValue Stride = + DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout())); SDValue Ptr0 = St->getBasePtr(); SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); @@ -23172,6 +24772,13 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, unsigned FromSz = VT.getVectorElementType().getSizeInBits(); unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); + // The truncating store is legal in some cases. For example + // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw + // are designated for truncate store. + // In this case we don't need any further transformations. + if (TLI.isTruncStoreLegal(VT, StVT)) + return SDValue(); + // From, To sizes and ElemCount must be pow of two if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); // We are going to use the original vector elt for storing. @@ -23188,7 +24795,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); - SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue()); + SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue()); SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; @@ -23219,10 +24826,10 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); - SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff); + SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff); SmallVector Chains; - SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, dl, - TLI.getPointerTy()); + SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, dl, + TLI.getPointerTy(DAG.getDataLayout())); SDValue Ptr = St->getBasePtr(); // Perform one or more big stores into memory. @@ -23360,7 +24967,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue ExtOp0 = OldExtract.getOperand(0); unsigned VecSize = ExtOp0.getValueSizeInBits(); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64); - SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtOp0); + SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0); SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, BitCast, OldExtract.getOperand(1)); return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(), @@ -23666,39 +25273,97 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); - SDLoc dl(N); + EVT SVT = VT.getScalarType(); + EVT InVT = N0.getValueType(); + EVT InSVT = InVT.getScalarType(); + SDLoc DL(N); // (i8,i32 sext (sdivrem (i8 x, i8 y)) -> // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y) // This exposes the sext to the sdivrem lowering, so that it directly extends // from AH (which we otherwise need to do contortions to access). if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 && - N0.getValueType() == MVT::i8 && VT == MVT::i32) { + InVT == MVT::i8 && VT == MVT::i32) { SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); - SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys, + SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, DL, NodeTys, N0.getOperand(0), N0.getOperand(1)); DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); return R.getValue(1); } if (!DCI.isBeforeLegalizeOps()) { - if (N0.getValueType() == MVT::i1) { - SDValue Zero = DAG.getConstant(0, dl, VT); + if (InVT == MVT::i1) { + SDValue Zero = DAG.getConstant(0, DL, VT); SDValue AllOnes = - DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, VT); - return DAG.getNode(ISD::SELECT, dl, VT, N0, AllOnes, Zero); + DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT); + return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero); } return SDValue(); } + if (VT.isVector() && Subtarget->hasSSE2()) { + auto ExtendVecSize = [&DAG](SDLoc DL, SDValue N, unsigned Size) { + EVT InVT = N.getValueType(); + EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(), + Size / InVT.getScalarSizeInBits()); + SmallVector Opnds(Size / InVT.getSizeInBits(), + DAG.getUNDEF(InVT)); + Opnds[0] = N; + return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds); + }; + + // If target-size is less than 128-bits, extend to a type that would extend + // to 128 bits, extend that and extract the original target vector. + if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits()) && + (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) && + (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) { + unsigned Scale = 128 / VT.getSizeInBits(); + EVT ExVT = + EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits()); + SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits()); + SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, ExVT, Ex); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt, + DAG.getIntPtrConstant(0, DL)); + } + + // If target-size is 128-bits, then convert to ISD::SIGN_EXTEND_VECTOR_INREG + // which ensures lowering to X86ISD::VSEXT (pmovsx*). + if (VT.getSizeInBits() == 128 && + (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) && + (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) { + SDValue ExOp = ExtendVecSize(DL, N0, 128); + return DAG.getSignExtendVectorInReg(ExOp, DL, VT); + } + + // On pre-AVX2 targets, split into 128-bit nodes of + // ISD::SIGN_EXTEND_VECTOR_INREG. + if (!Subtarget->hasInt256() && !(VT.getSizeInBits() % 128) && + (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) && + (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) { + unsigned NumVecs = VT.getSizeInBits() / 128; + unsigned NumSubElts = 128 / SVT.getSizeInBits(); + EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); + EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); + + SmallVector Opnds; + for (unsigned i = 0, Offset = 0; i != NumVecs; + ++i, Offset += NumSubElts) { + SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, + DAG.getIntPtrConstant(Offset, DL)); + SrcVec = ExtendVecSize(DL, SrcVec, 128); + SrcVec = DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT); + Opnds.push_back(SrcVec); + } + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); + } + } + if (!Subtarget->hasFp256()) return SDValue(); - if (VT.isVector() && VT.getSizeInBits() == 256) { - SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); - if (R.getNode()) + if (VT.isVector() && VT.getSizeInBits() == 256) + if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; - } return SDValue(); } @@ -23714,7 +25379,8 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, EVT ScalarVT = VT.getScalarType(); if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || - (!Subtarget->hasFMA() && !Subtarget->hasFMA4())) + (!Subtarget->hasFMA() && !Subtarget->hasFMA4() && + !Subtarget->hasAVX512())) return SDValue(); SDValue A = N->getOperand(0); @@ -23780,11 +25446,10 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, DAG.getConstant(1, dl, VT)); } } - if (VT.is256BitVector()) { - SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); - if (R.getNode()) + + if (VT.is256BitVector()) + if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; - } // (i8,i32 zext (udivrem (i8 x, i8 y)) -> // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y) @@ -23988,10 +25653,7 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, if (CC == X86::COND_B) return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0)); - SDValue Flags; - - Flags = checkBoolTestSetCCCombine(EFLAGS, CC); - if (Flags.getNode()) { + if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) { SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); } @@ -24010,10 +25672,7 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, SDValue EFLAGS = N->getOperand(3); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); - SDValue Flags; - - Flags = checkBoolTestSetCCCombine(EFLAGS, CC); - if (Flags.getNode()) { + if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) { SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, Flags); @@ -24058,51 +25717,79 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, // DAG. SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); // The AND node needs bitcasts to/from an integer vector type around it. - SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); + SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst); SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, N->getOperand(0)->getOperand(0), MaskConst); - SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); + SDValue Res = DAG.getBitcast(VT, NewAnd); return Res; } return SDValue(); } +static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDValue Op0 = N->getOperand(0); + EVT VT = N->getValueType(0); + EVT InVT = Op0.getValueType(); + EVT InSVT = InVT.getScalarType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32)) + // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32)) + if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) { + SDLoc dl(N); + EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, + InVT.getVectorNumElements()); + SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); + + if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT)) + return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P); + + return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); + } + + return SDValue(); +} + static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { // First try to optimize away the conversion entirely when it's // conditionally from a constant. Vectors only. - SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); - if (Res != SDValue()) + if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) return Res; // Now move on to more general possibilities. SDValue Op0 = N->getOperand(0); - EVT InVT = Op0->getValueType(0); + EVT VT = N->getValueType(0); + EVT InVT = Op0.getValueType(); + EVT InSVT = InVT.getScalarType(); - // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32)) - if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { + // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32)) + // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32)) + if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) { SDLoc dl(N); - MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; + EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, + InVT.getVectorNumElements()); SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); - return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); + return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); } // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have // a 32-bit target where SSE doesn't support i64->FP operations. if (Op0.getOpcode() == ISD::LOAD) { LoadSDNode *Ld = cast(Op0.getNode()); - EVT VT = Ld->getValueType(0); + EVT LdVT = Ld->getValueType(0); // This transformation is not supported if the result type is f16 - if (N->getValueType(0) == MVT::f16) + if (VT == MVT::f16) return SDValue(); - if (!Ld->isVolatile() && !N->getValueType(0).isVector() && + if (!Ld->isVolatile() && !VT.isVector() && ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && - !Subtarget->is64Bit() && VT == MVT::i64) { + !Subtarget->is64Bit() && LdVT == MVT::i64) { SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD( - SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG); + SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG); DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); return FILDChain; } @@ -24261,12 +25948,11 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, // In this case, the inner vzext is completely dead because we're going to // only look at bits inside of the low element. Just do the outer vzext on // a bitcast of the input to the inner. - return DAG.getNode(X86ISD::VZEXT, DL, VT, - DAG.getNode(ISD::BITCAST, DL, OpVT, V)); + return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V)); } // Check if we can bypass extracting and re-inserting an element of an input - // vector. Essentialy: + // vector. Essentially: // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && @@ -24284,7 +25970,7 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV, DAG.getIntPtrConstant(0, DL)); } - Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV); + Op = DAG.getBitcast(OpVT, OrigV); return DAG.getNode(X86ISD::VZEXT, DL, VT, Op); } } @@ -24320,6 +26006,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget); case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, Subtarget); + case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG, Subtarget); case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); case X86ISD::FXOR: @@ -24355,8 +26042,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); - case ISD::INTRINSIC_WO_CHAIN: - return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget); case X86ISD::INSERTPS: { if (getTargetMachine().getOptLevel() > CodeGenOpt::None) return PerformINSERTPSCombine(N, DAG, Subtarget); @@ -24542,7 +26227,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) || matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) { AsmPieces.clear(); - const std::string &ConstraintsStr = IA->getConstraintString(); + StringRef ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); array_pod_sort(AsmPieces.begin(), AsmPieces.end()); if (clobbersFlagRegisters(AsmPieces)) @@ -24556,7 +26241,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) && matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) { AsmPieces.clear(); - const std::string &ConstraintsStr = IA->getConstraintString(); + StringRef ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); array_pod_sort(AsmPieces.begin(), AsmPieces.end()); if (clobbersFlagRegisters(AsmPieces)) @@ -24583,7 +26268,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. X86TargetLowering::ConstraintType -X86TargetLowering::getConstraintType(const std::string &Constraint) const { +X86TargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'R': @@ -24915,7 +26600,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::pair X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint, + StringRef Constraint, MVT VT) const { // First, see if this is a constraint that directly corresponds to an LLVM // register class. @@ -25065,71 +26750,40 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // Otherwise, check to see if this is a register class of the wrong value // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to // turn into {ax},{dx}. - if (Res.second->hasType(VT)) + // MVT::Other is used to specify clobber names. + if (Res.second->hasType(VT) || VT == MVT::Other) return Res; // Correct type already, nothing to do. - // All of the single-register GCC register classes map their values onto - // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we - // really want an 8-bit or 32-bit register, map to the appropriate register - // class and return the appropriate register. - if (Res.second == &X86::GR16RegClass) { - if (VT == MVT::i8 || VT == MVT::i1) { - unsigned DestReg = 0; - switch (Res.first) { - default: break; - case X86::AX: DestReg = X86::AL; break; - case X86::DX: DestReg = X86::DL; break; - case X86::CX: DestReg = X86::CL; break; - case X86::BX: DestReg = X86::BL; break; - } - if (DestReg) { - Res.first = DestReg; - Res.second = &X86::GR8RegClass; - } - } else if (VT == MVT::i32 || VT == MVT::f32) { - unsigned DestReg = 0; - switch (Res.first) { - default: break; - case X86::AX: DestReg = X86::EAX; break; - case X86::DX: DestReg = X86::EDX; break; - case X86::CX: DestReg = X86::ECX; break; - case X86::BX: DestReg = X86::EBX; break; - case X86::SI: DestReg = X86::ESI; break; - case X86::DI: DestReg = X86::EDI; break; - case X86::BP: DestReg = X86::EBP; break; - case X86::SP: DestReg = X86::ESP; break; - } - if (DestReg) { - Res.first = DestReg; - Res.second = &X86::GR32RegClass; - } - } else if (VT == MVT::i64 || VT == MVT::f64) { - unsigned DestReg = 0; - switch (Res.first) { - default: break; - case X86::AX: DestReg = X86::RAX; break; - case X86::DX: DestReg = X86::RDX; break; - case X86::CX: DestReg = X86::RCX; break; - case X86::BX: DestReg = X86::RBX; break; - case X86::SI: DestReg = X86::RSI; break; - case X86::DI: DestReg = X86::RDI; break; - case X86::BP: DestReg = X86::RBP; break; - case X86::SP: DestReg = X86::RSP; break; - } - if (DestReg) { - Res.first = DestReg; - Res.second = &X86::GR64RegClass; - } - } - } else if (Res.second == &X86::FR32RegClass || - Res.second == &X86::FR64RegClass || - Res.second == &X86::VR128RegClass || - Res.second == &X86::VR256RegClass || - Res.second == &X86::FR32XRegClass || - Res.second == &X86::FR64XRegClass || - Res.second == &X86::VR128XRegClass || - Res.second == &X86::VR256XRegClass || - Res.second == &X86::VR512RegClass) { + // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should + // return "eax". This should even work for things like getting 64bit integer + // registers when given an f64 type. + const TargetRegisterClass *Class = Res.second; + if (Class == &X86::GR8RegClass || Class == &X86::GR16RegClass || + Class == &X86::GR32RegClass || Class == &X86::GR64RegClass) { + unsigned Size = VT.getSizeInBits(); + MVT::SimpleValueType SimpleTy = Size == 1 || Size == 8 ? MVT::i8 + : Size == 16 ? MVT::i16 + : Size == 32 ? MVT::i32 + : Size == 64 ? MVT::i64 + : MVT::Other; + unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, SimpleTy); + if (DestReg > 0) { + Res.first = DestReg; + Res.second = SimpleTy == MVT::i8 ? &X86::GR8RegClass + : SimpleTy == MVT::i16 ? &X86::GR16RegClass + : SimpleTy == MVT::i32 ? &X86::GR32RegClass + : &X86::GR64RegClass; + assert(Res.second->contains(Res.first) && "Register in register class"); + } else { + // No register found/type mismatch. + Res.first = 0; + Res.second = nullptr; + } + } else if (Class == &X86::FR32RegClass || Class == &X86::FR64RegClass || + Class == &X86::VR128RegClass || Class == &X86::VR256RegClass || + Class == &X86::FR32XRegClass || Class == &X86::FR64XRegClass || + Class == &X86::VR128XRegClass || Class == &X86::VR256XRegClass || + Class == &X86::VR512RegClass) { // Handle references to XMM physical registers that got mapped into the // wrong class. This can happen with constraints like {xmm0} where the // target independent register mapper will just pick the first match it can @@ -25145,13 +26799,19 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, Res.second = &X86::VR256RegClass; else if (X86::VR512RegClass.hasType(VT)) Res.second = &X86::VR512RegClass; + else { + // Type mismatch and not a clobber: Return an error; + Res.first = 0; + Res.second = nullptr; + } } return Res; } -int X86TargetLowering::getScalingFactorCost(const AddrMode &AM, - Type *Ty) const { +int X86TargetLowering::getScalingFactorCost(const DataLayout &DL, + const AddrMode &AM, Type *Ty, + unsigned AS) const { // Scaling factors are not free at all. // An indexed folded instruction, i.e., inst (reg1, reg2, scale), // will take 2 allocations in the out of order engine instead of 1 @@ -25170,13 +26830,22 @@ int X86TargetLowering::getScalingFactorCost(const AddrMode &AM, // E.g., on Haswell: // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. - if (isLegalAddressingMode(AM, Ty)) + if (isLegalAddressingMode(DL, AM, Ty, AS)) // Scale represents reg2 * scale, thus account for 1 // as soon as we use a second register. return AM.Scale != 0; return -1; } -bool X86TargetLowering::isTargetFTOL() const { - return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit(); +bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { + // Integer division on x86 is expensive. However, when aggressively optimizing + // for code size, we prefer to use a div instruction, as it is usually smaller + // than the alternative sequence. + // The exception to this is vector division. Since x86 doesn't have vector + // integer division, leaving the division as-is is a loss even in terms of + // size, because it will have to be scalarized, while the alternative code + // sequence can be performed in vector form. + bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex, + Attribute::MinSize); + return OptSize && !VT.isVector(); }