X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTarget%2FARM%2FARMISelLowering.cpp;h=7366697343e9e3bb94e717a877136ac0784b57b2;hp=607ce767be2f517e5834fab7eeedbc05f403e220;hb=824e14ddab883edf9af866352bfe6b34577f2e2d;hpb=ce2b45bd7dc5dc17de6b50234f8d24ef1fed1d3b diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 607ce767be2..7366697343e 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -60,11 +60,6 @@ STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); -cl::opt -EnableARMLongCalls("arm-long-calls", cl::Hidden, - cl::desc("Generate calls via indirect call instructions"), - cl::init(false)); - static cl::opt ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), @@ -147,6 +142,16 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); + + if (VT.isInteger()) { + setOperationAction(ISD::SABSDIFF, VT, Legal); + setOperationAction(ISD::UABSDIFF, VT, Legal); + } + if (!VT.isFloatingPoint() && + VT != MVT::v2i64 && VT != MVT::v1i64) + for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) + setOperationAction(Opcode, VT, Legal); + } void ARMTargetLowering::addDRTypeForNEON(MVT VT) { @@ -171,77 +176,78 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // Uses VFP for Thumb libfuncs if available. if (Subtarget->isThumb() && Subtarget->hasVFP2() && Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { - // Single-precision floating-point arithmetic. - setLibcallName(RTLIB::ADD_F32, "__addsf3vfp"); - setLibcallName(RTLIB::SUB_F32, "__subsf3vfp"); - setLibcallName(RTLIB::MUL_F32, "__mulsf3vfp"); - setLibcallName(RTLIB::DIV_F32, "__divsf3vfp"); - - // Double-precision floating-point arithmetic. - setLibcallName(RTLIB::ADD_F64, "__adddf3vfp"); - setLibcallName(RTLIB::SUB_F64, "__subdf3vfp"); - setLibcallName(RTLIB::MUL_F64, "__muldf3vfp"); - setLibcallName(RTLIB::DIV_F64, "__divdf3vfp"); - - // Single-precision comparisons. - setLibcallName(RTLIB::OEQ_F32, "__eqsf2vfp"); - setLibcallName(RTLIB::UNE_F32, "__nesf2vfp"); - setLibcallName(RTLIB::OLT_F32, "__ltsf2vfp"); - setLibcallName(RTLIB::OLE_F32, "__lesf2vfp"); - setLibcallName(RTLIB::OGE_F32, "__gesf2vfp"); - setLibcallName(RTLIB::OGT_F32, "__gtsf2vfp"); - setLibcallName(RTLIB::UO_F32, "__unordsf2vfp"); - setLibcallName(RTLIB::O_F32, "__unordsf2vfp"); - - setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE); - setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETNE); - setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE); - setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE); - setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE); - setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE); - setCmpLibcallCC(RTLIB::UO_F32, ISD::SETNE); - setCmpLibcallCC(RTLIB::O_F32, ISD::SETEQ); - - // Double-precision comparisons. - setLibcallName(RTLIB::OEQ_F64, "__eqdf2vfp"); - setLibcallName(RTLIB::UNE_F64, "__nedf2vfp"); - setLibcallName(RTLIB::OLT_F64, "__ltdf2vfp"); - setLibcallName(RTLIB::OLE_F64, "__ledf2vfp"); - setLibcallName(RTLIB::OGE_F64, "__gedf2vfp"); - setLibcallName(RTLIB::OGT_F64, "__gtdf2vfp"); - setLibcallName(RTLIB::UO_F64, "__unorddf2vfp"); - setLibcallName(RTLIB::O_F64, "__unorddf2vfp"); - - setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE); - setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETNE); - setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE); - setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE); - setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE); - setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE); - setCmpLibcallCC(RTLIB::UO_F64, ISD::SETNE); - setCmpLibcallCC(RTLIB::O_F64, ISD::SETEQ); - - // Floating-point to integer conversions. - // i64 conversions are done via library routines even when generating VFP - // instructions, so use the same ones. - setLibcallName(RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp"); - setLibcallName(RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp"); - setLibcallName(RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp"); - setLibcallName(RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp"); - - // Conversions between floating types. - setLibcallName(RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp"); - setLibcallName(RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp"); + static const struct { + const RTLIB::Libcall Op; + const char * const Name; + const ISD::CondCode Cond; + } LibraryCalls[] = { + // Single-precision floating-point arithmetic. + { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, + { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, + { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, + { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, + + // Double-precision floating-point arithmetic. + { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, + { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, + { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, + { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, + + // Single-precision comparisons. + { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, + { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, + { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, + { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, + { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, + { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, + { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, + { RTLIB::O_F32, "__unordsf2vfp", ISD::SETEQ }, + + // Double-precision comparisons. + { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, + { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, + { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, + { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, + { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, + { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, + { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, + { RTLIB::O_F64, "__unorddf2vfp", ISD::SETEQ }, + + // Floating-point to integer conversions. + // i64 conversions are done via library routines even when generating VFP + // instructions, so use the same ones. + { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, + { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, + { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, + { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, + + // Conversions between floating types. + { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, + { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, + + // Integer to floating-point conversions. + // i64 conversions are done via library routines even when generating VFP + // instructions, so use the same ones. + // FIXME: There appears to be some naming inconsistency in ARM libgcc: + // e.g., __floatunsidf vs. __floatunssidfvfp. + { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, + { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, + { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, + { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, + }; + + for (const auto &LC : LibraryCalls) { + setLibcallName(LC.Op, LC.Name); + if (LC.Cond != ISD::SETCC_INVALID) + setCmpLibcallCC(LC.Op, LC.Cond); + } + } - // Integer to floating-point conversions. - // i64 conversions are done via library routines even when generating VFP - // instructions, so use the same ones. - // FIXME: There appears to be some naming inconsistency in ARM libgcc: - // e.g., __floatunsidf vs. __floatunssidfvfp. - setLibcallName(RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp"); - setLibcallName(RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp"); - setLibcallName(RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp"); - setLibcallName(RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp"); + // Set the correct calling convention for ARMv7k WatchOS. It's just + // AAPCS_VFP for functions as simple as libcalls. + if (Subtarget->isTargetWatchOS()) { + for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) + setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP); } } @@ -250,8 +256,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setLibcallName(RTLIB::SRL_I128, nullptr); setLibcallName(RTLIB::SRA_I128, nullptr); - if (Subtarget->isAAPCS_ABI() && !Subtarget->isTargetMachO() && - !Subtarget->isTargetWindows()) { + // RTLIB + if (Subtarget->isAAPCS_ABI() && + (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || + Subtarget->isTargetAndroid())) { static const struct { const RTLIB::Libcall Op; const char * const Name; @@ -339,12 +347,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, - - // Memory operations - // RTABI chapter 4.3.4 - { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, - { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, - { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, }; for (const auto &LC : LibraryCalls) { @@ -353,6 +355,30 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (LC.Cond != ISD::SETCC_INVALID) setCmpLibcallCC(LC.Op, LC.Cond); } + + // EABI dependent RTLIB + if (TM.Options.EABIVersion == EABI::EABI4 || + TM.Options.EABIVersion == EABI::EABI5) { + static const struct { + const RTLIB::Libcall Op; + const char *const Name; + const CallingConv::ID CC; + const ISD::CondCode Cond; + } MemOpsLibraryCalls[] = { + // Memory operations + // RTABI chapter 4.3.4 + { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + }; + + for (const auto &LC : MemOpsLibraryCalls) { + setLibcallName(LC.Op, LC.Name); + setLibcallCallingConv(LC.Op, LC.CC); + if (LC.Cond != ISD::SETCC_INVALID) + setCmpLibcallCC(LC.Op, LC.Cond); + } + } } if (Subtarget->isTargetWindows()) { @@ -369,6 +395,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, + { RTLIB::SDIV_I32, "__rt_sdiv", CallingConv::ARM_AAPCS_VFP }, + { RTLIB::SDIV_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS_VFP }, }; for (const auto &LC : LibraryCalls) { @@ -378,8 +406,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } // Use divmod compiler-rt calls for iOS 5.0 and later. - if (Subtarget->getTargetTriple().isiOS() && - !Subtarget->getTargetTriple().isOSVersionLT(5, 0)) { + if (Subtarget->isTargetWatchOS() || + (Subtarget->isTargetIOS() && + !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); } @@ -397,6 +426,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); } + // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have + // a __gnu_ prefix (which is the default). + if (Subtarget->isTargetAEABI()) { + setLibcallName(RTLIB::FPROUND_F32_F16, "__aeabi_f2h"); + setLibcallName(RTLIB::FPROUND_F64_F16, "__aeabi_d2h"); + setLibcallName(RTLIB::FPEXT_F16_F32, "__aeabi_h2f"); + } + if (Subtarget->isThumb1Only()) addRegisterClass(MVT::i32, &ARM::tGPRRegClass); else @@ -548,6 +585,27 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); + // NEON does not have single instruction CTTZ for vectors. + setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); + setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); + setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); + setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); + + setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); + setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); + setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); + setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); + + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); + + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); + // NEON only has FMA instructions as of VFP4. if (!Subtarget->hasVFP4()) { setOperationAction(ISD::FMA, MVT::v2f32, Expand); @@ -563,7 +621,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); - setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); @@ -589,7 +646,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::ADDC); if (Subtarget->isFPOnlySP()) { - // When targetting a floating-point unit with only single-precision + // When targeting a floating-point unit with only single-precision // operations, f64 is legal for the few double-precision instructions which // are present However, no double-precision operations other than moves, // loads and stores are provided by the hardware. @@ -673,7 +730,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); } if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() - || (Subtarget->isThumb2() && !Subtarget->hasThumb2DSP())) + || (Subtarget->isThumb2() && !Subtarget->hasDSP())) setOperationAction(ISD::MULHS, MVT::i32, Expand); setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); @@ -690,8 +747,15 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SUBE, MVT::i32, Custom); } + if (!Subtarget->isThumb1Only()) + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + // ARM does not have ROTL. - setOperationAction(ISD::ROTL, MVT::i32, Expand); + setOperationAction(ISD::ROTL, MVT::i32, Expand); + for (MVT VT : MVT::vector_valuetypes()) { + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); + } setOperationAction(ISD::CTTZ, MVT::i32, Custom); setOperationAction(ISD::CTPOP, MVT::i32, Expand); if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) @@ -701,7 +765,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i32 , Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i32 , Expand); - setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); + // @llvm.readcyclecounter requires the Performance Monitors extension. + // Default to the 0 expansion on unsupported platforms. + // FIXME: Technically there are older ARM CPUs that have + // implementation-specific ways of obtaining this information. + if (Subtarget->hasPerfMon()) + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); // Only ARMv6 has BSWAP. if (!Subtarget->hasV6Ops()) @@ -710,15 +779,23 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) && !(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) { // These are expanded into libcalls if the cpu doesn't have HW divider. - setOperationAction(ISD::SDIV, MVT::i32, Expand); - setOperationAction(ISD::UDIV, MVT::i32, Expand); + setOperationAction(ISD::SDIV, MVT::i32, LibCall); + setOperationAction(ISD::UDIV, MVT::i32, LibCall); + } + + if (Subtarget->isTargetWindows() && !Subtarget->hasDivide()) { + setOperationAction(ISD::UDIV, MVT::i32, Custom); + + setOperationAction(ISD::UDIV, MVT::i64, Custom); } - // FIXME: Also set divmod for SREM on EABI setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i32, Expand); // Register based DivRem for AEABI (RTABI 4.2) - if (Subtarget->isTargetAEABI()) { + if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) { + setOperationAction(ISD::SREM, MVT::i64, Custom); + setOperationAction(ISD::UREM, MVT::i64, Custom); + setLibcallName(RTLIB::SDIVREM_I8, "__aeabi_idivmod"); setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod"); setLibcallName(RTLIB::SDIVREM_I32, "__aeabi_idivmod"); @@ -746,7 +823,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::ConstantPool, MVT::i32, Custom); - setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom); setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); setOperationAction(ISD::BlockAddress, MVT::i32, Custom); @@ -760,13 +836,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - if (!Subtarget->isTargetMachO()) { - // Non-MachO platforms may return values in these registers via the - // personality function. - setExceptionPointerRegister(ARM::R0); - setExceptionSelectorRegister(ARM::R1); - } - if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment()) setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); else @@ -833,11 +902,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - if (Subtarget->isTargetDarwin()) { - setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); - setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); + setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); + setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); + setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); + if (Subtarget->useSjLjEH()) setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); - } setOperationAction(ISD::SETCC, MVT::i32, Expand); setOperationAction(ISD::SETCC, MVT::f32, Expand); @@ -896,7 +965,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (Subtarget->hasSinCos()) { setLibcallName(RTLIB::SINCOS_F32, "sincosf"); setLibcallName(RTLIB::SINCOS_F64, "sincos"); - if (Subtarget->getTargetTriple().isiOS()) { + if (Subtarget->isTargetWatchOS()) { + setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP); + setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP); + } + if (Subtarget->isTargetIOS() || Subtarget->isTargetWatchOS()) { // For iOS, we don't want to the normal expansion of a libcall to // sincos. We want to issue a libcall to __sincos_stret. setOperationAction(ISD::FSINCOS, MVT::f64, Custom); @@ -912,6 +985,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FTRUNC, MVT::f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); setOperationAction(ISD::FRINT, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); + if (!Subtarget->isFPOnlySP()) { setOperationAction(ISD::FFLOOR, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); @@ -919,8 +999,22 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); setOperationAction(ISD::FRINT, MVT::f64, Legal); + setOperationAction(ISD::FMINNUM, MVT::f64, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); } } + + if (Subtarget->hasNEON()) { + // vmin and vmax aren't available in a scalar form, so we use + // a NEON instruction with an undef lane instead. + setOperationAction(ISD::FMINNAN, MVT::f32, Legal); + setOperationAction(ISD::FMAXNAN, MVT::f32, Legal); + setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal); + setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal); + setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal); + setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal); + } + // We have target-specific dag combine patterns for the following nodes: // ARMISD::VMOVRRD - No need to call setTargetDAGCombine setTargetDAGCombine(ISD::ADD); @@ -943,11 +1037,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, //// temporary - rewrite interface to use type MaxStoresPerMemset = 8; - MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + MaxStoresPerMemsetOptSize = 4; MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores - MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 4 : 2; + MaxStoresPerMemcpyOptSize = 2; MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores - MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 4 : 2; + MaxStoresPerMemmoveOptSize = 2; // On ARM arguments smaller than 4 bytes are extended, so all arguments // are at least 4 bytes aligned. @@ -1038,8 +1132,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::CMOV: return "ARMISD::CMOV"; - case ARMISD::RBIT: return "ARMISD::RBIT"; - case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; case ARMISD::RRX: return "ARMISD::RRX"; @@ -1053,7 +1145,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; - case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP"; + case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP"; + case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH"; case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; @@ -1066,6 +1159,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; case ARMISD::WIN__CHKSTK: return "ARMISD:::WIN__CHKSTK"; + case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; case ARMISD::VCEQ: return "ARMISD::VCEQ"; case ARMISD::VCEQZ: return "ARMISD::VCEQZ"; @@ -1117,14 +1211,11 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::UMLAL: return "ARMISD::UMLAL"; case ARMISD::SMLAL: return "ARMISD::SMLAL"; case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; - case ARMISD::FMAX: return "ARMISD::FMAX"; - case ARMISD::FMIN: return "ARMISD::FMIN"; - case ARMISD::VMAXNM: return "ARMISD::VMAX"; - case ARMISD::VMINNM: return "ARMISD::VMIN"; case ARMISD::BFI: return "ARMISD::BFI"; case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; case ARMISD::VBSL: return "ARMISD::VBSL"; + case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; @@ -1149,8 +1240,10 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { return nullptr; } -EVT ARMTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { - if (!VT.isVector()) return getPointerTy(); +EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, + EVT VT) const { + if (!VT.isVector()) + return getPointerTy(DL); return VT.changeVectorElementTypeToInteger(); } @@ -1429,10 +1522,12 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); - PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); - return DAG.getStore(Chain, dl, Arg, PtrOff, - MachinePointerInfo::getStack(LocMemOffset), - false, false, 0); + PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), + StackPtr, PtrOff); + return DAG.getStore( + Chain, dl, Arg, PtrOff, + MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), + false, false, 0); } void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG, @@ -1453,7 +1548,8 @@ void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG, else { assert(NextVA.isMemLoc()); if (!StackPtr.getNode()) - StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); + StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, + getPointerTy(DAG.getDataLayout())); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), dl, DAG, NextVA, @@ -1526,7 +1622,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), dl); - SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); + SDValue StackPtr = + DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); RegsToPassVector RegsToPass; SmallVector MemOpChains; @@ -1607,7 +1704,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned RegBegin, RegEnd; CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = + DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); unsigned int i, j; for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); @@ -1628,12 +1726,12 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } if (Flags.getByValSize() > 4*offset) { + auto PtrVT = getPointerTy(DAG.getDataLayout()); unsigned LocMemOffset = VA.getLocMemOffset(); SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); - SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, - StkPtrOff); + SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); - SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset); + SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, MVT::i32); SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl, @@ -1693,8 +1791,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool isARMFunc = false; bool isLocalARMFunc = false; ARMFunctionInfo *AFI = MF.getInfo(); + auto PtrVt = getPointerTy(DAG.getDataLayout()); - if (EnableARMLongCalls) { + if (Subtarget->genLongCalls()) { assert((Subtarget->isTargetWindows() || getTargetMachine().getRelocationModel() == Reloc::Static) && "long-calls with non-static relocation model!"); @@ -1709,12 +1808,12 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); // Get the address of the callee into a register - SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - Callee = DAG.getLoad(getPointerTy(), dl, - DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Callee = DAG.getLoad( + PtrVt, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); } else if (ExternalSymbolSDNode *S=dyn_cast(Callee)) { const char *Sym = S->getSymbol(); @@ -1724,12 +1823,12 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, ARMPCLabelIndex, 0); // Get the address of the callee into a register - SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - Callee = DAG.getLoad(getPointerTy(), dl, - DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Callee = DAG.getLoad( + PtrVt, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); } } else if (GlobalAddressSDNode *G = dyn_cast(Callee)) { const GlobalValue *GV = G->getGlobal(); @@ -1743,31 +1842,33 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // tBX takes a register source operand. if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); - Callee = DAG.getNode(ARMISD::WrapperPIC, dl, getPointerTy(), - DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), - 0, ARMII::MO_NONLAZY)); - Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, - MachinePointerInfo::getGOT(), false, false, true, 0); + Callee = DAG.getNode( + ARMISD::WrapperPIC, dl, PtrVt, + DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); + Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), Callee, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, true, 0); } else if (Subtarget->isTargetCOFF()) { assert(Subtarget->isTargetWindows() && "Windows is the only supported COFF target"); unsigned TargetFlags = GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG; - Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), /*Offset=*/0, - TargetFlags); + Callee = + DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, TargetFlags); if (GV->hasDLLImportStorageClass()) - Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), - DAG.getNode(ARMISD::Wrapper, dl, getPointerTy(), - Callee), MachinePointerInfo::getGOT(), - false, false, false, 0); + Callee = + DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), + DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); } else { // On ELF targets for PIC code, direct calls should go through the PLT unsigned OpFlags = 0; if (Subtarget->isTargetELF() && getTargetMachine().getRelocationModel() == Reloc::PIC_) OpFlags = ARMII::MO_PLT; - Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); + Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, OpFlags); } } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { isDirect = true; @@ -1781,28 +1882,26 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, ARMPCLabelIndex, 4); - SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - Callee = DAG.getLoad(getPointerTy(), dl, - DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Callee = DAG.getLoad( + PtrVt, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); - Callee = DAG.getNode(ARMISD::PIC_ADD, dl, - getPointerTy(), Callee, PICLabel); + Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); } else { unsigned OpFlags = 0; // On ELF targets for PIC code, direct calls should go through the PLT if (Subtarget->isTargetELF() && getTargetMachine().getRelocationModel() == Reloc::PIC_) OpFlags = ARMII::MO_PLT; - Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlags); + Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, OpFlags); } } // FIXME: handle tail calls differently. unsigned CallOpc; - bool HasMinSizeAttr = MF.getFunction()->hasFnAttribute(Attribute::MinSize); if (Subtarget->isThumb()) { if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; @@ -1812,8 +1911,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (!isDirect && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; else if (doesNotRet && isDirect && Subtarget->hasRAS() && - // Emit regular call when code size is the priority - !HasMinSizeAttr) + // Emit regular call when code size is the priority + !MF.getFunction()->optForMinSize()) // "mov lr, pc; b _foo" to avoid confusing the RSP CallOpc = ARMISD::CALL_NOLINK; else @@ -1995,6 +2094,8 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; + assert(Subtarget->supportsTailCall()); + // Look for obvious safe cases to perform tail call optimization that do not // require ABI changes. This is what gcc calls sibcall. @@ -2014,26 +2115,6 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (isCalleeStructRet || isCallerStructRet) return false; - // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo:: - // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as - // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation - // support in the assembler and linker to be used. This would need to be - // fixed to fully support tail calls in Thumb1. - // - // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take - // LR. This means if we need to reload LR, it takes an extra instructions, - // which outweighs the value of the tail call; but here we don't know yet - // whether LR is going to be used. Probably the right approach is to - // generate the tail call here and turn it back into CALL/RET in - // emitEpilogue if LR is used. - - // Thumb1 PIC calls to external symbols use BX, so they can be tail calls, - // but we need to make sure there are enough registers; the only valid - // registers are the 4 used for parameters. We don't currently do this - // case. - if (Subtarget->isThumb1Only()) - return false; - // Externally-defined functions with weak linkage should not be // tail-called on ARM when the OS does not support dynamic // pre-emption of symbols, as the AAELF spec requires normal calls @@ -2381,7 +2462,7 @@ bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { if (!CI->isTailCall() || Attr.getValueAsString() == "true") return false; - return !Subtarget->isThumb1Only(); + return true; } // Trying to write a 64 bit value so need to split into two 32 bit values first, @@ -2433,7 +2514,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = 0; SDLoc DL(Op); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); const BlockAddress *BA = cast(Op)->getBlockAddress(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); SDValue CPAddr; @@ -2448,9 +2529,10 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); } CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); - SDValue Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + SDValue Result = + DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 0); if (RelocM == Reloc::Static) return Result; SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); @@ -2462,7 +2544,7 @@ SDValue ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG) const { SDLoc dl(GA); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); @@ -2472,9 +2554,10 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); - Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Argument = + DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 0); SDValue Chain = Argument.getValue(1); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); @@ -2508,7 +2591,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, SDLoc dl(GA); SDValue Offset; SDValue Chain = DAG.getEntryNode(); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); // Get the Thread Pointer SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); @@ -2524,17 +2607,19 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, true); Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); - Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Offset = DAG.getLoad( + PtrVT, dl, Chain, Offset, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); Chain = Offset.getValue(1); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); - Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Offset = DAG.getLoad( + PtrVT, dl, Chain, Offset, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); } else { // local exec model assert(model == TLSModel::LocalExec); @@ -2542,9 +2627,10 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); - Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Offset = DAG.getLoad( + PtrVT, dl, Chain, Offset, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); } // The address of the thread local variable is the add of the thread @@ -2558,6 +2644,8 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetELF() && "TLS not implemented for non-ELF targets"); GlobalAddressSDNode *GA = cast(Op); + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); @@ -2574,26 +2662,35 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const { - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc dl(Op); const GlobalValue *GV = cast(Op)->getGlobal(); if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { - bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility(); - ARMConstantPoolValue *CPV = - ARMConstantPoolConstant::Create(GV, - UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT); + bool UseGOT_PREL = + !(GV->hasHiddenVisibility() || GV->hasLocalLinkage()); + + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc dl(Op); + unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; + ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create( + GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj, + UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier, + /*AddCurrentAddress=*/UseGOT_PREL); SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), - CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + SDValue Result = DAG.getLoad( + PtrVT, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); SDValue Chain = Result.getValue(1); - SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT); - Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT); - if (!UseGOTOFF) + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); + Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); + if (UseGOT_PREL) Result = DAG.getLoad(PtrVT, dl, Chain, Result, - MachinePointerInfo::getGOT(), + MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, false, 0); return Result; } @@ -2609,15 +2706,16 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, } else { SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + return DAG.getLoad( + PtrVT, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); } } SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const { - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc dl(Op); const GlobalValue *GV = cast(Op)->getGlobal(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); @@ -2635,7 +2733,8 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); return Result; } @@ -2648,7 +2747,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, const GlobalValue *GV = cast(Op)->getGlobal(); const ARMII::TOF TargetFlags = (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result; SDLoc DL(Op); @@ -2661,32 +2760,11 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, TargetFlags)); if (GV->hasDLLImportStorageClass()) Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); return Result; } -SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, - SelectionDAG &DAG) const { - assert(Subtarget->isTargetELF() && - "GLOBAL OFFSET TABLE not implemented for non-ELF targets"); - MachineFunction &MF = DAG.getMachineFunction(); - ARMFunctionInfo *AFI = MF.getInfo(); - unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); - EVT PtrVT = getPointerTy(); - SDLoc dl(Op); - unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; - ARMConstantPoolValue *CPV = - ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_", - ARMPCLabelIndex, PCAdj); - SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); - CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); - SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); - return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); -} - SDValue ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -2703,6 +2781,13 @@ ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); } +SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, + Op.getOperand(0)); +} + SDValue ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { @@ -2713,17 +2798,17 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, case Intrinsic::arm_rbit: { assert(Op.getOperand(1).getValueType() == MVT::i32 && "RBIT intrinsic must have i32 type!"); - return DAG.getNode(ARMISD::RBIT, dl, MVT::i32, Op.getOperand(1)); + return DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Op.getOperand(1)); } case Intrinsic::arm_thread_pointer: { - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); } case Intrinsic::eh_sjlj_lsda: { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); SDValue CPAddr; unsigned PCAdj = (RelocM != Reloc::PIC_) @@ -2733,10 +2818,10 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, ARMCP::CPLSDA, PCAdj); CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - SDValue Result = - DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + SDValue Result = DAG.getLoad( + PtrVT, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); if (RelocM == Reloc::PIC_) { SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); @@ -2751,6 +2836,36 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } + case Intrinsic::arm_neon_vminnm: + case Intrinsic::arm_neon_vmaxnm: { + unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) + ? ISD::FMINNUM : ISD::FMAXNUM; + return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + case Intrinsic::arm_neon_vminu: + case Intrinsic::arm_neon_vmaxu: { + if (Op.getValueType().isFloatingPoint()) + return SDValue(); + unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) + ? ISD::UMIN : ISD::UMAX; + return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + case Intrinsic::arm_neon_vmins: + case Intrinsic::arm_neon_vmaxs: { + // v{min,max}s is overloaded between signed integers and floats. + if (!Op.getValueType().isFloatingPoint()) { + unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) + ? ISD::SMIN : ISD::SMAX; + return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) + ? ISD::FMINNAN : ISD::FMAXNAN; + return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } } } @@ -2820,7 +2935,7 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDLoc dl(Op); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), @@ -2850,10 +2965,11 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true); // Create load node to retrieve arguments from the stack. - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); - ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0); + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); + ArgValue2 = DAG.getLoad( + MVT::i32, dl, Root, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, + false, false, 0); } else { Reg = MF.addLiveIn(NextVA.getLocReg(), RC); ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); @@ -2904,8 +3020,9 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, if (REnd != RBegin) ArgOffset = -4 * (ARM::R4 - RBegin); + auto PtrVT = getPointerTy(DAG.getDataLayout()); int FrameIndex = MFI->CreateFixedObject(ArgSize, ArgOffset, false); - SDValue FIN = DAG.getFrameIndex(FrameIndex, getPointerTy()); + SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); SmallVector MemOps; const TargetRegisterClass *RC = @@ -2918,8 +3035,7 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo(OrigArg, 4 * i), false, false, 0); MemOps.push_back(Store); - FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, - DAG.getConstant(4, dl, getPointerTy())); + FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); } if (!MemOps.empty()) @@ -3013,6 +3129,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); + auto PtrVT = getPointerTy(DAG.getDataLayout()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -3035,10 +3152,11 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, SDValue ArgValue2; if (VA.isMemLoc()) { int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true); - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); - ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + ArgValue2 = DAG.getLoad( + MVT::f64, dl, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, false, 0); } else { ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); @@ -3119,10 +3237,10 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, "Byval arguments cannot be implicit"); unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); - int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, CurOrigArg, - CurByValIndex, VA.getLocMemOffset(), - Flags.getByValSize()); - InVals.push_back(DAG.getFrameIndex(FrameIndex, getPointerTy())); + int FrameIndex = StoreByValRegs( + CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, + VA.getLocMemOffset(), Flags.getByValSize()); + InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); CCInfo.nextInRegsParam(); } else { unsigned FIOffset = VA.getLocMemOffset(); @@ -3130,10 +3248,11 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, FIOffset, true); // Create load nodes to retrieve arguments from the stack. - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); - InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0)); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + InVals.push_back(DAG.getLoad( + VA.getValVT(), dl, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, false, 0)); } lastInsIndex = index; } @@ -3168,13 +3287,9 @@ static bool isFloatingPointZero(SDValue Op) { // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) // created by LowerConstantFP(). SDValue BitcastOp = Op->getOperand(0); - if (BitcastOp->getOpcode() == ARMISD::VMOVIMM) { - SDValue MoveOp = BitcastOp->getOperand(0); - if (MoveOp->getOpcode() == ISD::TargetConstant && - cast(MoveOp)->getZExtValue() == 0) { - return true; - } - } + if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && + isNullConstant(BitcastOp->getOperand(0))) + return true; } return false; } @@ -3539,113 +3654,6 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { // Try to generate VMAXNM/VMINNM on ARMv8. if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || TrueVal.getValueType() == MVT::f64)) { - // We can use VMAXNM/VMINNM for a compare followed by a select with the - // same operands, as follows: - // c = fcmp [?gt, ?ge, ?lt, ?le] a, b - // select c, a, b - // In NoNaNsFPMath the CC will have been changed from, e.g., 'ogt' to 'gt'. - bool swapSides = false; - if (!getTargetMachine().Options.NoNaNsFPMath) { - // transformability may depend on which way around we compare - switch (CC) { - default: - break; - case ISD::SETOGT: - case ISD::SETOGE: - case ISD::SETOLT: - case ISD::SETOLE: - // the non-NaN should be RHS - swapSides = DAG.isKnownNeverNaN(LHS) && !DAG.isKnownNeverNaN(RHS); - break; - case ISD::SETUGT: - case ISD::SETUGE: - case ISD::SETULT: - case ISD::SETULE: - // the non-NaN should be LHS - swapSides = DAG.isKnownNeverNaN(RHS) && !DAG.isKnownNeverNaN(LHS); - break; - } - } - swapSides = swapSides || (LHS == FalseVal && RHS == TrueVal); - if (swapSides) { - CC = ISD::getSetCCSwappedOperands(CC); - std::swap(LHS, RHS); - } - if (LHS == TrueVal && RHS == FalseVal) { - bool canTransform = true; - // FIXME: FastMathFlags::noSignedZeros() doesn't appear reachable from here - if (!getTargetMachine().Options.UnsafeFPMath && - !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { - const ConstantFPSDNode *Zero; - switch (CC) { - default: - break; - case ISD::SETOGT: - case ISD::SETUGT: - case ISD::SETGT: - // RHS must not be -0 - canTransform = (Zero = dyn_cast(RHS)) && - !Zero->isNegative(); - break; - case ISD::SETOGE: - case ISD::SETUGE: - case ISD::SETGE: - // LHS must not be -0 - canTransform = (Zero = dyn_cast(LHS)) && - !Zero->isNegative(); - break; - case ISD::SETOLT: - case ISD::SETULT: - case ISD::SETLT: - // RHS must not be +0 - canTransform = (Zero = dyn_cast(RHS)) && - Zero->isNegative(); - break; - case ISD::SETOLE: - case ISD::SETULE: - case ISD::SETLE: - // LHS must not be +0 - canTransform = (Zero = dyn_cast(LHS)) && - Zero->isNegative(); - break; - } - } - if (canTransform) { - // Note: If one of the elements in a pair is a number and the other - // element is NaN, the corresponding result element is the number. - // This is consistent with the IEEE 754-2008 standard. - // Therefore, a > b ? a : b <=> vmax(a,b), if b is constant and a is NaN - switch (CC) { - default: - break; - case ISD::SETOGT: - case ISD::SETOGE: - if (!DAG.isKnownNeverNaN(RHS)) - break; - return DAG.getNode(ARMISD::VMAXNM, dl, VT, LHS, RHS); - case ISD::SETUGT: - case ISD::SETUGE: - if (!DAG.isKnownNeverNaN(LHS)) - break; - case ISD::SETGT: - case ISD::SETGE: - return DAG.getNode(ARMISD::VMAXNM, dl, VT, LHS, RHS); - case ISD::SETOLT: - case ISD::SETOLE: - if (!DAG.isKnownNeverNaN(RHS)) - break; - return DAG.getNode(ARMISD::VMINNM, dl, VT, LHS, RHS); - case ISD::SETULT: - case ISD::SETULE: - if (!DAG.isKnownNeverNaN(LHS)) - break; - case ISD::SETLT: - case ISD::SETLE: - return DAG.getNode(ARMISD::VMINNM, dl, VT, LHS, RHS); - } - } - } - bool swpCmpOps = false; bool swpVselOps = false; checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); @@ -3855,7 +3863,7 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { SDValue Index = Op.getOperand(2); SDLoc dl(Op); - EVT PTy = getPointerTy(); + EVT PTy = getPointerTy(DAG.getDataLayout()); JumpTableSDNode *JT = cast(Table); SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); @@ -3870,16 +3878,18 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { Addr, Op.getOperand(2), JTI); } if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { - Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, - MachinePointerInfo::getJumpTable(), - false, false, false, 0); + Addr = + DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, + MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), + false, false, false, 0); Chain = Addr.getValue(1); Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table); return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); } else { - Addr = DAG.getLoad(PTy, dl, Chain, Addr, - MachinePointerInfo::getJumpTable(), - false, false, false, 0); + Addr = + DAG.getLoad(PTy, dl, Chain, Addr, + MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), + false, false, false, 0); Chain = Addr.getValue(1); return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); } @@ -3916,7 +3926,7 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { else LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); - return makeLibCall(DAG, LC, Op.getValueType(), &Op.getOperand(0), 1, + return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), /*isSigned*/ false, SDLoc(Op)).first; } @@ -3968,7 +3978,7 @@ SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { else LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); - return makeLibCall(DAG, LC, Op.getValueType(), &Op.getOperand(0), 1, + return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), /*isSigned*/ false, SDLoc(Op)).first; } @@ -4102,8 +4112,8 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -unsigned ARMTargetLowering::getRegisterByName(const char* RegName, - EVT VT) const { +unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const { unsigned Reg = StringSwitch(RegName) .Case("sp", ARM::SP) .Default(0); @@ -4163,7 +4173,7 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { // Turn f64->i64 into VMOVRRD. if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { SDValue Cvt; - if (TLI.isBigEndian() && SrcVT.isVector() && + if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && SrcVT.getVectorNumElements() > 1) Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), @@ -4283,13 +4293,87 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { - EVT VT = N->getValueType(0); SDLoc dl(N); + EVT VT = N->getValueType(0); + if (VT.isVector()) { + assert(ST->hasNEON()); + + // Compute the least significant set bit: LSB = X & -X + SDValue X = N->getOperand(0); + SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); + SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); + + EVT ElemTy = VT.getVectorElementType(); + + if (ElemTy == MVT::i8) { + // Compute with: cttz(x) = ctpop(lsb - 1) + SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(1, dl, ElemTy)); + SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); + return DAG.getNode(ISD::CTPOP, dl, VT, Bits); + } + + if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && + (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { + // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 + unsigned NumBits = ElemTy.getSizeInBits(); + SDValue WidthMinus1 = + DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); + SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); + return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); + } + + // Compute with: cttz(x) = ctpop(lsb - 1) + + // Since we can only compute the number of bits in a byte with vcnt.8, we + // have to gather the result with pairwise addition (vpaddl) for i16, i32, + // and i64. + + // Compute LSB - 1. + SDValue Bits; + if (ElemTy == MVT::i64) { + // Load constant 0xffff'ffff'ffff'ffff to register. + SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(0x1eff, dl, MVT::i32)); + Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); + } else { + SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(1, dl, ElemTy)); + Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); + } + + // Count #bits with vcnt.8. + EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; + SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits); + SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8); + + // Gather the #bits with vpaddl (pairwise add.) + EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; + SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit, + DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), + Cnt8); + if (ElemTy == MVT::i16) + return Cnt16; + + EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32; + SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit, + DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), + Cnt16); + if (ElemTy == MVT::i32) + return Cnt32; + + assert(ElemTy == MVT::i64); + SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), + Cnt32); + return Cnt64; + } if (!ST->hasV6T2Ops()) return SDValue(); - SDValue rbit = DAG.getNode(ARMISD::RBIT, dl, VT, N->getOperand(0)); + SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); return DAG.getNode(ISD::CTLZ, dl, VT, rbit); } @@ -4450,8 +4534,7 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, "Unknown shift to lower!"); // We only lower SRA, SRL of 1 here, all others use generic lowering. - if (!isa(N->getOperand(1)) || - cast(N->getOperand(1))->getZExtValue() != 1) + if (!isOneConstant(N->getOperand(1))) return SDValue(); // If we are in thumb mode, we don't have RRX. @@ -4489,6 +4572,12 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { ISD::CondCode SetCCOpcode = cast(CC)->get(); SDLoc dl(Op); + if (CmpVT.getVectorElementType() == MVT::i64) + // 64-bit comparisons are not legal. We've marked SETCC as non-Custom, + // but it's possible that our operands are 64-bit but our result is 32-bit. + // Bail in this case. + return SDValue(); + if (Op1.getValueType().isFloatingPoint()) { switch (SetCCOpcode) { default: llvm_unreachable("Illegal FP comparison"); @@ -4730,7 +4819,7 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, ImmMask <<= 1; } - if (DAG.getTargetLoweringInfo().isBigEndian()) + if (DAG.getDataLayout().isBigEndian()) // swap higher and lower 32 bit word Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); @@ -4936,18 +5025,56 @@ static bool isVTBLMask(ArrayRef M, EVT VT) { return VT == MVT::v8i8 && M.size() == 8; } +// Checks whether the shuffle mask represents a vector transpose (VTRN) by +// checking that pairs of elements in the shuffle mask represent the same index +// in each vector, incrementing the expected index by 2 at each step. +// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] +// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} +// v2={e,f,g,h} +// WhichResult gives the offset for each element in the mask based on which +// of the two results it belongs to. +// +// The transpose can be represented either as: +// result1 = shufflevector v1, v2, result1_shuffle_mask +// result2 = shufflevector v1, v2, result2_shuffle_mask +// where v1/v2 and the shuffle masks have the same number of elements +// (here WhichResult (see below) indicates which result is being checked) +// +// or as: +// results = shufflevector v1, v2, shuffle_mask +// where both results are returned in one vector and the shuffle mask has twice +// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we +// want to check the low half and high half of the shuffle mask as if it were +// the other case static bool isVTRNMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned i = 0; i < NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) || - (M[i+1] >= 0 && (unsigned) M[i+1] != i + NumElts + WhichResult)) - return false; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + // If the mask is twice as long as the input vector then we need to check the + // upper and lower parts of the mask with a matching value for WhichResult + // FIXME: A mask with only even values will be rejected in case the first + // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only + // M[0] is used to determine WhichResult + for (unsigned i = 0; i < M.size(); i += NumElts) { + if (M.size() == NumElts * 2) + WhichResult = i / NumElts; + else + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) + return false; + } } + + if (M.size() == NumElts*2) + WhichResult = 0; + return true; } @@ -4960,28 +5087,55 @@ static bool isVTRN_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned i = 0; i < NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) || - (M[i+1] >= 0 && (unsigned) M[i+1] != i + WhichResult)) - return false; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + if (M.size() == NumElts * 2) + WhichResult = i / NumElts; + else + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) + return false; + } } + + if (M.size() == NumElts*2) + WhichResult = 0; + return true; } +// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking +// that the mask elements are either all even and in steps of size 2 or all odd +// and in steps of size 2. +// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] +// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} +// v2={e,f,g,h} +// Requires similar checks to that of isVTRNMask with +// respect the how results are returned. static bool isVUZPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned i = 0; i != NumElts; ++i) { - if (M[i] < 0) continue; // ignore UNDEF indices - if ((unsigned) M[i] != 2 * i + WhichResult) - return false; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; ++j) { + if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) + return false; + } } + if (M.size() == NumElts*2) + WhichResult = 0; + // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; @@ -4997,18 +5151,27 @@ static bool isVUZP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ if (EltSz == 64) return false; - unsigned Half = VT.getVectorNumElements() / 2; - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned j = 0; j != 2; ++j) { - unsigned Idx = WhichResult; - for (unsigned i = 0; i != Half; ++i) { - int MIdx = M[i + j * Half]; - if (MIdx >= 0 && (unsigned) MIdx != Idx) - return false; - Idx += 2; + unsigned NumElts = VT.getVectorNumElements(); + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + unsigned Half = NumElts / 2; + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; j += Half) { + unsigned Idx = WhichResult; + for (unsigned k = 0; k < Half; ++k) { + int MIdx = M[i + j + k]; + if (MIdx >= 0 && (unsigned) MIdx != Idx) + return false; + Idx += 2; + } } } + if (M.size() == NumElts*2) + WhichResult = 0; + // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; @@ -5016,21 +5179,37 @@ static bool isVUZP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ return true; } +// Checks whether the shuffle mask represents a vector zip (VZIP) by checking +// that pairs of elements of the shufflemask represent the same index in each +// vector incrementing sequentially through the vectors. +// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] +// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} +// v2={e,f,g,h} +// Requires similar checks to that of isVTRNMask with respect the how results +// are returned. static bool isVZIPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - unsigned Idx = WhichResult * NumElts / 2; - for (unsigned i = 0; i != NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned) M[i] != Idx) || - (M[i+1] >= 0 && (unsigned) M[i+1] != Idx + NumElts)) - return false; - Idx += 1; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) + return false; + Idx += 1; + } } + if (M.size() == NumElts*2) + WhichResult = 0; + // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; @@ -5047,15 +5226,23 @@ static bool isVZIP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - unsigned Idx = WhichResult * NumElts / 2; - for (unsigned i = 0; i != NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned) M[i] != Idx) || - (M[i+1] >= 0 && (unsigned) M[i+1] != Idx)) - return false; - Idx += 1; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) + return false; + Idx += 1; + } } + if (M.size() == NumElts*2) + WhichResult = 0; + // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; @@ -5229,16 +5416,14 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // just use VDUPLANE. We can only do this if the lane being extracted // is at a constant index, as the VDUP from lane instructions only have // constant-index forms. + ConstantSDNode *constIndex; if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && - isa(Value->getOperand(1))) { + (constIndex = dyn_cast(Value->getOperand(1)))) { // We need to create a new undef vector to use for the VDUPLANE if the // size of the vector from which we get the value is different than the // size of the vector that we need to create. We will insert the element // such that the register coalescer will remove unnecessary copies. if (VT != Value->getOperand(0).getValueType()) { - ConstantSDNode *constIndex; - constIndex = dyn_cast(Value->getOperand(1)); - assert(constIndex && "The index is not a constant!"); unsigned index = constIndex->getAPIntValue().getLimitedValue() % VT.getVectorNumElements(); N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, @@ -5337,14 +5522,35 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // shuffle in combination with VEXTs. SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); SDLoc dl(Op); EVT VT = Op.getValueType(); unsigned NumElts = VT.getVectorNumElements(); - SmallVector SourceVecs; - SmallVector MinElts; - SmallVector MaxElts; + struct ShuffleSourceInfo { + SDValue Vec; + unsigned MinElt; + unsigned MaxElt; + + // We may insert some combination of BITCASTs and VEXT nodes to force Vec to + // be compatible with the shuffle we intend to construct. As a result + // ShuffleVec will be some sliding window into the original Vec. + SDValue ShuffleVec; + + // Code should guarantee that element i in Vec starts at element "WindowBase + // + i * WindowScale in ShuffleVec". + int WindowBase; + int WindowScale; + + bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } + ShuffleSourceInfo(SDValue Vec) + : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), + WindowScale(1) {} + }; + // First gather all vectors used as an immediate source for this BUILD_VECTOR + // node. + SmallVector Sources; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() == ISD::UNDEF) @@ -5353,127 +5559,166 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, // A shuffle can only come from building a vector from various // elements of other vectors. return SDValue(); - } else if (V.getOperand(0).getValueType().getVectorElementType() != - VT.getVectorElementType()) { - // This code doesn't know how to handle shuffles where the vector - // element types do not match (this happens because type legalization - // promotes the return type of EXTRACT_VECTOR_ELT). - // FIXME: It might be appropriate to extend this code to handle - // mismatched types. + } else if (!isa(V.getOperand(1))) { + // Furthermore, shuffles require a constant mask, whereas extractelts + // accept variable indices. return SDValue(); } - // Record this extraction against the appropriate vector if possible... + // Add this element source to the list if it's not already there. SDValue SourceVec = V.getOperand(0); - // If the element number isn't a constant, we can't effectively - // analyze what's going on. - if (!isa(V.getOperand(1))) - return SDValue(); - unsigned EltNo = cast(V.getOperand(1))->getZExtValue(); - bool FoundSource = false; - for (unsigned j = 0; j < SourceVecs.size(); ++j) { - if (SourceVecs[j] == SourceVec) { - if (MinElts[j] > EltNo) - MinElts[j] = EltNo; - if (MaxElts[j] < EltNo) - MaxElts[j] = EltNo; - FoundSource = true; - break; - } - } + auto Source = std::find(Sources.begin(), Sources.end(), SourceVec); + if (Source == Sources.end()) + Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); - // Or record a new source if not... - if (!FoundSource) { - SourceVecs.push_back(SourceVec); - MinElts.push_back(EltNo); - MaxElts.push_back(EltNo); - } + // Update the minimum and maximum lane number seen. + unsigned EltNo = cast(V.getOperand(1))->getZExtValue(); + Source->MinElt = std::min(Source->MinElt, EltNo); + Source->MaxElt = std::max(Source->MaxElt, EltNo); } // Currently only do something sane when at most two source vectors - // involved. - if (SourceVecs.size() > 2) + // are involved. + if (Sources.size() > 2) return SDValue(); - SDValue ShuffleSrcs[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT) }; - int VEXTOffsets[2] = {0, 0}; + // Find out the smallest element size among result and two sources, and use + // it as element size to build the shuffle_vector. + EVT SmallestEltTy = VT.getVectorElementType(); + for (auto &Source : Sources) { + EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); + if (SrcEltTy.bitsLT(SmallestEltTy)) + SmallestEltTy = SrcEltTy; + } + unsigned ResMultiplier = + VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits(); + NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); + EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); + + // If the source vector is too wide or too narrow, we may nevertheless be able + // to construct a compatible shuffle either by concatenating it with UNDEF or + // extracting a suitable range of elements. + for (auto &Src : Sources) { + EVT SrcVT = Src.ShuffleVec.getValueType(); + + if (SrcVT.getSizeInBits() == VT.getSizeInBits()) + continue; + + // This stage of the search produces a source with the same element type as + // the original, but with a total width matching the BUILD_VECTOR output. + EVT EltVT = SrcVT.getVectorElementType(); + unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); + EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); - // This loop extracts the usage patterns of the source vectors - // and prepares appropriate SDValues for a shuffle if possible. - for (unsigned i = 0; i < SourceVecs.size(); ++i) { - if (SourceVecs[i].getValueType() == VT) { - // No VEXT necessary - ShuffleSrcs[i] = SourceVecs[i]; - VEXTOffsets[i] = 0; + if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { + if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits()) + return SDValue(); + // We can pad out the smaller vector for free, so if it's part of a + // shuffle... + Src.ShuffleVec = + DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, + DAG.getUNDEF(Src.ShuffleVec.getValueType())); continue; - } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) { - // It probably isn't worth padding out a smaller vector just to - // break it down again in a shuffle. - return SDValue(); } - // Since only 64-bit and 128-bit vectors are legal on ARM and - // we've eliminated the other cases... - assert(SourceVecs[i].getValueType().getVectorNumElements() == 2*NumElts && - "unexpected vector sizes in ReconstructShuffle"); + if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) + return SDValue(); - if (MaxElts[i] - MinElts[i] >= NumElts) { + if (Src.MaxElt - Src.MinElt >= NumSrcElts) { // Span too large for a VEXT to cope return SDValue(); } - if (MinElts[i] >= NumElts) { + if (Src.MinElt >= NumSrcElts) { // The extraction can just take the second half - VEXTOffsets[i] = NumElts; - ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - SourceVecs[i], - DAG.getIntPtrConstant(NumElts, dl)); - } else if (MaxElts[i] < NumElts) { + Src.ShuffleVec = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(NumSrcElts, dl, MVT::i32)); + Src.WindowBase = -NumSrcElts; + } else if (Src.MaxElt < NumSrcElts) { // The extraction can just take the first half - VEXTOffsets[i] = 0; - ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - SourceVecs[i], - DAG.getIntPtrConstant(0, dl)); + Src.ShuffleVec = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(0, dl, MVT::i32)); } else { // An actual VEXT is needed - VEXTOffsets[i] = MinElts[i]; - SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - SourceVecs[i], - DAG.getIntPtrConstant(0, dl)); - SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - SourceVecs[i], - DAG.getIntPtrConstant(NumElts, dl)); - ShuffleSrcs[i] = DAG.getNode(ARMISD::VEXT, dl, VT, VEXTSrc1, VEXTSrc2, - DAG.getConstant(VEXTOffsets[i], dl, - MVT::i32)); - } - } - - SmallVector Mask; - - for (unsigned i = 0; i < NumElts; ++i) { + SDValue VEXTSrc1 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(0, dl, MVT::i32)); + SDValue VEXTSrc2 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(NumSrcElts, dl, MVT::i32)); + + Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, + VEXTSrc2, + DAG.getConstant(Src.MinElt, dl, MVT::i32)); + Src.WindowBase = -Src.MinElt; + } + } + + // Another possible incompatibility occurs from the vector element types. We + // can fix this by bitcasting the source vectors to the same type we intend + // for the shuffle. + for (auto &Src : Sources) { + EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); + if (SrcEltTy == SmallestEltTy) + continue; + assert(ShuffleVT.getVectorElementType() == SmallestEltTy); + Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); + Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); + Src.WindowBase *= Src.WindowScale; + } + + // Final sanity check before we try to actually produce a shuffle. + DEBUG( + for (auto Src : Sources) + assert(Src.ShuffleVec.getValueType() == ShuffleVT); + ); + + // The stars all align, our next step is to produce the mask for the shuffle. + SmallVector Mask(ShuffleVT.getVectorNumElements(), -1); + int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); + for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { SDValue Entry = Op.getOperand(i); - if (Entry.getOpcode() == ISD::UNDEF) { - Mask.push_back(-1); + if (Entry.getOpcode() == ISD::UNDEF) continue; - } - SDValue ExtractVec = Entry.getOperand(0); - int ExtractElt = cast(Op.getOperand(i) - .getOperand(1))->getSExtValue(); - if (ExtractVec == SourceVecs[0]) { - Mask.push_back(ExtractElt - VEXTOffsets[0]); - } else { - Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]); - } + auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); + int EltNo = cast(Entry.getOperand(1))->getSExtValue(); + + // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit + // trunc. So only std::min(SrcBits, DestBits) actually get defined in this + // segment. + EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); + int BitsDefined = std::min(OrigEltTy.getSizeInBits(), + VT.getVectorElementType().getSizeInBits()); + int LanesDefined = BitsDefined / BitsPerShuffleLane; + + // This source is expected to fill ResMultiplier lanes of the final shuffle, + // starting at the appropriate offset. + int *LaneMask = &Mask[i * ResMultiplier]; + + int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; + ExtractBase += NumElts * (Src - Sources.begin()); + for (int j = 0; j < LanesDefined; ++j) + LaneMask[j] = ExtractBase + j; } // Final check before we try to produce nonsense... - if (isShuffleMaskLegal(Mask, VT)) - return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1], - &Mask[0]); + if (!isShuffleMaskLegal(Mask, ShuffleVT)) + return SDValue(); - return SDValue(); + // We can't handle more than two sources. This should have already + // been checked before this point. + assert(Sources.size() <= 2 && "Too many sources!"); + + SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; + for (unsigned i = 0; i < Sources.size(); ++i) + ShuffleOps[i] = Sources[i].ShuffleVec; + + SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], + ShuffleOps[1], &Mask[0]); + return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); } /// isShuffleMaskLegal - Targets can use this to indicate that they only @@ -5868,7 +6113,7 @@ static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, if (BVN->getValueType(0) != MVT::v4i32 || BVN->getOpcode() != ISD::BUILD_VECTOR) return false; - unsigned LoElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0; + unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; unsigned HiElt = 1 - LoElt; ConstantSDNode *Lo0 = dyn_cast(BVN->getOperand(LoElt)); ConstantSDNode *Hi0 = dyn_cast(BVN->getOperand(HiElt)); @@ -6013,7 +6258,7 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { SDNode *BVN = N->getOperand(0).getNode(); assert(BVN->getOpcode() == ISD::BUILD_VECTOR && BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); - unsigned LowElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0; + unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::v2i32, BVN->getOperand(LowElt), BVN->getOperand(LowElt+2)); } @@ -6135,6 +6380,8 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) { + // TODO: Should this propagate fast-math-flags? + // Convert to float // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); @@ -6165,6 +6412,8 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) { static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) { + // TODO: Should this propagate fast-math-flags? + SDValue N2; // Convert to float. // float4 yf = vcvt_f32_s32(vmovl_s16(y)); @@ -6237,6 +6486,7 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { } static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { + // TODO: Should this propagate fast-math-flags? EVT VT = Op.getValueType(); assert((VT == MVT::v4i16 || VT == MVT::v8i8) && "unexpected type for custom-lowering ISD::UDIV"); @@ -6342,52 +6592,64 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { SDValue Arg = Op.getOperand(0); EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + auto PtrVT = getPointerTy(DAG.getDataLayout()); MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Pair of floats / doubles used to pass the result. - StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); - - // Create stack object for sret. - const uint64_t ByteSize = TLI.getDataLayout()->getTypeAllocSize(RetTy); - const unsigned StackAlign = TLI.getDataLayout()->getPrefTypeAlignment(RetTy); - int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false); - SDValue SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy()); + Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr); + auto &DL = DAG.getDataLayout(); ArgListTy Args; - ArgListEntry Entry; - - Entry.Node = SRet; - Entry.Ty = RetTy->getPointerTo(); - Entry.isSExt = false; - Entry.isZExt = false; - Entry.isSRet = true; - Args.push_back(Entry); + bool ShouldUseSRet = Subtarget->isAPCS_ABI(); + SDValue SRet; + if (ShouldUseSRet) { + // Create stack object for sret. + const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); + const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); + int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false); + SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); + + ArgListEntry Entry; + Entry.Node = SRet; + Entry.Ty = RetTy->getPointerTo(); + Entry.isSExt = false; + Entry.isZExt = false; + Entry.isSRet = true; + Args.push_back(Entry); + RetTy = Type::getVoidTy(*DAG.getContext()); + } + ArgListEntry Entry; Entry.Node = Arg; Entry.Ty = ArgTy; Entry.isSExt = false; Entry.isZExt = false; Args.push_back(Entry); - const char *LibcallName = (ArgVT == MVT::f64) - ? "__sincos_stret" : "__sincosf_stret"; - SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy()); + const char *LibcallName = + (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; + RTLIB::Libcall LC = + (ArgVT == MVT::f64) ? RTLIB::SINCOS_F64 : RTLIB::SINCOS_F32; + CallingConv::ID CC = getLibcallCallingConv(LC); + SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), Callee, - std::move(Args), 0) - .setDiscardResult(); - + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setCallee(CC, RetTy, Callee, std::move(Args), 0) + .setDiscardResult(ShouldUseSRet); std::pair CallResult = LowerCallTo(CLI); + if (!ShouldUseSRet) + return CallResult.first; + SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo(), false, false, false, 0); // Address of cos field. - SDValue Add = DAG.getNode(ISD::ADD, dl, getPointerTy(), SRet, + SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo(), false, false, false, 0); @@ -6397,6 +6659,81 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { LoadSin.getValue(0), LoadCos.getValue(0)); } +SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, + SDValue &Chain) const { + EVT VT = Op.getValueType(); + assert((VT == MVT::i32 || VT == MVT::i64) && + "unexpected type for custom lowering DIV"); + SDLoc dl(Op); + + const auto &DL = DAG.getDataLayout(); + const auto &TLI = DAG.getTargetLoweringInfo(); + + const char *Name = nullptr; + Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; + + SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); + + ARMTargetLowering::ArgListTy Args; + + for (auto AI : {1, 0}) { + ArgListEntry Arg; + Arg.Node = Op.getOperand(AI); + Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); + Args.push_back(Arg); + } + + CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl) + .setChain(Chain) + .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), + ES, std::move(Args), 0); + + return LowerCallTo(CLI).first; +} + +SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getValueType() == MVT::i32 && + "unexpected type for custom lowering DIV"); + SDLoc dl(Op); + + SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, + DAG.getEntryNode(), Op.getOperand(1)); + + return LowerWindowsDIVLibCall(Op, DAG, DBZCHK); +} + +void ARMTargetLowering::ExpandDIV_Windows( + SDValue Op, SelectionDAG &DAG, + SmallVectorImpl &Results) const { + const auto &DL = DAG.getDataLayout(); + const auto &TLI = DAG.getTargetLoweringInfo(); + + assert(Op.getValueType() == MVT::i64 && + "unexpected type for custom lowering DIV"); + SDLoc dl(Op); + + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1), + DAG.getConstant(0, dl, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1), + DAG.getConstant(1, dl, MVT::i32)); + SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i32, Lo, Hi); + + SDValue DBZCHK = + DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, DAG.getEntryNode(), Or); + + SDValue Result = LowerWindowsDIVLibCall(Op, DAG, DBZCHK); + + SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); + SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, + DAG.getConstant(32, dl, TLI.getPointerTy(DL))); + Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); + + Results.push_back(Lower); + Results.push_back(Upper); +} + static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { // Monotonic load/store is legal for all targets if (cast(Op)->getOrdering() <= Monotonic) @@ -6412,36 +6749,22 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { SDLoc DL(N); - SDValue Cycles32, OutChain; - - if (Subtarget->hasPerfMon()) { - // Under Power Management extensions, the cycle-count is: - // mrc p15, #0, , c9, c13, #0 - SDValue Ops[] = { N->getOperand(0), // Chain - DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), - DAG.getConstant(15, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(9, DL, MVT::i32), - DAG.getConstant(13, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32) - }; - - Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, - DAG.getVTList(MVT::i32, MVT::Other), Ops); - OutChain = Cycles32.getValue(1); - } else { - // Intrinsic is defined to return 0 on unsupported platforms. Technically - // there are older ARM CPUs that have implementation-specific ways of - // obtaining this information (FIXME!). - Cycles32 = DAG.getConstant(0, DL, MVT::i32); - OutChain = DAG.getEntryNode(); - } - + // Under Power Management extensions, the cycle-count is: + // mrc p15, #0, , c9, c13, #0 + SDValue Ops[] = { N->getOperand(0), // Chain + DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), + DAG.getConstant(15, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(9, DL, MVT::i32), + DAG.getConstant(13, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32) + }; - SDValue Cycles64 = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, - Cycles32, DAG.getConstant(0, DL, MVT::i32)); - Results.push_back(Cycles64); - Results.push_back(OutChain); + SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, + DAG.getVTList(MVT::i32, MVT::Other), Ops); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, + DAG.getConstant(0, DL, MVT::i32))); + Results.push_back(Cycles32.getValue(1)); } SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { @@ -6475,19 +6798,22 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); - case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG); case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); + case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, Subtarget); case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG); case ISD::SHL: case ISD::SRL: case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); + case ISD::SREM: return LowerREM(Op.getNode(), DAG); + case ISD::UREM: return LowerREM(Op.getNode(), DAG); case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); case ISD::SRL_PARTS: case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); - case ISD::CTTZ: return LowerCTTZ(Op.getNode(), DAG, Subtarget); + case ISD::CTTZ: + case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); case ISD::SETCC: return LowerVSETCC(Op, DAG); case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); @@ -6499,7 +6825,10 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::SDIV: return LowerSDIV(Op, DAG); - case ISD::UDIV: return LowerUDIV(Op, DAG); + case ISD::UDIV: + if (Subtarget->isTargetWindows()) + return LowerDIV_Windows(Op, DAG); + return LowerUDIV(Op, DAG); case ISD::ADDC: case ISD::ADDE: case ISD::SUBC: @@ -6520,13 +6849,14 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("Don't know how to custom lower this!"); case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); + case ARMISD::WIN__DBZCHK: return SDValue(); } } /// ReplaceNodeResults - Replace the results of node with an illegal result /// type with new values built out of custom code. void ARMTargetLowering::ReplaceNodeResults(SDNode *N, - SmallVectorImpl&Results, + SmallVectorImpl &Results, SelectionDAG &DAG) const { SDValue Res; switch (N->getOpcode()) { @@ -6542,9 +6872,16 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, case ISD::SRA: Res = Expand64BitShift(N, DAG, Subtarget); break; + case ISD::SREM: + case ISD::UREM: + Res = LowerREM(N, DAG); + break; case ISD::READCYCLECOUNTER: ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); return; + case ISD::UDIV: + assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); + return ExpandDIV_Windows(SDValue(N, 0), DAG, Results); } if (Res.getNode()) Results.push_back(Res); @@ -6581,12 +6918,12 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB, // Grab constant pool and fixed stack memory operands. MachineMemOperand *CPMMO = - MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(), - MachineMemOperand::MOLoad, 4, 4); + MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), + MachineMemOperand::MOLoad, 4, 4); MachineMemOperand *FIMMOSt = - MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOStore, 4, 4); + MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), + MachineMemOperand::MOStore, 4, 4); // Load the address of the dispatch MBB into the jump buffer. if (isThumb2) { @@ -6690,7 +7027,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, MachineModuleInfo &MMI = MF->getMMI(); for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; ++BB) { - if (!BB->isLandingPad()) continue; + if (!BB->isEHPad()) continue; // FIXME: We should assert that the EH_LABEL is the first MI in the landing // pad. @@ -6705,7 +7042,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, for (SmallVectorImpl::iterator CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); CSI != CSE; ++CSI) { - CallSiteNumToLPad[*CSI].push_back(BB); + CallSiteNumToLPad[*CSI].push_back(&*BB); MaxCSNum = std::max(MaxCSNum, *CSI); } break; @@ -6738,7 +7075,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, // Shove the dispatch's address into the return slot in the function context. MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); - DispatchBB->setIsLandingPad(); + DispatchBB->setIsEHPad(); MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); unsigned trap_opcode; @@ -6762,10 +7099,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, // context. SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); - MachineMemOperand *FIMMOLd = - MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOLoad | - MachineMemOperand::MOVolatile, 4, 4); + MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FI), + MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); MachineInstrBuilder MIB; MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); @@ -6845,9 +7181,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. - unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty); + unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); if (Align == 0) - Align = getDataLayout()->getTypeAllocSize(C->getType()); + Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); unsigned VReg1 = MRI->createVirtualRegister(TRC); @@ -6880,9 +7216,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, .addReg(NewVReg2, RegState::Kill) .addReg(NewVReg3)); - MachineMemOperand *JTMMOLd = - MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(), - MachineMemOperand::MOLoad, 4, 4); + MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( + MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); unsigned NewVReg5 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) @@ -6935,9 +7270,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. - unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty); + unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); if (Align == 0) - Align = getDataLayout()->getTypeAllocSize(C->getType()); + Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); unsigned VReg1 = MRI->createVirtualRegister(TRC); @@ -6964,9 +7299,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) .addJumpTableIndex(MJTI)); - MachineMemOperand *JTMMOLd = - MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(), - MachineMemOperand::MOLoad, 4, 4); + MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( + MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); unsigned NewVReg5 = MRI->createVirtualRegister(TRC); AddDefaultPred( BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) @@ -7007,7 +7341,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, BB->succ_end()); while (!Successors.empty()) { MachineBasicBlock *SMBB = Successors.pop_back_val(); - if (SMBB->isLandingPad()) { + if (SMBB->isEHPad()) { BB->removeSuccessor(SMBB); MBBLPads.push_back(SMBB); } @@ -7055,7 +7389,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, // landing pad now. for (SmallVectorImpl::iterator I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) - (*I)->setIsLandingPad(false); + (*I)->setIsEHPad(false); // The instruction is gone now. MI->eraseFromParent(); @@ -7178,8 +7512,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI, // Otherwise, we will generate unrolled scalar copies. const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); unsigned dest = MI->getOperand(0).getReg(); unsigned src = MI->getOperand(1).getReg(); @@ -7313,9 +7646,9 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI, const Constant *C = ConstantInt::get(Int32Ty, LoopSize); // MachineConstantPool wants an explicit alignment. - unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty); + unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); if (Align == 0) - Align = getDataLayout()->getTypeAllocSize(C->getType()); + Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); if (IsThumb1) @@ -7471,6 +7804,32 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI, return MBB; } +MachineBasicBlock * +ARMTargetLowering::EmitLowered__dbzchk(MachineInstr *MI, + MachineBasicBlock *MBB) const { + DebugLoc DL = MI->getDebugLoc(); + MachineFunction *MF = MBB->getParent(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + + MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); + MF->push_back(ContBB); + ContBB->splice(ContBB->begin(), MBB, + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); + MBB->addSuccessor(ContBB); + + MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); + MF->push_back(TrapBB); + BuildMI(TrapBB, DL, TII->get(ARM::t2UDF)).addImm(249); + MBB->addSuccessor(TrapBB); + + BuildMI(*MBB, MI, DL, TII->get(ARM::tCBZ)) + .addReg(MI->getOperand(0).getReg()) + .addMBB(TrapBB); + + MI->eraseFromParent(); + return ContBB; +} + MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { @@ -7541,8 +7900,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // destination vreg to set, the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... @@ -7639,6 +7997,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case ARM::tInt_eh_sjlj_setjmp: case ARM::t2Int_eh_sjlj_setjmp: case ARM::t2Int_eh_sjlj_setjmp_nofp: + return BB; + + case ARM::Int_eh_sjlj_setup_dispatch: EmitSjLjDispatchBlock(MI, BB); return BB; @@ -7657,8 +8018,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) // SinkBB: V1 = PHI(V2, V3) const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator BBI = BB; - ++BBI; + MachineFunction::iterator BBI = ++BB->getIterator(); MachineFunction *Fn = BB->getParent(); MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); @@ -7722,11 +8082,46 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, return EmitStructByval(MI, BB); case ARM::WIN__CHKSTK: return EmitLowered__chkstk(MI, BB); + case ARM::WIN__DBZCHK: + return EmitLowered__dbzchk(MI, BB); + } +} + +/// \brief Attaches vregs to MEMCPY that it will use as scratch registers +/// when it is expanded into LDM/STM. This is done as a post-isel lowering +/// instead of as a custom inserter because we need the use list from the SDNode. +static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, + MachineInstr *MI, const SDNode *Node) { + bool isThumb1 = Subtarget->isThumb1Only(); + + DebugLoc DL = MI->getDebugLoc(); + MachineFunction *MF = MI->getParent()->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineInstrBuilder MIB(*MF, MI); + + // If the new dst/src is unused mark it as dead. + if (!Node->hasAnyUseOfValue(0)) { + MI->getOperand(0).setIsDead(true); + } + if (!Node->hasAnyUseOfValue(1)) { + MI->getOperand(1).setIsDead(true); + } + + // The MEMCPY both defines and kills the scratch registers. + for (unsigned I = 0; I != MI->getOperand(4).getImm(); ++I) { + unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass + : &ARM::GPRRegClass); + MIB.addReg(TmpReg, RegState::Define|RegState::Dead); } } void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const { + if (MI->getOpcode() == ARM::MEMCPY) { + attachMEMCPYScratchRegs(Subtarget, MI, Node); + return; + } + const MCInstrDesc *MCID = &MI->getDesc(); // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, // RSC. Coming out of isel, they have an implicit CPSR def, but the optional @@ -7796,10 +8191,7 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, // Helper function that checks if N is a null or all ones constant. static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { - ConstantSDNode *C = dyn_cast(N); - if (!C) - return false; - return AllOnes ? C->isAllOnesValue() : C->isNullValue(); + return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); } // Return true if N is conditionally 0 or all ones. @@ -8001,7 +8393,7 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1, // Build operand list. SmallVector Ops; Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, - TLI.getPointerTy())); + TLI.getPointerTy(DAG.getDataLayout()))); // Input is the vector. Ops.push_back(Vec); @@ -8621,12 +9013,88 @@ static SDValue PerformXORCombine(SDNode *N, return SDValue(); } -/// PerformBFICombine - (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff -/// the bits being cleared by the AND are not demanded by the BFI. +// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, +// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and +// their position in "to" (Rd). +static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { + assert(N->getOpcode() == ARMISD::BFI); + + SDValue From = N->getOperand(1); + ToMask = ~cast(N->getOperand(2))->getAPIntValue(); + FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); + + // If the Base came from a SHR #C, we can deduce that it is really testing bit + // #C in the base of the SHR. + if (From->getOpcode() == ISD::SRL && + isa(From->getOperand(1))) { + APInt Shift = cast(From->getOperand(1))->getAPIntValue(); + assert(Shift.getLimitedValue() < 32 && "Shift too large!"); + FromMask <<= Shift.getLimitedValue(31); + From = From->getOperand(0); + } + + return From; +} + +// If A and B contain one contiguous set of bits, does A | B == A . B? +// +// Neither A nor B must be zero. +static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { + unsigned LastActiveBitInA = A.countTrailingZeros(); + unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; + return LastActiveBitInA - 1 == FirstActiveBitInB; +} + +static SDValue FindBFIToCombineWith(SDNode *N) { + // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with, + // if one exists. + APInt ToMask, FromMask; + SDValue From = ParseBFI(N, ToMask, FromMask); + SDValue To = N->getOperand(0); + + // Now check for a compatible BFI to merge with. We can pass through BFIs that + // aren't compatible, but not if they set the same bit in their destination as + // we do (or that of any BFI we're going to combine with). + SDValue V = To; + APInt CombinedToMask = ToMask; + while (V.getOpcode() == ARMISD::BFI) { + APInt NewToMask, NewFromMask; + SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); + if (NewFrom != From) { + // This BFI has a different base. Keep going. + CombinedToMask |= NewToMask; + V = V.getOperand(0); + continue; + } + + // Do the written bits conflict with any we've seen so far? + if ((NewToMask & CombinedToMask).getBoolValue()) + // Conflicting bits - bail out because going further is unsafe. + return SDValue(); + + // Are the new bits contiguous when combined with the old bits? + if (BitsProperlyConcatenate(ToMask, NewToMask) && + BitsProperlyConcatenate(FromMask, NewFromMask)) + return V; + if (BitsProperlyConcatenate(NewToMask, ToMask) && + BitsProperlyConcatenate(NewFromMask, FromMask)) + return V; + + // We've seen a write to some bits, so track it. + CombinedToMask |= NewToMask; + // Keep going... + V = V.getOperand(0); + } + + return SDValue(); +} + static SDValue PerformBFICombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDValue N1 = N->getOperand(1); if (N1.getOpcode() == ISD::AND) { + // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff + // the bits being cleared by the AND are not demanded by the BFI. ConstantSDNode *N11C = dyn_cast(N1.getOperand(1)); if (!N11C) return SDValue(); @@ -8642,6 +9110,38 @@ static SDValue PerformBFICombine(SDNode *N, return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), N->getOperand(0), N1.getOperand(0), N->getOperand(2)); + } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) { + // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes. + // Keep track of any consecutive bits set that all come from the same base + // value. We can combine these together into a single BFI. + SDValue CombineBFI = FindBFIToCombineWith(N); + if (CombineBFI == SDValue()) + return SDValue(); + + // We've found a BFI. + APInt ToMask1, FromMask1; + SDValue From1 = ParseBFI(N, ToMask1, FromMask1); + + APInt ToMask2, FromMask2; + SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); + assert(From1 == From2); + (void)From2; + + // First, unlink CombineBFI. + DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0)); + // Then create a new BFI, combining the two together. + APInt NewFromMask = FromMask1 | FromMask2; + APInt NewToMask = ToMask1 | ToMask2; + + EVT VT = N->getValueType(0); + SDLoc dl(N); + + if (NewFromMask[0] == 0) + From1 = DCI.DAG.getNode( + ISD::SRL, dl, VT, From1, + DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); + return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1, + DCI.DAG.getConstant(~NewToMask, dl, VT)); } return SDValue(); } @@ -8681,7 +9181,7 @@ static SDValue PerformVMOVRRDCombine(SDNode *N, std::min(4U, LD->getAlignment() / 2)); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); - if (DCI.DAG.getTargetLoweringInfo().isBigEndian()) + if (DCI.DAG.getDataLayout().isBigEndian()) std::swap (NewLD1, NewLD2); SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); return Result; @@ -9312,7 +9812,9 @@ static SDValue PerformSTORECombine(SDNode *N, SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i < NumElems; ++i) - ShuffleVec[i] = TLI.isBigEndian() ? (i+1) * SizeRatio - 1 : i * SizeRatio; + ShuffleVec[i] = DAG.getDataLayout().isBigEndian() + ? (i + 1) * SizeRatio - 1 + : i * SizeRatio; // Can't shuffle using an illegal type. if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); @@ -9339,8 +9841,8 @@ static SDValue PerformSTORECombine(SDNode *N, assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); SmallVector Chains; - SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, DL, - TLI.getPointerTy()); + SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, + TLI.getPointerTy(DAG.getDataLayout())); SDValue BasePtr = St->getBasePtr(); // Perform one or more big stores into memory. @@ -9367,7 +9869,7 @@ static SDValue PerformSTORECombine(SDNode *N, if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && StVal.getNode()->hasOneUse()) { SelectionDAG &DAG = DCI.DAG; - bool isBigEndian = DAG.getTargetLoweringInfo().isBigEndian(); + bool isBigEndian = DAG.getDataLayout().isBigEndian(); SDLoc DL(St); SDValue BasePtr = St->getBasePtr(); SDValue NewST1 = DAG.getStore(St->getChain(), DL, @@ -9417,32 +9919,6 @@ static SDValue PerformSTORECombine(SDNode *N, return SDValue(); } -// isConstVecPow2 - Return true if each vector element is a power of 2, all -// elements are the same constant, C, and Log2(C) ranges from 1 to 32. -static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C) -{ - integerPart cN; - integerPart c0 = 0; - for (unsigned I = 0, E = ConstVec.getValueType().getVectorNumElements(); - I != E; I++) { - ConstantFPSDNode *C = dyn_cast(ConstVec.getOperand(I)); - if (!C) - return false; - - bool isExact; - APFloat APF = C->getValueAPF(); - if (APF.convertToInteger(&cN, 64, isSigned, APFloat::rmTowardZero, &isExact) - != APFloat::opOK || !isExact) - return false; - - c0 = (I == 0) ? cN : c0; - if (!isPowerOf2_64(cN) || c0 != cN || Log2_64(c0) < 1 || Log2_64(c0) > 32) - return false; - } - C = c0; - return true; -} - /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) /// can replace combinations of VMUL and VCVT (floating-point to integer) /// when the VMUL has a constant operand that is a power of 2. @@ -9452,30 +9928,25 @@ static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C) /// vcvt.s32.f32 d16, d16 /// becomes: /// vcvt.s32.f32 d16, d16, #3 -static SDValue PerformVCVTCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, +static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { - SelectionDAG &DAG = DCI.DAG; - SDValue Op = N->getOperand(0); + if (!Subtarget->hasNEON()) + return SDValue(); - if (!Subtarget->hasNEON() || !Op.getValueType().isVector() || - Op.getOpcode() != ISD::FMUL) + SDValue Op = N->getOperand(0); + if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL) return SDValue(); - uint64_t C; - SDValue N0 = Op->getOperand(0); SDValue ConstVec = Op->getOperand(1); - bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; - - if (ConstVec.getOpcode() != ISD::BUILD_VECTOR || - !isConstVecPow2(ConstVec, isSigned, C)) + if (!isa(ConstVec)) return SDValue(); MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); + uint32_t FloatBits = FloatTy.getSizeInBits(); MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); + uint32_t IntBits = IntTy.getSizeInBits(); unsigned NumLanes = Op.getValueType().getVectorNumElements(); - if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32 || - NumLanes > 4) { + if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { // These instructions only exist converting from f32 to i32. We can handle // smaller integers by generating an extra truncate, but larger ones would // be lossy. We also can't handle more then 4 lanes, since these intructions @@ -9483,16 +9954,22 @@ static SDValue PerformVCVTCombine(SDNode *N, return SDValue(); } + BitVector UndefElements; + BuildVectorSDNode *BV = cast(ConstVec); + int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); + if (C == -1 || C == 0 || C > 32) + return SDValue(); + SDLoc dl(N); + bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : Intrinsic::arm_neon_vcvtfp2fxu; - SDValue FixConv = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, - NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, - DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), - N0, - DAG.getConstant(Log2_64(C), dl, MVT::i32)); + SDValue FixConv = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, + DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), + DAG.getConstant(C, dl, MVT::i32)); - if (IntTy.getSizeInBits() < FloatTy.getSizeInBits()) + if (IntBits < FloatBits) FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); return FixConv; @@ -9507,38 +9984,44 @@ static SDValue PerformVCVTCombine(SDNode *N, /// vdiv.f32 d16, d17, d16 /// becomes: /// vcvt.f32.s32 d16, d16, #3 -static SDValue PerformVDIVCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, +static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { - SelectionDAG &DAG = DCI.DAG; + if (!Subtarget->hasNEON()) + return SDValue(); + SDValue Op = N->getOperand(0); unsigned OpOpcode = Op.getNode()->getOpcode(); - - if (!Subtarget->hasNEON() || !N->getValueType(0).isVector() || + if (!N->getValueType(0).isVector() || (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) return SDValue(); - uint64_t C; SDValue ConstVec = N->getOperand(1); - bool isSigned = OpOpcode == ISD::SINT_TO_FP; - - if (ConstVec.getOpcode() != ISD::BUILD_VECTOR || - !isConstVecPow2(ConstVec, isSigned, C)) + if (!isa(ConstVec)) return SDValue(); MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); + uint32_t FloatBits = FloatTy.getSizeInBits(); MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); - if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) { + uint32_t IntBits = IntTy.getSizeInBits(); + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { // These instructions only exist converting from i32 to f32. We can handle // smaller integers by generating an extra extend, but larger ones would - // be lossy. + // be lossy. We also can't handle more then 4 lanes, since these intructions + // only support v2i32/v4i32 types. return SDValue(); } + BitVector UndefElements; + BuildVectorSDNode *BV = cast(ConstVec); + int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); + if (C == -1 || C == 0 || C > 32) + return SDValue(); + SDLoc dl(N); + bool isSigned = OpOpcode == ISD::SINT_TO_FP; SDValue ConvInput = Op.getOperand(0); - unsigned NumLanes = Op.getValueType().getVectorNumElements(); - if (IntTy.getSizeInBits() < FloatTy.getSizeInBits()) + if (IntBits < FloatBits) ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput); @@ -9548,7 +10031,7 @@ static SDValue PerformVDIVCombine(SDNode *N, return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), - ConvInput, DAG.getConstant(Log2_64(C), dl, MVT::i32)); + ConvInput, DAG.getConstant(C, dl, MVT::i32)); } /// Getvshiftimm - Check if this is a valid build_vector for the immediate @@ -9576,7 +10059,7 @@ static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { /// 0 <= Value <= ElementBits for a long left shift. static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); - unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); if (! getVShiftImm(Op, ElementBits, Cnt)) return false; return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits); @@ -9591,12 +10074,16 @@ static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); - unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); if (! getVShiftImm(Op, ElementBits, Cnt)) return false; - if (isIntrinsic) + if (!isIntrinsic) + return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); + if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) { Cnt = -Cnt; - return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); + return true; + } + return false; } /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. @@ -9607,6 +10094,15 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { // Don't do anything for most intrinsics. break; + case Intrinsic::arm_neon_vabds: + if (!N->getValueType(0).isInteger()) + return SDValue(); + return DAG.getNode(ISD::SABSDIFF, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + case Intrinsic::arm_neon_vabdu: + return DAG.getNode(ISD::UABSDIFF, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + // Vector shifts: check for immediate versions and lower them. // Note: This is done during DAG combining instead of DAG legalizing because // the build_vectors for 64-bit vector element shift counts are generally @@ -9835,89 +10331,123 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// PerformSELECT_CCCombine - Target-specific DAG combining for ISD::SELECT_CC -/// to match f32 max/min patterns to use NEON vmax/vmin instructions. -static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *ST) { - // If the target supports NEON, try to use vmax/vmin instructions for f32 - // selects like "x < y ? x : y". Unless the NoNaNsFPMath option is set, - // be careful about NaNs: NEON's vmax/vmin return NaN if either operand is - // a NaN; only do the transformation when it matches that behavior. - - // For now only do this when using NEON for FP operations; if using VFP, it - // is not obvious that the benefit outweighs the cost of switching to the - // NEON pipeline. - if (!ST->hasNEON() || !ST->useNEONForSinglePrecisionFP() || - N->getValueType(0) != MVT::f32) +static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero, + APInt &KnownOne) { + if (Op.getOpcode() == ARMISD::BFI) { + // Conservatively, we can recurse down the first operand + // and just mask out all affected bits. + computeKnownBits(DAG, Op.getOperand(0), KnownZero, KnownOne); + + // The operand to BFI is already a mask suitable for removing the bits it + // sets. + ConstantSDNode *CI = cast(Op.getOperand(2)); + APInt Mask = CI->getAPIntValue(); + KnownZero &= Mask; + KnownOne &= Mask; + return; + } + if (Op.getOpcode() == ARMISD::CMOV) { + APInt KZ2(KnownZero.getBitWidth(), 0); + APInt KO2(KnownOne.getBitWidth(), 0); + computeKnownBits(DAG, Op.getOperand(1), KnownZero, KnownOne); + computeKnownBits(DAG, Op.getOperand(2), KZ2, KO2); + + KnownZero &= KZ2; + KnownOne &= KO2; + return; + } + return DAG.computeKnownBits(Op, KnownZero, KnownOne); +} + +SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { + // If we have a CMOV, OR and AND combination such as: + // if (x & CN) + // y |= CM; + // + // And: + // * CN is a single bit; + // * All bits covered by CM are known zero in y + // + // Then we can convert this into a sequence of BFI instructions. This will + // always be a win if CM is a single bit, will always be no worse than the + // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is + // three bits (due to the extra IT instruction). + + SDValue Op0 = CMOV->getOperand(0); + SDValue Op1 = CMOV->getOperand(1); + auto CCNode = cast(CMOV->getOperand(2)); + auto CC = CCNode->getAPIntValue().getLimitedValue(); + SDValue CmpZ = CMOV->getOperand(4); + + // The compare must be against zero. + if (!isNullConstant(CmpZ->getOperand(1))) return SDValue(); - SDValue CondLHS = N->getOperand(0); - SDValue CondRHS = N->getOperand(1); - SDValue LHS = N->getOperand(2); - SDValue RHS = N->getOperand(3); - ISD::CondCode CC = cast(N->getOperand(4))->get(); - - unsigned Opcode = 0; - bool IsReversed; - if (DAG.isEqualTo(LHS, CondLHS) && DAG.isEqualTo(RHS, CondRHS)) { - IsReversed = false; // x CC y ? x : y - } else if (DAG.isEqualTo(LHS, CondRHS) && DAG.isEqualTo(RHS, CondLHS)) { - IsReversed = true ; // x CC y ? y : x - } else { + assert(CmpZ->getOpcode() == ARMISD::CMPZ); + SDValue And = CmpZ->getOperand(0); + if (And->getOpcode() != ISD::AND) + return SDValue(); + ConstantSDNode *AndC = dyn_cast(And->getOperand(1)); + if (!AndC || !AndC->getAPIntValue().isPowerOf2()) return SDValue(); + SDValue X = And->getOperand(0); + + if (CC == ARMCC::EQ) { + // We're performing an "equal to zero" compare. Swap the operands so we + // canonicalize on a "not equal to zero" compare. + std::swap(Op0, Op1); + } else { + assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); } + + if (Op1->getOpcode() != ISD::OR) + return SDValue(); - bool IsUnordered; - switch (CC) { - default: break; - case ISD::SETOLT: - case ISD::SETOLE: - case ISD::SETLT: - case ISD::SETLE: - case ISD::SETULT: - case ISD::SETULE: - // If LHS is NaN, an ordered comparison will be false and the result will - // be the RHS, but vmin(NaN, RHS) = NaN. Avoid this by checking that LHS - // != NaN. Likewise, for unordered comparisons, check for RHS != NaN. - IsUnordered = (CC == ISD::SETULT || CC == ISD::SETULE); - if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS)) - break; - // For less-than-or-equal comparisons, "+0 <= -0" will be true but vmin - // will return -0, so vmin can only be used for unsafe math or if one of - // the operands is known to be nonzero. - if ((CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE) && - !DAG.getTarget().Options.UnsafeFPMath && - !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) - break; - Opcode = IsReversed ? ARMISD::FMAX : ARMISD::FMIN; - break; + ConstantSDNode *OrC = dyn_cast(Op1->getOperand(1)); + if (!OrC) + return SDValue(); + SDValue Y = Op1->getOperand(0); - case ISD::SETOGT: - case ISD::SETOGE: - case ISD::SETGT: - case ISD::SETGE: - case ISD::SETUGT: - case ISD::SETUGE: - // If LHS is NaN, an ordered comparison will be false and the result will - // be the RHS, but vmax(NaN, RHS) = NaN. Avoid this by checking that LHS - // != NaN. Likewise, for unordered comparisons, check for RHS != NaN. - IsUnordered = (CC == ISD::SETUGT || CC == ISD::SETUGE); - if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS)) - break; - // For greater-than-or-equal comparisons, "-0 >= +0" will be true but vmax - // will return +0, so vmax can only be used for unsafe math or if one of - // the operands is known to be nonzero. - if ((CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE) && - !DAG.getTarget().Options.UnsafeFPMath && - !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) - break; - Opcode = IsReversed ? ARMISD::FMIN : ARMISD::FMAX; - break; - } + if (Op0 != Y) + return SDValue(); + + // Now, is it profitable to continue? + APInt OrCI = OrC->getAPIntValue(); + unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; + if (OrCI.countPopulation() > Heuristic) + return SDValue(); - if (!Opcode) + // Lastly, can we determine that the bits defined by OrCI + // are zero in Y? + APInt KnownZero, KnownOne; + computeKnownBits(DAG, Y, KnownZero, KnownOne); + if ((OrCI & KnownZero) != OrCI) return SDValue(); - return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS); + + // OK, we can do the combine. + SDValue V = Y; + SDLoc dl(X); + EVT VT = X.getValueType(); + unsigned BitInX = AndC->getAPIntValue().logBase2(); + + if (BitInX != 0) { + // We must shift X first. + X = DAG.getNode(ISD::SRL, dl, VT, X, + DAG.getConstant(BitInX, dl, VT)); + } + + for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); + BitInY < NumActiveBits; ++BitInY) { + if (OrCI[BitInY] == 0) + continue; + APInt Mask(VT.getSizeInBits(), 0); + Mask.setBit(BitInY); + V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, + // Confusingly, the operand is an *inverted* mask. + DAG.getConstant(~Mask, dl, VT)); + } + + return V; } /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. @@ -9938,6 +10468,13 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { ARMCC::CondCodes CC = (ARMCC::CondCodes)cast(ARMcc)->getZExtValue(); + // BFI is only available on V6T2+. + if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { + SDValue R = PerformCMOVToBFICombine(N, DAG); + if (R) + return R; + } + // Simplify // mov r1, r0 // cmp r1, x @@ -10004,8 +10541,10 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); case ISD::FP_TO_SINT: - case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI, Subtarget); - case ISD::FDIV: return PerformVDIVCombine(N, DCI, Subtarget); + case ISD::FP_TO_UINT: + return PerformVCVTCombine(N, DCI.DAG, Subtarget); + case ISD::FDIV: + return PerformVDIVCombine(N, DCI.DAG, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); case ISD::SHL: case ISD::SRA: @@ -10013,7 +10552,6 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); - case ISD::SELECT_CC: return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget); case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); case ISD::LOAD: return PerformLOADCombine(N, DCI); case ARMISD::VLD2DUP: @@ -10078,7 +10616,7 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, // For any little-endian targets with neon, we can support unaligned ld/st // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. // A big-endian target may also explicitly support unaligned accesses - if (Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian())) { + if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { if (Fast) *Fast = true; return true; @@ -10317,10 +10855,10 @@ bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. -bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM, - Type *Ty, +bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, unsigned AS) const { - EVT VT = getValueType(Ty, true); + EVT VT = getValueType(DL, Ty, true); if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) return false; @@ -10939,40 +11477,64 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } -SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { - assert(Subtarget->isTargetAEABI() && "Register-based DivRem lowering only"); - unsigned Opcode = Op->getOpcode(); - assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && - "Invalid opcode for Div/Rem lowering"); - bool isSigned = (Opcode == ISD::SDIVREM); - EVT VT = Op->getValueType(0); - Type *Ty = VT.getTypeForEVT(*DAG.getContext()); - +static RTLIB::Libcall getDivRemLibcall( + const SDNode *N, MVT::SimpleValueType SVT) { + assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || + N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && + "Unhandled Opcode in getDivRemLibcall"); + bool isSigned = N->getOpcode() == ISD::SDIVREM || + N->getOpcode() == ISD::SREM; RTLIB::Libcall LC; - switch (VT.getSimpleVT().SimpleTy) { + switch (SVT) { default: llvm_unreachable("Unexpected request for libcall!"); case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; } + return LC; +} - SDValue InChain = DAG.getEntryNode(); - +static TargetLowering::ArgListTy getDivRemArgList( + const SDNode *N, LLVMContext *Context) { + assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || + N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && + "Unhandled Opcode in getDivRemArgList"); + bool isSigned = N->getOpcode() == ISD::SDIVREM || + N->getOpcode() == ISD::SREM; TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; - for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { - EVT ArgVT = Op->getOperand(i).getValueType(); - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - Entry.Node = Op->getOperand(i); + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + EVT ArgVT = N->getOperand(i).getValueType(); + Type *ArgTy = ArgVT.getTypeForEVT(*Context); + Entry.Node = N->getOperand(i); Entry.Ty = ArgTy; Entry.isSExt = isSigned; Entry.isZExt = !isSigned; Args.push_back(Entry); } + return Args; +} + +SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { + assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) && + "Register-based DivRem lowering only"); + unsigned Opcode = Op->getOpcode(); + assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && + "Invalid opcode for Div/Rem lowering"); + bool isSigned = (Opcode == ISD::SDIVREM); + EVT VT = Op->getValueType(0); + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + + RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), + VT.getSimpleVT().SimpleTy); + SDValue InChain = DAG.getEntryNode(); + + TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), + DAG.getContext()); SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), - getPointerTy()); + getPointerTy(DAG.getDataLayout())); Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr); @@ -10986,6 +11548,47 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { return CallInfo.first; } +// Lowers REM using divmod helpers +// see RTABI section 4.2/4.3 +SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { + // Build return types (div and rem) + std::vector RetTyParams; + Type *RetTyElement; + + switch (N->getValueType(0).getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unexpected request for libcall!"); + case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; + case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; + case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; + case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; + } + + RetTyParams.push_back(RetTyElement); + RetTyParams.push_back(RetTyElement); + ArrayRef ret = ArrayRef(RetTyParams); + Type *RetTy = StructType::get(*DAG.getContext(), ret); + + RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). + SimpleTy); + SDValue InChain = DAG.getEntryNode(); + TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext()); + bool isSigned = N->getOpcode() == ISD::SREM; + SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), + getPointerTy(DAG.getDataLayout())); + + // Lower call + CallLoweringInfo CLI(DAG); + CLI.setChain(InChain) + .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args), 0) + .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); + std::pair CallResult = LowerCallTo(CLI); + + // Return second (rem) result operand (first contains div) + SDNode *ResNode = CallResult.first.getNode(); + assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); + return ResNode->getOperand(1); +} + SDValue ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "unsupported target platform"); @@ -11020,8 +11623,8 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); SDValue SrcVal = Op.getOperand(0); - return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1, - /*isSigned*/ false, SDLoc(Op)).first; + return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, + SDLoc(Op)).first; } SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { @@ -11033,8 +11636,8 @@ SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); SDValue SrcVal = Op.getOperand(0); - return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1, - /*isSigned*/ false, SDLoc(Op)).first; + return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, + SDLoc(Op)).first; } bool @@ -11081,7 +11684,8 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::arm_neon_vld4lane: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. - uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8; + auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); + uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8; Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; @@ -11101,12 +11705,13 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::arm_neon_vst4lane: { Info.opc = ISD::INTRINSIC_VOID; // Conservatively set memVT to the entire set of vectors stored. + auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); unsigned NumElts = 0; for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; - NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8; + NumElts += DL.getTypeAllocSize(ArgTy) / 8; } Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); @@ -11120,12 +11725,13 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, } case Intrinsic::arm_ldaex: case Intrinsic::arm_ldrex: { + auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); PointerType *PtrTy = cast(I.getArgOperand(0)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); + Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); Info.vol = true; Info.readMem = true; Info.writeMem = false; @@ -11133,12 +11739,13 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, } case Intrinsic::arm_stlex: case Intrinsic::arm_strex: { + auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); PointerType *PtrTy = cast(I.getArgOperand(1)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); + Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); Info.vol = true; Info.readMem = false; Info.writeMem = true; @@ -11187,8 +11794,6 @@ bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, return true; } -bool ARMTargetLowering::hasLoadLinkedStoreConditional() const { return true; } - Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); @@ -11284,19 +11889,26 @@ bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that // guarantee, see DDI0406C ARM architecture reference manual, // sections A8.8.72-74 LDRD) -bool ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { +TargetLowering::AtomicExpansionKind +ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { unsigned Size = LI->getType()->getPrimitiveSizeInBits(); - return (Size == 64) && !Subtarget->isMClass(); + return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLSC + : AtomicExpansionKind::None; } // For the real atomic operations, we have ldrex/strex up to 32 bits, // and up to 64 bits on the non-M profiles -TargetLoweringBase::AtomicRMWExpansionKind +TargetLowering::AtomicExpansionKind ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); return (Size <= (Subtarget->isMClass() ? 32U : 64U)) - ? AtomicRMWExpansionKind::LLSC - : AtomicRMWExpansionKind::None; + ? AtomicExpansionKind::LLSC + : AtomicExpansionKind::None; +} + +bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR( + AtomicCmpXchgInst *AI) const { + return true; } // This has so far only been implemented for MachO. @@ -11311,7 +11923,7 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, return false; // Floating point values and vector values map to the same register file. - // Therefore, althought we could do a store extract of a vector type, this is + // Therefore, although we could do a store extract of a vector type, this is // better to leave at float as we have more freedom in the addressing mode for // those. if (VectorTy->isFPOrFPVectorTy()) @@ -11333,6 +11945,14 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, return false; } +bool ARMTargetLowering::isCheapToSpeculateCttz() const { + return Subtarget->hasV6T2Ops(); +} + +bool ARMTargetLowering::isCheapToSpeculateCtlz() const { + return Subtarget->hasV6T2Ops(); +} + Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); @@ -11369,6 +11989,14 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, cast(Addr->getType())->getElementType()); } +void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( + IRBuilder<> &Builder) const { + if (!Subtarget->hasV7Ops()) + return; + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Builder.CreateCall(llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); +} + Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { @@ -11425,28 +12053,25 @@ bool ARMTargetLowering::lowerInterleavedLoad( VectorType *VecTy = Shuffles[0]->getType(); Type *EltTy = VecTy->getVectorElementType(); - const DataLayout *DL = getDataLayout(); - unsigned VecSize = DL->getTypeAllocSizeInBits(VecTy); - bool EltIs64Bits = DL->getTypeAllocSizeInBits(EltTy) == 64; + const DataLayout &DL = LI->getModule()->getDataLayout(); + unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy); + bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64; - // Skip illegal vector types and vector types of i64/f64 element (vldN doesn't - // support i64/f64 element). - if ((VecSize != 64 && VecSize != 128) || EltIs64Bits) + // Skip if we do not have NEON and skip illegal vector types and vector types + // with i64/f64 elements (vldN doesn't support i64/f64 elements). + if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits) return false; // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. if (EltTy->isPointerTy()) - VecTy = VectorType::get(DL->getIntPtrType(EltTy), - VecTy->getVectorNumElements()); + VecTy = + VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, Intrinsic::arm_neon_vld3, Intrinsic::arm_neon_vld4}; - Function *VldnFunc = - Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], VecTy); - IRBuilder<> Builder(LI); SmallVector Ops; @@ -11454,6 +12079,9 @@ bool ARMTargetLowering::lowerInterleavedLoad( Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr)); Ops.push_back(Builder.getInt32(LI->getAlignment())); + Type *Tys[] = { VecTy, Int8Ptr }; + Function *VldnFunc = + Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); // Replace uses of each shufflevector with the corresponding vector loaded @@ -11515,13 +12143,14 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, Type *EltTy = VecTy->getVectorElementType(); VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); - const DataLayout *DL = getDataLayout(); - unsigned SubVecSize = DL->getTypeAllocSizeInBits(SubVecTy); - bool EltIs64Bits = DL->getTypeAllocSizeInBits(EltTy) == 64; + const DataLayout &DL = SI->getModule()->getDataLayout(); + unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy); + bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64; - // Skip illegal sub vector types and vector types of i64/f64 element (vstN - // doesn't support i64/f64 element). - if ((SubVecSize != 64 && SubVecSize != 128) || EltIs64Bits) + // Skip if we do not have NEON and skip illegal vector types and vector types + // with i64/f64 elements (vstN doesn't support i64/f64 elements). + if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) || + EltIs64Bits) return false; Value *Op0 = SVI->getOperand(0); @@ -11531,7 +12160,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, // StN intrinsics don't support pointer vectors as arguments. Convert pointer // vectors to integer vectors. if (EltTy->isPointerTy()) { - Type *IntTy = DL->getIntPtrType(EltTy); + Type *IntTy = DL.getIntPtrType(EltTy); // Convert to the corresponding integer vector. Type *IntVecTy = @@ -11542,17 +12171,18 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, SubVecTy = VectorType::get(IntTy, NumSubElts); } - static Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, - Intrinsic::arm_neon_vst3, - Intrinsic::arm_neon_vst4}; - Function *VstNFunc = Intrinsic::getDeclaration( - SI->getModule(), StoreInts[Factor - 2], SubVecTy); - + static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, + Intrinsic::arm_neon_vst3, + Intrinsic::arm_neon_vst4}; SmallVector Ops; Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr)); + Type *Tys[] = { Int8Ptr, SubVecTy }; + Function *VstNFunc = Intrinsic::getDeclaration( + SI->getModule(), StoreInts[Factor - 2], Tys); + // Split the shufflevector operands into sub vectors for the new vstN call. for (unsigned i = 0; i < Factor; i++) Ops.push_back(Builder.CreateShuffleVector( @@ -11573,14 +12203,14 @@ enum HABaseType { static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members) { - if (const StructType *ST = dyn_cast(Ty)) { + if (auto *ST = dyn_cast(Ty)) { for (unsigned i = 0; i < ST->getNumElements(); ++i) { uint64_t SubMembers = 0; if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) return false; Members += SubMembers; } - } else if (const ArrayType *AT = dyn_cast(Ty)) { + } else if (auto *AT = dyn_cast(Ty)) { uint64_t SubMembers = 0; if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) return false; @@ -11595,7 +12225,7 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, return false; Members = 1; Base = HA_DOUBLE; - } else if (const VectorType *VT = dyn_cast(Ty)) { + } else if (auto *VT = dyn_cast(Ty)) { Members = 1; switch (Base) { case HA_FLOAT: @@ -11639,3 +12269,17 @@ bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); return IsHA || IsIntArray; } + +unsigned ARMTargetLowering::getExceptionPointerRegister( + const Constant *PersonalityFn) const { + // Platforms which do not use SjLj EH may return values in these registers + // via the personality function. + return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0; +} + +unsigned ARMTargetLowering::getExceptionSelectorRegister( + const Constant *PersonalityFn) const { + // Platforms which do not use SjLj EH may return values in these registers + // via the personality function. + return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; +}