X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FARM%2FARMISelLowering.cpp;h=b512db5551e23f34120a92185de4efafabae0c63;hb=b0c513b9ba3d0a17c58ca91cd71b1c420ca0eec6;hp=2ef6ab48c5fcf18520fa76f7cd3ec7211307eaab;hpb=21c0aa74bdeae6303204c9b0c2fc154562fbb373;p=oota-llvm.git diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 2ef6ab48c5f..b512db5551e 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -14,7 +14,6 @@ #define DEBUG_TYPE "arm-isel" #include "ARMISelLowering.h" -#include "ARM.h" #include "ARMCallingConv.h" #include "ARMConstantPoolValue.h" #include "ARMMachineFunctionInfo.h" @@ -46,20 +45,14 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" +#include using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); -// This option should go away when tail calls fully work. -static cl::opt -EnableARMTailCalls("arm-tail-calls", cl::Hidden, - cl::desc("Generate tail calls (TEMPORARY OPTION)."), - cl::init(false)); - cl::opt EnableARMLongCalls("arm-long-calls", cl::Hidden, cl::desc("Generate calls via indirect call instructions"), @@ -74,7 +67,7 @@ namespace { class ARMCCState : public CCState { public: ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF, - const TargetMachine &TM, SmallVector &locs, + const TargetMachine &TM, SmallVectorImpl &locs, LLVMContext &C, ParmContext PC) : CCState(CC, isVarArg, MF, TM, locs, C) { assert(((PC == Call) || (PC == Prologue)) && @@ -155,12 +148,12 @@ void ARMTargetLowering::addDRTypeForNEON(MVT VT) { } void ARMTargetLowering::addQRTypeForNEON(MVT VT) { - addRegisterClass(VT, &ARM::QPRRegClass); + addRegisterClass(VT, &ARM::DPairRegClass); addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); } static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) { - if (TM.getSubtarget().isTargetDarwin()) + if (TM.getSubtarget().isTargetMachO()) return new TargetLoweringObjectFileMachO(); return new ARMElfTargetObjectFile(); @@ -174,9 +167,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); - if (Subtarget->isTargetDarwin()) { + if (Subtarget->isTargetMachO()) { // Uses VFP for Thumb libfuncs if available. - if (Subtarget->isThumb() && Subtarget->hasVFP2()) { + if (Subtarget->isThumb() && Subtarget->hasVFP2() && + Subtarget->hasARMOps()) { // Single-precision floating-point arithmetic. setLibcallName(RTLIB::ADD_F32, "__addsf3vfp"); setLibcallName(RTLIB::SUB_F32, "__subsf3vfp"); @@ -256,7 +250,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setLibcallName(RTLIB::SRL_I128, 0); setLibcallName(RTLIB::SRA_I128, 0); - if (Subtarget->isAAPCS_ABI() && !Subtarget->isTargetDarwin()) { + if (Subtarget->isAAPCS_ABI() && !Subtarget->isTargetMachO()) { // Double-precision floating-point arithmetic helper functions // RTABI chapter 4.1.2, Table 2 setLibcallName(RTLIB::ADD_F64, "__aeabi_dadd"); @@ -421,7 +415,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) } // Use divmod compiler-rt calls for iOS 5.0 and later. - if (Subtarget->getTargetTriple().getOS() == Triple::IOS && + if (Subtarget->getTargetTriple().isiOS() && !Subtarget->getTargetTriple().isOSVersionLT(5, 0)) { setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); @@ -452,6 +446,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) } setOperationAction(ISD::ConstantFP, MVT::f32, Custom); + setOperationAction(ISD::ConstantFP, MVT::f64, Custom); if (Subtarget->hasNEON()) { addDRTypeForNEON(MVT::v2f32); @@ -671,6 +666,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i32 , Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i32 , Expand); + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); + // Only ARMv6 has BSWAP. if (!Subtarget->hasV6Ops()) setOperationAction(ISD::BSWAP, MVT::i32, Expand); @@ -681,10 +678,36 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::SDIV, MVT::i32, Expand); setOperationAction(ISD::UDIV, MVT::i32, Expand); } + + // FIXME: Also set divmod for SREM on EABI setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i32, Expand); - setOperationAction(ISD::SDIVREM, MVT::i32, Expand); - setOperationAction(ISD::UDIVREM, MVT::i32, Expand); + // Register based DivRem for AEABI (RTABI 4.2) + if (Subtarget->isTargetAEABI()) { + setLibcallName(RTLIB::SDIVREM_I8, "__aeabi_idivmod"); + setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod"); + setLibcallName(RTLIB::SDIVREM_I32, "__aeabi_idivmod"); + setLibcallName(RTLIB::SDIVREM_I64, "__aeabi_ldivmod"); + setLibcallName(RTLIB::UDIVREM_I8, "__aeabi_uidivmod"); + setLibcallName(RTLIB::UDIVREM_I16, "__aeabi_uidivmod"); + setLibcallName(RTLIB::UDIVREM_I32, "__aeabi_uidivmod"); + setLibcallName(RTLIB::UDIVREM_I64, "__aeabi_uldivmod"); + + setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS); + + setOperationAction(ISD::SDIVREM, MVT::i32, Custom); + setOperationAction(ISD::UDIVREM, MVT::i32, Custom); + } else { + setOperationAction(ISD::SDIVREM, MVT::i32, Expand); + setOperationAction(ISD::UDIVREM, MVT::i32, Expand); + } setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::ConstantPool, MVT::i32, Custom); @@ -702,11 +725,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - if (!Subtarget->isTargetDarwin()) { - // Non-Darwin platforms may return values in these registers via the + if (!Subtarget->isTargetMachO()) { + // Non-MachO platforms may return values in these registers via the // personality function. - setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); - setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); setExceptionPointerRegister(ARM::R0); setExceptionSelectorRegister(ARM::R1); } @@ -714,13 +735,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use // the default expansion. - // FIXME: This should be checking for v6k, not just v6. - if (Subtarget->hasDataBarrier() || - (Subtarget->hasV6Ops() && !Subtarget->isThumb())) { - // membarrier needs custom lowering; the rest are legal and handled - // normally. - setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom); - setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); + if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) { + // ATOMIC_FENCE needs custom lowering; the other 32-bit ones are legal and + // handled normally. + setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); // Custom lowering for 64-bit ops setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); @@ -733,12 +751,20 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); - // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc. - setInsertFencesForAtomic(true); + // On v8, we have particularly efficient implementations of atomic fences + // if they can be combined with nearby atomic loads and stores. + if (!Subtarget->hasV8Ops()) { + // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc. + setInsertFencesForAtomic(true); + } + setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); } else { + // If there's anything we can use as a barrier, go through custom lowering + // for ATOMIC_FENCE. + setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, + Subtarget->hasAnyDataBarrier() ? Custom : Expand); + // Set them all for expansion, which will force libcalls. - setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand); - setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand); setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); @@ -755,8 +781,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) // Unordered/Monotonic case. setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); - // Since the libcalls include locking, fold in the fences - setShouldFoldAtomicFences(true); } setOperationAction(ISD::PREFETCH, MVT::Other, Custom); @@ -837,6 +861,18 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::FP32_TO_FP16, MVT::i32, Expand); } } + + // Combine sin / cos into one node or libcall if possible. + if (Subtarget->hasSinCos()) { + setLibcallName(RTLIB::SINCOS_F32, "sincosf"); + setLibcallName(RTLIB::SINCOS_F64, "sincos"); + if (Subtarget->getTargetTriple().getOS() == Triple::IOS) { + // For iOS, we don't want to the normal expansion of a libcall to + // sincos. We want to issue a libcall to __sincos_stret. + setOperationAction(ISD::FSINCOS, MVT::f64, Custom); + setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + } + } // We have target-specific dag combine patterns for the following nodes: // ARMISD::VMOVRRD - No need to call setTargetDAGCombine @@ -870,14 +906,50 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) // are at least 4 bytes aligned. setMinStackArgumentAlignment(4); - BenefitFromCodePlacementOpt = true; - // Prefer likely predicted branches to selects on out-of-order cores. PredictableSelectIsExpensive = Subtarget->isLikeA9(); setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); } +static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord, + bool isThumb2, unsigned &LdrOpc, + unsigned &StrOpc) { + static const unsigned LoadBares[4][2] = {{ARM::LDREXB, ARM::t2LDREXB}, + {ARM::LDREXH, ARM::t2LDREXH}, + {ARM::LDREX, ARM::t2LDREX}, + {ARM::LDREXD, ARM::t2LDREXD}}; + static const unsigned LoadAcqs[4][2] = {{ARM::LDAEXB, ARM::t2LDAEXB}, + {ARM::LDAEXH, ARM::t2LDAEXH}, + {ARM::LDAEX, ARM::t2LDAEX}, + {ARM::LDAEXD, ARM::t2LDAEXD}}; + static const unsigned StoreBares[4][2] = {{ARM::STREXB, ARM::t2STREXB}, + {ARM::STREXH, ARM::t2STREXH}, + {ARM::STREX, ARM::t2STREX}, + {ARM::STREXD, ARM::t2STREXD}}; + static const unsigned StoreRels[4][2] = {{ARM::STLEXB, ARM::t2STLEXB}, + {ARM::STLEXH, ARM::t2STLEXH}, + {ARM::STLEX, ARM::t2STLEX}, + {ARM::STLEXD, ARM::t2STLEXD}}; + + const unsigned (*LoadOps)[2], (*StoreOps)[2]; + if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent) + LoadOps = LoadAcqs; + else + LoadOps = LoadBares; + + if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent) + StoreOps = StoreRels; + else + StoreOps = StoreBares; + + assert(isPowerOf2_32(Size) && Size <= 8 && + "unsupported size for atomic binary op!"); + + LdrOpc = LoadOps[Log2_32(Size)][isThumb2]; + StrOpc = StoreOps[Log2_32(Size)][isThumb2]; +} + // FIXME: It might make sense to define the representative register class as the // nearest super-register that has a non-null superset. For example, DPR_VFP2 is // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, @@ -929,7 +1001,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { switch (Opcode) { default: return 0; case ARMISD::Wrapper: return "ARMISD::Wrapper"; - case ARMISD::WrapperDYN: return "ARMISD::WrapperDYN"; case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; case ARMISD::CALL: return "ARMISD::CALL"; @@ -940,6 +1011,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::BR_JT: return "ARMISD::BR_JT"; case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; + case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG"; case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; case ARMISD::CMP: return "ARMISD::CMP"; case ARMISD::CMN: return "ARMISD::CMN"; @@ -979,7 +1051,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; - case ARMISD::MEMBARRIER: return "ARMISD::MEMBARRIER"; case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; @@ -999,10 +1070,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VSHL: return "ARMISD::VSHL"; case ARMISD::VSHRs: return "ARMISD::VSHRs"; case ARMISD::VSHRu: return "ARMISD::VSHRu"; - case ARMISD::VSHLLs: return "ARMISD::VSHLLs"; - case ARMISD::VSHLLu: return "ARMISD::VSHLLu"; - case ARMISD::VSHLLi: return "ARMISD::VSHLLi"; - case ARMISD::VSHRN: return "ARMISD::VSHRN"; case ARMISD::VRSHRs: return "ARMISD::VRSHRs"; case ARMISD::VRSHRu: return "ARMISD::VRSHRu"; case ARMISD::VRSHRN: return "ARMISD::VRSHRN"; @@ -1038,6 +1105,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; case ARMISD::FMAX: return "ARMISD::FMAX"; case ARMISD::FMIN: return "ARMISD::FMIN"; + case ARMISD::VMAXNM: return "ARMISD::VMAX"; + case ARMISD::VMINNM: return "ARMISD::VMIN"; case ARMISD::BFI: return "ARMISD::BFI"; case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; @@ -1065,7 +1134,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { } } -EVT ARMTargetLowering::getSetCCResultType(EVT VT) const { +EVT ARMTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { if (!VT.isVector()) return getPointerTy(); return VT.changeVectorElementTypeToInteger(); } @@ -1229,8 +1298,9 @@ SDValue ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, - DebugLoc dl, SelectionDAG &DAG, - SmallVectorImpl &InVals) const { + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl &InVals, + bool isThisReturn, SDValue ThisVal) const { // Assign locations to each value returned by this call. SmallVector RVLocs; @@ -1244,6 +1314,15 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign VA = RVLocs[i]; + // Pass 'this' value directly from the argument to return value, to avoid + // reg unit interference + if (i == 0 && isThisReturn) { + assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && + "unexpected return calling convention register assignment"); + InVals.push_back(ThisVal); + continue; + } + SDValue Val; if (VA.needsCustom()) { // Handle f64 or half of a v2f64. @@ -1300,7 +1379,7 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); @@ -1311,12 +1390,12 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, false, false, 0); } -void ARMTargetLowering::PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG, +void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg, RegsToPassVector &RegsToPass, CCValAssign &VA, CCValAssign &NextVA, SDValue &StackPtr, - SmallVector &MemOpChains, + SmallVectorImpl &MemOpChains, ISD::ArgFlagsTy Flags) const { SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, @@ -1343,10 +1422,10 @@ SDValue ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; - DebugLoc &dl = CLI.DL; - SmallVector &Outs = CLI.Outs; - SmallVector &OutVals = CLI.OutVals; - SmallVector &Ins = CLI.Ins; + SDLoc &dl = CLI.DL; + SmallVectorImpl &Outs = CLI.Outs; + SmallVectorImpl &OutVals = CLI.OutVals; + SmallVectorImpl &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; bool &isTailCall = CLI.IsTailCall; @@ -1355,21 +1434,24 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool isVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); - bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); - bool IsSibCall = false; + bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); + bool isThisReturn = false; + bool isSibCall = false; + // Disable tail calls if they're not supported. - if (!EnableARMTailCalls && !Subtarget->supportsTailCall()) + if (!Subtarget->supportsTailCall() || MF.getTarget().Options.DisableTailCalls) isTailCall = false; + if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, - isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), + isVarArg, isStructRet, MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG); // We don't support GuaranteedTailCallOpt for ARM, only automatically // detected sibcalls. if (isTailCall) { ++NumTailCalls; - IsSibCall = true; + isSibCall = true; } } @@ -1385,13 +1467,14 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned NumBytes = CCInfo.getNextStackOffset(); // For tail calls, memory operands are available in our caller's stack. - if (IsSibCall) + if (isSibCall) NumBytes = 0; // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - if (!IsSibCall) - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); + if (!isSibCall) + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), + dl); SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); @@ -1452,6 +1535,13 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, StackPtr, MemOpChains, Flags); } } else if (VA.isRegLoc()) { + if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i32) { + assert(VA.getLocVT() == MVT::i32 && + "unexpected calling convention register assignment"); + assert(!Ins.empty() && Ins[0].VT == MVT::i32 && + "unexpected use of 'returned'"); + isThisReturn = true; + } RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else if (isByVal) { assert(VA.isMemLoc()); @@ -1459,23 +1549,35 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // True if this byval aggregate will be split between registers // and memory. - if (CCInfo.isFirstByValRegValid()) { + unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); + unsigned CurByValIdx = CCInfo.getInRegsParamsProceed(); + + if (CurByValIdx < ByValArgsCount) { + + unsigned RegBegin, RegEnd; + CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); unsigned int i, j; - for (i = 0, j = CCInfo.getFirstByValReg(); j < ARM::R4; i++, j++) { + for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { SDValue Const = DAG.getConstant(4*i, MVT::i32); SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(), - false, false, false, 0); + false, false, false, + DAG.InferPtrAlignment(AddArg)); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(j, Load)); } - offset = ARM::R4 - CCInfo.getFirstByValReg(); - CCInfo.clearFirstByValReg(); + + // If parameter size outsides register area, "offset" value + // helps us to calculate stack slot for remained part properly. + offset = RegEnd - RegBegin; + + CCInfo.nextInRegsParam(); } - if (Flags.getByValSize() - 4*offset > 0) { + if (Flags.getByValSize() > 4*offset) { unsigned LocMemOffset = VA.getLocMemOffset(); SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset); SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, @@ -1491,7 +1593,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops, array_lengthof(Ops))); } - } else if (!IsSibCall) { + } else if (!isSibCall) { assert(VA.isMemLoc()); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, @@ -1531,7 +1633,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } - InFlag =SDValue(); + InFlag = SDValue(); } // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every @@ -1582,25 +1684,16 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const GlobalValue *GV = G->getGlobal(); isDirect = true; bool isExt = GV->isDeclaration() || GV->isWeakForLinker(); - bool isStub = (isExt && Subtarget->isTargetDarwin()) && + bool isStub = (isExt && Subtarget->isTargetMachO()) && getTargetMachine().getRelocationModel() != Reloc::Static; isARMFunc = !Subtarget->isThumb() || isStub; // ARM call to a local ARM function is predicable. isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking); // tBX takes a register source operand. - if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { - unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); - ARMConstantPoolValue *CPV = - ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 4); - SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); - CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - Callee = DAG.getLoad(getPointerTy(), dl, - DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); - SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); - Callee = DAG.getNode(ARMISD::PIC_ADD, dl, - getPointerTy(), Callee, PICLabel); + if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { + assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); + Callee = DAG.getNode(ARMISD::WrapperPIC, dl, getPointerTy(), + DAG.getTargetGlobalAddress(GV, dl, getPointerTy())); } else { // On ELF targets for PIC code, direct calls should go through the PLT unsigned OpFlags = 0; @@ -1611,7 +1704,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { isDirect = true; - bool isStub = Subtarget->isTargetDarwin() && + bool isStub = Subtarget->isTargetMachO() && getTargetMachine().getRelocationModel() != Reloc::Static; isARMFunc = !Subtarget->isThumb() || isStub; // tBX takes a register source operand. @@ -1642,8 +1735,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // FIXME: handle tail calls differently. unsigned CallOpc; - bool HasMinSizeAttr = MF.getFunction()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); + bool HasMinSizeAttr = Subtarget->isMinSize(); if (Subtarget->isThumb()) { if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; @@ -1672,10 +1764,26 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); - const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); - assert(Mask && "Missing call preserved mask for calling convention"); - Ops.push_back(DAG.getRegisterMask(Mask)); + if (!isTailCall) { + const uint32_t *Mask; + const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const ARMBaseRegisterInfo *ARI = static_cast(TRI); + if (isThisReturn) { + // For 'this' returns, use the R0-preserving mask if applicable + Mask = ARI->getThisReturnPreservedMask(CallConv); + if (!Mask) { + // Set isThisReturn to false if the calling convention is not one that + // allows 'returned' to be modeled in this way, so LowerCallResult does + // not try to pass 'this' straight through + isThisReturn = false; + Mask = ARI->getCallPreservedMask(CallConv); + } + } else + Mask = ARI->getCallPreservedMask(CallConv); + + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); + } if (InFlag.getNode()) Ops.push_back(InFlag); @@ -1689,14 +1797,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, InFlag = Chain.getValue(1); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), - DAG.getIntPtrConstant(0, true), InFlag); + DAG.getIntPtrConstant(0, true), InFlag, dl); if (!Ins.empty()) InFlag = Chain.getValue(1); // Handle result values, copying them out of physregs into vregs that we // return. - return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, - dl, DAG, InVals); + return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, + InVals, isThisReturn, + isThisReturn ? OutVals[0] : SDValue()); } /// HandleByVal - Every parameter *after* a byval parameter is passed @@ -1710,8 +1819,8 @@ ARMTargetLowering::HandleByVal( assert((State->getCallOrPrologue() == Prologue || State->getCallOrPrologue() == Call) && "unhandled ParmContext"); - if ((!State->isFirstByValRegValid()) && - (ARM::R0 <= reg) && (reg <= ARM::R3)) { + + if ((ARM::R0 <= reg) && (reg <= ARM::R3)) { if (Subtarget->isAAPCS_ABI() && Align > 4) { unsigned AlignInRegs = Align / 4; unsigned Waste = (ARM::R4 - reg) % AlignInRegs; @@ -1719,22 +1828,42 @@ ARMTargetLowering::HandleByVal( reg = State->AllocateReg(GPRArgRegs, 4); } if (reg != 0) { - State->setFirstByValReg(reg); - // At a call site, a byval parameter that is split between - // registers and memory needs its size truncated here. In a - // function prologue, such byval parameters are reassembled in - // memory, and are not truncated. - if (State->getCallOrPrologue() == Call) { - unsigned excess = 4 * (ARM::R4 - reg); - assert(size >= excess && "expected larger existing stack allocation"); - size -= excess; + unsigned excess = 4 * (ARM::R4 - reg); + + // Special case when NSAA != SP and parameter size greater than size of + // all remained GPR regs. In that case we can't split parameter, we must + // send it to stack. We also must set NCRN to R4, so waste all + // remained registers. + const unsigned NSAAOffset = State->getNextStackOffset(); + if (Subtarget->isAAPCS_ABI() && NSAAOffset != 0 && size > excess) { + while (State->AllocateReg(GPRArgRegs, 4)) + ; + return; } + + // First register for byval parameter is the first register that wasn't + // allocated before this method call, so it would be "reg". + // If parameter is small enough to be saved in range [reg, r4), then + // the end (first after last) register would be reg + param-size-in-regs, + // else parameter would be splitted between registers and stack, + // end register would be r4 in this case. + unsigned ByValRegBegin = reg; + unsigned ByValRegEnd = (size < excess) ? reg + size/4 : (unsigned)ARM::R4; + State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); + // Note, first register is allocated in the beginning of function already, + // allocate remained amount of registers we need. + for (unsigned i = reg+1; i != ByValRegEnd; ++i) + State->AllocateReg(GPRArgRegs, 4); + // A byval parameter that is split between registers and memory needs its + // size truncated here. + // In the case where the entire structure fits in registers, we set the + // size in memory to zero. + if (size < excess) + size = 0; + else + size -= excess; } } - // Confiscate any remaining parameter registers to preclude their - // assignment to subsequent parameters. - while (State->AllocateReg(GPRArgRegs, 4)) - ; } /// MatchingStackOffset - Return true if the given stack call argument is @@ -1806,6 +1935,12 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (isVarArg && !Outs.empty()) return false; + // Exception-handling functions need a special set of instructions to indicate + // a return to the hardware. Tail-calling another function would probably + // break this. + if (CallerF->hasFnAttribute("interrupt")) + return false; + // Also avoid sibcall optimization if either caller or callee uses struct // return semantics. if (isCalleeStructRet || isCallerStructRet) @@ -1866,7 +2001,7 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // local frame. const ARMFunctionInfo *AFI_Caller = DAG.getMachineFunction(). getInfo(); - if (AFI_Caller->getVarArgsRegSaveSize()) + if (AFI_Caller->getArgRegsSaveSize()) return false; // If the callee takes no arguments then go on to check the results of the @@ -1934,12 +2069,45 @@ ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, isVarArg)); } +static SDValue LowerInterruptReturn(SmallVectorImpl &RetOps, + SDLoc DL, SelectionDAG &DAG) { + const MachineFunction &MF = DAG.getMachineFunction(); + const Function *F = MF.getFunction(); + + StringRef IntKind = F->getFnAttribute("interrupt").getValueAsString(); + + // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset + // version of the "preferred return address". These offsets affect the return + // instruction if this is a return from PL1 without hypervisor extensions. + // IRQ/FIQ: +4 "subs pc, lr, #4" + // SWI: 0 "subs pc, lr, #0" + // ABORT: +4 "subs pc, lr, #4" + // UNDEF: +4/+2 "subs pc, lr, #0" + // UNDEF varies depending on where the exception came from ARM or Thumb + // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. + + int64_t LROffset; + if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || + IntKind == "ABORT") + LROffset = 4; + else if (IntKind == "SWI" || IntKind == "UNDEF") + LROffset = 0; + else + report_fatal_error("Unsupported interrupt attribute. If present, value " + "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); + + RetOps.insert(RetOps.begin() + 1, DAG.getConstant(LROffset, MVT::i32, false)); + + return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, + RetOps.data(), RetOps.size()); +} + SDValue ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, - DebugLoc dl, SelectionDAG &DAG) const { + SDLoc dl, SelectionDAG &DAG) const { // CCValAssign - represent the assignment of the return value to a location. SmallVector RVLocs; @@ -2019,6 +2187,19 @@ ARMTargetLowering::LowerReturn(SDValue Chain, if (Flag.getNode()) RetOps.push_back(Flag); + // CPUs which aren't M-class use a special sequence to return from + // exceptions (roughly, any instruction setting pc and cpsr simultaneously, + // though we use "subs pc, lr, #N"). + // + // M-class CPUs actually use a normal return sequence with a special + // (hardware-provided) value in LR, so the normal code path works. + if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt") && + !Subtarget->isMClass()) { + if (Subtarget->isThumb1Only()) + report_fatal_error("interrupt attribute is not supported in Thumb1"); + return LowerInterruptReturn(RetOps, dl, DAG); + } + return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps.data(), RetOps.size()); } @@ -2067,7 +2248,7 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { Copy = *Copy->use_begin(); if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) return false; - Chain = Copy->getOperand(0); + TCChain = Copy->getOperand(0); } else { return false; } @@ -2075,7 +2256,8 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { bool HasRet = false; for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); UI != UE; ++UI) { - if (UI->getOpcode() != ARMISD::RET_FLAG) + if (UI->getOpcode() != ARMISD::RET_FLAG && + UI->getOpcode() != ARMISD::INTRET_FLAG) return false; HasRet = true; } @@ -2088,10 +2270,10 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { } bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { - if (!EnableARMTailCalls && !Subtarget->supportsTailCall()) + if (!Subtarget->supportsTailCall()) return false; - if (!CI->isTailCall()) + if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) return false; return !Subtarget->isThumb1Only(); @@ -2106,7 +2288,7 @@ bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) { EVT PtrVT = Op.getValueType(); // FIXME there is no actual debug info here - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); ConstantPoolSDNode *CP = cast(Op); SDValue Res; if (CP->isMachineConstantPoolEntry()) @@ -2127,7 +2309,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = 0; - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); EVT PtrVT = getPointerTy(); const BlockAddress *BA = cast(Op)->getBlockAddress(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); @@ -2156,7 +2338,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, SDValue ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG) const { - DebugLoc dl = GA->getDebugLoc(); + SDLoc dl(GA); EVT PtrVT = getPointerTy(); unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; MachineFunction &MF = DAG.getMachineFunction(); @@ -2199,7 +2381,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, SelectionDAG &DAG, TLSModel::Model model) const { const GlobalValue *GV = GA->getGlobal(); - DebugLoc dl = GA->getDebugLoc(); + SDLoc dl(GA); SDValue Offset; SDValue Chain = DAG.getEntryNode(); EVT PtrVT = getPointerTy(); @@ -2269,7 +2451,7 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); const GlobalValue *GV = cast(Op)->getGlobal(); if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility(); @@ -2312,60 +2494,24 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); const GlobalValue *GV = cast(Op)->getGlobal(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); - // FIXME: Enable this for static codegen when tool issues are fixed. Also - // update ARMFastISel::ARMMaterializeGV. - if (Subtarget->useMovt() && RelocM != Reloc::Static) { + if (Subtarget->useMovt()) ++NumMovwMovt; - // FIXME: Once remat is capable of dealing with instructions with register - // operands, expand this into two nodes. - if (RelocM == Reloc::Static) - return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, - DAG.getTargetGlobalAddress(GV, dl, PtrVT)); - - unsigned Wrapper = (RelocM == Reloc::PIC_) - ? ARMISD::WrapperPIC : ARMISD::WrapperDYN; - SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, - DAG.getTargetGlobalAddress(GV, dl, PtrVT)); - if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) - Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), - false, false, false, 0); - return Result; - } - unsigned ARMPCLabelIndex = 0; - SDValue CPAddr; - if (RelocM == Reloc::Static) { - CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); - } else { - ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo(); - ARMPCLabelIndex = AFI->createPICLabelUId(); - unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8); - ARMConstantPoolValue *CPV = - ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, - PCAdj); - CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); - } - CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + // FIXME: Once remat is capable of dealing with instructions with register + // operands, expand this into multiple nodes + unsigned Wrapper = + RelocM == Reloc::PIC_ ? ARMISD::WrapperPIC : ARMISD::Wrapper; - SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); - SDValue Chain = Result.getValue(1); - - if (RelocM == Reloc::PIC_) { - SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); - Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); - } + SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); + SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) - Result = DAG.getLoad(PtrVT, dl, Chain, Result, MachinePointerInfo::getGOT(), - false, false, false, 0); - + Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, + MachinePointerInfo::getGOT(), false, false, false, 0); return Result; } @@ -2377,7 +2523,7 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); EVT PtrVT = getPointerTy(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_", @@ -2393,7 +2539,7 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, SDValue ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue Val = DAG.getConstant(0, MVT::i32); return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), @@ -2402,7 +2548,7 @@ ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { SDValue ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), Op.getOperand(1), DAG.getConstant(0, MVT::i32)); } @@ -2411,7 +2557,7 @@ SDValue ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. case Intrinsic::arm_thread_pointer: { @@ -2447,57 +2593,42 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, case Intrinsic::arm_neon_vmullu: { unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) ? ARMISD::VMULLs : ARMISD::VMULLu; - return DAG.getNode(NewOpc, Op.getDebugLoc(), Op.getValueType(), + return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } } } -static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) { - DebugLoc dl = Op.getDebugLoc(); - if (!Subtarget->hasDataBarrier()) { - // Some ARMv6 cpus can support data barriers with an mcr instruction. - // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get - // here. - assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && - "Unexpected ISD::MEMBARRIER encountered. Should be libcall!"); - return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), - DAG.getConstant(0, MVT::i32)); - } - - SDValue Op5 = Op.getOperand(5); - bool isDeviceBarrier = cast(Op5)->getZExtValue() != 0; - unsigned isLL = cast(Op.getOperand(1))->getZExtValue(); - unsigned isLS = cast(Op.getOperand(2))->getZExtValue(); - bool isOnlyStoreBarrier = (isLL == 0 && isLS == 0); - - ARM_MB::MemBOpt DMBOpt; - if (isDeviceBarrier) - DMBOpt = isOnlyStoreBarrier ? ARM_MB::ST : ARM_MB::SY; - else - DMBOpt = isOnlyStoreBarrier ? ARM_MB::ISHST : ARM_MB::ISH; - return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0), - DAG.getConstant(DMBOpt, MVT::i32)); -} - - static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { // FIXME: handle "fence singlethread" more efficiently. - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); if (!Subtarget->hasDataBarrier()) { // Some ARMv6 cpus can support data barriers with an mcr instruction. // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get // here. assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && - "Unexpected ISD::MEMBARRIER encountered. Should be libcall!"); + "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), DAG.getConstant(0, MVT::i32)); } - return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0), - DAG.getConstant(ARM_MB::ISH, MVT::i32)); + ConstantSDNode *OrdN = cast(Op.getOperand(1)); + AtomicOrdering Ord = static_cast(OrdN->getZExtValue()); + unsigned Domain = ARM_MB::ISH; + if (Subtarget->isMClass()) { + // Only a full system barrier exists in the M-class architectures. + Domain = ARM_MB::SY; + } else if (Subtarget->isSwift() && Ord == Release) { + // Swift happens to implement ISHST barriers in a way that's compatible with + // Release semantics but weaker than ISH so we'd be fools not to use + // it. Beware: other processors probably don't! + Domain = ARM_MB::ISHST; + } + + return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), + DAG.getConstant(Intrinsic::arm_dmb, MVT::i32), + DAG.getConstant(Domain, MVT::i32)); } static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, @@ -2508,7 +2639,7 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, // Just preserve the chain. return Op.getOperand(0); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); unsigned isRead = ~cast(Op.getOperand(2))->getZExtValue() & 1; if (!isRead && (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) @@ -2533,7 +2664,7 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); const Value *SV = cast(Op.getOperand(2))->getValue(); @@ -2544,7 +2675,7 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, SDValue &Root, SelectionDAG &DAG, - DebugLoc dl) const { + SDLoc dl) const { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); @@ -2578,12 +2709,17 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, void ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF, - unsigned &VARegSize, unsigned &VARegSaveSize) + unsigned InRegsParamRecordIdx, + unsigned ArgSize, + unsigned &ArgRegsSize, + unsigned &ArgRegsSaveSize) const { unsigned NumGPRs; - if (CCInfo.isFirstByValRegValid()) - NumGPRs = ARM::R4 - CCInfo.getFirstByValReg(); - else { + if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { + unsigned RBegin, REnd; + CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); + NumGPRs = REnd - RBegin; + } else { unsigned int firstUnalloced; firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs, sizeof(GPRArgRegs) / @@ -2592,8 +2728,29 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF, } unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment(); - VARegSize = NumGPRs * 4; - VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1); + ArgRegsSize = NumGPRs * 4; + + // If parameter is split between stack and GPRs... + if (NumGPRs && Align > 4 && + (ArgRegsSize < ArgSize || + InRegsParamRecordIdx >= CCInfo.getInRegsParamsCount())) { + // Add padding for part of param recovered from GPRs. For example, + // if Align == 8, its last byte must be at address K*8 - 1. + // We need to do it, since remained (stack) part of parameter has + // stack alignment, and we need to "attach" "GPRs head" without gaps + // to it: + // Stack: + // |---- 8 bytes block ----| |---- 8 bytes block ----| |---- 8 bytes... + // [ [padding] [GPRs head] ] [ Tail passed via stack .... + // + ARMFunctionInfo *AFI = MF.getInfo(); + unsigned Padding = + OffsetToAlignment(ArgRegsSize + AFI->getArgRegsSaveSize(), Align); + ArgRegsSaveSize = ArgRegsSize + Padding; + } else + // We don't need to extend regs save size for byval parameters if they + // are passed via GPRs only. + ArgRegsSaveSize = ArgRegsSize; } // The remaining GPRs hold either the beginning of variable-argument @@ -2603,40 +2760,78 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF, // If this is a variadic function, the va_list pointer will begin with // these values; otherwise, this reassembles a (byval) structure that // was split between registers and memory. -void -ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, - DebugLoc dl, SDValue &Chain, - const Value *OrigArg, - unsigned OffsetFromOrigArg, - unsigned ArgOffset, - bool ForceMutable) const { +// Return: The frame index registers were stored into. +int +ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, + SDLoc dl, SDValue &Chain, + const Value *OrigArg, + unsigned InRegsParamRecordIdx, + unsigned OffsetFromOrigArg, + unsigned ArgOffset, + unsigned ArgSize, + bool ForceMutable, + unsigned ByValStoreOffset, + unsigned TotalArgRegsSaveSize) const { + + // Currently, two use-cases possible: + // Case #1. Non-var-args function, and we meet first byval parameter. + // Setup first unallocated register as first byval register; + // eat all remained registers + // (these two actions are performed by HandleByVal method). + // Then, here, we initialize stack frame with + // "store-reg" instructions. + // Case #2. Var-args function, that doesn't contain byval parameters. + // The same: eat all remained unallocated registers, + // initialize stack frame. + MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo(); - unsigned firstRegToSaveIndex; - if (CCInfo.isFirstByValRegValid()) - firstRegToSaveIndex = CCInfo.getFirstByValReg() - ARM::R0; - else { + unsigned firstRegToSaveIndex, lastRegToSaveIndex; + unsigned RBegin, REnd; + if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { + CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); + firstRegToSaveIndex = RBegin - ARM::R0; + lastRegToSaveIndex = REnd - ARM::R0; + } else { firstRegToSaveIndex = CCInfo.getFirstUnallocated - (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0])); - } - - unsigned VARegSize, VARegSaveSize; - computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize); - if (VARegSaveSize) { - // If this function is vararg, store any remaining integer argument regs - // to their spots on the stack so that they may be loaded by deferencing - // the result of va_next. - AFI->setVarArgsRegSaveSize(VARegSaveSize); - AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(VARegSaveSize, - ArgOffset + VARegSaveSize - - VARegSize, - false)); - SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(), - getPointerTy()); + (GPRArgRegs, array_lengthof(GPRArgRegs)); + lastRegToSaveIndex = 4; + } + + unsigned ArgRegsSize, ArgRegsSaveSize; + computeRegArea(CCInfo, MF, InRegsParamRecordIdx, ArgSize, + ArgRegsSize, ArgRegsSaveSize); + + // Store any by-val regs to their spots on the stack so that they may be + // loaded by deferencing the result of formal parameter pointer or va_next. + // Note: once stack area for byval/varargs registers + // was initialized, it can't be initialized again. + if (ArgRegsSaveSize) { + unsigned Padding = ArgRegsSaveSize - ArgRegsSize; + + if (Padding) { + assert(AFI->getStoredByValParamsPadding() == 0 && + "The only parameter may be padded."); + AFI->setStoredByValParamsPadding(Padding); + } + + int FrameIndex = MFI->CreateFixedObject(ArgRegsSaveSize, + Padding + + ByValStoreOffset - + (int64_t)TotalArgRegsSaveSize, + false); + SDValue FIN = DAG.getFrameIndex(FrameIndex, getPointerTy()); + if (Padding) { + MFI->CreateFixedObject(Padding, + ArgOffset + ByValStoreOffset - + (int64_t)ArgRegsSaveSize, + false); + } SmallVector MemOps; - for (unsigned i = 0; firstRegToSaveIndex < 4; ++firstRegToSaveIndex, ++i) { + for (unsigned i = 0; firstRegToSaveIndex < lastRegToSaveIndex; + ++firstRegToSaveIndex, ++i) { const TargetRegisterClass *RC; if (AFI->isThumb1OnlyFunction()) RC = &ARM::tGPRRegClass; @@ -2653,13 +2848,45 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, DAG.getConstant(4, getPointerTy())); } + + AFI->setArgRegsSaveSize(ArgRegsSaveSize + AFI->getArgRegsSaveSize()); + if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOps[0], MemOps.size()); - } else + return FrameIndex; + } else { + if (ArgSize == 0) { + // We cannot allocate a zero-byte object for the first variadic argument, + // so just make up a size. + ArgSize = 4; + } // This will point to the next argument passed via stack. - AFI->setVarArgsFrameIndex( - MFI->CreateFixedObject(4, ArgOffset, !ForceMutable)); + return MFI->CreateFixedObject( + ArgSize, ArgOffset, !ForceMutable); + } +} + +// Setup stack frame, the va_list pointer will start from. +void +ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, + SDLoc dl, SDValue &Chain, + unsigned ArgOffset, + unsigned TotalArgRegsSaveSize, + bool ForceMutable) const { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo(); + + // Try to store any remaining integer argument regs + // to their spots on the stack so that they may be loaded by deferencing + // the result of va_next. + // If there is no regs to be stored, just point address after last + // argument passed via stack. + int FrameIndex = + StoreByValRegs(CCInfo, DAG, dl, Chain, 0, CCInfo.getInRegsParamsCount(), + 0, ArgOffset, 0, ForceMutable, 0, TotalArgRegsSaveSize); + + AFI->setVarArgsFrameIndex(FrameIndex); } SDValue @@ -2667,7 +2894,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -2682,12 +2909,63 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv, /* Return*/ false, isVarArg)); - + SmallVector ArgValues; int lastInsIndex = -1; SDValue ArgValue; Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); unsigned CurArgIdx = 0; + + // Initially ArgRegsSaveSize is zero. + // Then we increase this value each time we meet byval parameter. + // We also increase this value in case of varargs function. + AFI->setArgRegsSaveSize(0); + + unsigned ByValStoreOffset = 0; + unsigned TotalArgRegsSaveSize = 0; + unsigned ArgRegsSaveSizeMaxAlign = 4; + + // Calculate the amount of stack space that we need to allocate to store + // byval and variadic arguments that are passed in registers. + // We need to know this before we allocate the first byval or variadic + // argument, as they will be allocated a stack slot below the CFA (Canonical + // Frame Address, the stack pointer at entry to the function). + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + if (VA.isMemLoc()) { + int index = VA.getValNo(); + if (index != lastInsIndex) { + ISD::ArgFlagsTy Flags = Ins[index].Flags; + if (Flags.isByVal()) { + unsigned ExtraArgRegsSize; + unsigned ExtraArgRegsSaveSize; + computeRegArea(CCInfo, MF, CCInfo.getInRegsParamsProceed(), + Flags.getByValSize(), + ExtraArgRegsSize, ExtraArgRegsSaveSize); + + TotalArgRegsSaveSize += ExtraArgRegsSaveSize; + if (Flags.getByValAlign() > ArgRegsSaveSizeMaxAlign) + ArgRegsSaveSizeMaxAlign = Flags.getByValAlign(); + CCInfo.nextInRegsParam(); + } + lastInsIndex = index; + } + } + } + CCInfo.rewindByValRegsInfo(); + lastInsIndex = -1; + if (isVarArg) { + unsigned ExtraArgRegsSize; + unsigned ExtraArgRegsSaveSize; + computeRegArea(CCInfo, MF, CCInfo.getInRegsParamsCount(), 0, + ExtraArgRegsSize, ExtraArgRegsSaveSize); + TotalArgRegsSaveSize += ExtraArgRegsSaveSize; + } + // If the arg regs save area contains N-byte aligned values, the + // bottom of it must be at least N-byte aligned. + TotalArgRegsSaveSize = RoundUpToAlignment(TotalArgRegsSaveSize, ArgRegsSaveSizeMaxAlign); + TotalArgRegsSaveSize = std::min(TotalArgRegsSaveSize, 16U); + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; std::advance(CurOrigArg, Ins[VA.getValNo()].OrigArgIndex - CurArgIdx); @@ -2785,23 +3063,26 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, // Since they could be overwritten by lowering of arguments in case of // a tail call. if (Flags.isByVal()) { - ARMFunctionInfo *AFI = MF.getInfo(); - if (!AFI->getVarArgsFrameIndex()) { - VarArgStyleRegisters(CCInfo, DAG, - dl, Chain, CurOrigArg, - Ins[VA.getValNo()].PartOffset, - VA.getLocMemOffset(), - true /*force mutable frames*/); - int VAFrameIndex = AFI->getVarArgsFrameIndex(); - InVals.push_back(DAG.getFrameIndex(VAFrameIndex, getPointerTy())); - } else { - int FI = MFI->CreateFixedObject(Flags.getByValSize(), - VA.getLocMemOffset(), false); - InVals.push_back(DAG.getFrameIndex(FI, getPointerTy())); - } + unsigned CurByValIndex = CCInfo.getInRegsParamsProceed(); + + ByValStoreOffset = RoundUpToAlignment(ByValStoreOffset, Flags.getByValAlign()); + int FrameIndex = StoreByValRegs( + CCInfo, DAG, dl, Chain, CurOrigArg, + CurByValIndex, + Ins[VA.getValNo()].PartOffset, + VA.getLocMemOffset(), + Flags.getByValSize(), + true /*force mutable frames*/, + ByValStoreOffset, + TotalArgRegsSaveSize); + ByValStoreOffset += Flags.getByValSize(); + ByValStoreOffset = std::min(ByValStoreOffset, 16U); + InVals.push_back(DAG.getFrameIndex(FrameIndex, getPointerTy())); + CCInfo.nextInRegsParam(); } else { + unsigned FIOffset = VA.getLocMemOffset(); int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8, - VA.getLocMemOffset(), true); + FIOffset, true); // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); @@ -2816,8 +3097,9 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, // varargs if (isVarArg) - VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 0, 0, - CCInfo.getNextStackOffset()); + VarArgStyleRegisters(CCInfo, DAG, dl, Chain, + CCInfo.getNextStackOffset(), + TotalArgRegsSaveSize); return Chain; } @@ -2843,7 +3125,7 @@ static bool isFloatingPointZero(SDValue Op) { SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &ARMcc, SelectionDAG &DAG, - DebugLoc dl) const { + SDLoc dl) const { if (ConstantSDNode *RHSC = dyn_cast(RHS.getNode())) { unsigned C = RHSC->getZExtValue(); if (!isLegalICmpImmediate(C)) { @@ -2901,7 +3183,7 @@ ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, - DebugLoc dl) const { + SDLoc dl) const { SDValue Cmp; if (!isFloatingPointZero(RHS)) Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS); @@ -2915,7 +3197,7 @@ ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, SDValue ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { unsigned Opc = Cmp.getOpcode(); - DebugLoc DL = Cmp.getDebugLoc(); + SDLoc DL(Cmp); if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); @@ -2935,7 +3217,7 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cond = Op.getOperand(0); SDValue SelectTrue = Op.getOperand(1); SDValue SelectFalse = Op.getOperand(2); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); // Convert: // @@ -2983,6 +3265,61 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SelectTrue, SelectFalse, ISD::SETNE); } +static ISD::CondCode getInverseCCForVSEL(ISD::CondCode CC) { + if (CC == ISD::SETNE) + return ISD::SETEQ; + return ISD::getSetCCInverse(CC, true); +} + +static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, + bool &swpCmpOps, bool &swpVselOps) { + // Start by selecting the GE condition code for opcodes that return true for + // 'equality' + if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || + CC == ISD::SETULE) + CondCode = ARMCC::GE; + + // and GT for opcodes that return false for 'equality'. + else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || + CC == ISD::SETULT) + CondCode = ARMCC::GT; + + // Since we are constrained to GE/GT, if the opcode contains 'less', we need + // to swap the compare operands. + if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || + CC == ISD::SETULT) + swpCmpOps = true; + + // Both GT and GE are ordered comparisons, and return false for 'unordered'. + // If we have an unordered opcode, we need to swap the operands to the VSEL + // instruction (effectively negating the condition). + // + // This also has the effect of swapping which one of 'less' or 'greater' + // returns true, so we also swap the compare operands. It also switches + // whether we return true for 'equality', so we compensate by picking the + // opposite condition code to our original choice. + if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || + CC == ISD::SETUGT) { + swpCmpOps = !swpCmpOps; + swpVselOps = !swpVselOps; + CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; + } + + // 'ordered' is 'anything but unordered', so use the VS condition code and + // swap the VSEL operands. + if (CC == ISD::SETO) { + CondCode = ARMCC::VS; + swpVselOps = true; + } + + // 'unordered or not equal' is 'anything but equal', so use the EQ condition + // code and swap the VSEL operands. + if (CC == ISD::SETUNE) { + CondCode = ARMCC::EQ; + swpVselOps = true; + } +} + SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDValue LHS = Op.getOperand(0); @@ -2990,18 +3327,69 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { ISD::CondCode CC = cast(Op.getOperand(4))->get(); SDValue TrueVal = Op.getOperand(2); SDValue FalseVal = Op.getOperand(3); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); if (LHS.getValueType() == MVT::i32) { + // Try to generate VSEL on ARMv8. + // The VSEL instruction can't use all the usual ARM condition + // codes: it only has two bits to select the condition code, so it's + // constrained to use only GE, GT, VS and EQ. + // + // To implement all the various ISD::SETXXX opcodes, we sometimes need to + // swap the operands of the previous compare instruction (effectively + // inverting the compare condition, swapping 'less' and 'greater') and + // sometimes need to swap the operands to the VSEL (which inverts the + // condition in the sense of firing whenever the previous condition didn't) + if (getSubtarget()->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || + TrueVal.getValueType() == MVT::f64)) { + ARMCC::CondCodes CondCode = IntCCToARMCC(CC); + if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || + CondCode == ARMCC::VC || CondCode == ARMCC::NE) { + CC = getInverseCCForVSEL(CC); + std::swap(TrueVal, FalseVal); + } + } + SDValue ARMcc; SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); - return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,Cmp); + return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, + Cmp); } ARMCC::CondCodes CondCode, CondCode2; FPCCToARMCC(CC, CondCode, CondCode2); + // Try to generate VSEL on ARMv8. + if (getSubtarget()->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || + TrueVal.getValueType() == MVT::f64)) { + // We can select VMAXNM/VMINNM from a compare followed by a select with the + // same operands, as follows: + // c = fcmp [ogt, olt, ugt, ult] a, b + // select c, a, b + // We only do this in unsafe-fp-math, because signed zeros and NaNs are + // handled differently than the original code sequence. + if (getTargetMachine().Options.UnsafeFPMath && LHS == TrueVal && + RHS == FalseVal) { + if (CC == ISD::SETOGT || CC == ISD::SETUGT) + return DAG.getNode(ARMISD::VMAXNM, dl, VT, TrueVal, FalseVal); + if (CC == ISD::SETOLT || CC == ISD::SETULT) + return DAG.getNode(ARMISD::VMINNM, dl, VT, TrueVal, FalseVal); + } + + bool swpCmpOps = false; + bool swpVselOps = false; + checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); + + if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || + CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { + if (swpCmpOps) + std::swap(LHS, RHS); + if (swpVselOps) + std::swap(TrueVal, FalseVal); + } + } + SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32); SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); @@ -3045,7 +3433,7 @@ static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { return DAG.getConstant(0, MVT::i32); if (LoadSDNode *Ld = dyn_cast(Op)) - return DAG.getLoad(MVT::i32, Op.getDebugLoc(), + return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), Ld->getAlignment()); @@ -3063,7 +3451,7 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, if (LoadSDNode *Ld = dyn_cast(Op)) { SDValue Ptr = Ld->getBasePtr(); - RetVal1 = DAG.getLoad(MVT::i32, Op.getDebugLoc(), + RetVal1 = DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ptr, Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), @@ -3071,9 +3459,9 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, EVT PtrType = Ptr.getValueType(); unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); - SDValue NewPtr = DAG.getNode(ISD::ADD, Op.getDebugLoc(), + SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(Op), PtrType, Ptr, DAG.getConstant(4, PtrType)); - RetVal2 = DAG.getLoad(MVT::i32, Op.getDebugLoc(), + RetVal2 = DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), NewPtr, Ld->getPointerInfo().getWithOffset(4), Ld->isVolatile(), Ld->isNonTemporal(), @@ -3093,7 +3481,7 @@ ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(2); SDValue RHS = Op.getOperand(3); SDValue Dest = Op.getOperand(4); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); bool LHSSeenZero = false; bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); @@ -3143,7 +3531,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(2); SDValue RHS = Op.getOperand(3); SDValue Dest = Op.getOperand(4); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); if (LHS.getValueType() == MVT::i32) { SDValue ARMcc; @@ -3184,7 +3572,7 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Table = Op.getOperand(1); SDValue Index = Op.getOperand(2); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT PTy = getPointerTy(); JumpTableSDNode *JT = cast(Table); @@ -3220,7 +3608,7 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); if (Op.getValueType().getVectorElementType() == MVT::i32) { if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) @@ -3242,7 +3630,7 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) { if (VT.isVector()) return LowerVectorFP_TO_INT(Op, DAG); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); unsigned Opc; switch (Op.getOpcode()) { @@ -3260,7 +3648,7 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) { static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { if (VT.getVectorElementType() == MVT::f32) @@ -3296,7 +3684,7 @@ static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) { if (VT.isVector()) return LowerVectorINT_TO_FP(Op, DAG); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); unsigned Opc; switch (Op.getOpcode()) { @@ -3317,7 +3705,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { // Implement fcopysign with a fabs and a conditional fneg. SDValue Tmp0 = Op.getOperand(0); SDValue Tmp1 = Op.getOperand(1); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT VT = Op.getValueType(); EVT SrcVT = Tmp1.getValueType(); bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || @@ -3400,8 +3788,11 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ MachineFrameInfo *MFI = MF.getFrameInfo(); MFI->setReturnAddressIsTaken(true); + if (verifyReturnAddressArgumentIsConstant(Op, DAG)) + return SDValue(); + EVT VT = Op.getValueType(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); if (Depth) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); @@ -3421,9 +3812,9 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { MFI->setFrameAddressIsTaken(true); EVT VT = Op.getValueType(); - DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful + SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); - unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetDarwin()) + unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetMachO()) ? ARM::R7 : ARM::R11; SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); while (Depth--) @@ -3440,7 +3831,7 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { /// vectors), since the legalizer won't know what to do with that. static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); SDValue Op = N->getOperand(0); // This function is only supposed to be called for i64 types, either as the @@ -3477,7 +3868,7 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { /// not support i64 elements, so sometimes the zero vectors will need to be /// explicitly constructed. Regardless, use a canonical VMOV to create the /// zero vector. -static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { +static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); // The canonical modified immediate encoding of a zero vector is....0! SDValue EncodedVal = DAG.getTargetConstant(0, MVT::i32); @@ -3493,7 +3884,7 @@ SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); @@ -3529,7 +3920,7 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); @@ -3562,7 +3953,7 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) // so that the shift + and get folded into a bitfield extract. - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, DAG.getConstant(Intrinsic::arm_get_fpscr, MVT::i32)); @@ -3577,7 +3968,7 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); if (!ST->hasV6T2Ops()) return SDValue(); @@ -3594,14 +3985,14 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, /// input = [v0 v1 v2 v3 ] (vi 16-bit element) /// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element) /// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi) -/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6] +/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6] /// [b0 b1 b2 b3 b4 b5 b6 b7] /// +[b1 b0 b3 b2 b5 b4 b7 b6] /// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0, /// vuzp: = [k0 k1 k2 k3 k0 k1 k2 k3] each ki is 8-bits) static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0)); @@ -3615,7 +4006,7 @@ static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) { /// bit-count for each 16-bit element from the operand. We need slightly /// different sequencing for v4i16 and v8i16 to stay within NEON's available /// 64/128-bit registers. -/// +/// /// Trace for v4i16: /// input = [v0 v1 v2 v3 ] (vi 16-bit element) /// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi) @@ -3623,7 +4014,7 @@ static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) { /// v4i16:Extracted = [k0 k1 k2 k3 ] static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); SDValue BitCounts = getCTPOP16BitCounts(N, DAG); if (VT.is64BitVector()) { @@ -3646,7 +4037,7 @@ static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) { /// input = [v0 v1 ] (vi: 32-bit elements) /// Bitcast = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1]) /// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi) -/// vrev: N0 = [k1 k0 k3 k2 ] +/// vrev: N0 = [k1 k0 k3 k2 ] /// [k0 k1 k2 k3 ] /// N1 =+[k1 k0 k3 k2 ] /// [k0 k2 k1 k3 ] @@ -3658,7 +4049,7 @@ static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) { /// static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; @@ -3697,7 +4088,7 @@ static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); if (!VT.isVector()) return SDValue(); @@ -3732,7 +4123,7 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); // We can get here for a node like i32 = ISD::SHL i32, i64 if (VT != MVT::i64) @@ -3778,7 +4169,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { SDValue CC = Op.getOperand(2); EVT VT = Op.getValueType(); ISD::CondCode SetCCOpcode = cast(CC)->get(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); if (Op.getOperand(1).getValueType().isFloatingPoint()) { switch (SetCCOpcode) { @@ -3981,7 +4372,6 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, // Value = 0x0000nnff: Op=x, Cmode=1100. OpCmode = 0xc; Imm = SplatBits >> 8; - SplatBits |= 0xff; break; } @@ -3990,7 +4380,6 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, // Value = 0x00nnffff: Op=x, Cmode=1101. OpCmode = 0xd; Imm = SplatBits >> 16; - SplatBits |= 0xffff; break; } @@ -4021,7 +4410,6 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, } // Op=1, Cmode=1110. OpCmode = 0x1e; - SplatBits = Val; VT = is128Bits ? MVT::v2i64 : MVT::v1i64; break; } @@ -4036,18 +4424,26 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const { - if (!ST->useNEONForSinglePrecisionFP() || !ST->hasVFP3() || ST->hasD16()) + if (!ST->hasVFP3()) return SDValue(); + bool IsDouble = Op.getValueType() == MVT::f64; ConstantFPSDNode *CFP = cast(Op); - assert(Op.getValueType() == MVT::f32 && - "ConstantFP custom lowering should only occur for f32."); // Try splatting with a VMOV.f32... APFloat FPVal = CFP->getValueAPF(); - int ImmVal = ARM_AM::getFP32Imm(FPVal); + int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); + if (ImmVal != -1) { - DebugLoc DL = Op.getDebugLoc(); + if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { + // We have code in place to select a valid ConstantFP already, no need to + // do any mangling. + return Op; + } + + // It's a float and we are trying to use NEON operations where + // possible. Lower it to a splat followed by an extract. + SDLoc DL(Op); SDValue NewVal = DAG.getTargetConstant(ImmVal, MVT::i32); SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, NewVal); @@ -4055,15 +4451,31 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, DAG.getConstant(0, MVT::i32)); } - // If that fails, try a VMOV.i32 + // The rest of our options are NEON only, make sure that's allowed before + // proceeding.. + if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) + return SDValue(); + EVT VMovVT; - unsigned iVal = FPVal.bitcastToAPInt().getZExtValue(); - SDValue NewVal = isNEONModifiedImm(iVal, 0, 32, DAG, VMovVT, false, - VMOVModImm); + uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); + + // It wouldn't really be worth bothering for doubles except for one very + // important value, which does happen to match: 0.0. So make sure we don't do + // anything stupid. + if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) + return SDValue(); + + // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). + SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, VMovVT, + false, VMOVModImm); if (NewVal != SDValue()) { - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, NewVal); + if (IsDouble) + return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); + + // It's a float: cast and extract a vector element. SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, VecConstant); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, @@ -4071,11 +4483,16 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, } // Finally, try a VMVN.i32 - NewVal = isNEONModifiedImm(~iVal & 0xffffffff, 0, 32, DAG, VMovVT, false, - VMVNModImm); + NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, VMovVT, + false, VMVNModImm); if (NewVal != SDValue()) { - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); + + if (IsDouble) + return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); + + // It's a float: cast and extract a vector element. SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, VecConstant); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, @@ -4334,7 +4751,7 @@ static bool isReverseMask(ArrayRef M, EVT VT) { // instruction, return an SDValue of such a constant (will become a MOV // instruction). Otherwise return null. static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, - const ARMSubtarget *ST, DebugLoc dl) { + const ARMSubtarget *ST, SDLoc dl) { uint64_t Val; if (!isa(N)) return SDValue(); @@ -4355,7 +4772,7 @@ static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const { BuildVectorSDNode *BVN = cast(Op.getNode()); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT VT = Op.getValueType(); APInt SplatBits, SplatUndef; @@ -4424,7 +4841,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, ValueCounts.insert(std::make_pair(V, 0)); unsigned &Count = ValueCounts[V]; - + // Is this value dominant? (takes up more than half of the lanes) if (++Count > (NumElts / 2)) { hasDominantValue = true; @@ -4439,7 +4856,9 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (ValueCounts.size() == 0) return DAG.getUNDEF(VT); - if (isOnlyLowElement) + // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. + // Keep going if we are hitting this case. + if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); unsigned EltSize = VT.getVectorElementType().getSizeInBits(); @@ -4452,8 +4871,11 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // If we are VDUPing a value that comes directly from a vector, that will // cause an unnecessary move to and from a GPR, where instead we could - // just use VDUPLANE. - if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + // just use VDUPLANE. We can only do this if the lane being extracted + // is at a constant index, as the VDUP from lane instructions only have + // constant-index forms. + if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + isa(Value->getOperand(1))) { // We need to create a new undef vector to use for the VDUPLANE if the // size of the vector from which we get the value is different than the // size of the vector that we need to create. We will insert the element @@ -4468,12 +4890,10 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), Value, DAG.getConstant(index, MVT::i32)), DAG.getConstant(index, MVT::i32)); - } else { + } else N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, Value->getOperand(0), Value->getOperand(1)); - } - } - else + } else N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); if (!usesOnlyOneValue) { @@ -4505,7 +4925,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (usesOnlyOneValue) { SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); if (isConstant && Val.getNode()) - return DAG.getNode(ARMISD::VDUP, dl, VT, Val); + return DAG.getNode(ARMISD::VDUP, dl, VT, Val); } } @@ -4537,6 +4957,24 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::BITCAST, dl, VT, Val); } + // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we + // know the default expansion would otherwise fall back on something even + // worse. For a vector with one or two non-undef values, that's + // scalar_to_vector for the elements followed by a shuffle (provided the + // shuffle is valid for the target) and materialization element by element + // on the stack followed by a load for everything else. + if (!isConstant && !usesOnlyOneValue) { + SDValue Vec = DAG.getUNDEF(VT); + for (unsigned i = 0 ; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + SDValue LaneIdx = DAG.getConstant(i, MVT::i32); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); + } + return Vec; + } + return SDValue(); } @@ -4544,7 +4982,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // shuffle in combination with VEXTs. SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT VT = Op.getValueType(); unsigned NumElts = VT.getVectorNumElements(); @@ -4733,7 +5171,7 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, /// the specified operations to build the shuffle. static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, - DebugLoc dl) { + SDLoc dl) { unsigned OpNum = (PFEntry >> 26) & 0x0F; unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); @@ -4813,7 +5251,7 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, // Check to see if we can use the VTBL instruction. SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); SmallVector VTBLMask; for (ArrayRef::iterator @@ -4832,7 +5270,7 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, SelectionDAG &DAG) { - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); SDValue OpLHS = Op.getOperand(0); EVT VT = OpLHS.getValueType(); @@ -4850,7 +5288,7 @@ static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT VT = Op.getValueType(); ShuffleVectorSDNode *SVN = cast(Op.getNode()); @@ -5014,7 +5452,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { SDValue Vec = Op.getOperand(0); if (Op.getValueType() == MVT::i32 && Vec.getValueType().getVectorElementType().getSizeInBits() < 32) { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); } @@ -5026,7 +5464,7 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { // two 64-bit vectors are concatenated to a 128-bit vector. assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && "unexpected CONCAT_VECTORS"); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue Val = DAG.getUNDEF(MVT::v2f64); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); @@ -5115,6 +5553,23 @@ static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { return false; } +static EVT getExtensionTo64Bits(const EVT &OrigVT) { + if (OrigVT.getSizeInBits() >= 64) + return OrigVT; + + assert(OrigVT.isSimple() && "Expecting a simple value type"); + + MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; + switch (OrigSimpleTy) { + default: llvm_unreachable("Unexpected Vector Type"); + case MVT::v2i8: + case MVT::v2i16: + return MVT::v2i32; + case MVT::v4i8: + return MVT::v4i16; + } +} + /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. /// We insert the required extension here to get the vector to fill a D register. @@ -5130,19 +5585,9 @@ static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, return N; // Must extend size to at least 64 bits to be used as an operand for VMULL. - MVT::SimpleValueType OrigSimpleTy = OrigTy.getSimpleVT().SimpleTy; - EVT NewVT; - switch (OrigSimpleTy) { - default: llvm_unreachable("Unexpected Orig Vector Type"); - case MVT::v2i8: - case MVT::v2i16: - NewVT = MVT::v2i32; - break; - case MVT::v4i8: - NewVT = MVT::v4i16; - break; - } - return DAG.getNode(ExtOpcode, N->getDebugLoc(), NewVT, N); + EVT NewVT = getExtensionTo64Bits(OrigTy); + + return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); } /// SkipLoadExtensionForVMULL - return a load of the original vector size that @@ -5151,22 +5596,22 @@ static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, /// reach a total size of 64 bits. We have to add the extension separately /// because ARM does not have a sign/zero extending load for vectors. static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { - SDValue NonExtendingLoad = - DAG.getLoad(LD->getMemoryVT(), LD->getDebugLoc(), LD->getChain(), + EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); + + // The load already has the right type. + if (ExtendedTy == LD->getMemoryVT()) + return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(), LD->isInvariant(), LD->getAlignment()); - unsigned ExtOp = 0; - switch (LD->getExtensionType()) { - default: llvm_unreachable("Unexpected LoadExtType"); - case ISD::EXTLOAD: - case ISD::SEXTLOAD: ExtOp = ISD::SIGN_EXTEND; break; - case ISD::ZEXTLOAD: ExtOp = ISD::ZERO_EXTEND; break; - } - MVT::SimpleValueType MemType = LD->getMemoryVT().getSimpleVT().SimpleTy; - MVT::SimpleValueType ExtType = LD->getValueType(0).getSimpleVT().SimpleTy; - return AddRequiredExtensionForVMULL(NonExtendingLoad, DAG, - MemType, ExtType, ExtOp); + + // We need to create a zextload/sextload. We cannot just create a load + // followed by a zext/zext node because LowerMUL is also run during normal + // operation legalization where we can't create illegal types. + return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, + LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), + LD->getMemoryVT(), LD->isVolatile(), + LD->isNonTemporal(), LD->getAlignment()); } /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, @@ -5192,7 +5637,7 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { assert(BVN->getOpcode() == ISD::BUILD_VECTOR && BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); unsigned LowElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0; - return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), MVT::v2i32, + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::v2i32, BVN->getOperand(LowElt), BVN->getOperand(LowElt+2)); } // Construct a new BUILD_VECTOR with elements truncated to half the size. @@ -5209,7 +5654,7 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { // The values are implicitly truncated so sext vs. zext doesn't matter. Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32)); } - return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts); } @@ -5281,7 +5726,7 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { } // Legalize to a VMULL instruction. - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); SDValue Op0; SDValue Op1 = SkipExtensionForVMULL(N1, DAG); if (!isMLA) { @@ -5311,7 +5756,7 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { } static SDValue -LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) { +LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) { // Convert to float // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); @@ -5340,7 +5785,7 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) { } static SDValue -LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) { +LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) { SDValue N2; // Convert to float. // float4 yf = vcvt_f32_s32(vmovl_s16(y)); @@ -5381,7 +5826,7 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { assert((VT == MVT::v4i16 || VT == MVT::v8i8) && "unexpected type for custom-lowering ISD::SDIV"); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2, N3; @@ -5416,7 +5861,7 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { assert((VT == MVT::v4i16 || VT == MVT::v8i8) && "unexpected type for custom-lowering ISD::UDIV"); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2, N3; @@ -5500,63 +5945,160 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { } if (!ExtraOp) - return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), + return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); - return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), + return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), Op.getOperand(2)); } +SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { + assert(Subtarget->isTargetDarwin()); + + // For iOS, we want to call an alternative entry point: __sincos_stret, + // return values are passed via sret. + SDLoc dl(Op); + SDValue Arg = Op.getOperand(0); + EVT ArgVT = Arg.getValueType(); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + + MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // Pair of floats / doubles used to pass the result. + StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL); + + // Create stack object for sret. + const uint64_t ByteSize = TLI.getDataLayout()->getTypeAllocSize(RetTy); + const unsigned StackAlign = TLI.getDataLayout()->getPrefTypeAlignment(RetTy); + int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false); + SDValue SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy()); + + ArgListTy Args; + ArgListEntry Entry; + + Entry.Node = SRet; + Entry.Ty = RetTy->getPointerTo(); + Entry.isSExt = false; + Entry.isZExt = false; + Entry.isSRet = true; + Args.push_back(Entry); + + Entry.Node = Arg; + Entry.Ty = ArgTy; + Entry.isSExt = false; + Entry.isZExt = false; + Args.push_back(Entry); + + const char *LibcallName = (ArgVT == MVT::f64) + ? "__sincos_stret" : "__sincosf_stret"; + SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy()); + + TargetLowering:: + CallLoweringInfo CLI(DAG.getEntryNode(), Type::getVoidTy(*DAG.getContext()), + false, false, false, false, 0, + CallingConv::C, /*isTaillCall=*/false, + /*doesNotRet=*/false, /*isReturnValueUsed*/false, + Callee, Args, DAG, dl); + std::pair CallResult = LowerCallTo(CLI); + + SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, + MachinePointerInfo(), false, false, false, 0); + + // Address of cos field. + SDValue Add = DAG.getNode(ISD::ADD, dl, getPointerTy(), SRet, + DAG.getIntPtrConstant(ArgVT.getStoreSize())); + SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, + MachinePointerInfo(), false, false, false, 0); + + SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); + return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, + LoadSin.getValue(0), LoadCos.getValue(0)); +} + static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { // Monotonic load/store is legal for all targets if (cast(Op)->getOrdering() <= Monotonic) return Op; - // Aquire/Release load/store is not legal for targets without a + // Acquire/Release load/store is not legal for targets without a // dmb or equivalent available. return SDValue(); } - static void ReplaceATOMIC_OP_64(SDNode *Node, SmallVectorImpl& Results, - SelectionDAG &DAG, unsigned NewOp) { - DebugLoc dl = Node->getDebugLoc(); + SelectionDAG &DAG) { + SDLoc dl(Node); assert (Node->getValueType(0) == MVT::i64 && "Only know how to expand i64 atomics"); + AtomicSDNode *AN = cast(Node); SmallVector Ops; Ops.push_back(Node->getOperand(0)); // Chain Ops.push_back(Node->getOperand(1)); // Ptr - // Low part of Val1 - Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Node->getOperand(2), DAG.getIntPtrConstant(0))); - // High part of Val1 - Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Node->getOperand(2), DAG.getIntPtrConstant(1))); - if (NewOp == ARMISD::ATOMCMPXCHG64_DAG) { - // High part of Val1 + for(unsigned i=2; igetNumOperands(); i++) { + // Low part Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Node->getOperand(3), DAG.getIntPtrConstant(0))); - // High part of Val2 + Node->getOperand(i), DAG.getIntPtrConstant(0))); + // High part Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Node->getOperand(3), DAG.getIntPtrConstant(1))); + Node->getOperand(i), DAG.getIntPtrConstant(1))); } SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); - SDValue Result = - DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops.data(), Ops.size(), MVT::i64, - cast(Node)->getMemOperand()); + SDValue Result = DAG.getAtomic( + Node->getOpcode(), dl, MVT::i64, Tys, Ops.data(), Ops.size(), + cast(Node)->getMemOperand(), AN->getSuccessOrdering(), + AN->getFailureOrdering(), AN->getSynchScope()); SDValue OpsF[] = { Result.getValue(0), Result.getValue(1) }; Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); Results.push_back(Result.getValue(2)); } +static void ReplaceREADCYCLECOUNTER(SDNode *N, + SmallVectorImpl &Results, + SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + SDLoc DL(N); + SDValue Cycles32, OutChain; + + if (Subtarget->hasPerfMon()) { + // Under Power Management extensions, the cycle-count is: + // mrc p15, #0, , c9, c13, #0 + SDValue Ops[] = { N->getOperand(0), // Chain + DAG.getConstant(Intrinsic::arm_mrc, MVT::i32), + DAG.getConstant(15, MVT::i32), + DAG.getConstant(0, MVT::i32), + DAG.getConstant(9, MVT::i32), + DAG.getConstant(13, MVT::i32), + DAG.getConstant(0, MVT::i32) + }; + + Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, + DAG.getVTList(MVT::i32, MVT::Other), &Ops[0], + array_lengthof(Ops)); + OutChain = Cycles32.getValue(1); + } else { + // Intrinsic is defined to return 0 on unsupported platforms. Technically + // there are older ARM CPUs that have implementation-specific ways of + // obtaining this information (FIXME!). + Cycles32 = DAG.getConstant(0, MVT::i32); + OutChain = DAG.getEntryNode(); + } + + + SDValue Cycles64 = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, + Cycles32, DAG.getConstant(0, MVT::i32)); + Results.push_back(Cycles64); + Results.push_back(OutChain); +} + SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Don't know how to custom lower this!"); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::GlobalAddress: - return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) : + return Subtarget->isTargetMachO() ? LowerGlobalAddressDarwin(Op, DAG) : LowerGlobalAddressELF(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); @@ -5564,7 +6106,6 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BR_CC: return LowerBR_CC(Op, DAG); case ISD::BR_JT: return LowerBR_JT(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); - case ISD::MEMBARRIER: return LowerMEMBARRIER(Op, DAG, Subtarget); case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); case ISD::SINT_TO_FP: @@ -5605,6 +6146,9 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); case ISD::ATOMIC_LOAD: case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); + case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); + case ISD::SDIVREM: + case ISD::UDIVREM: return LowerDivRem(Op, DAG); } } @@ -5624,41 +6168,24 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, case ISD::SRA: Res = Expand64BitShift(N, DAG, Subtarget); break; - case ISD::ATOMIC_LOAD_ADD: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMADD64_DAG); + case ISD::READCYCLECOUNTER: + ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); return; + case ISD::ATOMIC_STORE: + case ISD::ATOMIC_LOAD: + case ISD::ATOMIC_LOAD_ADD: case ISD::ATOMIC_LOAD_AND: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMAND64_DAG); - return; case ISD::ATOMIC_LOAD_NAND: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMNAND64_DAG); - return; case ISD::ATOMIC_LOAD_OR: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMOR64_DAG); - return; case ISD::ATOMIC_LOAD_SUB: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSUB64_DAG); - return; case ISD::ATOMIC_LOAD_XOR: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMXOR64_DAG); - return; case ISD::ATOMIC_SWAP: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSWAP64_DAG); - return; case ISD::ATOMIC_CMP_SWAP: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMCMPXCHG64_DAG); - return; case ISD::ATOMIC_LOAD_MIN: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMIN64_DAG); - return; case ISD::ATOMIC_LOAD_UMIN: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMIN64_DAG); - return; case ISD::ATOMIC_LOAD_MAX: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMAX64_DAG); - return; case ISD::ATOMIC_LOAD_UMAX: - ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMAX64_DAG); + ReplaceATOMIC_OP_64(N, Results, DAG); return; } if (Res.getNode()) @@ -5678,6 +6205,7 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI, unsigned oldval = MI->getOperand(2).getReg(); unsigned newval = MI->getOperand(3).getReg(); const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + AtomicOrdering Ord = static_cast(MI->getOperand(4).getImm()); DebugLoc dl = MI->getDebugLoc(); bool isThumb2 = Subtarget->isThumb2(); @@ -5693,21 +6221,7 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI, } unsigned ldrOpc, strOpc; - switch (Size) { - default: llvm_unreachable("unsupported size for AtomicCmpSwap!"); - case 1: - ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB; - strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB; - break; - case 2: - ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH; - strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH; - break; - case 4: - ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX; - strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX; - break; - } + getExclusiveOperation(Size, Ord, isThumb2, ldrOpc, strOpc); MachineFunction *MF = BB->getParent(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); @@ -5723,8 +6237,7 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI, // Transfer the remainder of BB and its successor edges to exitMBB. exitMBB->splice(exitMBB->begin(), BB, - llvm::next(MachineBasicBlock::iterator(MI)), - BB->end()); + std::next(MachineBasicBlock::iterator(MI)), BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); // thisMBB: @@ -5787,6 +6300,7 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, unsigned dest = MI->getOperand(0).getReg(); unsigned ptr = MI->getOperand(1).getReg(); unsigned incr = MI->getOperand(2).getReg(); + AtomicOrdering Ord = static_cast(MI->getOperand(3).getImm()); DebugLoc dl = MI->getDebugLoc(); bool isThumb2 = Subtarget->isThumb2(); @@ -5794,24 +6308,11 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, if (isThumb2) { MRI.constrainRegClass(dest, &ARM::rGPRRegClass); MRI.constrainRegClass(ptr, &ARM::rGPRRegClass); + MRI.constrainRegClass(incr, &ARM::rGPRRegClass); } unsigned ldrOpc, strOpc; - switch (Size) { - default: llvm_unreachable("unsupported size for AtomicCmpSwap!"); - case 1: - ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB; - strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB; - break; - case 2: - ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH; - strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH; - break; - case 4: - ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX; - strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX; - break; - } + getExclusiveOperation(Size, Ord, isThumb2, ldrOpc, strOpc); MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); @@ -5820,8 +6321,7 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, // Transfer the remainder of BB and its successor edges to exitMBB. exitMBB->splice(exitMBB->begin(), BB, - llvm::next(MachineBasicBlock::iterator(MI)), - BB->end()); + std::next(MachineBasicBlock::iterator(MI)), BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); const TargetRegisterClass *TRC = isThumb2 ? @@ -5895,6 +6395,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, unsigned ptr = MI->getOperand(1).getReg(); unsigned incr = MI->getOperand(2).getReg(); unsigned oldval = dest; + AtomicOrdering Ord = static_cast(MI->getOperand(3).getImm()); DebugLoc dl = MI->getDebugLoc(); bool isThumb2 = Subtarget->isThumb2(); @@ -5902,24 +6403,20 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, if (isThumb2) { MRI.constrainRegClass(dest, &ARM::rGPRRegClass); MRI.constrainRegClass(ptr, &ARM::rGPRRegClass); + MRI.constrainRegClass(incr, &ARM::rGPRRegClass); } unsigned ldrOpc, strOpc, extendOpc; + getExclusiveOperation(Size, Ord, isThumb2, ldrOpc, strOpc); switch (Size) { - default: llvm_unreachable("unsupported size for AtomicCmpSwap!"); + default: llvm_unreachable("unsupported size for AtomicBinaryMinMax!"); case 1: - ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB; - strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB; extendOpc = isThumb2 ? ARM::t2SXTB : ARM::SXTB; break; case 2: - ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH; - strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH; extendOpc = isThumb2 ? ARM::t2SXTH : ARM::SXTH; break; case 4: - ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX; - strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX; extendOpc = 0; break; } @@ -5931,8 +6428,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, // Transfer the remainder of BB and its successor edges to exitMBB. exitMBB->splice(exitMBB->begin(), BB, - llvm::next(MachineBasicBlock::iterator(MI)), - BB->end()); + std::next(MachineBasicBlock::iterator(MI)), BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); const TargetRegisterClass *TRC = isThumb2 ? @@ -5963,7 +6459,10 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, // Sign extend the value, if necessary. if (signExtend && extendOpc) { - oldval = MRI.createVirtualRegister(&ARM::GPRRegClass); + oldval = MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass + : &ARM::GPRnopcRegClass); + if (!isThumb2) + MRI.constrainRegClass(dest, &ARM::GPRnopcRegClass); AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval) .addReg(dest) .addImm(0)); @@ -6001,7 +6500,7 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, unsigned Op1, unsigned Op2, bool NeedsCarry, bool IsCmpxchg, bool IsMinMax, ARMCC::CondCodes CC) const { - // This also handles ATOMIC_SWAP, indicated by Op1==0. + // This also handles ATOMIC_SWAP and ATOMIC_STORE, indicated by Op1==0. const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); @@ -6014,6 +6513,8 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, unsigned ptr = MI->getOperand(2).getReg(); unsigned vallo = MI->getOperand(3).getReg(); unsigned valhi = MI->getOperand(4).getReg(); + AtomicOrdering Ord = + static_cast(MI->getOperand(IsCmpxchg ? 7 : 5).getImm()); DebugLoc dl = MI->getDebugLoc(); bool isThumb2 = Subtarget->isThumb2(); @@ -6022,8 +6523,13 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, MRI.constrainRegClass(destlo, &ARM::rGPRRegClass); MRI.constrainRegClass(desthi, &ARM::rGPRRegClass); MRI.constrainRegClass(ptr, &ARM::rGPRRegClass); + MRI.constrainRegClass(vallo, &ARM::rGPRRegClass); + MRI.constrainRegClass(valhi, &ARM::rGPRRegClass); } + unsigned ldrOpc, strOpc; + getExclusiveOperation(8, Ord, isThumb2, ldrOpc, strOpc); + MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *contBB = 0, *cont2BB = 0; if (IsCmpxchg || IsMinMax) @@ -6039,8 +6545,7 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, // Transfer the remainder of BB and its successor edges to exitMBB. exitMBB->splice(exitMBB->begin(), BB, - llvm::next(MachineBasicBlock::iterator(MI)), - BB->end()); + std::next(MachineBasicBlock::iterator(MI)), BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); const TargetRegisterClass *TRC = isThumb2 ? @@ -6065,19 +6570,20 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, // Load if (isThumb2) { - AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2LDREXD)) - .addReg(destlo, RegState::Define) - .addReg(desthi, RegState::Define) - .addReg(ptr)); + AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc)) + .addReg(destlo, RegState::Define) + .addReg(desthi, RegState::Define) + .addReg(ptr)); } else { unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); - AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDREXD)) - .addReg(GPRPair0, RegState::Define).addReg(ptr)); + AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc)) + .addReg(GPRPair0, RegState::Define) + .addReg(ptr)); // Copy r2/r3 into dest. (This copy will normally be coalesced.) BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo) - .addReg(GPRPair0, 0, ARM::gsub_0); + .addReg(GPRPair0, 0, ARM::gsub_0); BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi) - .addReg(GPRPair0, 0, ARM::gsub_1); + .addReg(GPRPair0, 0, ARM::gsub_1); } unsigned StoreLo, StoreHi; @@ -6129,7 +6635,9 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, // Store if (isThumb2) { - AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2STREXD), storesuccess) + MRI.constrainRegClass(StoreLo, &ARM::rGPRRegClass); + MRI.constrainRegClass(StoreHi, &ARM::rGPRRegClass); + AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess) .addReg(StoreLo).addReg(StoreHi).addReg(ptr)); } else { // Marshal a pair... @@ -6147,7 +6655,7 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, .addImm(ARM::gsub_1); // ...and store it - AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::STREXD), storesuccess) + AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess) .addReg(StorePair).addReg(ptr)); } // Cmp+jump @@ -6168,6 +6676,51 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, return BB; } +MachineBasicBlock * +ARMTargetLowering::EmitAtomicLoad64(MachineInstr *MI, MachineBasicBlock *BB) const { + + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + unsigned destlo = MI->getOperand(0).getReg(); + unsigned desthi = MI->getOperand(1).getReg(); + unsigned ptr = MI->getOperand(2).getReg(); + AtomicOrdering Ord = static_cast(MI->getOperand(3).getImm()); + DebugLoc dl = MI->getDebugLoc(); + bool isThumb2 = Subtarget->isThumb2(); + + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + if (isThumb2) { + MRI.constrainRegClass(destlo, &ARM::rGPRRegClass); + MRI.constrainRegClass(desthi, &ARM::rGPRRegClass); + MRI.constrainRegClass(ptr, &ARM::rGPRRegClass); + } + unsigned ldrOpc, strOpc; + getExclusiveOperation(8, Ord, isThumb2, ldrOpc, strOpc); + + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(ldrOpc)); + + if (isThumb2) { + MIB.addReg(destlo, RegState::Define) + .addReg(desthi, RegState::Define) + .addReg(ptr); + + } else { + unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + MIB.addReg(GPRPair0, RegState::Define).addReg(ptr); + + // Copy GPRPair0 into dest. (This copy will normally be coalesced.) + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), destlo) + .addReg(GPRPair0, 0, ARM::gsub_0); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), desthi) + .addReg(GPRPair0, 0, ARM::gsub_1); + } + AddDefaultPred(MIB); + + MI->eraseFromParent(); // The instruction is gone now. + + return BB; +} + /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and /// registers the function context. void ARMTargetLowering:: @@ -6699,8 +7252,109 @@ MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { llvm_unreachable("Expecting a BB with two successors!"); } -MachineBasicBlock *ARMTargetLowering:: -EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { +/// Return the load opcode for a given load size. If load size >= 8, +/// neon opcode will be returned. +static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { + if (LdSize >= 8) + return LdSize == 16 ? ARM::VLD1q32wb_fixed + : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; + if (IsThumb1) + return LdSize == 4 ? ARM::tLDRi + : LdSize == 2 ? ARM::tLDRHi + : LdSize == 1 ? ARM::tLDRBi : 0; + if (IsThumb2) + return LdSize == 4 ? ARM::t2LDR_POST + : LdSize == 2 ? ARM::t2LDRH_POST + : LdSize == 1 ? ARM::t2LDRB_POST : 0; + return LdSize == 4 ? ARM::LDR_POST_IMM + : LdSize == 2 ? ARM::LDRH_POST + : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; +} + +/// Return the store opcode for a given store size. If store size >= 8, +/// neon opcode will be returned. +static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { + if (StSize >= 8) + return StSize == 16 ? ARM::VST1q32wb_fixed + : StSize == 8 ? ARM::VST1d32wb_fixed : 0; + if (IsThumb1) + return StSize == 4 ? ARM::tSTRi + : StSize == 2 ? ARM::tSTRHi + : StSize == 1 ? ARM::tSTRBi : 0; + if (IsThumb2) + return StSize == 4 ? ARM::t2STR_POST + : StSize == 2 ? ARM::t2STRH_POST + : StSize == 1 ? ARM::t2STRB_POST : 0; + return StSize == 4 ? ARM::STR_POST_IMM + : StSize == 2 ? ARM::STRH_POST + : StSize == 1 ? ARM::STRB_POST_IMM : 0; +} + +/// Emit a post-increment load operation with given size. The instructions +/// will be added to BB at Pos. +static void emitPostLd(MachineBasicBlock *BB, MachineInstr *Pos, + const TargetInstrInfo *TII, DebugLoc dl, + unsigned LdSize, unsigned Data, unsigned AddrIn, + unsigned AddrOut, bool IsThumb1, bool IsThumb2) { + unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); + assert(LdOpc != 0 && "Should have a load opcode"); + if (LdSize >= 8) { + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) + .addReg(AddrOut, RegState::Define).addReg(AddrIn) + .addImm(0)); + } else if (IsThumb1) { + // load + update AddrIn + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) + .addReg(AddrIn).addImm(0)); + MachineInstrBuilder MIB = + BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut); + MIB = AddDefaultT1CC(MIB); + MIB.addReg(AddrIn).addImm(LdSize); + AddDefaultPred(MIB); + } else if (IsThumb2) { + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) + .addReg(AddrOut, RegState::Define).addReg(AddrIn) + .addImm(LdSize)); + } else { // arm + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) + .addReg(AddrOut, RegState::Define).addReg(AddrIn) + .addReg(0).addImm(LdSize)); + } +} + +/// Emit a post-increment store operation with given size. The instructions +/// will be added to BB at Pos. +static void emitPostSt(MachineBasicBlock *BB, MachineInstr *Pos, + const TargetInstrInfo *TII, DebugLoc dl, + unsigned StSize, unsigned Data, unsigned AddrIn, + unsigned AddrOut, bool IsThumb1, bool IsThumb2) { + unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); + assert(StOpc != 0 && "Should have a store opcode"); + if (StSize >= 8) { + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) + .addReg(AddrIn).addImm(0).addReg(Data)); + } else if (IsThumb1) { + // store + update AddrIn + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc)).addReg(Data) + .addReg(AddrIn).addImm(0)); + MachineInstrBuilder MIB = + BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut); + MIB = AddDefaultT1CC(MIB); + MIB.addReg(AddrIn).addImm(StSize); + AddDefaultPred(MIB); + } else if (IsThumb2) { + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) + .addReg(Data).addReg(AddrIn).addImm(StSize)); + } else { // arm + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) + .addReg(Data).addReg(AddrIn).addReg(0) + .addImm(StSize)); + } +} + +MachineBasicBlock * +ARMTargetLowering::EmitStructByval(MachineInstr *MI, + MachineBasicBlock *BB) const { // This pseudo instruction has 3 operands: dst, src, size // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). // Otherwise, we will generate unrolled scalar copies. @@ -6715,23 +7369,18 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { unsigned Align = MI->getOperand(3).getImm(); DebugLoc dl = MI->getDebugLoc(); - bool isThumb2 = Subtarget->isThumb2(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned ldrOpc, strOpc, UnitSize = 0; + unsigned UnitSize = 0; + const TargetRegisterClass *TRC = 0; + const TargetRegisterClass *VecTRC = 0; - const TargetRegisterClass *TRC = isThumb2 ? - (const TargetRegisterClass*)&ARM::tGPRRegClass : - (const TargetRegisterClass*)&ARM::GPRRegClass; - const TargetRegisterClass *TRC_Vec = 0; + bool IsThumb1 = Subtarget->isThumb1Only(); + bool IsThumb2 = Subtarget->isThumb2(); if (Align & 1) { - ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM; - strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM; UnitSize = 1; } else if (Align & 2) { - ldrOpc = isThumb2 ? ARM::t2LDRH_POST : ARM::LDRH_POST; - strOpc = isThumb2 ? ARM::t2STRH_POST : ARM::STRH_POST; UnitSize = 2; } else { // Check whether we can use NEON instructions. @@ -6739,27 +7388,27 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat) && Subtarget->hasNEON()) { - if ((Align % 16 == 0) && SizeVal >= 16) { - ldrOpc = ARM::VLD1q32wb_fixed; - strOpc = ARM::VST1q32wb_fixed; + if ((Align % 16 == 0) && SizeVal >= 16) UnitSize = 16; - TRC_Vec = (const TargetRegisterClass*)&ARM::DPairRegClass; - } - else if ((Align % 8 == 0) && SizeVal >= 8) { - ldrOpc = ARM::VLD1d32wb_fixed; - strOpc = ARM::VST1d32wb_fixed; + else if ((Align % 8 == 0) && SizeVal >= 8) UnitSize = 8; - TRC_Vec = (const TargetRegisterClass*)&ARM::DPRRegClass; - } } // Can't use NEON instructions. - if (UnitSize == 0) { - ldrOpc = isThumb2 ? ARM::t2LDR_POST : ARM::LDR_POST_IMM; - strOpc = isThumb2 ? ARM::t2STR_POST : ARM::STR_POST_IMM; + if (UnitSize == 0) UnitSize = 4; - } } + // Select the correct opcode and register class for unit size load/store + bool IsNeon = UnitSize >= 8; + TRC = (IsThumb1 || IsThumb2) ? (const TargetRegisterClass *)&ARM::tGPRRegClass + : (const TargetRegisterClass *)&ARM::GPRRegClass; + if (IsNeon) + VecTRC = UnitSize == 16 + ? (const TargetRegisterClass *)&ARM::DPairRegClass + : UnitSize == 8 + ? (const TargetRegisterClass *)&ARM::DPRRegClass + : 0; + unsigned BytesLeft = SizeVal % UnitSize; unsigned LoopSize = SizeVal - BytesLeft; @@ -6770,34 +7419,13 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { unsigned srcIn = src; unsigned destIn = dest; for (unsigned i = 0; i < LoopSize; i+=UnitSize) { - unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC); unsigned srcOut = MRI.createVirtualRegister(TRC); unsigned destOut = MRI.createVirtualRegister(TRC); - if (UnitSize >= 8) { - AddDefaultPred(BuildMI(*BB, MI, dl, - TII->get(ldrOpc), scratch) - .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(0)); - - AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) - .addReg(destIn).addImm(0).addReg(scratch)); - } else if (isThumb2) { - AddDefaultPred(BuildMI(*BB, MI, dl, - TII->get(ldrOpc), scratch) - .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(UnitSize)); - - AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) - .addReg(scratch).addReg(destIn) - .addImm(UnitSize)); - } else { - AddDefaultPred(BuildMI(*BB, MI, dl, - TII->get(ldrOpc), scratch) - .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0) - .addImm(UnitSize)); - - AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) - .addReg(scratch).addReg(destIn) - .addReg(0).addImm(UnitSize)); - } + unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); + emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, + IsThumb1, IsThumb2); + emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, + IsThumb1, IsThumb2); srcIn = srcOut; destIn = destOut; } @@ -6805,30 +7433,14 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { // Handle the leftover bytes with LDRB and STRB. // [scratch, srcOut] = LDRB_POST(srcIn, 1) // [destOut] = STRB_POST(scratch, destIn, 1) - ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM; - strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM; for (unsigned i = 0; i < BytesLeft; i++) { - unsigned scratch = MRI.createVirtualRegister(TRC); unsigned srcOut = MRI.createVirtualRegister(TRC); unsigned destOut = MRI.createVirtualRegister(TRC); - if (isThumb2) { - AddDefaultPred(BuildMI(*BB, MI, dl, - TII->get(ldrOpc),scratch) - .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1)); - - AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) - .addReg(scratch).addReg(destIn) - .addReg(0).addImm(1)); - } else { - AddDefaultPred(BuildMI(*BB, MI, dl, - TII->get(ldrOpc),scratch) - .addReg(srcOut, RegState::Define).addReg(srcIn) - .addReg(0).addImm(1)); - - AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) - .addReg(scratch).addReg(destIn) - .addReg(0).addImm(1)); - } + unsigned scratch = MRI.createVirtualRegister(TRC); + emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, + IsThumb1, IsThumb2); + emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, + IsThumb1, IsThumb2); srcIn = srcOut; destIn = destOut; } @@ -6863,23 +7475,21 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { // Transfer the remainder of BB and its successor edges to exitMBB. exitMBB->splice(exitMBB->begin(), BB, - llvm::next(MachineBasicBlock::iterator(MI)), - BB->end()); + std::next(MachineBasicBlock::iterator(MI)), BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); // Load an immediate to varEnd. unsigned varEnd = MRI.createVirtualRegister(TRC); - if (isThumb2) { - unsigned VReg1 = varEnd; + if (IsThumb2) { + unsigned Vtmp = varEnd; if ((LoopSize & 0xFFFF0000) != 0) - VReg1 = MRI.createVirtualRegister(TRC); - AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), VReg1) - .addImm(LoopSize & 0xFFFF)); + Vtmp = MRI.createVirtualRegister(TRC); + AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), Vtmp) + .addImm(LoopSize & 0xFFFF)); if ((LoopSize & 0xFFFF0000) != 0) AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd) - .addReg(VReg1) - .addImm(LoopSize >> 16)); + .addReg(Vtmp).addImm(LoopSize >> 16)); } else { MachineConstantPool *ConstantPool = MF->getConstantPool(); Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); @@ -6891,10 +7501,12 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { Align = getDataLayout()->getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); - AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDRcp)) - .addReg(varEnd, RegState::Define) - .addConstantPoolIndex(Idx) - .addImm(0)); + if (IsThumb1) + AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)).addReg( + varEnd, RegState::Define).addConstantPoolIndex(Idx)); + else + AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)).addReg( + varEnd, RegState::Define).addConstantPoolIndex(Idx).addImm(0)); } BB->addSuccessor(loopMBB); @@ -6923,39 +7535,30 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) - unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC); - if (UnitSize >= 8) { - AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch) - .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(0)); - - AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop) - .addReg(destPhi).addImm(0).addReg(scratch)); - } else if (isThumb2) { - AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch) - .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(UnitSize)); - - AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop) - .addReg(scratch).addReg(destPhi) - .addImm(UnitSize)); - } else { - AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch) - .addReg(srcLoop, RegState::Define).addReg(srcPhi).addReg(0) - .addImm(UnitSize)); - - AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop) - .addReg(scratch).addReg(destPhi) - .addReg(0).addImm(UnitSize)); - } + unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); + emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, + IsThumb1, IsThumb2); + emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, + IsThumb1, IsThumb2); // Decrement loop variable by UnitSize. - MachineInstrBuilder MIB = BuildMI(BB, dl, - TII->get(isThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); - AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize))); - MIB->getOperand(5).setReg(ARM::CPSR); - MIB->getOperand(5).setIsDef(true); - - BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) - .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); + if (IsThumb1) { + MachineInstrBuilder MIB = + BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop); + MIB = AddDefaultT1CC(MIB); + MIB.addReg(varPhi).addImm(UnitSize); + AddDefaultPred(MIB); + } else { + MachineInstrBuilder MIB = + BuildMI(*BB, BB->end(), dl, + TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); + AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize))); + MIB->getOperand(5).setReg(ARM::CPSR); + MIB->getOperand(5).setIsDef(true); + } + BuildMI(*BB, BB->end(), dl, + TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); // loopMBB can loop back to loopMBB or fall through to exitMBB. BB->addSuccessor(loopMBB); @@ -6964,34 +7567,19 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { // Add epilogue to handle BytesLeft. BB = exitMBB; MachineInstr *StartOfExit = exitMBB->begin(); - ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM; - strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM; // [scratch, srcOut] = LDRB_POST(srcLoop, 1) // [destOut] = STRB_POST(scratch, destLoop, 1) unsigned srcIn = srcLoop; unsigned destIn = destLoop; for (unsigned i = 0; i < BytesLeft; i++) { - unsigned scratch = MRI.createVirtualRegister(TRC); unsigned srcOut = MRI.createVirtualRegister(TRC); unsigned destOut = MRI.createVirtualRegister(TRC); - if (isThumb2) { - AddDefaultPred(BuildMI(*BB, StartOfExit, dl, - TII->get(ldrOpc),scratch) - .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1)); - - AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut) - .addReg(scratch).addReg(destIn) - .addImm(1)); - } else { - AddDefaultPred(BuildMI(*BB, StartOfExit, dl, - TII->get(ldrOpc),scratch) - .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0).addImm(1)); - - AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut) - .addReg(scratch).addReg(destIn) - .addReg(0).addImm(1)); - } + unsigned scratch = MRI.createVirtualRegister(TRC); + emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, + IsThumb1, IsThumb2); + emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, + IsThumb1, IsThumb2); srcIn = srcOut; destIn = destOut; } @@ -7141,46 +7729,48 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2); case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4); + case ARM::ATOMIC_LOAD_I64: + return EmitAtomicLoad64(MI, BB); - case ARM::ATOMADD6432: + case ARM::ATOMIC_LOAD_ADD_I64: return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr, isThumb2 ? ARM::t2ADCrr : ARM::ADCrr, /*NeedsCarry*/ true); - case ARM::ATOMSUB6432: + case ARM::ATOMIC_LOAD_SUB_I64: return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, /*NeedsCarry*/ true); - case ARM::ATOMOR6432: + case ARM::ATOMIC_LOAD_OR_I64: return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr); - case ARM::ATOMXOR6432: + case ARM::ATOMIC_LOAD_XOR_I64: return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2EORrr : ARM::EORrr, isThumb2 ? ARM::t2EORrr : ARM::EORrr); - case ARM::ATOMAND6432: + case ARM::ATOMIC_LOAD_AND_I64: return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr); - case ARM::ATOMSWAP6432: + case ARM::ATOMIC_SWAP_I64: return EmitAtomicBinary64(MI, BB, 0, 0, false); - case ARM::ATOMCMPXCHG6432: + case ARM::ATOMIC_CMP_SWAP_I64: return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, /*NeedsCarry*/ false, /*IsCmpxchg*/true); - case ARM::ATOMMIN6432: + case ARM::ATOMIC_LOAD_MIN_I64: return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, /*NeedsCarry*/ true, /*IsCmpxchg*/false, /*IsMinMax*/ true, ARMCC::LT); - case ARM::ATOMMAX6432: + case ARM::ATOMIC_LOAD_MAX_I64: return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, /*NeedsCarry*/ true, /*IsCmpxchg*/false, /*IsMinMax*/ true, ARMCC::GE); - case ARM::ATOMUMIN6432: + case ARM::ATOMIC_LOAD_UMIN_I64: return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, /*NeedsCarry*/ true, /*IsCmpxchg*/false, /*IsMinMax*/ true, ARMCC::LO); - case ARM::ATOMUMAX6432: + case ARM::ATOMIC_LOAD_UMAX_I64: return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, /*NeedsCarry*/ true, /*IsCmpxchg*/false, @@ -7210,8 +7800,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), BB, - llvm::next(MachineBasicBlock::iterator(MI)), - BB->end()); + std::next(MachineBasicBlock::iterator(MI)), BB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(copy0MBB); @@ -7244,7 +7833,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case ARM::BCCi64: case ARM::BCCZi64: { // If there is an unconditional branch to the other successor, remove it. - BB->erase(llvm::next(MachineBasicBlock::iterator(MI)), BB->end()); + BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end()); // Compare both parts that make up the double comparison separately for // equality. @@ -7329,8 +7918,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // Transfer the remainder of BB and its successor edges to sinkMBB. SinkBB->splice(SinkBB->begin(), BB, - llvm::next(MachineBasicBlock::iterator(MI)), - BB->end()); + std::next(MachineBasicBlock::iterator(MI)), BB->end()); SinkBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(RSBBB); @@ -7558,13 +8146,13 @@ SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, // Slct is now know to be the desired identity constant when CC is true. SDValue TrueVal = OtherOp; - SDValue FalseVal = DAG.getNode(N->getOpcode(), N->getDebugLoc(), VT, + SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, OtherOp, NonConstantVal); // Unless SwapSelectOps says CC should be false. if (SwapSelectOps) std::swap(TrueVal, FalseVal); - return DAG.getNode(ISD::SELECT, N->getDebugLoc(), VT, + return DAG.getNode(ISD::SELECT, SDLoc(N), VT, CCOp, TrueVal, FalseVal); } @@ -7671,9 +8259,9 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1, llvm_unreachable("Invalid vector element type for padd optimization."); } - SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(), + SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), widenType, &Ops[0], Ops.size()); - return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, tmp); + return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, tmp); } static SDValue findMUL_LOHI(SDValue V) { @@ -7716,8 +8304,11 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, assert(AddcNode->getNumValues() == 2 && AddcNode->getValueType(0) == MVT::i32 && - AddcNode->getValueType(1) == MVT::Glue && - "Expect ADDC with two result values: i32, glue"); + "Expect ADDC with two result values. First: i32"); + + // Check that we have a glued ADDC node. + if (AddcNode->getValueType(1) != MVT::Glue) + return SDValue(); // Check that the ADDC adds the low result of the S/UMUL_LOHI. if (AddcOp0->getOpcode() != ISD::UMUL_LOHI && @@ -7798,7 +8389,7 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, Ops.push_back(*LowAdd); Ops.push_back(*HiAdd); - SDValue MLALNode = DAG.getNode(FinalOpc, AddcNode->getDebugLoc(), + SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcNode), DAG.getVTList(MVT::i32, MVT::i32), &Ops[0], Ops.size()); @@ -7886,6 +8477,13 @@ static SDValue PerformSUBCombine(SDNode *N, /// is faster than /// vadd d3, d0, d1 /// vmul d3, d3, d2 +// However, for (A + B) * (A + B), +// vadd d2, d0, d1 +// vmul d3, d0, d2 +// vmla d3, d1, d2 +// is slower than +// vadd d2, d0, d1 +// vmul d3, d2, d2 static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { @@ -7905,8 +8503,11 @@ static SDValue PerformVMULCombine(SDNode *N, std::swap(N0, N1); } + if (N0 == N1) + return SDValue(); + EVT VT = N->getValueType(0); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); SDValue N00 = N0->getOperand(0); SDValue N01 = N0->getOperand(1); return DAG.getNode(Opcode, DL, VT, @@ -7936,11 +8537,11 @@ static SDValue PerformMULCombine(SDNode *N, return SDValue(); int64_t MulAmt = C->getSExtValue(); - unsigned ShiftAmt = CountTrailingZeros_64(MulAmt); + unsigned ShiftAmt = countTrailingZeros(MulAmt); ShiftAmt = ShiftAmt & (32 - 1); SDValue V = N->getOperand(0); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); SDValue Res; MulAmt >>= ShiftAmt; @@ -8004,7 +8605,7 @@ static SDValue PerformANDCombine(SDNode *N, // Attempt to use immediate-form VBIC BuildVectorSDNode *BVN = dyn_cast(N->getOperand(1)); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; @@ -8047,7 +8648,7 @@ static SDValue PerformORCombine(SDNode *N, const ARMSubtarget *Subtarget) { // Attempt to use immediate-form VORR BuildVectorSDNode *BVN = dyn_cast(N->getOperand(1)); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; @@ -8096,22 +8697,29 @@ static SDValue PerformORCombine(SDNode *N, unsigned SplatBitSize; bool HasAnyUndefs; + APInt SplatBits0, SplatBits1; BuildVectorSDNode *BVN0 = dyn_cast(N0->getOperand(1)); - APInt SplatBits0; + BuildVectorSDNode *BVN1 = dyn_cast(N1->getOperand(1)); + // Ensure that the second operand of both ands are constants if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, - HasAnyUndefs) && !HasAnyUndefs) { - BuildVectorSDNode *BVN1 = dyn_cast(N1->getOperand(1)); - APInt SplatBits1; - if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, - HasAnyUndefs) && !HasAnyUndefs && - SplatBits0 == ~SplatBits1) { - // Canonicalize the vector type to make instruction selection simpler. - EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; - SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, - N0->getOperand(1), N0->getOperand(0), - N1->getOperand(0)); - return DAG.getNode(ISD::BITCAST, dl, VT, Result); - } + HasAnyUndefs) && !HasAnyUndefs) { + if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, + HasAnyUndefs) && !HasAnyUndefs) { + // Ensure that the bit width of the constants are the same and that + // the splat arguments are logical inverses as per the pattern we + // are trying to simplify. + if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && + SplatBits0 == ~SplatBits1) { + // Canonicalize the vector type to make instruction selection + // simpler. + EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; + SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, + N0->getOperand(1), + N0->getOperand(0), + N1->getOperand(0)); + return DAG.getNode(ISD::BITCAST, dl, VT, Result); + } + } } } @@ -8122,7 +8730,7 @@ static SDValue PerformORCombine(SDNode *N, if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) return SDValue(); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); // 1) or (and A, mask), val => ARMbfi A, val, mask // iff (val & mask) == val // @@ -8157,7 +8765,7 @@ static SDValue PerformORCombine(SDNode *N, return SDValue(); if (ARM::isBitFieldInvertedMask(Mask)) { - Val >>= CountTrailingZeros_32(~Mask); + Val >>= countTrailingZeros(~Mask); Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, DAG.getConstant(Val, MVT::i32), @@ -8184,7 +8792,7 @@ static SDValue PerformORCombine(SDNode *N, (Mask == 0xffff || Mask == 0xffff0000)) return SDValue(); // 2a - unsigned amt = CountTrailingZeros_32(Mask2); + unsigned amt = countTrailingZeros(Mask2); Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), DAG.getConstant(amt, MVT::i32)); Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, @@ -8200,7 +8808,7 @@ static SDValue PerformORCombine(SDNode *N, (Mask2 == 0xffff || Mask2 == 0xffff0000)) return SDValue(); // 2b - unsigned lsb = CountTrailingZeros_32(Mask); + unsigned lsb = countTrailingZeros(Mask); Res = DAG.getNode(ISD::SRL, DL, VT, N00, DAG.getConstant(lsb, MVT::i32)); Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, @@ -8218,7 +8826,7 @@ static SDValue PerformORCombine(SDNode *N, // where lsb(mask) == #shamt and masked bits of B are known zero. SDValue ShAmt = N00.getOperand(1); unsigned ShAmtC = cast(ShAmt)->getZExtValue(); - unsigned LSB = CountTrailingZeros_32(Mask); + unsigned LSB = countTrailingZeros(Mask); if (ShAmtC != LSB) return SDValue(); @@ -8261,12 +8869,12 @@ static SDValue PerformBFICombine(SDNode *N, if (!N11C) return SDValue(); unsigned InvMask = cast(N->getOperand(2))->getZExtValue(); - unsigned LSB = CountTrailingZeros_32(~InvMask); - unsigned Width = (32 - CountLeadingZeros_32(~InvMask)) - LSB; + unsigned LSB = countTrailingZeros(~InvMask); + unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; unsigned Mask = (1 << Width)-1; unsigned Mask2 = N11C->getZExtValue(); if ((Mask & (~Mask2)) == 0) - return DCI.DAG.getNode(ARMISD::BFI, N->getDebugLoc(), N->getValueType(0), + return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), N->getOperand(0), N1.getOperand(0), N->getOperand(2)); } @@ -8292,7 +8900,7 @@ static SDValue PerformVMOVRRDCombine(SDNode *N, LoadSDNode *LD = cast(InNode); SelectionDAG &DAG = DCI.DAG; - DebugLoc DL = LD->getDebugLoc(); + SDLoc DL(LD); SDValue BasePtr = LD->getBasePtr(); SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(), LD->isVolatile(), @@ -8329,7 +8937,7 @@ static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { if (Op0.getOpcode() == ARMISD::VMOVRRD && Op0.getNode() == Op1.getNode() && Op0.getResNo() == 0 && Op1.getResNo() == 1) - return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), + return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Op0.getOperand(0)); return SDValue(); } @@ -8371,7 +8979,7 @@ static SDValue PerformSTORECombine(SDNode *N, NumElems*SizeRatio); assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); - DebugLoc DL = St->getDebugLoc(); + SDLoc DL(St); SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = i * SizeRatio; @@ -8432,7 +9040,7 @@ static SDValue PerformSTORECombine(SDNode *N, if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && StVal.getNode()->hasOneUse()) { SelectionDAG &DAG = DCI.DAG; - DebugLoc DL = St->getDebugLoc(); + SDLoc DL(St); SDValue BasePtr = St->getBasePtr(); SDValue NewST1 = DAG.getStore(St->getChain(), DL, StVal.getNode()->getOperand(0), BasePtr, @@ -8454,14 +9062,14 @@ static SDValue PerformSTORECombine(SDNode *N, // Bitcast an i64 store extracted from a vector to f64. // Otherwise, the i64 value will be legalized to a pair of i32 values. SelectionDAG &DAG = DCI.DAG; - DebugLoc dl = StVal.getDebugLoc(); + SDLoc dl(StVal); SDValue IntVec = StVal.getOperand(0); EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, IntVec.getValueType().getVectorNumElements()); SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Vec, StVal.getOperand(1)); - dl = N->getDebugLoc(); + dl = SDLoc(N); SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); // Make the DAGCombiner fold the bitcasts. DCI.AddToWorklist(Vec.getNode()); @@ -8507,7 +9115,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, EVT VT = N->getValueType(0); if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) return SDValue(); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); SmallVector Ops; unsigned NumElts = VT.getVectorNumElements(); for (unsigned i = 0; i < NumElts; ++i) { @@ -8521,6 +9129,98 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, return DAG.getNode(ISD::BITCAST, dl, VT, BV); } +/// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. +static SDValue +PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. + // At that time, we may have inserted bitcasts from integer to float. + // If these bitcasts have survived DAGCombine, change the lowering of this + // BUILD_VECTOR in something more vector friendly, i.e., that does not + // force to use floating point types. + + // Make sure we can change the type of the vector. + // This is possible iff: + // 1. The vector is only used in a bitcast to a integer type. I.e., + // 1.1. Vector is used only once. + // 1.2. Use is a bit convert to an integer type. + // 2. The size of its operands are 32-bits (64-bits are not legal). + EVT VT = N->getValueType(0); + EVT EltVT = VT.getVectorElementType(); + + // Check 1.1. and 2. + if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) + return SDValue(); + + // By construction, the input type must be float. + assert(EltVT == MVT::f32 && "Unexpected type!"); + + // Check 1.2. + SDNode *Use = *N->use_begin(); + if (Use->getOpcode() != ISD::BITCAST || + Use->getValueType(0).isFloatingPoint()) + return SDValue(); + + // Check profitability. + // Model is, if more than half of the relevant operands are bitcast from + // i32, turn the build_vector into a sequence of insert_vector_elt. + // Relevant operands are everything that is not statically + // (i.e., at compile time) bitcasted. + unsigned NumOfBitCastedElts = 0; + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumOfRelevantElts = NumElts; + for (unsigned Idx = 0; Idx < NumElts; ++Idx) { + SDValue Elt = N->getOperand(Idx); + if (Elt->getOpcode() == ISD::BITCAST) { + // Assume only bit cast to i32 will go away. + if (Elt->getOperand(0).getValueType() == MVT::i32) + ++NumOfBitCastedElts; + } else if (Elt.getOpcode() == ISD::UNDEF || isa(Elt)) + // Constants are statically casted, thus do not count them as + // relevant operands. + --NumOfRelevantElts; + } + + // Check if more than half of the elements require a non-free bitcast. + if (NumOfBitCastedElts <= NumOfRelevantElts / 2) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + // Create the new vector type. + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); + // Check if the type is legal. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(VecVT)) + return SDValue(); + + // Combine: + // ARMISD::BUILD_VECTOR E1, E2, ..., EN. + // => BITCAST INSERT_VECTOR_ELT + // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), + // (BITCAST EN), N. + SDValue Vec = DAG.getUNDEF(VecVT); + SDLoc dl(N); + for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { + SDValue V = N->getOperand(Idx); + if (V.getOpcode() == ISD::UNDEF) + continue; + if (V.getOpcode() == ISD::BITCAST && + V->getOperand(0).getValueType() == MVT::i32) + // Fold obvious case. + V = V.getOperand(0); + else { + V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); + // Make the DAGCombiner fold the bitcasts. + DCI.AddToWorklist(V.getNode()); + } + SDValue LaneIdx = DAG.getConstant(Idx, MVT::i32); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); + } + Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); + // Make the DAGCombiner fold the bitcasts. + DCI.AddToWorklist(Vec.getNode()); + return Vec; +} + /// PerformInsertEltCombine - Target-specific dag combine xforms for /// ISD::INSERT_VECTOR_ELT. static SDValue PerformInsertEltCombine(SDNode *N, @@ -8534,7 +9234,7 @@ static SDValue PerformInsertEltCombine(SDNode *N, return SDValue(); SelectionDAG &DAG = DCI.DAG; - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VT.getVectorNumElements()); SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); @@ -8580,7 +9280,7 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { !TLI.isTypeLegal(Concat1Op1.getValueType())) return SDValue(); - SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT, + SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Op0.getOperand(0), Op1.getOperand(0)); // Translate the shuffle mask. SmallVector NewMask; @@ -8596,7 +9296,7 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { NewElt = HalfElts + MaskElt - NumElts; NewMask.push_back(NewElt); } - return DAG.getVectorShuffle(VT, N->getDebugLoc(), NewConcat, + return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, DAG.getUNDEF(VT), NewMask.data()); } @@ -8713,7 +9413,7 @@ static SDValue CombineBaseUpdate(SDNode *N, Ops.push_back(N->getOperand(i)); } MemIntrinsicSDNode *MemInt = cast(N); - SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, N->getDebugLoc(), SDTys, + SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops.data(), Ops.size(), MemInt->getMemoryVT(), MemInt->getMemOperand()); @@ -8787,7 +9487,7 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDVTList SDTys = DAG.getVTList(Tys, NumVecs+1); SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; MemIntrinsicSDNode *VLDMemInt = cast(VLD); - SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, VLD->getDebugLoc(), SDTys, + SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, Ops, 2, VLDMemInt->getMemoryVT(), VLDMemInt->getMemOperand()); @@ -8842,7 +9542,7 @@ static SDValue PerformVDUPLANECombine(SDNode *N, if (EltSize > VT.getVectorElementType().getSizeInBits()) return SDValue(); - return DCI.DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); + return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); } // isConstVecPow2 - Return true if each vector element is a power of 2, all @@ -8899,12 +9599,27 @@ static SDValue PerformVCVTCombine(SDNode *N, !isConstVecPow2(ConstVec, isSigned, C)) return SDValue(); + MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); + MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); + if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) { + // These instructions only exist converting from f32 to i32. We can handle + // smaller integers by generating an extra truncate, but larger ones would + // be lossy. + return SDValue(); + } + unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : Intrinsic::arm_neon_vcvtfp2fxu; - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(), - N->getValueType(0), - DAG.getConstant(IntrinsicOpcode, MVT::i32), N0, - DAG.getConstant(Log2_64(C), MVT::i32)); + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + SDValue FixConv = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), + NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, + DAG.getConstant(IntrinsicOpcode, MVT::i32), N0, + DAG.getConstant(Log2_64(C), MVT::i32)); + + if (IntTy.getSizeInBits() < FloatTy.getSizeInBits()) + FixConv = DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), FixConv); + + return FixConv; } /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) @@ -8935,12 +9650,28 @@ static SDValue PerformVDIVCombine(SDNode *N, !isConstVecPow2(ConstVec, isSigned, C)) return SDValue(); + MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); + MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); + if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) { + // These instructions only exist converting from i32 to f32. We can handle + // smaller integers by generating an extra extend, but larger ones would + // be lossy. + return SDValue(); + } + + SDValue ConvInput = Op.getOperand(0); + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + if (IntTy.getSizeInBits() < FloatTy.getSizeInBits()) + ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, + SDLoc(N), NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, + ConvInput); + unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : Intrinsic::arm_neon_vcvtfxu2fp; - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(), + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), Op.getValueType(), DAG.getConstant(IntrinsicOpcode, MVT::i32), - Op.getOperand(0), DAG.getConstant(Log2_64(C), MVT::i32)); + ConvInput, DAG.getConstant(Log2_64(C), MVT::i32)); } /// Getvshiftimm - Check if this is a valid build_vector for the immediate @@ -9006,9 +9737,6 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { // loads from a constant pool. case Intrinsic::arm_neon_vshifts: case Intrinsic::arm_neon_vshiftu: - case Intrinsic::arm_neon_vshiftls: - case Intrinsic::arm_neon_vshiftlu: - case Intrinsic::arm_neon_vshiftn: case Intrinsic::arm_neon_vrshifts: case Intrinsic::arm_neon_vrshiftu: case Intrinsic::arm_neon_vrshiftn: @@ -9039,12 +9767,6 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { } return SDValue(); - case Intrinsic::arm_neon_vshiftls: - case Intrinsic::arm_neon_vshiftlu: - if (isVShiftLImm(N->getOperand(2), VT, true, Cnt)) - break; - llvm_unreachable("invalid shift count for vshll intrinsic"); - case Intrinsic::arm_neon_vrshifts: case Intrinsic::arm_neon_vrshiftu: if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) @@ -9062,7 +9784,6 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { break; llvm_unreachable("invalid shift count for vqshlu intrinsic"); - case Intrinsic::arm_neon_vshiftn: case Intrinsic::arm_neon_vrshiftn: case Intrinsic::arm_neon_vqshiftns: case Intrinsic::arm_neon_vqshiftnu: @@ -9085,16 +9806,6 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { case Intrinsic::arm_neon_vshiftu: // Opcode already set above. break; - case Intrinsic::arm_neon_vshiftls: - case Intrinsic::arm_neon_vshiftlu: - if (Cnt == VT.getVectorElementType().getSizeInBits()) - VShiftOpc = ARMISD::VSHLLi; - else - VShiftOpc = (IntNo == Intrinsic::arm_neon_vshiftls ? - ARMISD::VSHLLs : ARMISD::VSHLLu); - break; - case Intrinsic::arm_neon_vshiftn: - VShiftOpc = ARMISD::VSHRN; break; case Intrinsic::arm_neon_vrshifts: VShiftOpc = ARMISD::VRSHRs; break; case Intrinsic::arm_neon_vrshiftu: @@ -9121,7 +9832,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { VShiftOpc = ARMISD::VQRSHRNsu; break; } - return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0), + return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0), N->getOperand(1), DAG.getConstant(Cnt, MVT::i32)); } @@ -9138,7 +9849,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); } - return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0), + return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), DAG.getConstant(Cnt, MVT::i32)); } @@ -9169,7 +9880,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && DAG.MaskedValueIsZero(N0.getOperand(0), APInt::getHighBitsSet(32, 16))) - return DAG.getNode(ISD::ROTR, N->getDebugLoc(), VT, N0, N1); + return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); } } @@ -9186,7 +9897,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, case ISD::SHL: if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) - return DAG.getNode(ARMISD::VSHL, N->getDebugLoc(), VT, N->getOperand(0), + return DAG.getNode(ARMISD::VSHL, SDLoc(N), VT, N->getOperand(0), DAG.getConstant(Cnt, MVT::i32)); break; @@ -9195,7 +9906,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? ARMISD::VSHRs : ARMISD::VSHRu); - return DAG.getNode(VShiftOpc, N->getDebugLoc(), VT, N->getOperand(0), + return DAG.getNode(VShiftOpc, SDLoc(N), VT, N->getOperand(0), DAG.getConstant(Cnt, MVT::i32)); } } @@ -9235,7 +9946,7 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, Opc = ARMISD::VGETLANEu; break; } - return DAG.getNode(Opc, N->getDebugLoc(), VT, Vec, Lane); + return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); } } @@ -9324,7 +10035,7 @@ static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG, if (!Opcode) return SDValue(); - return DAG.getNode(Opcode, N->getDebugLoc(), N->getValueType(0), LHS, RHS); + return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS); } /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. @@ -9336,7 +10047,7 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { return SDValue(); EVT VT = N->getValueType(0); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); SDValue LHS = Cmp.getOperand(0); SDValue RHS = Cmp.getOperand(1); SDValue FalseVal = N->getOperand(0); @@ -9426,6 +10137,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ARMISD::VLD3DUP: case ARMISD::VLD4DUP: return CombineBaseUpdate(N, DCI); + case ARMISD::BUILD_VECTOR: + return PerformARMBUILD_VECTORCombine(N, DCI); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { @@ -9456,7 +10169,8 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); } -bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const { +bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, unsigned, + bool *Fast) const { // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); @@ -9478,7 +10192,7 @@ bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const case MVT::v2f64: { // For any little-endian targets with neon, we can support unaligned ld/st // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. - // A big-endian target may also explictly support unaligned accesses + // A big-endian target may also explicitly support unaligned accesses if (Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian())) { if (Fast) *Fast = true; @@ -9510,11 +10224,11 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, bool Fast; if (Size >= 16 && (memOpAlign(SrcAlign, DstAlign, 16) || - (allowsUnalignedMemoryAccesses(MVT::v2f64, &Fast) && Fast))) { + (allowsUnalignedMemoryAccesses(MVT::v2f64, 0, &Fast) && Fast))) { return MVT::v2f64; } else if (Size >= 8 && (memOpAlign(SrcAlign, DstAlign, 8) || - (allowsUnalignedMemoryAccesses(MVT::f64, &Fast) && Fast))) { + (allowsUnalignedMemoryAccesses(MVT::f64, 0, &Fast) && Fast))) { return MVT::f64; } } @@ -9550,6 +10264,21 @@ bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return false; } +bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { + if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) + return false; + + if (!isTypeLegal(EVT::getEVT(Ty1))) + return false; + + assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); + + // Assuming the caller doesn't have a zeroext or signext return parameter, + // truncation all the way down to i1 is valid. + return true; +} + + static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { if (V < 0) return false; @@ -9949,9 +10678,19 @@ void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth) const { - KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); + unsigned BitWidth = KnownOne.getBitWidth(); + KnownZero = KnownOne = APInt(BitWidth, 0); switch (Op.getOpcode()) { default: break; + case ARMISD::ADDC: + case ARMISD::ADDE: + case ARMISD::SUBC: + case ARMISD::SUBE: + // These nodes' second result is a boolean + if (Op.getResNo() == 0) + break; + KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); + break; case ARMISD::CMOV: { // Bits are known zero/one if known on the LHS and RHS. DAG.ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); @@ -10065,7 +10804,7 @@ ARMTargetLowering::getSingleConstraintMatchWeight( typedef std::pair RCPair; RCPair ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, - EVT VT) const { + MVT VT) const { if (Constraint.size() == 1) { // GCC ARM Constraint Letters switch (Constraint[0]) { @@ -10080,6 +10819,8 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, case 'r': return RCPair(0U, &ARM::GPRRegClass); case 'w': + if (VT == MVT::Other) + break; if (VT == MVT::f32) return RCPair(0U, &ARM::SPRRegClass); if (VT.getSizeInBits() == 64) @@ -10088,6 +10829,8 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, return RCPair(0U, &ARM::QPRRegClass); break; case 'x': + if (VT == MVT::Other) + break; if (VT == MVT::f32) return RCPair(0U, &ARM::SPR_8RegClass); if (VT.getSizeInBits() == 64) @@ -10274,6 +11017,54 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } +SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { + assert(Subtarget->isTargetAEABI() && "Register-based DivRem lowering only"); + unsigned Opcode = Op->getOpcode(); + assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && + "Invalid opcode for Div/Rem lowering"); + bool isSigned = (Opcode == ISD::SDIVREM); + EVT VT = Op->getValueType(0); + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + + RTLIB::Libcall LC; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unexpected request for libcall!"); + case MVT::i8: LC= isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; + case MVT::i16: LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; + case MVT::i32: LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; + case MVT::i64: LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; + } + + SDValue InChain = DAG.getEntryNode(); + + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { + EVT ArgVT = Op->getOperand(i).getValueType(); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + Entry.Node = Op->getOperand(i); + Entry.Ty = ArgTy; + Entry.isSExt = isSigned; + Entry.isZExt = !isSigned; + Args.push_back(Entry); + } + + SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), + getPointerTy()); + + Type *RetTy = (Type*)StructType::get(Ty, Ty, NULL); + + SDLoc dl(Op); + TargetLowering:: + CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, true, + 0, getLibcallCallingConv(LC), /*isTailCall=*/false, + /*doesNotReturn=*/false, /*isReturnValueUsed=*/true, + Callee, Args, DAG, dl); + std::pair CallInfo = LowerCallTo(CLI); + + return CallInfo.first; +} + bool ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // The ARM target isn't yet aware of offsets. @@ -10282,17 +11073,15 @@ ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { bool ARM::isBitFieldInvertedMask(unsigned v) { if (v == 0xffffffff) - return 0; + return false; + // there can be 1's on either or both "outsides", all the "inside" // bits must be 0's - unsigned int lsb = 0, msb = 31; - while (v & (1 << msb)) --msb; - while (v & (1 << lsb)) ++lsb; - for (unsigned int i = lsb; i <= msb; ++i) { - if (v & (1 << i)) - return 0; - } - return 1; + unsigned TO = CountTrailingOnes_32(v); + unsigned LO = CountLeadingOnes_32(v); + v = (v >> TO) << TO; + v = (v << LO) >> LO; + return v == 0; } /// isFPImmLegal - Returns true if the target can instruction select the @@ -10361,6 +11150,30 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.writeMem = true; return true; } + case Intrinsic::arm_ldrex: { + PointerType *PtrTy = cast(I.getArgOperand(0)->getType()); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); + Info.vol = true; + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::arm_strex: { + PointerType *PtrTy = cast(I.getArgOperand(1)->getType()); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.ptrVal = I.getArgOperand(1); + Info.offset = 0; + Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); + Info.vol = true; + Info.readMem = false; + Info.writeMem = true; + return true; + } case Intrinsic::arm_strexd: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i64; @@ -10389,3 +11202,15 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return false; } + +/// \brief Returns true if it is beneficial to convert a load of a constant +/// to just the constant itself. +bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const { + assert(Ty->isIntegerTy()); + + unsigned Bits = Ty->getPrimitiveSizeInBits(); + if (Bits == 0 || Bits > 32) + return false; + return true; +}