X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FARM%2FARMISelLowering.cpp;h=159d4df00bb42b9936fb61661894d4fa8c0f61ff;hb=8163ca76f0b0d336c5436364ffb3b85be1162e7a;hp=7e47242412743f07711a434805b8b9c9e039e9ce;hpb=acf2077ca497980a066e8e7bb81ceec0de82d5da;p=oota-llvm.git diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 7e472424127..159d4df00bb 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -19,19 +19,13 @@ #include "ARMConstantPoolValue.h" #include "ARMMachineFunctionInfo.h" #include "ARMPerfectShuffle.h" -#include "ARMRegisterInfo.h" #include "ARMSubtarget.h" #include "ARMTargetMachine.h" #include "ARMTargetObjectFile.h" #include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/CallingConv.h" -#include "llvm/Constants.h" -#include "llvm/Function.h" -#include "llvm/GlobalValue.h" -#include "llvm/Instruction.h" -#include "llvm/Instructions.h" -#include "llvm/Intrinsics.h" -#include "llvm/Type.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -41,18 +35,24 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/Constants.h" +#include "llvm/Function.h" +#include "llvm/GlobalValue.h" +#include "llvm/Instruction.h" +#include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" #include "llvm/MC/MCSectionMachO.h" -#include "llvm/Target/TargetOptions.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/ADT/Statistic.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Type.h" using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); +STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); // This option should go away when tail calls fully work. static cl::opt @@ -90,76 +90,72 @@ static const uint16_t GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; -void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT, - EVT PromotedBitwiseVT) { +void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, + MVT PromotedBitwiseVT) { if (VT != PromotedLdStVT) { - setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); - AddPromotedToType (ISD::LOAD, VT.getSimpleVT(), - PromotedLdStVT.getSimpleVT()); + setOperationAction(ISD::LOAD, VT, Promote); + AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); - setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); - AddPromotedToType (ISD::STORE, VT.getSimpleVT(), - PromotedLdStVT.getSimpleVT()); + setOperationAction(ISD::STORE, VT, Promote); + AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); } - EVT ElemTy = VT.getVectorElementType(); + MVT ElemTy = VT.getVectorElementType(); if (ElemTy != MVT::i64 && ElemTy != MVT::f64) - setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); if (ElemTy == MVT::i32) { - setOperationAction(ISD::SINT_TO_FP, VT.getSimpleVT(), Custom); - setOperationAction(ISD::UINT_TO_FP, VT.getSimpleVT(), Custom); - setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom); - setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom); + setOperationAction(ISD::SINT_TO_FP, VT, Custom); + setOperationAction(ISD::UINT_TO_FP, VT, Custom); + setOperationAction(ISD::FP_TO_SINT, VT, Custom); + setOperationAction(ISD::FP_TO_UINT, VT, Custom); } else { - setOperationAction(ISD::SINT_TO_FP, VT.getSimpleVT(), Expand); - setOperationAction(ISD::UINT_TO_FP, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Expand); - } - setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom); - setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal); - setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Legal); - setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); - setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, VT.getSimpleVT(), Expand); + setOperationAction(ISD::SINT_TO_FP, VT, Expand); + setOperationAction(ISD::UINT_TO_FP, VT, Expand); + setOperationAction(ISD::FP_TO_SINT, VT, Expand); + setOperationAction(ISD::FP_TO_UINT, VT, Expand); + } + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); + setOperationAction(ISD::SELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); if (VT.isInteger()) { - setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom); - setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom); - setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); } // Promote all bit-wise operations. if (VT.isInteger() && VT != PromotedBitwiseVT) { - setOperationAction(ISD::AND, VT.getSimpleVT(), Promote); - AddPromotedToType (ISD::AND, VT.getSimpleVT(), - PromotedBitwiseVT.getSimpleVT()); - setOperationAction(ISD::OR, VT.getSimpleVT(), Promote); - AddPromotedToType (ISD::OR, VT.getSimpleVT(), - PromotedBitwiseVT.getSimpleVT()); - setOperationAction(ISD::XOR, VT.getSimpleVT(), Promote); - AddPromotedToType (ISD::XOR, VT.getSimpleVT(), - PromotedBitwiseVT.getSimpleVT()); + setOperationAction(ISD::AND, VT, Promote); + AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); + setOperationAction(ISD::OR, VT, Promote); + AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); + setOperationAction(ISD::XOR, VT, Promote); + AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); } // Neon does not support vector divide/remainder operations. - setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand); - setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FDIV, VT.getSimpleVT(), Expand); - setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand); - setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand); + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::FDIV, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); } -void ARMTargetLowering::addDRTypeForNEON(EVT VT) { - addRegisterClass(VT, ARM::DPRRegisterClass); +void ARMTargetLowering::addDRTypeForNEON(MVT VT) { + addRegisterClass(VT, &ARM::DPRRegClass); addTypeForNEON(VT, MVT::f64, MVT::v2i32); } -void ARMTargetLowering::addQRTypeForNEON(EVT VT) { - addRegisterClass(VT, ARM::QPRRegisterClass); +void ARMTargetLowering::addQRTypeForNEON(MVT VT) { + addRegisterClass(VT, &ARM::QPRRegClass); addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); } @@ -432,14 +428,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) } if (Subtarget->isThumb1Only()) - addRegisterClass(MVT::i32, ARM::tGPRRegisterClass); + addRegisterClass(MVT::i32, &ARM::tGPRRegClass); else - addRegisterClass(MVT::i32, ARM::GPRRegisterClass); + addRegisterClass(MVT::i32, &ARM::GPRRegClass); if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { - addRegisterClass(MVT::f32, ARM::SPRRegisterClass); + addRegisterClass(MVT::f32, &ARM::SPRRegClass); if (!Subtarget->isFPOnlySP()) - addRegisterClass(MVT::f64, ARM::DPRRegisterClass); + addRegisterClass(MVT::f64, &ARM::DPRRegClass); setTruncStoreAction(MVT::f64, MVT::f32, Expand); } @@ -508,7 +504,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::FRINT, MVT::v2f64, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); - + setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); setOperationAction(ISD::FSIN, MVT::v4f32, Expand); setOperationAction(ISD::FCOS, MVT::v4f32, Expand); @@ -519,6 +515,11 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); setOperationAction(ISD::FEXP, MVT::v4f32, Expand); setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); + setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); + setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); + setOperationAction(ISD::FRINT, MVT::v4f32, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); + setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); // Neon does not support some operations on v1i64 and v2i64 types. setOperationAction(ISD::MUL, MVT::v1i64, Expand); @@ -542,6 +543,17 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); + setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); + + // NEON does not have single instruction CTPOP for vectors with element + // types wider than 8-bits. However, custom lowering can leverage the + // v8i8/v16i8 vcnt instruction. + setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); + setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); + setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); + setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); + setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); @@ -571,6 +583,11 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) } } + // ARM and Thumb2 support UMLAL/SMLAL. + if (!Subtarget->isThumb1Only()) + setTargetDAGCombine(ISD::ADDC); + + computeRegisterProperties(); // ARM does not have f32 extending load. @@ -634,9 +651,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) if (!Subtarget->hasV6Ops()) setOperationAction(ISD::BSWAP, MVT::i32, Expand); - // These are expanded into libcalls. - if (!Subtarget->hasDivide() || !Subtarget->isThumb2()) { - // v7M has a hardware divider + if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) && + !(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) { + // These are expanded into libcalls if the cpu doesn't have HW divider. setOperationAction(ISD::SDIV, MVT::i32, Expand); setOperationAction(ISD::UDIV, MVT::i32, Expand); } @@ -686,7 +703,11 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc. setInsertFencesForAtomic(true); @@ -770,8 +791,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::FPOW, MVT::f64, Expand); setOperationAction(ISD::FPOW, MVT::f32, Expand); - setOperationAction(ISD::FMA, MVT::f64, Expand); - setOperationAction(ISD::FMA, MVT::f32, Expand); + if (!Subtarget->hasVFP4()) { + setOperationAction(ISD::FMA, MVT::f64, Expand); + setOperationAction(ISD::FMA, MVT::f32, Expand); + } // Various VFP goodness if (!TM.Options.UseSoftFloat && !Subtarget->isThumb1Only()) { @@ -794,12 +817,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::MUL); - - if (Subtarget->hasV6T2Ops() || Subtarget->hasNEON()) { - setTargetDAGCombine(ISD::AND); - setTargetDAGCombine(ISD::OR); - setTargetDAGCombine(ISD::XOR); - } + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::OR); + setTargetDAGCombine(ISD::XOR); if (Subtarget->hasV6Ops()) setTargetDAGCombine(ISD::SRL); @@ -813,9 +833,12 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setSchedulingPreference(Sched::Hybrid); //// temporary - rewrite interface to use type - maxStoresPerMemcpy = maxStoresPerMemcpyOptSize = 1; - maxStoresPerMemset = 16; + maxStoresPerMemset = 8; maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + maxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores + maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 4 : 2; + maxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores + maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 4 : 2; // On ARM arguments smaller than 4 bytes are extended, so all arguments // are at least 4 bytes aligned. @@ -823,6 +846,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) benefitFromCodePlacementOpt = true; + // Prefer likely predicted branches to selects on out-of-order cores. + predictableSelectIsExpensive = Subtarget->isLikeA9(); + setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); } @@ -848,7 +874,7 @@ ARMTargetLowering::findRepresentativeClass(EVT VT) const{ // the cost is 1 for both f32 and f64. case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: - RRC = ARM::DPRRegisterClass; + RRC = &ARM::DPRRegClass; // When NEON is used for SP, only half of the register file is available // because operations that define both SP and DP results will be constrained // to the VFP2 class (D0-D15). We currently model this constraint prior to @@ -858,15 +884,15 @@ ARMTargetLowering::findRepresentativeClass(EVT VT) const{ break; case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: - RRC = ARM::DPRRegisterClass; + RRC = &ARM::DPRRegClass; Cost = 2; break; case MVT::v4i64: - RRC = ARM::DPRRegisterClass; + RRC = &ARM::DPRRegClass; Cost = 4; break; case MVT::v8i64: - RRC = ARM::DPRRegisterClass; + RRC = &ARM::DPRRegClass; Cost = 8; break; } @@ -890,6 +916,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; case ARMISD::CMP: return "ARMISD::CMP"; + case ARMISD::CMN: return "ARMISD::CMN"; case ARMISD::CMPZ: return "ARMISD::CMPZ"; case ARMISD::CMPFP: return "ARMISD::CMPFP"; case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; @@ -897,9 +924,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; case ARMISD::CMOV: return "ARMISD::CMOV"; - case ARMISD::CAND: return "ARMISD::CAND"; - case ARMISD::COR: return "ARMISD::COR"; - case ARMISD::CXOR: return "ARMISD::CXOR"; case ARMISD::RBIT: return "ARMISD::RBIT"; @@ -983,6 +1007,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VTBL2: return "ARMISD::VTBL2"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; + case ARMISD::UMLAL: return "ARMISD::UMLAL"; + case ARMISD::SMLAL: return "ARMISD::SMLAL"; case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; case ARMISD::FMAX: return "ARMISD::FMAX"; case ARMISD::FMIN: return "ARMISD::FMIN"; @@ -1020,23 +1046,24 @@ EVT ARMTargetLowering::getSetCCResultType(EVT VT) const { /// getRegClassFor - Return the register class that should be used for the /// specified value type. -const TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const { +const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const { // Map v4i64 to QQ registers but do not make the type legal. Similarly map // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to // load / store 4 to 8 consecutive D registers. if (Subtarget->hasNEON()) { if (VT == MVT::v4i64) - return ARM::QQPRRegisterClass; - else if (VT == MVT::v8i64) - return ARM::QQQQPRRegisterClass; + return &ARM::QQPRRegClass; + if (VT == MVT::v8i64) + return &ARM::QQQQPRRegClass; } return TargetLowering::getRegClassFor(VT); } // Create a fast isel object. FastISel * -ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { - return ARM::createFastISel(funcInfo); +ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) const { + return ARM::createFastISel(funcInfo, libInfo); } /// getMaximalGlobalOffset - Returns the maximal possible offset which can @@ -1165,6 +1192,8 @@ CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); case CallingConv::ARM_APCS: return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); + case CallingConv::GHC: + return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); } } @@ -1285,14 +1314,20 @@ void ARMTargetLowering::PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG, /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter /// nodes. SDValue -ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, - CallingConv::ID CallConv, bool isVarArg, - bool doesNotRet, bool &isTailCall, - const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, - const SmallVectorImpl &Ins, - DebugLoc dl, SelectionDAG &DAG, +ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { + SelectionDAG &DAG = CLI.DAG; + DebugLoc &dl = CLI.DL; + SmallVector &Outs = CLI.Outs; + SmallVector &OutVals = CLI.OutVals; + SmallVector &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + bool &isTailCall = CLI.IsTailCall; + CallingConv::ID CallConv = CLI.CallConv; + bool doesNotRet = CLI.DoesNotReturn; + bool isVarArg = CLI.IsVarArg; + MachineFunction &MF = DAG.getMachineFunction(); bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); bool IsSibCall = false; @@ -1414,21 +1449,22 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, CCInfo.clearFirstByValReg(); } - unsigned LocMemOffset = VA.getLocMemOffset(); - SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset); - SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, - StkPtrOff); - SDValue SrcOffset = DAG.getIntPtrConstant(4*offset); - SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset); - SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, - MVT::i32); - MemOpChains.push_back(DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, - Flags.getByValAlign(), - /*isVolatile=*/false, - /*AlwaysInline=*/false, - MachinePointerInfo(0), - MachinePointerInfo(0))); - + if (Flags.getByValSize() - 4*offset > 0) { + unsigned LocMemOffset = VA.getLocMemOffset(); + SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset); + SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, + StkPtrOff); + SDValue SrcOffset = DAG.getIntPtrConstant(4*offset); + SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset); + SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, + MVT::i32); + SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), MVT::i32); + + SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; + MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, + Ops, array_lengthof(Ops))); + } } else if (!IsSibCall) { assert(VA.isMemLoc()); @@ -1580,19 +1616,19 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // FIXME: handle tail calls differently. unsigned CallOpc; + bool HasMinSizeAttr = MF.getFunction()->getFnAttributes(). + hasAttribute(Attributes::MinSize); if (Subtarget->isThumb()) { if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; - else if (doesNotRet && isDirect && !isARMFunc && - Subtarget->hasRAS() && !Subtarget->isThumb1Only()) - // "mov lr, pc; b _foo" to avoid confusing the RSP - CallOpc = ARMISD::CALL_NOLINK; else CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL; } else { - if (!isDirect && !Subtarget->hasV5TOps()) { + if (!isDirect && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; - } else if (doesNotRet && isDirect && Subtarget->hasRAS()) + else if (doesNotRet && isDirect && Subtarget->hasRAS() && + // Emit regular call when code size is the priority + !HasMinSizeAttr) // "mov lr, pc; b _foo" to avoid confusing the RSP CallOpc = ARMISD::CALL_NOLINK; else @@ -1642,22 +1678,31 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, /// and then confiscate the rest of the parameter registers to insure /// this. void -llvm::ARMTargetLowering::HandleByVal(CCState *State, unsigned &size) const { +ARMTargetLowering::HandleByVal( + CCState *State, unsigned &size, unsigned Align) const { unsigned reg = State->AllocateReg(GPRArgRegs, 4); assert((State->getCallOrPrologue() == Prologue || State->getCallOrPrologue() == Call) && "unhandled ParmContext"); if ((!State->isFirstByValRegValid()) && (ARM::R0 <= reg) && (reg <= ARM::R3)) { - State->setFirstByValReg(reg); - // At a call site, a byval parameter that is split between - // registers and memory needs its size truncated here. In a - // function prologue, such byval parameters are reassembled in - // memory, and are not truncated. - if (State->getCallOrPrologue() == Call) { - unsigned excess = 4 * (ARM::R4 - reg); - assert(size >= excess && "expected larger existing stack allocation"); - size -= excess; + if (Subtarget->isAAPCS_ABI() && Align > 4) { + unsigned AlignInRegs = Align / 4; + unsigned Waste = (ARM::R4 - reg) % AlignInRegs; + for (unsigned i = 0; i < Waste; ++i) + reg = State->AllocateReg(GPRArgRegs, 4); + } + if (reg != 0) { + State->setFirstByValReg(reg); + // At a call site, a byval parameter that is split between + // registers and memory needs its size truncated here. In a + // function prologue, such byval parameters are reassembled in + // memory, and are not truncated. + if (State->getCallOrPrologue() == Call) { + unsigned excess = 4 * (ARM::R4 - reg); + assert(size >= excess && "expected larger existing stack allocation"); + size -= excess; + } } } // Confiscate any remaining parameter registers to preclude their @@ -1790,6 +1835,14 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, } } + // If Caller's vararg or byval argument has been split between registers and + // stack, do not perform tail call, since part of the argument is in caller's + // local frame. + const ARMFunctionInfo *AFI_Caller = DAG.getMachineFunction(). + getInfo(); + if (AFI_Caller->getVarArgsRegSaveSize()) + return false; + // If the callee takes no arguments then go on to check the results of the // call. if (!Outs.empty()) { @@ -1844,6 +1897,17 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, return true; } +bool +ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, + MachineFunction &MF, bool isVarArg, + const SmallVectorImpl &Outs, + LLVMContext &Context) const { + SmallVector RVLocs; + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context); + return CCInfo.CheckReturn(Outs, CCAssignFnForNode(CallConv, /*Return=*/true, + isVarArg)); +} + SDValue ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -1935,63 +1999,72 @@ ARMTargetLowering::LowerReturn(SDValue Chain, return result; } -bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N) const { +bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { if (N->getNumValues() != 1) return false; if (!N->hasNUsesOfValue(1, 0)) return false; - unsigned NumCopies = 0; - SDNode* Copies[2] = { 0, 0 }; - SDNode *Use = *N->use_begin(); - if (Use->getOpcode() == ISD::CopyToReg) { - Copies[NumCopies++] = Use; - } else if (Use->getOpcode() == ARMISD::VMOVRRD) { + SDValue TCChain = Chain; + SDNode *Copy = *N->use_begin(); + if (Copy->getOpcode() == ISD::CopyToReg) { + // If the copy has a glue operand, we conservatively assume it isn't safe to + // perform a tail call. + if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) + return false; + TCChain = Copy->getOperand(0); + } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { + SDNode *VMov = Copy; // f64 returned in a pair of GPRs. - for (SDNode::use_iterator UI = Use->use_begin(), UE = Use->use_end(); + SmallPtrSet Copies; + for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); UI != UE; ++UI) { if (UI->getOpcode() != ISD::CopyToReg) return false; - Copies[UI.getUse().getResNo()] = *UI; - ++NumCopies; + Copies.insert(*UI); } - } else if (Use->getOpcode() == ISD::BITCAST) { + if (Copies.size() > 2) + return false; + + for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); + UI != UE; ++UI) { + SDValue UseChain = UI->getOperand(0); + if (Copies.count(UseChain.getNode())) + // Second CopyToReg + Copy = *UI; + else + // First CopyToReg + TCChain = UseChain; + } + } else if (Copy->getOpcode() == ISD::BITCAST) { // f32 returned in a single GPR. - if (!Use->hasNUsesOfValue(1, 0)) + if (!Copy->hasOneUse()) return false; - Use = *Use->use_begin(); - if (Use->getOpcode() != ISD::CopyToReg || !Use->hasNUsesOfValue(1, 0)) + Copy = *Copy->use_begin(); + if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) return false; - Copies[NumCopies++] = Use; + Chain = Copy->getOperand(0); } else { return false; } - if (NumCopies != 1 && NumCopies != 2) - return false; - bool HasRet = false; - for (unsigned i = 0; i < NumCopies; ++i) { - SDNode *Copy = Copies[i]; - for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); - UI != UE; ++UI) { - if (UI->getOpcode() == ISD::CopyToReg) { - SDNode *Use = *UI; - if (Use == Copies[0] || ((NumCopies == 2) && (Use == Copies[1]))) - continue; - return false; - } - if (UI->getOpcode() != ARMISD::RET_FLAG) - return false; - HasRet = true; - } + for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); + UI != UE; ++UI) { + if (UI->getOpcode() != ARMISD::RET_FLAG) + return false; + HasRet = true; } - return HasRet; + if (!HasRet) + return false; + + Chain = TCChain; + return true; } bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { - if (!EnableARMTailCalls) + if (!EnableARMTailCalls && !Subtarget->supportsTailCall()) return false; if (!CI->isTailCall()) @@ -2085,12 +2158,13 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); Args.push_back(Entry); // FIXME: is there useful debug info available here? - std::pair CallResult = - LowerCallTo(Chain, (Type *) Type::getInt32Ty(*DAG.getContext()), + TargetLowering::CallLoweringInfo CLI(Chain, + (Type *) Type::getInt32Ty(*DAG.getContext()), false, false, false, false, 0, CallingConv::C, /*isTailCall=*/false, /*doesNotRet=*/false, /*isReturnValueUsed=*/true, DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl); + std::pair CallResult = LowerCallTo(CLI); return CallResult.first; } @@ -2098,7 +2172,8 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, // "local exec" model. SDValue ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, - SelectionDAG &DAG) const { + SelectionDAG &DAG, + TLSModel::Model model) const { const GlobalValue *GV = GA->getGlobal(); DebugLoc dl = GA->getDebugLoc(); SDValue Offset; @@ -2107,7 +2182,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, // Get the Thread Pointer SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); - if (GV->isDeclaration()) { + if (model == TLSModel::InitialExec) { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); @@ -2132,6 +2207,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, false, false, false, 0); } else { // local exec model + assert(model == TLSModel::LocalExec); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); @@ -2152,12 +2228,18 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetELF() && "TLS not implemented for non-ELF targets"); GlobalAddressSDNode *GA = cast(Op); - // If the relocation model is PIC, use the "General Dynamic" TLS Model, - // otherwise use the "Local Exec" TLS Model - if (getTargetMachine().getRelocationModel() == Reloc::PIC_) - return LowerToTLSGeneralDynamicModel(GA, DAG); - else - return LowerToTLSExecModels(GA, DAG); + + TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); + + switch (model) { + case TLSModel::GeneralDynamic: + case TLSModel::LocalDynamic: + return LowerToTLSGeneralDynamicModel(GA, DAG); + case TLSModel::InitialExec: + case TLSModel::LocalExec: + return LowerToTLSExecModels(GA, DAG, model); + } + llvm_unreachable("bogus TLS model"); } SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, @@ -2447,9 +2529,9 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, const TargetRegisterClass *RC; if (AFI->isThumb1OnlyFunction()) - RC = ARM::tGPRRegisterClass; + RC = &ARM::tGPRRegClass; else - RC = ARM::GPRRegisterClass; + RC = &ARM::GPRRegClass; // Transform the arguments stored in physical registers into virtual ones. unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); @@ -2503,7 +2585,10 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF, void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, DebugLoc dl, SDValue &Chain, - unsigned ArgOffset) const { + const Value *OrigArg, + unsigned OffsetFromOrigArg, + unsigned ArgOffset, + bool ForceMutable) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo(); @@ -2530,18 +2615,18 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, getPointerTy()); SmallVector MemOps; - for (; firstRegToSaveIndex < 4; ++firstRegToSaveIndex) { + for (unsigned i = 0; firstRegToSaveIndex < 4; ++firstRegToSaveIndex, ++i) { const TargetRegisterClass *RC; if (AFI->isThumb1OnlyFunction()) - RC = ARM::tGPRRegisterClass; + RC = &ARM::tGPRRegClass; else - RC = ARM::GPRRegisterClass; + RC = &ARM::GPRRegClass; unsigned VReg = MF.addLiveIn(GPRArgRegs[firstRegToSaveIndex], RC); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo::getFixedStack(AFI->getVarArgsFrameIndex()), + MachinePointerInfo(OrigArg, OffsetFromOrigArg + 4*i), false, false, 0); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, @@ -2552,7 +2637,8 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, &MemOps[0], MemOps.size()); } else // This will point to the next argument passed via stack. - AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true)); + AFI->setVarArgsFrameIndex( + MFI->CreateFixedObject(4, ArgOffset, !ForceMutable)); } SDValue @@ -2575,14 +2661,16 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv, /* Return*/ false, isVarArg)); - + SmallVector ArgValues; int lastInsIndex = -1; - SDValue ArgValue; + Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); + unsigned CurArgIdx = 0; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; - + std::advance(CurOrigArg, Ins[VA.getValNo()].OrigArgIndex - CurArgIdx); + CurArgIdx = Ins[VA.getValNo()].OrigArgIndex; // Arguments stored in registers. if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); @@ -2617,14 +2705,15 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, const TargetRegisterClass *RC; if (RegVT == MVT::f32) - RC = ARM::SPRRegisterClass; + RC = &ARM::SPRRegClass; else if (RegVT == MVT::f64) - RC = ARM::DPRRegisterClass; + RC = &ARM::DPRRegClass; else if (RegVT == MVT::v2f64) - RC = ARM::QPRRegisterClass; + RC = &ARM::QPRRegClass; else if (RegVT == MVT::i32) - RC = (AFI->isThumb1OnlyFunction() ? - ARM::tGPRRegisterClass : ARM::GPRRegisterClass); + RC = AFI->isThumb1OnlyFunction() ? + (const TargetRegisterClass*)&ARM::tGPRRegClass : + (const TargetRegisterClass*)&ARM::GPRRegClass; else llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); @@ -2675,14 +2764,20 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, // Since they could be overwritten by lowering of arguments in case of // a tail call. if (Flags.isByVal()) { - unsigned VARegSize, VARegSaveSize; - computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize); - VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 0); - unsigned Bytes = Flags.getByValSize() - VARegSize; - if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. - int FI = MFI->CreateFixedObject(Bytes, - VA.getLocMemOffset(), false); - InVals.push_back(DAG.getFrameIndex(FI, getPointerTy())); + ARMFunctionInfo *AFI = MF.getInfo(); + if (!AFI->getVarArgsFrameIndex()) { + VarArgStyleRegisters(CCInfo, DAG, + dl, Chain, CurOrigArg, + Ins[VA.getValNo()].PartOffset, + VA.getLocMemOffset(), + true /*force mutable frames*/); + int VAFrameIndex = AFI->getVarArgsFrameIndex(); + InVals.push_back(DAG.getFrameIndex(VAFrameIndex, getPointerTy())); + } else { + int FI = MFI->CreateFixedObject(Flags.getByValSize(), + VA.getLocMemOffset(), false); + InVals.push_back(DAG.getFrameIndex(FI, getPointerTy())); + } } else { int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8, VA.getLocMemOffset(), true); @@ -2700,7 +2795,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, // varargs if (isVarArg) - VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset()); + VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 0, 0, + CCInfo.getNextStackOffset()); return Chain; } @@ -3469,6 +3565,114 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::CTLZ, dl, VT, rbit); } +/// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count +/// for each 16-bit element from operand, repeated. The basic idea is to +/// leverage vcnt to get the 8-bit counts, gather and add the results. +/// +/// Trace for v4i16: +/// input = [v0 v1 v2 v3 ] (vi 16-bit element) +/// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element) +/// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi) +/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6] +/// [b0 b1 b2 b3 b4 b5 b6 b7] +/// +[b1 b0 b3 b2 b5 b4 b7 b6] +/// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0, +/// vuzp: = [k0 k1 k2 k3 k0 k1 k2 k3] each ki is 8-bits) +static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + DebugLoc DL = N->getDebugLoc(); + + EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; + SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0)); + SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0); + SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1); + SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2); + return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3); +} + +/// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the +/// bit-count for each 16-bit element from the operand. We need slightly +/// different sequencing for v4i16 and v8i16 to stay within NEON's available +/// 64/128-bit registers. +/// +/// Trace for v4i16: +/// input = [v0 v1 v2 v3 ] (vi 16-bit element) +/// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi) +/// v8i16:Extended = [k0 k1 k2 k3 k0 k1 k2 k3 ] +/// v4i16:Extracted = [k0 k1 k2 k3 ] +static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + DebugLoc DL = N->getDebugLoc(); + + SDValue BitCounts = getCTPOP16BitCounts(N, DAG); + if (VT.is64BitVector()) { + SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended, + DAG.getIntPtrConstant(0)); + } else { + SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, + BitCounts, DAG.getIntPtrConstant(0)); + return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted); + } +} + +/// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the +/// bit-count for each 32-bit element from the operand. The idea here is +/// to split the vector into 16-bit elements, leverage the 16-bit count +/// routine, and then combine the results. +/// +/// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged): +/// input = [v0 v1 ] (vi: 32-bit elements) +/// Bitcast = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1]) +/// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi) +/// vrev: N0 = [k1 k0 k3 k2 ] +/// [k0 k1 k2 k3 ] +/// N1 =+[k1 k0 k3 k2 ] +/// [k0 k2 k1 k3 ] +/// N2 =+[k1 k3 k0 k2 ] +/// [k0 k2 k1 k3 ] +/// Extended =+[k1 k3 k0 k2 ] +/// [k0 k2 ] +/// Extracted=+[k1 k3 ] +/// +static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + DebugLoc DL = N->getDebugLoc(); + + EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; + + SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0)); + SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG); + SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16); + SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0); + SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1); + + if (VT.is64BitVector()) { + SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended, + DAG.getIntPtrConstant(0)); + } else { + SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2, + DAG.getIntPtrConstant(0)); + return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted); + } +} + +static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = N->getValueType(0); + + assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); + assert((VT == MVT::v2i32 || VT == MVT::v4i32 || + VT == MVT::v4i16 || VT == MVT::v8i16) && + "Unexpected type for custom ctpop lowering"); + + if (VT.getVectorElementType() == MVT::i32) + return lowerCTPOP32BitElements(N, DAG); + else + return lowerCTPOP16BitElements(N, DAG); +} + static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); @@ -3673,27 +3877,6 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { return Result; } -SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, - const ARMSubtarget *ST) const { - if (!ST->useNEONForSinglePrecisionFP() || !ST->hasVFP3() || ST->hasD16()) - return SDValue(); - - ConstantFPSDNode *CFP = cast(Op); - assert(Op.getValueType() == MVT::f32 && - "ConstantFP custom lowering should only occur for f32."); - - APFloat FPVal = CFP->getValueAPF(); - int ImmVal = ARM_AM::getFP32Imm(FPVal); - if (ImmVal == -1) - return SDValue(); - - DebugLoc DL = Op.getDebugLoc(); - SDValue NewVal = DAG.getTargetConstant(ImmVal, MVT::i32); - SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, NewVal); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, - DAG.getConstant(0, MVT::i32)); -} - /// isNEONModifiedImm - Check if the specified splat value corresponds to a /// valid vector constant for a NEON instruction with a "modified immediate" /// operand (e.g., VMOV). If so, return the encoded value. @@ -3830,6 +4013,88 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, return DAG.getTargetConstant(EncodedVal, MVT::i32); } +SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) const { + if (!ST->useNEONForSinglePrecisionFP() || !ST->hasVFP3() || ST->hasD16()) + return SDValue(); + + ConstantFPSDNode *CFP = cast(Op); + assert(Op.getValueType() == MVT::f32 && + "ConstantFP custom lowering should only occur for f32."); + + // Try splatting with a VMOV.f32... + APFloat FPVal = CFP->getValueAPF(); + int ImmVal = ARM_AM::getFP32Imm(FPVal); + if (ImmVal != -1) { + DebugLoc DL = Op.getDebugLoc(); + SDValue NewVal = DAG.getTargetConstant(ImmVal, MVT::i32); + SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, + NewVal); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, + DAG.getConstant(0, MVT::i32)); + } + + // If that fails, try a VMOV.i32 + EVT VMovVT; + unsigned iVal = FPVal.bitcastToAPInt().getZExtValue(); + SDValue NewVal = isNEONModifiedImm(iVal, 0, 32, DAG, VMovVT, false, + VMOVModImm); + if (NewVal != SDValue()) { + DebugLoc DL = Op.getDebugLoc(); + SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, + NewVal); + SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, + VecConstant); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, + DAG.getConstant(0, MVT::i32)); + } + + // Finally, try a VMVN.i32 + NewVal = isNEONModifiedImm(~iVal & 0xffffffff, 0, 32, DAG, VMovVT, false, + VMVNModImm); + if (NewVal != SDValue()) { + DebugLoc DL = Op.getDebugLoc(); + SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); + SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, + VecConstant); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, + DAG.getConstant(0, MVT::i32)); + } + + return SDValue(); +} + +// check if an VEXT instruction can handle the shuffle mask when the +// vector sources of the shuffle are the same. +static bool isSingletonVEXTMask(ArrayRef M, EVT VT, unsigned &Imm) { + unsigned NumElts = VT.getVectorNumElements(); + + // Assume that the first shuffle index is not UNDEF. Fail if it is. + if (M[0] < 0) + return false; + + Imm = M[0]; + + // If this is a VEXT shuffle, the immediate value is the index of the first + // element. The other shuffle indices must be the successive elements after + // the first one. + unsigned ExpectedElt = Imm; + for (unsigned i = 1; i < NumElts; ++i) { + // Increment the expected index. If it wraps around, just follow it + // back to index zero and keep going. + ++ExpectedElt; + if (ExpectedElt == NumElts) + ExpectedElt = 0; + + if (M[i] < 0) continue; // ignore UNDEF indices + if (ExpectedElt != static_cast(M[i])) + return false; + } + + return true; +} + + static bool isVEXTMask(ArrayRef M, EVT VT, bool &ReverseVEXT, unsigned &Imm) { unsigned NumElts = VT.getVectorNumElements(); @@ -4096,10 +4361,21 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, } // Scan through the operands to see if only one value is used. + // + // As an optimisation, even if more than one value is used it may be more + // profitable to splat with one value then change some lanes. + // + // Heuristically we decide to do this if the vector has a "dominant" value, + // defined as splatted to more than half of the lanes. unsigned NumElts = VT.getVectorNumElements(); bool isOnlyLowElement = true; bool usesOnlyOneValue = true; + bool hasDominantValue = false; bool isConstant = true; + + // Map of the number of times a particular SDValue appears in the + // element list. + DenseMap ValueCounts; SDValue Value; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); @@ -4110,13 +4386,21 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (!isa(V) && !isa(V)) isConstant = false; - if (!Value.getNode()) + ValueCounts.insert(std::make_pair(V, 0)); + unsigned &Count = ValueCounts[V]; + + // Is this value dominant? (takes up more than half of the lanes) + if (++Count > (NumElts / 2)) { + hasDominantValue = true; Value = V; - else if (V != Value) - usesOnlyOneValue = false; + } } + if (ValueCounts.size() != 1) + usesOnlyOneValue = false; + if (!Value.getNode() && ValueCounts.size() > 0) + Value = ValueCounts.begin()->first; - if (!Value.getNode()) + if (ValueCounts.size() == 0) return DAG.getUNDEF(VT); if (isOnlyLowElement) @@ -4126,9 +4410,51 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // Use VDUP for non-constant splats. For f32 constant splats, reduce to // i32 and try again. - if (usesOnlyOneValue && EltSize <= 32) { - if (!isConstant) - return DAG.getNode(ARMISD::VDUP, dl, VT, Value); + if (hasDominantValue && EltSize <= 32) { + if (!isConstant) { + SDValue N; + + // If we are VDUPing a value that comes directly from a vector, that will + // cause an unnecessary move to and from a GPR, where instead we could + // just use VDUPLANE. + if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + // We need to create a new undef vector to use for the VDUPLANE if the + // size of the vector from which we get the value is different than the + // size of the vector that we need to create. We will insert the element + // such that the register coalescer will remove unnecessary copies. + if (VT != Value->getOperand(0).getValueType()) { + ConstantSDNode *constIndex; + constIndex = dyn_cast(Value->getOperand(1)); + assert(constIndex && "The index is not a constant!"); + unsigned index = constIndex->getAPIntValue().getLimitedValue() % + VT.getVectorNumElements(); + N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, + DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), + Value, DAG.getConstant(index, MVT::i32)), + DAG.getConstant(index, MVT::i32)); + } else { + N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, + Value->getOperand(0), Value->getOperand(1)); + } + } + else + N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); + + if (!usesOnlyOneValue) { + // The dominant value was splatted as 'N', but we now have to insert + // all differing elements. + for (unsigned I = 0; I < NumElts; ++I) { + if (Op.getOperand(I) == Value) + continue; + SmallVector Ops; + Ops.push_back(N); + Ops.push_back(Op.getOperand(I)); + Ops.push_back(DAG.getConstant(I, MVT::i32)); + N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, &Ops[0], 3); + } + } + return N; + } if (VT.getVectorElementType().isFloatingPoint()) { SmallVector Ops; for (unsigned i = 0; i < NumElts; ++i) @@ -4140,9 +4466,11 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (Val.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, Val); } - SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); - if (Val.getNode()) - return DAG.getNode(ARMISD::VDUP, dl, VT, Val); + if (usesOnlyOneValue) { + SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); + if (isConstant && Val.getNode()) + return DAG.getNode(ARMISD::VDUP, dl, VT, Val); + } } // If all elements are constants and the case above didn't get hit, fall back @@ -4208,6 +4536,10 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, // Record this extraction against the appropriate vector if possible... SDValue SourceVec = V.getOperand(0); + // If the element number isn't a constant, we can't effectively + // analyze what's going on. + if (!isa(V.getOperand(1))) + return SDValue(); unsigned EltNo = cast(V.getOperand(1))->getZExtValue(); bool FoundSource = false; for (unsigned j = 0; j < SourceVecs.size(); ++j) { @@ -4521,6 +4853,12 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { if (isVREVMask(ShuffleMask, VT, 16)) return DAG.getNode(ARMISD::VREV16, dl, VT, V1); + if (V2->getOpcode() == ISD::UNDEF && + isSingletonVEXTMask(ShuffleMask, VT, Imm)) { + return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, + DAG.getConstant(Imm, MVT::i32)); + } + // Check for Neon shuffles that modify both input vectors in place. // If both results are used, i.e., if there are two shuffles with the same // source operands and with masks corresponding to both results of one of @@ -4720,16 +5058,76 @@ static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { return false; } -/// SkipExtension - For a node that is a SIGN_EXTEND, ZERO_EXTEND, extending -/// load, or BUILD_VECTOR with extended elements, return the unextended value. -static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) { +/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total +/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. +/// We insert the required extension here to get the vector to fill a D register. +static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, + const EVT &OrigTy, + const EVT &ExtTy, + unsigned ExtOpcode) { + // The vector originally had a size of OrigTy. It was then extended to ExtTy. + // We expect the ExtTy to be 128-bits total. If the OrigTy is less than + // 64-bits we need to insert a new extension so that it will be 64-bits. + assert(ExtTy.is128BitVector() && "Unexpected extension size"); + if (OrigTy.getSizeInBits() >= 64) + return N; + + // Must extend size to at least 64 bits to be used as an operand for VMULL. + MVT::SimpleValueType OrigSimpleTy = OrigTy.getSimpleVT().SimpleTy; + EVT NewVT; + switch (OrigSimpleTy) { + default: llvm_unreachable("Unexpected Orig Vector Type"); + case MVT::v2i8: + case MVT::v2i16: + NewVT = MVT::v2i32; + break; + case MVT::v4i8: + NewVT = MVT::v4i16; + break; + } + return DAG.getNode(ExtOpcode, N->getDebugLoc(), NewVT, N); +} + +/// SkipLoadExtensionForVMULL - return a load of the original vector size that +/// does not do any sign/zero extension. If the original vector is less +/// than 64 bits, an appropriate extension will be added after the load to +/// reach a total size of 64 bits. We have to add the extension separately +/// because ARM does not have a sign/zero extending load for vectors. +static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { + SDValue NonExtendingLoad = + DAG.getLoad(LD->getMemoryVT(), LD->getDebugLoc(), LD->getChain(), + LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(), + LD->isNonTemporal(), LD->isInvariant(), + LD->getAlignment()); + unsigned ExtOp = 0; + switch (LD->getExtensionType()) { + default: llvm_unreachable("Unexpected LoadExtType"); + case ISD::EXTLOAD: + case ISD::SEXTLOAD: ExtOp = ISD::SIGN_EXTEND; break; + case ISD::ZEXTLOAD: ExtOp = ISD::ZERO_EXTEND; break; + } + MVT::SimpleValueType MemType = LD->getMemoryVT().getSimpleVT().SimpleTy; + MVT::SimpleValueType ExtType = LD->getValueType(0).getSimpleVT().SimpleTy; + return AddRequiredExtensionForVMULL(NonExtendingLoad, DAG, + MemType, ExtType, ExtOp); +} + +/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, +/// extending load, or BUILD_VECTOR with extended elements, return the +/// unextended value. The unextended vector should be 64 bits so that it can +/// be used as an operand to a VMULL instruction. If the original vector size +/// before extension is less than 64 bits we add a an extension to resize +/// the vector to 64 bits. +static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) - return N->getOperand(0); + return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, + N->getOperand(0)->getValueType(0), + N->getValueType(0), + N->getOpcode()); + if (LoadSDNode *LD = dyn_cast(N)) - return DAG.getLoad(LD->getMemoryVT(), N->getDebugLoc(), LD->getChain(), - LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(), - LD->isNonTemporal(), LD->isInvariant(), - LD->getAlignment()); + return SkipLoadExtensionForVMULL(LD, DAG); + // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will // have been legalized as a BITCAST from v4i32. if (N->getOpcode() == ISD::BITCAST) { @@ -4750,7 +5148,9 @@ static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) { for (unsigned i = 0; i != NumElts; ++i) { ConstantSDNode *C = cast(N->getOperand(i)); const APInt &CInt = C->getAPIntValue(); - Ops.push_back(DAG.getConstant(CInt.trunc(EltSize), TruncVT)); + // Element types smaller than 32 bits are not legal, so use i32 elements. + // The values are implicitly truncated so sext vs. zext doesn't matter. + Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32)); } return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts); @@ -4782,7 +5182,8 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { // Multiplications are only custom-lowered for 128-bit vectors so that // VMULL can be detected. Otherwise v2i64 multiplications are not legal. EVT VT = Op.getValueType(); - assert(VT.is128BitVector() && "unexpected type for custom-lowering ISD::MUL"); + assert(VT.is128BitVector() && VT.isInteger() && + "unexpected type for custom-lowering ISD::MUL"); SDNode *N0 = Op.getOperand(0).getNode(); SDNode *N1 = Op.getOperand(1).getNode(); unsigned NewOpc = 0; @@ -4825,9 +5226,9 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { // Legalize to a VMULL instruction. DebugLoc DL = Op.getDebugLoc(); SDValue Op0; - SDValue Op1 = SkipExtension(N1, DAG); + SDValue Op1 = SkipExtensionForVMULL(N1, DAG); if (!isMLA) { - Op0 = SkipExtension(N0, DAG); + Op0 = SkipExtensionForVMULL(N0, DAG); assert(Op0.getValueType().is64BitVector() && Op1.getValueType().is64BitVector() && "unexpected types for extended operands to VMULL"); @@ -4842,8 +5243,8 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { // vaddl q0, d4, d5 // vmovl q1, d6 // vmul q0, q0, q1 - SDValue N00 = SkipExtension(N0->getOperand(0).getNode(), DAG); - SDValue N01 = SkipExtension(N0->getOperand(1).getNode(), DAG); + SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); + SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); EVT Op1VT = Op1.getValueType(); return DAG.getNode(N0->getOpcode(), DL, VT, DAG.getNode(NewOpc, DL, VT, @@ -5129,6 +5530,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SRL_PARTS: case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op.getNode(), DAG, Subtarget); + case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); case ISD::SETCC: return LowerVSETCC(Op, DAG); case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); @@ -5189,6 +5591,18 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, case ISD::ATOMIC_CMP_SWAP: ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMCMPXCHG64_DAG); return; + case ISD::ATOMIC_LOAD_MIN: + ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMIN64_DAG); + return; + case ISD::ATOMIC_LOAD_UMIN: + ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMIN64_DAG); + return; + case ISD::ATOMIC_LOAD_MAX: + ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMAX64_DAG); + return; + case ISD::ATOMIC_LOAD_UMAX: + ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMAX64_DAG); + return; } if (Res.getNode()) Results.push_back(Res); @@ -5211,14 +5625,14 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI, bool isThumb2 = Subtarget->isThumb2(); MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - unsigned scratch = - MRI.createVirtualRegister(isThumb2 ? ARM::rGPRRegisterClass - : ARM::GPRRegisterClass); + unsigned scratch = MRI.createVirtualRegister(isThumb2 ? + (const TargetRegisterClass*)&ARM::rGPRRegClass : + (const TargetRegisterClass*)&ARM::GPRRegClass); if (isThumb2) { - MRI.constrainRegClass(dest, ARM::rGPRRegisterClass); - MRI.constrainRegClass(oldval, ARM::rGPRRegisterClass); - MRI.constrainRegClass(newval, ARM::rGPRRegisterClass); + MRI.constrainRegClass(dest, &ARM::rGPRRegClass); + MRI.constrainRegClass(oldval, &ARM::rGPRRegClass); + MRI.constrainRegClass(newval, &ARM::rGPRRegClass); } unsigned ldrOpc, strOpc; @@ -5321,8 +5735,8 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); if (isThumb2) { - MRI.constrainRegClass(dest, ARM::rGPRRegisterClass); - MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass); + MRI.constrainRegClass(dest, &ARM::rGPRRegClass); + MRI.constrainRegClass(ptr, &ARM::rGPRRegClass); } unsigned ldrOpc, strOpc; @@ -5353,8 +5767,9 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); - const TargetRegisterClass *TRC = - isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass; + const TargetRegisterClass *TRC = isThumb2 ? + (const TargetRegisterClass*)&ARM::rGPRRegClass : + (const TargetRegisterClass*)&ARM::GPRRegClass; unsigned scratch = MRI.createVirtualRegister(TRC); unsigned scratch2 = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC); @@ -5428,8 +5843,8 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); if (isThumb2) { - MRI.constrainRegClass(dest, ARM::rGPRRegisterClass); - MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass); + MRI.constrainRegClass(dest, &ARM::rGPRRegClass); + MRI.constrainRegClass(ptr, &ARM::rGPRRegClass); } unsigned ldrOpc, strOpc, extendOpc; @@ -5463,8 +5878,9 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); - const TargetRegisterClass *TRC = - isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass; + const TargetRegisterClass *TRC = isThumb2 ? + (const TargetRegisterClass*)&ARM::rGPRRegClass : + (const TargetRegisterClass*)&ARM::GPRRegClass; unsigned scratch = MRI.createVirtualRegister(TRC); unsigned scratch2 = MRI.createVirtualRegister(TRC); @@ -5477,7 +5893,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, // ldrex dest, ptr // (sign extend dest, if required) // cmp dest, incr - // cmov.cond scratch2, dest, incr + // cmov.cond scratch2, incr, dest // strex scratch, scratch2, ptr // cmp scratch, #0 // bne- loopMBB @@ -5490,7 +5906,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, // Sign extend the value, if necessary. if (signExtend && extendOpc) { - oldval = MRI.createVirtualRegister(ARM::GPRRegisterClass); + oldval = MRI.createVirtualRegister(&ARM::GPRRegClass); AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval) .addReg(dest) .addImm(0)); @@ -5500,7 +5916,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) .addReg(oldval).addReg(incr)); BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr), scratch2) - .addReg(oldval).addReg(incr).addImm(Cond).addReg(ARM::CPSR); + .addReg(incr).addReg(oldval).addImm(Cond).addReg(ARM::CPSR); MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr); if (strOpc == ARM::t2STREX) @@ -5526,7 +5942,8 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, MachineBasicBlock * ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, unsigned Op1, unsigned Op2, - bool NeedsCarry, bool IsCmpxchg) const { + bool NeedsCarry, bool IsCmpxchg, + bool IsMinMax, ARMCC::CondCodes CC) const { // This also handles ATOMIC_SWAP, indicated by Op1==0. const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); @@ -5545,9 +5962,9 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); if (isThumb2) { - MRI.constrainRegClass(destlo, ARM::rGPRRegisterClass); - MRI.constrainRegClass(desthi, ARM::rGPRRegisterClass); - MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass); + MRI.constrainRegClass(destlo, &ARM::rGPRRegClass); + MRI.constrainRegClass(desthi, &ARM::rGPRRegClass); + MRI.constrainRegClass(ptr, &ARM::rGPRRegClass); } unsigned ldrOpc = isThumb2 ? ARM::t2LDREXD : ARM::LDREXD; @@ -5555,16 +5972,15 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *contBB = 0, *cont2BB = 0; - if (IsCmpxchg) { + if (IsCmpxchg || IsMinMax) contBB = MF->CreateMachineBasicBlock(LLVM_BB); + if (IsCmpxchg) cont2BB = MF->CreateMachineBasicBlock(LLVM_BB); - } MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(It, loopMBB); - if (IsCmpxchg) { - MF->insert(It, contBB); - MF->insert(It, cont2BB); - } + if (IsCmpxchg || IsMinMax) MF->insert(It, contBB); + if (IsCmpxchg) MF->insert(It, cont2BB); MF->insert(It, exitMBB); // Transfer the remainder of BB and its successor edges to exitMBB. @@ -5573,8 +5989,9 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); - const TargetRegisterClass *TRC = - isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass; + const TargetRegisterClass *TRC = isThumb2 ? + (const TargetRegisterClass*)&ARM::tGPRRegClass : + (const TargetRegisterClass*)&ARM::GPRRegClass; unsigned storesuccess = MRI.createVirtualRegister(TRC); // thisMBB: @@ -5599,12 +6016,32 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, // for ldrexd must be different. BB = loopMBB; // Load + unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + unsigned GPRPair1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + unsigned GPRPair2; + if (IsMinMax) { + //We need an extra double register for doing min/max. + unsigned undef = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + GPRPair2 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), undef); + BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1) + .addReg(undef) + .addReg(vallo) + .addImm(ARM::gsub_0); + BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), GPRPair2) + .addReg(r1) + .addReg(valhi) + .addImm(ARM::gsub_1); + } + AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc)) - .addReg(ARM::R2, RegState::Define) - .addReg(ARM::R3, RegState::Define).addReg(ptr)); + .addReg(GPRPair0, RegState::Define).addReg(ptr)); // Copy r2/r3 into dest. (This copy will normally be coalesced.) - BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo).addReg(ARM::R2); - BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi).addReg(ARM::R3); + BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo) + .addReg(GPRPair0, 0, ARM::gsub_0); + BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi) + .addReg(GPRPair0, 0, ARM::gsub_1); if (IsCmpxchg) { // Add early exit @@ -5623,24 +6060,67 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, // Copy to physregs for strexd unsigned setlo = MI->getOperand(5).getReg(); unsigned sethi = MI->getOperand(6).getReg(); - BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R0).addReg(setlo); - BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R1).addReg(sethi); + unsigned undef = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), undef); + BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1) + .addReg(undef) + .addReg(setlo) + .addImm(ARM::gsub_0); + BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), GPRPair1) + .addReg(r1) + .addReg(sethi) + .addImm(ARM::gsub_1); } else if (Op1) { // Perform binary operation - AddDefaultPred(BuildMI(BB, dl, TII->get(Op1), ARM::R0) + unsigned tmpRegLo = MRI.createVirtualRegister(TRC); + AddDefaultPred(BuildMI(BB, dl, TII->get(Op1), tmpRegLo) .addReg(destlo).addReg(vallo)) .addReg(NeedsCarry ? ARM::CPSR : 0, getDefRegState(NeedsCarry)); - AddDefaultPred(BuildMI(BB, dl, TII->get(Op2), ARM::R1) - .addReg(desthi).addReg(valhi)).addReg(0); + unsigned tmpRegHi = MRI.createVirtualRegister(TRC); + AddDefaultPred(BuildMI(BB, dl, TII->get(Op2), tmpRegHi) + .addReg(desthi).addReg(valhi)) + .addReg(IsMinMax ? ARM::CPSR : 0, getDefRegState(IsMinMax)); + + unsigned UndefPair = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), UndefPair); + unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1) + .addReg(UndefPair) + .addReg(tmpRegLo) + .addImm(ARM::gsub_0); + BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), GPRPair1) + .addReg(r1) + .addReg(tmpRegHi) + .addImm(ARM::gsub_1); } else { // Copy to physregs for strexd - BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R0).addReg(vallo); - BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R1).addReg(valhi); + unsigned UndefPair = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), UndefPair); + BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1) + .addReg(UndefPair) + .addReg(vallo) + .addImm(ARM::gsub_0); + BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), GPRPair1) + .addReg(r1) + .addReg(valhi) + .addImm(ARM::gsub_1); + } + unsigned GPRPairStore = GPRPair1; + if (IsMinMax) { + // Compare and branch to exit block. + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(exitMBB).addImm(CC).addReg(ARM::CPSR); + BB->addSuccessor(exitMBB); + BB->addSuccessor(contBB); + BB = contBB; + GPRPairStore = GPRPair2; } // Store AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess) - .addReg(ARM::R0).addReg(ARM::R1).addReg(ptr)); + .addReg(GPRPairStore).addReg(ptr)); // Cmp+jump AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) .addReg(storesuccess).addImm(0)); @@ -5681,8 +6161,9 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB, ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj); unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); - const TargetRegisterClass *TRC = - isThumb ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass; + const TargetRegisterClass *TRC = isThumb ? + (const TargetRegisterClass*)&ARM::tGPRRegClass : + (const TargetRegisterClass*)&ARM::GPRRegClass; // Grab constant pool and fixed stack memory operands. MachineMemOperand *CPMMO = @@ -5786,15 +6267,17 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { MachineFrameInfo *MFI = MF->getFrameInfo(); int FI = MFI->getFunctionContextIndex(); - const TargetRegisterClass *TRC = - Subtarget->isThumb() ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass; + const TargetRegisterClass *TRC = Subtarget->isThumb() ? + (const TargetRegisterClass*)&ARM::tGPRRegClass : + (const TargetRegisterClass*)&ARM::GPRnopcRegClass; // Get a mapping of the call site numbers to all of the landing pads they're // associated with. DenseMap > CallSiteNumToLPad; unsigned MaxCSNum = 0; MachineModuleInfo &MMI = MF->getMMI(); - for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; ++BB) { + for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; + ++BB) { if (!BB->isLandingPad()) continue; // FIXME: We should assert that the EH_LABEL is the first MI in the landing @@ -5866,12 +6349,15 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); - if (AFI->isThumb1OnlyFunction()) - BuildMI(DispatchBB, dl, TII->get(ARM::tInt_eh_sjlj_dispatchsetup)); - else if (!Subtarget->hasVFP2()) - BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup_nofp)); - else - BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); + MachineInstrBuilder MIB; + MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); + + const ARMBaseInstrInfo *AII = static_cast(TII); + const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); + + // Add a register mask with no preserved registers. This results in all + // registers being marked as clobbered. + MIB.addRegMask(RI.getNoPreservedMask()); unsigned NumLPads = LPadList.size(); if (Subtarget->isThumb2()) { @@ -5943,9 +6429,9 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. - unsigned Align = getTargetData()->getPrefTypeAlignment(Int32Ty); + unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty); if (Align == 0) - Align = getTargetData()->getTypeAllocSize(C->getType()); + Align = getDataLayout()->getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); unsigned VReg1 = MRI->createVirtualRegister(TRC); @@ -6032,9 +6518,9 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. - unsigned Align = getTargetData()->getPrefTypeAlignment(Int32Ty); + unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty); if (Align == 0) - Align = getTargetData()->getTypeAllocSize(C->getType()); + Align = getDataLayout()->getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); unsigned VReg1 = MRI->createVirtualRegister(TRC); @@ -6081,18 +6567,15 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { } // Add the jump table entries as successors to the MBB. - MachineBasicBlock *PrevMBB = 0; + SmallPtrSet SeenMBBs; for (std::vector::iterator I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { MachineBasicBlock *CurMBB = *I; - if (PrevMBB != CurMBB) + if (SeenMBBs.insert(CurMBB)) DispContBB->addSuccessor(CurMBB); - PrevMBB = CurMBB; } // N.B. the order the invoke BBs are processed in doesn't matter here. - const ARMBaseInstrInfo *AII = static_cast(TII); - const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); const uint16_t *SavedRegs = RI.getCalleeSavedRegs(MF); SmallVector MBBLPads; for (SmallPtrSet::iterator @@ -6134,14 +6617,12 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { for (unsigned i = 0; SavedRegs[i] != 0; ++i) { unsigned Reg = SavedRegs[i]; if (Subtarget->isThumb2() && - !ARM::tGPRRegisterClass->contains(Reg) && - !ARM::hGPRRegisterClass->contains(Reg)) + !ARM::tGPRRegClass.contains(Reg) && + !ARM::hGPRRegClass.contains(Reg)) continue; - else if (Subtarget->isThumb1Only() && - !ARM::tGPRRegisterClass->contains(Reg)) + if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) continue; - else if (!Subtarget->isThumb() && - !ARM::GPRRegisterClass->contains(Reg)) + if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) continue; if (!DefRegs[Reg]) MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); @@ -6172,6 +6653,306 @@ MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { llvm_unreachable("Expecting a BB with two successors!"); } +MachineBasicBlock *ARMTargetLowering:: +EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { + // This pseudo instruction has 3 operands: dst, src, size + // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). + // Otherwise, we will generate unrolled scalar copies. + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction::iterator It = BB; + ++It; + + unsigned dest = MI->getOperand(0).getReg(); + unsigned src = MI->getOperand(1).getReg(); + unsigned SizeVal = MI->getOperand(2).getImm(); + unsigned Align = MI->getOperand(3).getImm(); + DebugLoc dl = MI->getDebugLoc(); + + bool isThumb2 = Subtarget->isThumb2(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned ldrOpc, strOpc, UnitSize = 0; + + const TargetRegisterClass *TRC = isThumb2 ? + (const TargetRegisterClass*)&ARM::tGPRRegClass : + (const TargetRegisterClass*)&ARM::GPRRegClass; + const TargetRegisterClass *TRC_Vec = 0; + + if (Align & 1) { + ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM; + strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM; + UnitSize = 1; + } else if (Align & 2) { + ldrOpc = isThumb2 ? ARM::t2LDRH_POST : ARM::LDRH_POST; + strOpc = isThumb2 ? ARM::t2STRH_POST : ARM::STRH_POST; + UnitSize = 2; + } else { + // Check whether we can use NEON instructions. + if (!MF->getFunction()->getFnAttributes(). + hasAttribute(Attributes::NoImplicitFloat) && + Subtarget->hasNEON()) { + if ((Align % 16 == 0) && SizeVal >= 16) { + ldrOpc = ARM::VLD1q32wb_fixed; + strOpc = ARM::VST1q32wb_fixed; + UnitSize = 16; + TRC_Vec = (const TargetRegisterClass*)&ARM::DPairRegClass; + } + else if ((Align % 8 == 0) && SizeVal >= 8) { + ldrOpc = ARM::VLD1d32wb_fixed; + strOpc = ARM::VST1d32wb_fixed; + UnitSize = 8; + TRC_Vec = (const TargetRegisterClass*)&ARM::DPRRegClass; + } + } + // Can't use NEON instructions. + if (UnitSize == 0) { + ldrOpc = isThumb2 ? ARM::t2LDR_POST : ARM::LDR_POST_IMM; + strOpc = isThumb2 ? ARM::t2STR_POST : ARM::STR_POST_IMM; + UnitSize = 4; + } + } + + unsigned BytesLeft = SizeVal % UnitSize; + unsigned LoopSize = SizeVal - BytesLeft; + + if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { + // Use LDR and STR to copy. + // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) + // [destOut] = STR_POST(scratch, destIn, UnitSize) + unsigned srcIn = src; + unsigned destIn = dest; + for (unsigned i = 0; i < LoopSize; i+=UnitSize) { + unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC); + unsigned srcOut = MRI.createVirtualRegister(TRC); + unsigned destOut = MRI.createVirtualRegister(TRC); + if (UnitSize >= 8) { + AddDefaultPred(BuildMI(*BB, MI, dl, + TII->get(ldrOpc), scratch) + .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(0)); + + AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) + .addReg(destIn).addImm(0).addReg(scratch)); + } else if (isThumb2) { + AddDefaultPred(BuildMI(*BB, MI, dl, + TII->get(ldrOpc), scratch) + .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(UnitSize)); + + AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) + .addReg(scratch).addReg(destIn) + .addImm(UnitSize)); + } else { + AddDefaultPred(BuildMI(*BB, MI, dl, + TII->get(ldrOpc), scratch) + .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0) + .addImm(UnitSize)); + + AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) + .addReg(scratch).addReg(destIn) + .addReg(0).addImm(UnitSize)); + } + srcIn = srcOut; + destIn = destOut; + } + + // Handle the leftover bytes with LDRB and STRB. + // [scratch, srcOut] = LDRB_POST(srcIn, 1) + // [destOut] = STRB_POST(scratch, destIn, 1) + ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM; + strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM; + for (unsigned i = 0; i < BytesLeft; i++) { + unsigned scratch = MRI.createVirtualRegister(TRC); + unsigned srcOut = MRI.createVirtualRegister(TRC); + unsigned destOut = MRI.createVirtualRegister(TRC); + if (isThumb2) { + AddDefaultPred(BuildMI(*BB, MI, dl, + TII->get(ldrOpc),scratch) + .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1)); + + AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) + .addReg(scratch).addReg(destIn) + .addReg(0).addImm(1)); + } else { + AddDefaultPred(BuildMI(*BB, MI, dl, + TII->get(ldrOpc),scratch) + .addReg(srcOut, RegState::Define).addReg(srcIn) + .addReg(0).addImm(1)); + + AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) + .addReg(scratch).addReg(destIn) + .addReg(0).addImm(1)); + } + srcIn = srcOut; + destIn = destOut; + } + MI->eraseFromParent(); // The instruction is gone now. + return BB; + } + + // Expand the pseudo op to a loop. + // thisMBB: + // ... + // movw varEnd, # --> with thumb2 + // movt varEnd, # + // ldrcp varEnd, idx --> without thumb2 + // fallthrough --> loopMBB + // loopMBB: + // PHI varPhi, varEnd, varLoop + // PHI srcPhi, src, srcLoop + // PHI destPhi, dst, destLoop + // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) + // [destLoop] = STR_POST(scratch, destPhi, UnitSize) + // subs varLoop, varPhi, #UnitSize + // bne loopMBB + // fallthrough --> exitMBB + // exitMBB: + // epilogue to handle left-over bytes + // [scratch, srcOut] = LDRB_POST(srcLoop, 1) + // [destOut] = STRB_POST(scratch, destLoop, 1) + MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(It, loopMBB); + MF->insert(It, exitMBB); + + // Transfer the remainder of BB and its successor edges to exitMBB. + exitMBB->splice(exitMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + // Load an immediate to varEnd. + unsigned varEnd = MRI.createVirtualRegister(TRC); + if (isThumb2) { + unsigned VReg1 = varEnd; + if ((LoopSize & 0xFFFF0000) != 0) + VReg1 = MRI.createVirtualRegister(TRC); + AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), VReg1) + .addImm(LoopSize & 0xFFFF)); + + if ((LoopSize & 0xFFFF0000) != 0) + AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd) + .addReg(VReg1) + .addImm(LoopSize >> 16)); + } else { + MachineConstantPool *ConstantPool = MF->getConstantPool(); + Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); + const Constant *C = ConstantInt::get(Int32Ty, LoopSize); + + // MachineConstantPool wants an explicit alignment. + unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty); + if (Align == 0) + Align = getDataLayout()->getTypeAllocSize(C->getType()); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); + + AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDRcp)) + .addReg(varEnd, RegState::Define) + .addConstantPoolIndex(Idx) + .addImm(0)); + } + BB->addSuccessor(loopMBB); + + // Generate the loop body: + // varPhi = PHI(varLoop, varEnd) + // srcPhi = PHI(srcLoop, src) + // destPhi = PHI(destLoop, dst) + MachineBasicBlock *entryBB = BB; + BB = loopMBB; + unsigned varLoop = MRI.createVirtualRegister(TRC); + unsigned varPhi = MRI.createVirtualRegister(TRC); + unsigned srcLoop = MRI.createVirtualRegister(TRC); + unsigned srcPhi = MRI.createVirtualRegister(TRC); + unsigned destLoop = MRI.createVirtualRegister(TRC); + unsigned destPhi = MRI.createVirtualRegister(TRC); + + BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) + .addReg(varLoop).addMBB(loopMBB) + .addReg(varEnd).addMBB(entryBB); + BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) + .addReg(srcLoop).addMBB(loopMBB) + .addReg(src).addMBB(entryBB); + BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) + .addReg(destLoop).addMBB(loopMBB) + .addReg(dest).addMBB(entryBB); + + // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) + // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) + unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC); + if (UnitSize >= 8) { + AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch) + .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(0)); + + AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop) + .addReg(destPhi).addImm(0).addReg(scratch)); + } else if (isThumb2) { + AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch) + .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(UnitSize)); + + AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop) + .addReg(scratch).addReg(destPhi) + .addImm(UnitSize)); + } else { + AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch) + .addReg(srcLoop, RegState::Define).addReg(srcPhi).addReg(0) + .addImm(UnitSize)); + + AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop) + .addReg(scratch).addReg(destPhi) + .addReg(0).addImm(UnitSize)); + } + + // Decrement loop variable by UnitSize. + MachineInstrBuilder MIB = BuildMI(BB, dl, + TII->get(isThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); + AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize))); + MIB->getOperand(5).setReg(ARM::CPSR); + MIB->getOperand(5).setIsDef(true); + + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); + + // loopMBB can loop back to loopMBB or fall through to exitMBB. + BB->addSuccessor(loopMBB); + BB->addSuccessor(exitMBB); + + // Add epilogue to handle BytesLeft. + BB = exitMBB; + MachineInstr *StartOfExit = exitMBB->begin(); + ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM; + strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM; + + // [scratch, srcOut] = LDRB_POST(srcLoop, 1) + // [destOut] = STRB_POST(scratch, destLoop, 1) + unsigned srcIn = srcLoop; + unsigned destIn = destLoop; + for (unsigned i = 0; i < BytesLeft; i++) { + unsigned scratch = MRI.createVirtualRegister(TRC); + unsigned srcOut = MRI.createVirtualRegister(TRC); + unsigned destOut = MRI.createVirtualRegister(TRC); + if (isThumb2) { + AddDefaultPred(BuildMI(*BB, StartOfExit, dl, + TII->get(ldrOpc),scratch) + .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1)); + + AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut) + .addReg(scratch).addReg(destIn) + .addImm(1)); + } else { + AddDefaultPred(BuildMI(*BB, StartOfExit, dl, + TII->get(ldrOpc),scratch) + .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0).addImm(1)); + + AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut) + .addReg(scratch).addReg(destIn) + .addReg(0).addImm(1)); + } + srcIn = srcOut; + destIn = destOut; + } + + MI->eraseFromParent(); // The instruction is gone now. + return BB; +} + MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { @@ -6337,6 +7118,26 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, /*NeedsCarry*/ false, /*IsCmpxchg*/true); + case ARM::ATOMMIN6432: + return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, + isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, + /*NeedsCarry*/ true, /*IsCmpxchg*/false, + /*IsMinMax*/ true, ARMCC::LE); + case ARM::ATOMMAX6432: + return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, + isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, + /*NeedsCarry*/ true, /*IsCmpxchg*/false, + /*IsMinMax*/ true, ARMCC::GE); + case ARM::ATOMUMIN6432: + return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, + isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, + /*NeedsCarry*/ true, /*IsCmpxchg*/false, + /*IsMinMax*/ true, ARMCC::LS); + case ARM::ATOMUMAX6432: + return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, + isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, + /*NeedsCarry*/ true, /*IsCmpxchg*/false, + /*IsMinMax*/ true, ARMCC::HS); case ARM::tMOVCCr_pseudo: { // To "insert" a SELECT_CC instruction, we actually have to insert the @@ -6475,10 +7276,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineRegisterInfo &MRI = Fn->getRegInfo(); // In Thumb mode S must not be specified if source register is the SP or // PC and if destination register is the SP, so restrict register class - unsigned NewMovDstReg = MRI.createVirtualRegister( - isThumb2 ? ARM::rGPRRegisterClass : ARM::GPRRegisterClass); - unsigned NewRsbDstReg = MRI.createVirtualRegister( - isThumb2 ? ARM::rGPRRegisterClass : ARM::GPRRegisterClass); + unsigned NewRsbDstReg = MRI.createVirtualRegister(isThumb2 ? + (const TargetRegisterClass*)&ARM::rGPRRegClass : + (const TargetRegisterClass*)&ARM::GPRRegClass); // Transfer the remainder of BB and its successor edges to sinkMBB. SinkBB->splice(SinkBB->begin(), BB, @@ -6492,12 +7292,10 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // fall through to SinkMBB RSBBB->addSuccessor(SinkBB); - // insert a movs at the end of BB - BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVr : ARM::MOVr), - NewMovDstReg) - .addReg(ABSSrcReg, RegState::Kill) - .addImm((unsigned)ARMCC::AL).addReg(0) - .addReg(ARM::CPSR, RegState::Define); + // insert a cmp at the end of BB + AddDefaultPred(BuildMI(BB, dl, + TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(ABSSrcReg).addImm(0)); // insert a bcc with opposite CC to ARMCC::MI at the end of BB BuildMI(BB, dl, @@ -6509,7 +7307,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // by if-conversion pass BuildMI(*RSBBB, RSBBB->begin(), dl, TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) - .addReg(NewMovDstReg, RegState::Kill) + .addReg(ABSSrcReg, RegState::Kill) .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); // insert PHI in SinkBB, @@ -6517,7 +7315,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, BuildMI(*SinkBB, SinkBB->begin(), dl, TII->get(ARM::PHI), ABSDstReg) .addReg(NewRsbDstReg).addMBB(RSBBB) - .addReg(NewMovDstReg).addMBB(BB); + .addReg(ABSSrcReg).addMBB(BB); // remove ABS instruction MI->eraseFromParent(); @@ -6525,6 +7323,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // return last added BB return SinkBB; } + case ARM::COPY_STRUCT_BYVAL_I32: + ++NumLoopByVals; + return EmitStructByval(MI, BB); } } @@ -6604,62 +7405,137 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, // ARM Optimization Hooks //===----------------------------------------------------------------------===// +// Helper function that checks if N is a null or all ones constant. +static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { + ConstantSDNode *C = dyn_cast(N); + if (!C) + return false; + return AllOnes ? C->isAllOnesValue() : C->isNullValue(); +} + +// Return true if N is conditionally 0 or all ones. +// Detects these expressions where cc is an i1 value: +// +// (select cc 0, y) [AllOnes=0] +// (select cc y, 0) [AllOnes=0] +// (zext cc) [AllOnes=0] +// (sext cc) [AllOnes=0/1] +// (select cc -1, y) [AllOnes=1] +// (select cc y, -1) [AllOnes=1] +// +// Invert is set when N is the null/all ones constant when CC is false. +// OtherOp is set to the alternative value of N. +static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, + SDValue &CC, bool &Invert, + SDValue &OtherOp, + SelectionDAG &DAG) { + switch (N->getOpcode()) { + default: return false; + case ISD::SELECT: { + CC = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue N2 = N->getOperand(2); + if (isZeroOrAllOnes(N1, AllOnes)) { + Invert = false; + OtherOp = N2; + return true; + } + if (isZeroOrAllOnes(N2, AllOnes)) { + Invert = true; + OtherOp = N1; + return true; + } + return false; + } + case ISD::ZERO_EXTEND: + // (zext cc) can never be the all ones value. + if (AllOnes) + return false; + // Fall through. + case ISD::SIGN_EXTEND: { + EVT VT = N->getValueType(0); + CC = N->getOperand(0); + if (CC.getValueType() != MVT::i1) + return false; + Invert = !AllOnes; + if (AllOnes) + // When looking for an AllOnes constant, N is an sext, and the 'other' + // value is 0. + OtherOp = DAG.getConstant(0, VT); + else if (N->getOpcode() == ISD::ZERO_EXTEND) + // When looking for a 0 constant, N can be zext or sext. + OtherOp = DAG.getConstant(1, VT); + else + OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT); + return true; + } + } +} + +// Combine a constant select operand into its use: +// +// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) +// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) +// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] +// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) +// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) +// +// The transform is rejected if the select doesn't have a constant operand that +// is null, or all ones when AllOnes is set. +// +// Also recognize sext/zext from i1: +// +// (add (zext cc), x) -> (select cc (add x, 1), x) +// (add (sext cc), x) -> (select cc (add x, -1), x) +// +// These transformations eventually create predicated instructions. +// +// @param N The node to transform. +// @param Slct The N operand that is a select. +// @param OtherOp The other N operand (x above). +// @param DCI Context. +// @param AllOnes Require the select constant to be all ones instead of null. +// @returns The new node, or SDValue() on failure. static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + bool AllOnes = false) { SelectionDAG &DAG = DCI.DAG; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = N->getValueType(0); - unsigned Opc = N->getOpcode(); - bool isSlctCC = Slct.getOpcode() == ISD::SELECT_CC; - SDValue LHS = isSlctCC ? Slct.getOperand(2) : Slct.getOperand(1); - SDValue RHS = isSlctCC ? Slct.getOperand(3) : Slct.getOperand(2); - ISD::CondCode CC = ISD::SETCC_INVALID; - - if (isSlctCC) { - CC = cast(Slct.getOperand(4))->get(); - } else { - SDValue CCOp = Slct.getOperand(0); - if (CCOp.getOpcode() == ISD::SETCC) - CC = cast(CCOp.getOperand(2))->get(); - } - - bool DoXform = false; - bool InvCC = false; - assert ((Opc == ISD::ADD || (Opc == ISD::SUB && Slct == N->getOperand(1))) && - "Bad input!"); - - if (LHS.getOpcode() == ISD::Constant && - cast(LHS)->isNullValue()) { - DoXform = true; - } else if (CC != ISD::SETCC_INVALID && - RHS.getOpcode() == ISD::Constant && - cast(RHS)->isNullValue()) { - std::swap(LHS, RHS); - SDValue Op0 = Slct.getOperand(0); - EVT OpVT = isSlctCC ? Op0.getValueType() : - Op0.getOperand(0).getValueType(); - bool isInt = OpVT.isInteger(); - CC = ISD::getSetCCInverse(CC, isInt); - - if (!TLI.isCondCodeLegal(CC, OpVT)) - return SDValue(); // Inverse operator isn't legal. - - DoXform = true; - InvCC = true; - } - - if (DoXform) { - SDValue Result = DAG.getNode(Opc, RHS.getDebugLoc(), VT, OtherOp, RHS); - if (isSlctCC) - return DAG.getSelectCC(N->getDebugLoc(), OtherOp, Result, - Slct.getOperand(0), Slct.getOperand(1), CC); - SDValue CCOp = Slct.getOperand(0); - if (InvCC) - CCOp = DAG.getSetCC(Slct.getDebugLoc(), CCOp.getValueType(), - CCOp.getOperand(0), CCOp.getOperand(1), CC); - return DAG.getNode(ISD::SELECT, N->getDebugLoc(), VT, - CCOp, OtherOp, Result); + SDValue NonConstantVal; + SDValue CCOp; + bool SwapSelectOps; + if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, + NonConstantVal, DAG)) + return SDValue(); + + // Slct is now know to be the desired identity constant when CC is true. + SDValue TrueVal = OtherOp; + SDValue FalseVal = DAG.getNode(N->getOpcode(), N->getDebugLoc(), VT, + OtherOp, NonConstantVal); + // Unless SwapSelectOps says CC should be false. + if (SwapSelectOps) + std::swap(TrueVal, FalseVal); + + return DAG.getNode(ISD::SELECT, N->getDebugLoc(), VT, + CCOp, TrueVal, FalseVal); +} + +// Attempt combineSelectAndUse on each operand of a commutative operator N. +static +SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + if (N0.getNode()->hasOneUse()) { + SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes); + if (Result.getNode()) + return Result; + } + if (N1.getNode()->hasOneUse()) { + SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes); + if (Result.getNode()) + return Result; } return SDValue(); } @@ -6753,6 +7629,154 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1, return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, tmp); } +static SDValue findMUL_LOHI(SDValue V) { + if (V->getOpcode() == ISD::UMUL_LOHI || + V->getOpcode() == ISD::SMUL_LOHI) + return V; + return SDValue(); +} + +static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + + if (Subtarget->isThumb1Only()) return SDValue(); + + // Only perform the checks after legalize when the pattern is available. + if (DCI.isBeforeLegalize()) return SDValue(); + + // Look for multiply add opportunities. + // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where + // each add nodes consumes a value from ISD::UMUL_LOHI and there is + // a glue link from the first add to the second add. + // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by + // a S/UMLAL instruction. + // loAdd UMUL_LOHI + // \ / :lo \ :hi + // \ / \ [no multiline comment] + // ADDC | hiAdd + // \ :glue / / + // \ / / + // ADDE + // + assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC"); + SDValue AddcOp0 = AddcNode->getOperand(0); + SDValue AddcOp1 = AddcNode->getOperand(1); + + // Check if the two operands are from the same mul_lohi node. + if (AddcOp0.getNode() == AddcOp1.getNode()) + return SDValue(); + + assert(AddcNode->getNumValues() == 2 && + AddcNode->getValueType(0) == MVT::i32 && + AddcNode->getValueType(1) == MVT::Glue && + "Expect ADDC with two result values: i32, glue"); + + // Check that the ADDC adds the low result of the S/UMUL_LOHI. + if (AddcOp0->getOpcode() != ISD::UMUL_LOHI && + AddcOp0->getOpcode() != ISD::SMUL_LOHI && + AddcOp1->getOpcode() != ISD::UMUL_LOHI && + AddcOp1->getOpcode() != ISD::SMUL_LOHI) + return SDValue(); + + // Look for the glued ADDE. + SDNode* AddeNode = AddcNode->getGluedUser(); + if (AddeNode == NULL) + return SDValue(); + + // Make sure it is really an ADDE. + if (AddeNode->getOpcode() != ISD::ADDE) + return SDValue(); + + assert(AddeNode->getNumOperands() == 3 && + AddeNode->getOperand(2).getValueType() == MVT::Glue && + "ADDE node has the wrong inputs"); + + // Check for the triangle shape. + SDValue AddeOp0 = AddeNode->getOperand(0); + SDValue AddeOp1 = AddeNode->getOperand(1); + + // Make sure that the ADDE operands are not coming from the same node. + if (AddeOp0.getNode() == AddeOp1.getNode()) + return SDValue(); + + // Find the MUL_LOHI node walking up ADDE's operands. + bool IsLeftOperandMUL = false; + SDValue MULOp = findMUL_LOHI(AddeOp0); + if (MULOp == SDValue()) + MULOp = findMUL_LOHI(AddeOp1); + else + IsLeftOperandMUL = true; + if (MULOp == SDValue()) + return SDValue(); + + // Figure out the right opcode. + unsigned Opc = MULOp->getOpcode(); + unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; + + // Figure out the high and low input values to the MLAL node. + SDValue* HiMul = &MULOp; + SDValue* HiAdd = NULL; + SDValue* LoMul = NULL; + SDValue* LowAdd = NULL; + + if (IsLeftOperandMUL) + HiAdd = &AddeOp1; + else + HiAdd = &AddeOp0; + + + if (AddcOp0->getOpcode() == Opc) { + LoMul = &AddcOp0; + LowAdd = &AddcOp1; + } + if (AddcOp1->getOpcode() == Opc) { + LoMul = &AddcOp1; + LowAdd = &AddcOp0; + } + + if (LoMul == NULL) + return SDValue(); + + if (LoMul->getNode() != HiMul->getNode()) + return SDValue(); + + // Create the merged node. + SelectionDAG &DAG = DCI.DAG; + + // Build operand list. + SmallVector Ops; + Ops.push_back(LoMul->getOperand(0)); + Ops.push_back(LoMul->getOperand(1)); + Ops.push_back(*LowAdd); + Ops.push_back(*HiAdd); + + SDValue MLALNode = DAG.getNode(FinalOpc, AddcNode->getDebugLoc(), + DAG.getVTList(MVT::i32, MVT::i32), + &Ops[0], Ops.size()); + + // Replace the ADDs' nodes uses by the MLA node's values. + SDValue HiMLALResult(MLALNode.getNode(), 1); + DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); + + SDValue LoMLALResult(MLALNode.getNode(), 0); + DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); + + // Return original node to notify the driver to stop replacing. + SDValue resNode(AddcNode, 0); + return resNode; +} + +/// PerformADDCCombine - Target-specific dag combine transform from +/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL. +static SDValue PerformADDCCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + + return AddCombineTo64bitMLAL(N, DCI, Subtarget); + +} + /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with /// operands N0 and N1. This is a helper for PerformADDCombine that is /// called with the default operands, and if that fails, with commuted @@ -6767,7 +7791,7 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, return Result; // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) - if (N0.getOpcode() == ISD::SELECT && N0.getNode()->hasOneUse()) { + if (N0.getNode()->hasOneUse()) { SDValue Result = combineSelectAndUse(N, N0, N1, DCI); if (Result.getNode()) return Result; } @@ -6799,7 +7823,7 @@ static SDValue PerformSUBCombine(SDNode *N, SDValue N1 = N->getOperand(1); // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) - if (N1.getOpcode() == ISD::SELECT && N1.getNode()->hasOneUse()) { + if (N1.getNode()->hasOneUse()) { SDValue Result = combineSelectAndUse(N, N1, N0, DCI); if (Result.getNode()) return Result; } @@ -6927,49 +7951,6 @@ static SDValue PerformMULCombine(SDNode *N, return SDValue(); } -static bool isCMOVWithZeroOrAllOnesLHS(SDValue N, bool AllOnes) { - if (N.getOpcode() != ARMISD::CMOV || !N.getNode()->hasOneUse()) - return false; - - SDValue FalseVal = N.getOperand(0); - ConstantSDNode *C = dyn_cast(FalseVal); - if (!C) - return false; - if (AllOnes) - return C->isAllOnesValue(); - return C->isNullValue(); -} - -/// formConditionalOp - Combine an operation with a conditional move operand -/// to form a conditional op. e.g. (or x, (cmov 0, y, cond)) => (or.cond x, y) -/// (and x, (cmov -1, y, cond)) => (and.cond, x, y) -static SDValue formConditionalOp(SDNode *N, SelectionDAG &DAG, - bool Commutable) { - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - - bool isAND = N->getOpcode() == ISD::AND; - bool isCand = isCMOVWithZeroOrAllOnesLHS(N1, isAND); - if (!isCand && Commutable) { - isCand = isCMOVWithZeroOrAllOnesLHS(N0, isAND); - if (isCand) - std::swap(N0, N1); - } - if (!isCand) - return SDValue(); - - unsigned Opc = 0; - switch (N->getOpcode()) { - default: llvm_unreachable("Unexpected node"); - case ISD::AND: Opc = ARMISD::CAND; break; - case ISD::OR: Opc = ARMISD::COR; break; - case ISD::XOR: Opc = ARMISD::CXOR; break; - } - return DAG.getNode(Opc, N->getDebugLoc(), N->getValueType(0), N0, - N1.getOperand(1), N1.getOperand(2), N1.getOperand(3), - N1.getOperand(4)); -} - static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { @@ -7004,10 +7985,10 @@ static SDValue PerformANDCombine(SDNode *N, } if (!Subtarget->isThumb1Only()) { - // (and x, (cmov -1, y, cond)) => (and.cond x, y) - SDValue CAND = formConditionalOp(N, DAG, true); - if (CAND.getNode()) - return CAND; + // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) + SDValue Result = combineSelectAndUseCommutative(N, true, DCI); + if (Result.getNode()) + return Result; } return SDValue(); @@ -7047,14 +8028,17 @@ static SDValue PerformORCombine(SDNode *N, } if (!Subtarget->isThumb1Only()) { - // (or x, (cmov 0, y, cond)) => (or.cond x, y) - SDValue COR = formConditionalOp(N, DAG, true); - if (COR.getNode()) - return COR; + // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) + SDValue Result = combineSelectAndUseCommutative(N, false, DCI); + if (Result.getNode()) + return Result; } + // The code below optimizes (or (and X, Y), Z). + // The AND operand needs to have a single user to make these optimizations + // profitable. SDValue N0 = N->getOperand(0); - if (N0.getOpcode() != ISD::AND) + if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) return SDValue(); SDValue N1 = N->getOperand(1); @@ -7211,10 +8195,10 @@ static SDValue PerformXORCombine(SDNode *N, return SDValue(); if (!Subtarget->isThumb1Only()) { - // (xor x, (cmov 0, y, cond)) => (xor.cond x, y) - SDValue CXOR = formConditionalOp(N, DAG, true); - if (CXOR.getNode()) - return CXOR; + // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) + SDValue Result = combineSelectAndUseCommutative(N, false, DCI); + if (Result.getNode()) + return Result; } return SDValue(); @@ -7307,15 +8291,99 @@ static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { /// ISD::STORE. static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { - // Bitcast an i64 store extracted from a vector to f64. - // Otherwise, the i64 value will be legalized to a pair of i32 values. StoreSDNode *St = cast(N); + if (St->isVolatile()) + return SDValue(); + + // Optimize trunc store (of multiple scalars) to shuffle and store. First, + // pack all of the elements in one place. Next, store to memory in fewer + // chunks. SDValue StVal = St->getValue(); - if (!ISD::isNormalStore(St) || St->isVolatile()) + EVT VT = StVal.getValueType(); + if (St->isTruncatingStore() && VT.isVector()) { + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT StVT = St->getMemoryVT(); + unsigned NumElems = VT.getVectorNumElements(); + assert(StVT != VT && "Cannot truncate to the same type"); + unsigned FromEltSz = VT.getVectorElementType().getSizeInBits(); + unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits(); + + // From, To sizes and ElemCount must be pow of two + if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue(); + + // We are going to use the original vector elt for storing. + // Accumulated smaller vector elements must be a multiple of the store size. + if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue(); + + unsigned SizeRatio = FromEltSz / ToEltSz; + assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle. + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), + NumElems*SizeRatio); + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + DebugLoc DL = St->getDebugLoc(); + SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); + SmallVector ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = i * SizeRatio; + + // Can't shuffle using an illegal type. + if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec, + DAG.getUNDEF(WideVec.getValueType()), + ShuffleVec.data()); + // At this point all of the data is stored at the bottom of the + // register. We now need to save it to mem. + + // Find the largest store unit + MVT StoreType = MVT::i8; + for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; + tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { + MVT Tp = (MVT::SimpleValueType)tp; + if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) + StoreType = Tp; + } + // Didn't find a legal store type. + if (!TLI.isTypeLegal(StoreType)) + return SDValue(); + + // Bitcast the original vector into a vector of store-size units + EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), + StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); + assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); + SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); + SmallVector Chains; + SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, + TLI.getPointerTy()); + SDValue BasePtr = St->getBasePtr(); + + // Perform one or more big stores into memory. + unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits(); + for (unsigned I = 0; I < E; I++) { + SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, + StoreType, ShuffWide, + DAG.getIntPtrConstant(I)); + SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, + Increment); + Chains.push_back(Ch); + } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0], + Chains.size()); + } + + if (!ISD::isNormalStore(St)) return SDValue(); + // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and + // ARM stores of arguments in the same cache line. if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && - StVal.getNode()->hasOneUse() && !St->isVolatile()) { + StVal.getNode()->hasOneUse()) { SelectionDAG &DAG = DCI.DAG; DebugLoc DL = St->getDebugLoc(); SDValue BasePtr = St->getBasePtr(); @@ -7336,6 +8404,8 @@ static SDValue PerformSTORECombine(SDNode *N, StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); + // Bitcast an i64 store extracted from a vector to f64. + // Otherwise, the i64 value will be legalized to a pair of i32 values. SelectionDAG &DAG = DCI.DAG; DebugLoc dl = StVal.getDebugLoc(); SDValue IntVec = StVal.getOperand(0); @@ -8258,8 +9328,7 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { if (Res.getNode()) { APInt KnownZero, KnownOne; - APInt Mask = APInt::getAllOnesValue(VT.getScalarType().getSizeInBits()); - DAG.ComputeMaskedBits(SDValue(N,0), Mask, KnownZero, KnownOne); + DAG.ComputeMaskedBits(SDValue(N,0), KnownZero, KnownOne); // Capture demanded bits information that would be otherwise lost. if (KnownZero == 0xfffffffe) Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, @@ -8279,6 +9348,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { default: break; + case ISD::ADDC: return PerformADDCCombine(N, DCI, Subtarget); case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); case ISD::SUB: return PerformSUBCombine(N, DCI); case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); @@ -8339,18 +9409,36 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); } -bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const { - if (!Subtarget->allowsUnalignedMem()) - return false; +bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const { + // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus + bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i8: case MVT::i16: - case MVT::i32: - return true; - // FIXME: VLD1 etc with standard alignment is legal. + case MVT::i32: { + // Unaligned access can use (for example) LRDB, LRDH, LDR + if (AllowsUnaligned) { + if (Fast) + *Fast = Subtarget->hasV7Ops(); + return true; + } + return false; + } + case MVT::f64: + case MVT::v2f64: { + // For any little-endian targets with neon, we can support unaligned ld/st + // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. + // A big-endian target may also explictly support unaligned accesses + if (Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian())) { + if (Fast) + *Fast = true; + return true; + } + return false; + } } } @@ -8369,26 +9457,51 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, // See if we can use NEON instructions for this... if (IsZeroVal && - !F->hasFnAttr(Attribute::NoImplicitFloat) && - Subtarget->hasNEON()) { - if (memOpAlign(SrcAlign, DstAlign, 16) && Size >= 16) { - return MVT::v4i32; - } else if (memOpAlign(SrcAlign, DstAlign, 8) && Size >= 8) { - return MVT::v2i32; + Subtarget->hasNEON() && + !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) { + bool Fast; + if (Size >= 16 && + (memOpAlign(SrcAlign, DstAlign, 16) || + (allowsUnalignedMemoryAccesses(MVT::v2f64, &Fast) && Fast))) { + return MVT::v2f64; + } else if (Size >= 8 && + (memOpAlign(SrcAlign, DstAlign, 8) || + (allowsUnalignedMemoryAccesses(MVT::f64, &Fast) && Fast))) { + return MVT::f64; } } // Lowering to i32/i16 if the size permits. - if (Size >= 4) { + if (Size >= 4) return MVT::i32; - } else if (Size >= 2) { + else if (Size >= 2) return MVT::i16; - } // Let the target-independent logic figure it out. return MVT::Other; } +bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { + if (Val.getOpcode() != ISD::LOAD) + return false; + + EVT VT1 = Val.getValueType(); + if (!VT1.isSimple() || !VT1.isInteger() || + !VT2.isSimple() || !VT2.isInteger()) + return false; + + switch (VT1.getSimpleVT().SimpleTy) { + default: break; + case MVT::i1: + case MVT::i8: + case MVT::i16: + // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. + return true; + } + + return false; +} + static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { if (V < 0) return false; @@ -8585,19 +9698,28 @@ bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM, /// a register against the immediate without having to materialize the /// immediate into a register. bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { + // Thumb2 and ARM modes can use cmn for negative immediates. if (!Subtarget->isThumb()) - return ARM_AM::getSOImmVal(Imm) != -1; + return ARM_AM::getSOImmVal(llvm::abs64(Imm)) != -1; if (Subtarget->isThumb2()) - return ARM_AM::getT2SOImmVal(Imm) != -1; + return ARM_AM::getT2SOImmVal(llvm::abs64(Imm)) != -1; + // Thumb1 doesn't have cmn, and only 8-bit immediates. return Imm >= 0 && Imm <= 255; } -/// isLegalAddImmediate - Return true if the specified immediate is legal -/// add immediate, that is the target has add instructions which can add -/// a register with the immediate without having to materialize the +/// isLegalAddImmediate - Return true if the specified immediate is a legal add +/// *or sub* immediate, that is the target has add or sub instructions which can +/// add a register with the immediate without having to materialize the /// immediate into a register. bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { - return ARM_AM::getSOImmVal(Imm) != -1; + // Same encoding for add/sub, just flip the sign. + int64_t AbsImm = llvm::abs64(Imm); + if (!Subtarget->isThumb()) + return ARM_AM::getSOImmVal(AbsImm) != -1; + if (Subtarget->isThumb2()) + return ARM_AM::getT2SOImmVal(AbsImm) != -1; + // Thumb1 only has 8-bit unsigned immediate. + return AbsImm >= 0 && AbsImm <= 255; } static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, @@ -8775,22 +9897,20 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, } void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, - const APInt &Mask, APInt &KnownZero, APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth) const { - KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); + KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); switch (Op.getOpcode()) { default: break; case ARMISD::CMOV: { // Bits are known zero/one if known on the LHS and RHS. - DAG.ComputeMaskedBits(Op.getOperand(0), Mask, KnownZero, KnownOne, Depth+1); + DAG.ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); if (KnownZero == 0 && KnownOne == 0) return; APInt KnownZeroRHS, KnownOneRHS; - DAG.ComputeMaskedBits(Op.getOperand(1), Mask, - KnownZeroRHS, KnownOneRHS, Depth+1); + DAG.ComputeMaskedBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1); KnownZero &= KnownZeroRHS; KnownOne &= KnownOneRHS; return; @@ -8903,39 +10023,38 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, switch (Constraint[0]) { case 'l': // Low regs or general regs. if (Subtarget->isThumb()) - return RCPair(0U, ARM::tGPRRegisterClass); - else - return RCPair(0U, ARM::GPRRegisterClass); + return RCPair(0U, &ARM::tGPRRegClass); + return RCPair(0U, &ARM::GPRRegClass); case 'h': // High regs or no regs. if (Subtarget->isThumb()) - return RCPair(0U, ARM::hGPRRegisterClass); + return RCPair(0U, &ARM::hGPRRegClass); break; case 'r': - return RCPair(0U, ARM::GPRRegisterClass); + return RCPair(0U, &ARM::GPRRegClass); case 'w': if (VT == MVT::f32) - return RCPair(0U, ARM::SPRRegisterClass); + return RCPair(0U, &ARM::SPRRegClass); if (VT.getSizeInBits() == 64) - return RCPair(0U, ARM::DPRRegisterClass); + return RCPair(0U, &ARM::DPRRegClass); if (VT.getSizeInBits() == 128) - return RCPair(0U, ARM::QPRRegisterClass); + return RCPair(0U, &ARM::QPRRegClass); break; case 'x': if (VT == MVT::f32) - return RCPair(0U, ARM::SPR_8RegisterClass); + return RCPair(0U, &ARM::SPR_8RegClass); if (VT.getSizeInBits() == 64) - return RCPair(0U, ARM::DPR_8RegisterClass); + return RCPair(0U, &ARM::DPR_8RegClass); if (VT.getSizeInBits() == 128) - return RCPair(0U, ARM::QPR_8RegisterClass); + return RCPair(0U, &ARM::QPR_8RegClass); break; case 't': if (VT == MVT::f32) - return RCPair(0U, ARM::SPRRegisterClass); + return RCPair(0U, &ARM::SPRRegClass); break; } } if (StringRef("{cc}").equals_lower(Constraint)) - return std::make_pair(unsigned(ARM::CPSR), ARM::CCRRegisterClass); + return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); } @@ -9141,6 +10260,24 @@ bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { return false; } +bool ARMTargetLowering::isIntImmLegal(const APInt &Imm, EVT VT) const { + if (VT.getSizeInBits() > 32) + return false; + + int32_t ImmVal = Imm.getSExtValue(); + if (!Subtarget->isThumb()) { + return (ImmVal >= 0 && ImmVal < 65536) || + (ARM_AM::getSOImmVal(ImmVal) != -1) || + (ARM_AM::getSOImmVal(~ImmVal) != -1); + } else if (Subtarget->isThumb2()) { + return (ImmVal >= 0 && ImmVal < 65536) || + (ARM_AM::getT2SOImmVal(ImmVal) != -1) || + (ARM_AM::getT2SOImmVal(~ImmVal) != -1); + } else /*Thumb1*/ { + return (ImmVal >= 0 && ImmVal < 256); + } +} + /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment /// specified in the intrinsic calls. @@ -9157,7 +10294,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::arm_neon_vld4lane: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. - uint64_t NumElts = getTargetData()->getTypeAllocSize(I.getType()) / 8; + uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8; Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; @@ -9182,7 +10319,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; - NumElts += getTargetData()->getTypeAllocSize(ArgTy) / 8; + NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8; } Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0);