X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FARM%2FARMISelLowering.cpp;h=159d4df00bb42b9936fb61661894d4fa8c0f61ff;hb=8163ca76f0b0d336c5436364ffb3b85be1162e7a;hp=c17e9ae381e9aa63c36fddf1c5f3aa7e65edb7f1;hpb=67514e90669ec9ffd954c1fcb6f8979bafcabe8a;p=oota-llvm.git diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index c17e9ae381e..159d4df00bb 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -23,14 +23,9 @@ #include "ARMTargetMachine.h" #include "ARMTargetObjectFile.h" #include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/CallingConv.h" -#include "llvm/Constants.h" -#include "llvm/Function.h" -#include "llvm/GlobalValue.h" -#include "llvm/Instruction.h" -#include "llvm/Instructions.h" -#include "llvm/Intrinsics.h" -#include "llvm/Type.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -40,14 +35,19 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/Constants.h" +#include "llvm/Function.h" +#include "llvm/GlobalValue.h" +#include "llvm/Instruction.h" +#include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" #include "llvm/MC/MCSectionMachO.h" -#include "llvm/Target/TargetOptions.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/ADT/Statistic.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Type.h" using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); @@ -122,6 +122,7 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); if (VT.isInteger()) { setOperationAction(ISD::SHL, VT, Custom); @@ -514,6 +515,11 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); setOperationAction(ISD::FEXP, MVT::v4f32, Expand); setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); + setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); + setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); + setOperationAction(ISD::FRINT, MVT::v4f32, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); + setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); // Neon does not support some operations on v1i64 and v2i64 types. setOperationAction(ISD::MUL, MVT::v1i64, Expand); @@ -537,6 +543,17 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); + setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); + + // NEON does not have single instruction CTPOP for vectors with element + // types wider than 8-bits. However, custom lowering can leverage the + // v8i8/v16i8 vcnt instruction. + setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); + setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); + setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); + setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); + setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); @@ -634,9 +651,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) if (!Subtarget->hasV6Ops()) setOperationAction(ISD::BSWAP, MVT::i32, Expand); - // These are expanded into libcalls. - if (!Subtarget->hasDivide() || !Subtarget->isThumb2()) { - // v7M has a hardware divider + if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) && + !(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) { + // These are expanded into libcalls if the cpu doesn't have HW divider. setOperationAction(ISD::SDIV, MVT::i32, Expand); setOperationAction(ISD::UDIV, MVT::i32, Expand); } @@ -686,7 +703,11 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc. setInsertFencesForAtomic(true); @@ -796,12 +817,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::MUL); - - if (Subtarget->hasV6T2Ops() || Subtarget->hasNEON()) { - setTargetDAGCombine(ISD::AND); - setTargetDAGCombine(ISD::OR); - setTargetDAGCombine(ISD::XOR); - } + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::OR); + setTargetDAGCombine(ISD::XOR); if (Subtarget->hasV6Ops()) setTargetDAGCombine(ISD::SRL); @@ -815,9 +833,12 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setSchedulingPreference(Sched::Hybrid); //// temporary - rewrite interface to use type - maxStoresPerMemcpy = maxStoresPerMemcpyOptSize = 1; - maxStoresPerMemset = 16; + maxStoresPerMemset = 8; maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + maxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores + maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 4 : 2; + maxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores + maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 4 : 2; // On ARM arguments smaller than 4 bytes are extended, so all arguments // are at least 4 bytes aligned. @@ -826,7 +847,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) benefitFromCodePlacementOpt = true; // Prefer likely predicted branches to selects on out-of-order cores. - predictableSelectIsExpensive = Subtarget->isCortexA9(); + predictableSelectIsExpensive = Subtarget->isLikeA9(); setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); } @@ -1025,7 +1046,7 @@ EVT ARMTargetLowering::getSetCCResultType(EVT VT) const { /// getRegClassFor - Return the register class that should be used for the /// specified value type. -const TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const { +const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const { // Map v4i64 to QQ registers but do not make the type legal. Similarly map // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to // load / store 4 to 8 consecutive D registers. @@ -1595,19 +1616,19 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // FIXME: handle tail calls differently. unsigned CallOpc; + bool HasMinSizeAttr = MF.getFunction()->getFnAttributes(). + hasAttribute(Attributes::MinSize); if (Subtarget->isThumb()) { if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; - else if (doesNotRet && isDirect && !isARMFunc && - Subtarget->hasRAS() && !Subtarget->isThumb1Only()) - // "mov lr, pc; b _foo" to avoid confusing the RSP - CallOpc = ARMISD::CALL_NOLINK; else CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL; } else { - if (!isDirect && !Subtarget->hasV5TOps()) { + if (!isDirect && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; - } else if (doesNotRet && isDirect && Subtarget->hasRAS()) + else if (doesNotRet && isDirect && Subtarget->hasRAS() && + // Emit regular call when code size is the priority + !HasMinSizeAttr) // "mov lr, pc; b _foo" to avoid confusing the RSP CallOpc = ARMISD::CALL_NOLINK; else @@ -1657,22 +1678,31 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, /// and then confiscate the rest of the parameter registers to insure /// this. void -ARMTargetLowering::HandleByVal(CCState *State, unsigned &size) const { +ARMTargetLowering::HandleByVal( + CCState *State, unsigned &size, unsigned Align) const { unsigned reg = State->AllocateReg(GPRArgRegs, 4); assert((State->getCallOrPrologue() == Prologue || State->getCallOrPrologue() == Call) && "unhandled ParmContext"); if ((!State->isFirstByValRegValid()) && (ARM::R0 <= reg) && (reg <= ARM::R3)) { - State->setFirstByValReg(reg); - // At a call site, a byval parameter that is split between - // registers and memory needs its size truncated here. In a - // function prologue, such byval parameters are reassembled in - // memory, and are not truncated. - if (State->getCallOrPrologue() == Call) { - unsigned excess = 4 * (ARM::R4 - reg); - assert(size >= excess && "expected larger existing stack allocation"); - size -= excess; + if (Subtarget->isAAPCS_ABI() && Align > 4) { + unsigned AlignInRegs = Align / 4; + unsigned Waste = (ARM::R4 - reg) % AlignInRegs; + for (unsigned i = 0; i < Waste; ++i) + reg = State->AllocateReg(GPRArgRegs, 4); + } + if (reg != 0) { + State->setFirstByValReg(reg); + // At a call site, a byval parameter that is split between + // registers and memory needs its size truncated here. In a + // function prologue, such byval parameters are reassembled in + // memory, and are not truncated. + if (State->getCallOrPrologue() == Call) { + unsigned excess = 4 * (ARM::R4 - reg); + assert(size >= excess && "expected larger existing stack allocation"); + size -= excess; + } } } // Confiscate any remaining parameter registers to preclude their @@ -1805,6 +1835,14 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, } } + // If Caller's vararg or byval argument has been split between registers and + // stack, do not perform tail call, since part of the argument is in caller's + // local frame. + const ARMFunctionInfo *AFI_Caller = DAG.getMachineFunction(). + getInfo(); + if (AFI_Caller->getVarArgsRegSaveSize()) + return false; + // If the callee takes no arguments then go on to check the results of the // call. if (!Outs.empty()) { @@ -1859,6 +1897,17 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, return true; } +bool +ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, + MachineFunction &MF, bool isVarArg, + const SmallVectorImpl &Outs, + LLVMContext &Context) const { + SmallVector RVLocs; + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context); + return CCInfo.CheckReturn(Outs, CCAssignFnForNode(CallConv, /*Return=*/true, + isVarArg)); +} + SDValue ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -2536,7 +2585,10 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF, void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, DebugLoc dl, SDValue &Chain, - unsigned ArgOffset) const { + const Value *OrigArg, + unsigned OffsetFromOrigArg, + unsigned ArgOffset, + bool ForceMutable) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo(); @@ -2563,7 +2615,7 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, getPointerTy()); SmallVector MemOps; - for (; firstRegToSaveIndex < 4; ++firstRegToSaveIndex) { + for (unsigned i = 0; firstRegToSaveIndex < 4; ++firstRegToSaveIndex, ++i) { const TargetRegisterClass *RC; if (AFI->isThumb1OnlyFunction()) RC = &ARM::tGPRRegClass; @@ -2574,7 +2626,7 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo::getFixedStack(AFI->getVarArgsFrameIndex()), + MachinePointerInfo(OrigArg, OffsetFromOrigArg + 4*i), false, false, 0); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, @@ -2585,7 +2637,8 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, &MemOps[0], MemOps.size()); } else // This will point to the next argument passed via stack. - AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true)); + AFI->setVarArgsFrameIndex( + MFI->CreateFixedObject(4, ArgOffset, !ForceMutable)); } SDValue @@ -2608,14 +2661,16 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv, /* Return*/ false, isVarArg)); - + SmallVector ArgValues; int lastInsIndex = -1; - SDValue ArgValue; + Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); + unsigned CurArgIdx = 0; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; - + std::advance(CurOrigArg, Ins[VA.getValNo()].OrigArgIndex - CurArgIdx); + CurArgIdx = Ins[VA.getValNo()].OrigArgIndex; // Arguments stored in registers. if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); @@ -2709,14 +2764,20 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, // Since they could be overwritten by lowering of arguments in case of // a tail call. if (Flags.isByVal()) { - unsigned VARegSize, VARegSaveSize; - computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize); - VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 0); - unsigned Bytes = Flags.getByValSize() - VARegSize; - if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. - int FI = MFI->CreateFixedObject(Bytes, - VA.getLocMemOffset(), false); - InVals.push_back(DAG.getFrameIndex(FI, getPointerTy())); + ARMFunctionInfo *AFI = MF.getInfo(); + if (!AFI->getVarArgsFrameIndex()) { + VarArgStyleRegisters(CCInfo, DAG, + dl, Chain, CurOrigArg, + Ins[VA.getValNo()].PartOffset, + VA.getLocMemOffset(), + true /*force mutable frames*/); + int VAFrameIndex = AFI->getVarArgsFrameIndex(); + InVals.push_back(DAG.getFrameIndex(VAFrameIndex, getPointerTy())); + } else { + int FI = MFI->CreateFixedObject(Flags.getByValSize(), + VA.getLocMemOffset(), false); + InVals.push_back(DAG.getFrameIndex(FI, getPointerTy())); + } } else { int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8, VA.getLocMemOffset(), true); @@ -2734,7 +2795,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, // varargs if (isVarArg) - VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset()); + VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 0, 0, + CCInfo.getNextStackOffset()); return Chain; } @@ -3503,6 +3565,114 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::CTLZ, dl, VT, rbit); } +/// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count +/// for each 16-bit element from operand, repeated. The basic idea is to +/// leverage vcnt to get the 8-bit counts, gather and add the results. +/// +/// Trace for v4i16: +/// input = [v0 v1 v2 v3 ] (vi 16-bit element) +/// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element) +/// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi) +/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6] +/// [b0 b1 b2 b3 b4 b5 b6 b7] +/// +[b1 b0 b3 b2 b5 b4 b7 b6] +/// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0, +/// vuzp: = [k0 k1 k2 k3 k0 k1 k2 k3] each ki is 8-bits) +static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + DebugLoc DL = N->getDebugLoc(); + + EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; + SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0)); + SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0); + SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1); + SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2); + return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3); +} + +/// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the +/// bit-count for each 16-bit element from the operand. We need slightly +/// different sequencing for v4i16 and v8i16 to stay within NEON's available +/// 64/128-bit registers. +/// +/// Trace for v4i16: +/// input = [v0 v1 v2 v3 ] (vi 16-bit element) +/// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi) +/// v8i16:Extended = [k0 k1 k2 k3 k0 k1 k2 k3 ] +/// v4i16:Extracted = [k0 k1 k2 k3 ] +static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + DebugLoc DL = N->getDebugLoc(); + + SDValue BitCounts = getCTPOP16BitCounts(N, DAG); + if (VT.is64BitVector()) { + SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended, + DAG.getIntPtrConstant(0)); + } else { + SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, + BitCounts, DAG.getIntPtrConstant(0)); + return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted); + } +} + +/// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the +/// bit-count for each 32-bit element from the operand. The idea here is +/// to split the vector into 16-bit elements, leverage the 16-bit count +/// routine, and then combine the results. +/// +/// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged): +/// input = [v0 v1 ] (vi: 32-bit elements) +/// Bitcast = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1]) +/// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi) +/// vrev: N0 = [k1 k0 k3 k2 ] +/// [k0 k1 k2 k3 ] +/// N1 =+[k1 k0 k3 k2 ] +/// [k0 k2 k1 k3 ] +/// N2 =+[k1 k3 k0 k2 ] +/// [k0 k2 k1 k3 ] +/// Extended =+[k1 k3 k0 k2 ] +/// [k0 k2 ] +/// Extracted=+[k1 k3 ] +/// +static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + DebugLoc DL = N->getDebugLoc(); + + EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; + + SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0)); + SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG); + SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16); + SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0); + SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1); + + if (VT.is64BitVector()) { + SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended, + DAG.getIntPtrConstant(0)); + } else { + SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2, + DAG.getIntPtrConstant(0)); + return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted); + } +} + +static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = N->getValueType(0); + + assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); + assert((VT == MVT::v2i32 || VT == MVT::v4i32 || + VT == MVT::v4i16 || VT == MVT::v8i16) && + "Unexpected type for custom ctpop lowering"); + + if (VT.getVectorElementType() == MVT::i32) + return lowerCTPOP32BitElements(N, DAG); + else + return lowerCTPOP16BitElements(N, DAG); +} + static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); @@ -3894,6 +4064,36 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, return SDValue(); } +// check if an VEXT instruction can handle the shuffle mask when the +// vector sources of the shuffle are the same. +static bool isSingletonVEXTMask(ArrayRef M, EVT VT, unsigned &Imm) { + unsigned NumElts = VT.getVectorNumElements(); + + // Assume that the first shuffle index is not UNDEF. Fail if it is. + if (M[0] < 0) + return false; + + Imm = M[0]; + + // If this is a VEXT shuffle, the immediate value is the index of the first + // element. The other shuffle indices must be the successive elements after + // the first one. + unsigned ExpectedElt = Imm; + for (unsigned i = 1; i < NumElts; ++i) { + // Increment the expected index. If it wraps around, just follow it + // back to index zero and keep going. + ++ExpectedElt; + if (ExpectedElt == NumElts) + ExpectedElt = 0; + + if (M[i] < 0) continue; // ignore UNDEF indices + if (ExpectedElt != static_cast(M[i])) + return false; + } + + return true; +} + static bool isVEXTMask(ArrayRef M, EVT VT, bool &ReverseVEXT, unsigned &Imm) { @@ -4161,10 +4361,21 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, } // Scan through the operands to see if only one value is used. + // + // As an optimisation, even if more than one value is used it may be more + // profitable to splat with one value then change some lanes. + // + // Heuristically we decide to do this if the vector has a "dominant" value, + // defined as splatted to more than half of the lanes. unsigned NumElts = VT.getVectorNumElements(); bool isOnlyLowElement = true; bool usesOnlyOneValue = true; + bool hasDominantValue = false; bool isConstant = true; + + // Map of the number of times a particular SDValue appears in the + // element list. + DenseMap ValueCounts; SDValue Value; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); @@ -4175,13 +4386,21 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (!isa(V) && !isa(V)) isConstant = false; - if (!Value.getNode()) + ValueCounts.insert(std::make_pair(V, 0)); + unsigned &Count = ValueCounts[V]; + + // Is this value dominant? (takes up more than half of the lanes) + if (++Count > (NumElts / 2)) { + hasDominantValue = true; Value = V; - else if (V != Value) - usesOnlyOneValue = false; + } } + if (ValueCounts.size() != 1) + usesOnlyOneValue = false; + if (!Value.getNode() && ValueCounts.size() > 0) + Value = ValueCounts.begin()->first; - if (!Value.getNode()) + if (ValueCounts.size() == 0) return DAG.getUNDEF(VT); if (isOnlyLowElement) @@ -4191,9 +4410,51 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // Use VDUP for non-constant splats. For f32 constant splats, reduce to // i32 and try again. - if (usesOnlyOneValue && EltSize <= 32) { - if (!isConstant) - return DAG.getNode(ARMISD::VDUP, dl, VT, Value); + if (hasDominantValue && EltSize <= 32) { + if (!isConstant) { + SDValue N; + + // If we are VDUPing a value that comes directly from a vector, that will + // cause an unnecessary move to and from a GPR, where instead we could + // just use VDUPLANE. + if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + // We need to create a new undef vector to use for the VDUPLANE if the + // size of the vector from which we get the value is different than the + // size of the vector that we need to create. We will insert the element + // such that the register coalescer will remove unnecessary copies. + if (VT != Value->getOperand(0).getValueType()) { + ConstantSDNode *constIndex; + constIndex = dyn_cast(Value->getOperand(1)); + assert(constIndex && "The index is not a constant!"); + unsigned index = constIndex->getAPIntValue().getLimitedValue() % + VT.getVectorNumElements(); + N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, + DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), + Value, DAG.getConstant(index, MVT::i32)), + DAG.getConstant(index, MVT::i32)); + } else { + N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, + Value->getOperand(0), Value->getOperand(1)); + } + } + else + N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); + + if (!usesOnlyOneValue) { + // The dominant value was splatted as 'N', but we now have to insert + // all differing elements. + for (unsigned I = 0; I < NumElts; ++I) { + if (Op.getOperand(I) == Value) + continue; + SmallVector Ops; + Ops.push_back(N); + Ops.push_back(Op.getOperand(I)); + Ops.push_back(DAG.getConstant(I, MVT::i32)); + N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, &Ops[0], 3); + } + } + return N; + } if (VT.getVectorElementType().isFloatingPoint()) { SmallVector Ops; for (unsigned i = 0; i < NumElts; ++i) @@ -4205,9 +4466,11 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (Val.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, Val); } - SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); - if (Val.getNode()) - return DAG.getNode(ARMISD::VDUP, dl, VT, Val); + if (usesOnlyOneValue) { + SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); + if (isConstant && Val.getNode()) + return DAG.getNode(ARMISD::VDUP, dl, VT, Val); + } } // If all elements are constants and the case above didn't get hit, fall back @@ -4590,6 +4853,12 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { if (isVREVMask(ShuffleMask, VT, 16)) return DAG.getNode(ARMISD::VREV16, dl, VT, V1); + if (V2->getOpcode() == ISD::UNDEF && + isSingletonVEXTMask(ShuffleMask, VT, Imm)) { + return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, + DAG.getConstant(Imm, MVT::i32)); + } + // Check for Neon shuffles that modify both input vectors in place. // If both results are used, i.e., if there are two shuffles with the same // source operands and with masks corresponding to both results of one of @@ -4789,16 +5058,76 @@ static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { return false; } -/// SkipExtension - For a node that is a SIGN_EXTEND, ZERO_EXTEND, extending -/// load, or BUILD_VECTOR with extended elements, return the unextended value. -static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) { +/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total +/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. +/// We insert the required extension here to get the vector to fill a D register. +static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, + const EVT &OrigTy, + const EVT &ExtTy, + unsigned ExtOpcode) { + // The vector originally had a size of OrigTy. It was then extended to ExtTy. + // We expect the ExtTy to be 128-bits total. If the OrigTy is less than + // 64-bits we need to insert a new extension so that it will be 64-bits. + assert(ExtTy.is128BitVector() && "Unexpected extension size"); + if (OrigTy.getSizeInBits() >= 64) + return N; + + // Must extend size to at least 64 bits to be used as an operand for VMULL. + MVT::SimpleValueType OrigSimpleTy = OrigTy.getSimpleVT().SimpleTy; + EVT NewVT; + switch (OrigSimpleTy) { + default: llvm_unreachable("Unexpected Orig Vector Type"); + case MVT::v2i8: + case MVT::v2i16: + NewVT = MVT::v2i32; + break; + case MVT::v4i8: + NewVT = MVT::v4i16; + break; + } + return DAG.getNode(ExtOpcode, N->getDebugLoc(), NewVT, N); +} + +/// SkipLoadExtensionForVMULL - return a load of the original vector size that +/// does not do any sign/zero extension. If the original vector is less +/// than 64 bits, an appropriate extension will be added after the load to +/// reach a total size of 64 bits. We have to add the extension separately +/// because ARM does not have a sign/zero extending load for vectors. +static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { + SDValue NonExtendingLoad = + DAG.getLoad(LD->getMemoryVT(), LD->getDebugLoc(), LD->getChain(), + LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(), + LD->isNonTemporal(), LD->isInvariant(), + LD->getAlignment()); + unsigned ExtOp = 0; + switch (LD->getExtensionType()) { + default: llvm_unreachable("Unexpected LoadExtType"); + case ISD::EXTLOAD: + case ISD::SEXTLOAD: ExtOp = ISD::SIGN_EXTEND; break; + case ISD::ZEXTLOAD: ExtOp = ISD::ZERO_EXTEND; break; + } + MVT::SimpleValueType MemType = LD->getMemoryVT().getSimpleVT().SimpleTy; + MVT::SimpleValueType ExtType = LD->getValueType(0).getSimpleVT().SimpleTy; + return AddRequiredExtensionForVMULL(NonExtendingLoad, DAG, + MemType, ExtType, ExtOp); +} + +/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, +/// extending load, or BUILD_VECTOR with extended elements, return the +/// unextended value. The unextended vector should be 64 bits so that it can +/// be used as an operand to a VMULL instruction. If the original vector size +/// before extension is less than 64 bits we add a an extension to resize +/// the vector to 64 bits. +static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) - return N->getOperand(0); + return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, + N->getOperand(0)->getValueType(0), + N->getValueType(0), + N->getOpcode()); + if (LoadSDNode *LD = dyn_cast(N)) - return DAG.getLoad(LD->getMemoryVT(), N->getDebugLoc(), LD->getChain(), - LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(), - LD->isNonTemporal(), LD->isInvariant(), - LD->getAlignment()); + return SkipLoadExtensionForVMULL(LD, DAG); + // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will // have been legalized as a BITCAST from v4i32. if (N->getOpcode() == ISD::BITCAST) { @@ -4853,7 +5182,8 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { // Multiplications are only custom-lowered for 128-bit vectors so that // VMULL can be detected. Otherwise v2i64 multiplications are not legal. EVT VT = Op.getValueType(); - assert(VT.is128BitVector() && "unexpected type for custom-lowering ISD::MUL"); + assert(VT.is128BitVector() && VT.isInteger() && + "unexpected type for custom-lowering ISD::MUL"); SDNode *N0 = Op.getOperand(0).getNode(); SDNode *N1 = Op.getOperand(1).getNode(); unsigned NewOpc = 0; @@ -4896,9 +5226,9 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { // Legalize to a VMULL instruction. DebugLoc DL = Op.getDebugLoc(); SDValue Op0; - SDValue Op1 = SkipExtension(N1, DAG); + SDValue Op1 = SkipExtensionForVMULL(N1, DAG); if (!isMLA) { - Op0 = SkipExtension(N0, DAG); + Op0 = SkipExtensionForVMULL(N0, DAG); assert(Op0.getValueType().is64BitVector() && Op1.getValueType().is64BitVector() && "unexpected types for extended operands to VMULL"); @@ -4913,8 +5243,8 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { // vaddl q0, d4, d5 // vmovl q1, d6 // vmul q0, q0, q1 - SDValue N00 = SkipExtension(N0->getOperand(0).getNode(), DAG); - SDValue N01 = SkipExtension(N0->getOperand(1).getNode(), DAG); + SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); + SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); EVT Op1VT = Op1.getValueType(); return DAG.getNode(N0->getOpcode(), DL, VT, DAG.getNode(NewOpc, DL, VT, @@ -5200,6 +5530,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SRL_PARTS: case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op.getNode(), DAG, Subtarget); + case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); case ISD::SETCC: return LowerVSETCC(Op, DAG); case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); @@ -5260,6 +5591,18 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, case ISD::ATOMIC_CMP_SWAP: ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMCMPXCHG64_DAG); return; + case ISD::ATOMIC_LOAD_MIN: + ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMIN64_DAG); + return; + case ISD::ATOMIC_LOAD_UMIN: + ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMIN64_DAG); + return; + case ISD::ATOMIC_LOAD_MAX: + ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMAX64_DAG); + return; + case ISD::ATOMIC_LOAD_UMAX: + ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMAX64_DAG); + return; } if (Res.getNode()) Results.push_back(Res); @@ -5550,7 +5893,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, // ldrex dest, ptr // (sign extend dest, if required) // cmp dest, incr - // cmov.cond scratch2, dest, incr + // cmov.cond scratch2, incr, dest // strex scratch, scratch2, ptr // cmp scratch, #0 // bne- loopMBB @@ -5573,7 +5916,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) .addReg(oldval).addReg(incr)); BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr), scratch2) - .addReg(oldval).addReg(incr).addImm(Cond).addReg(ARM::CPSR); + .addReg(incr).addReg(oldval).addImm(Cond).addReg(ARM::CPSR); MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr); if (strOpc == ARM::t2STREX) @@ -5599,7 +5942,8 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, MachineBasicBlock * ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, unsigned Op1, unsigned Op2, - bool NeedsCarry, bool IsCmpxchg) const { + bool NeedsCarry, bool IsCmpxchg, + bool IsMinMax, ARMCC::CondCodes CC) const { // This also handles ATOMIC_SWAP, indicated by Op1==0. const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); @@ -5628,16 +5972,15 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *contBB = 0, *cont2BB = 0; - if (IsCmpxchg) { + if (IsCmpxchg || IsMinMax) contBB = MF->CreateMachineBasicBlock(LLVM_BB); + if (IsCmpxchg) cont2BB = MF->CreateMachineBasicBlock(LLVM_BB); - } MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(It, loopMBB); - if (IsCmpxchg) { - MF->insert(It, contBB); - MF->insert(It, cont2BB); - } + if (IsCmpxchg || IsMinMax) MF->insert(It, contBB); + if (IsCmpxchg) MF->insert(It, cont2BB); MF->insert(It, exitMBB); // Transfer the remainder of BB and its successor edges to exitMBB. @@ -5673,12 +6016,32 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, // for ldrexd must be different. BB = loopMBB; // Load + unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + unsigned GPRPair1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + unsigned GPRPair2; + if (IsMinMax) { + //We need an extra double register for doing min/max. + unsigned undef = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + GPRPair2 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), undef); + BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1) + .addReg(undef) + .addReg(vallo) + .addImm(ARM::gsub_0); + BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), GPRPair2) + .addReg(r1) + .addReg(valhi) + .addImm(ARM::gsub_1); + } + AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc)) - .addReg(ARM::R2, RegState::Define) - .addReg(ARM::R3, RegState::Define).addReg(ptr)); + .addReg(GPRPair0, RegState::Define).addReg(ptr)); // Copy r2/r3 into dest. (This copy will normally be coalesced.) - BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo).addReg(ARM::R2); - BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi).addReg(ARM::R3); + BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo) + .addReg(GPRPair0, 0, ARM::gsub_0); + BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi) + .addReg(GPRPair0, 0, ARM::gsub_1); if (IsCmpxchg) { // Add early exit @@ -5697,24 +6060,67 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, // Copy to physregs for strexd unsigned setlo = MI->getOperand(5).getReg(); unsigned sethi = MI->getOperand(6).getReg(); - BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R0).addReg(setlo); - BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R1).addReg(sethi); + unsigned undef = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), undef); + BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1) + .addReg(undef) + .addReg(setlo) + .addImm(ARM::gsub_0); + BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), GPRPair1) + .addReg(r1) + .addReg(sethi) + .addImm(ARM::gsub_1); } else if (Op1) { // Perform binary operation - AddDefaultPred(BuildMI(BB, dl, TII->get(Op1), ARM::R0) + unsigned tmpRegLo = MRI.createVirtualRegister(TRC); + AddDefaultPred(BuildMI(BB, dl, TII->get(Op1), tmpRegLo) .addReg(destlo).addReg(vallo)) .addReg(NeedsCarry ? ARM::CPSR : 0, getDefRegState(NeedsCarry)); - AddDefaultPred(BuildMI(BB, dl, TII->get(Op2), ARM::R1) - .addReg(desthi).addReg(valhi)).addReg(0); + unsigned tmpRegHi = MRI.createVirtualRegister(TRC); + AddDefaultPred(BuildMI(BB, dl, TII->get(Op2), tmpRegHi) + .addReg(desthi).addReg(valhi)) + .addReg(IsMinMax ? ARM::CPSR : 0, getDefRegState(IsMinMax)); + + unsigned UndefPair = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), UndefPair); + unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1) + .addReg(UndefPair) + .addReg(tmpRegLo) + .addImm(ARM::gsub_0); + BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), GPRPair1) + .addReg(r1) + .addReg(tmpRegHi) + .addImm(ARM::gsub_1); } else { // Copy to physregs for strexd - BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R0).addReg(vallo); - BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R1).addReg(valhi); + unsigned UndefPair = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), UndefPair); + BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1) + .addReg(UndefPair) + .addReg(vallo) + .addImm(ARM::gsub_0); + BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), GPRPair1) + .addReg(r1) + .addReg(valhi) + .addImm(ARM::gsub_1); + } + unsigned GPRPairStore = GPRPair1; + if (IsMinMax) { + // Compare and branch to exit block. + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(exitMBB).addImm(CC).addReg(ARM::CPSR); + BB->addSuccessor(exitMBB); + BB->addSuccessor(contBB); + BB = contBB; + GPRPairStore = GPRPair2; } // Store AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess) - .addReg(ARM::R0).addReg(ARM::R1).addReg(ptr)); + .addReg(GPRPairStore).addReg(ptr)); // Cmp+jump AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) .addReg(storesuccess).addImm(0)); @@ -5943,12 +6349,15 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); - if (AFI->isThumb1OnlyFunction()) - BuildMI(DispatchBB, dl, TII->get(ARM::tInt_eh_sjlj_dispatchsetup)); - else if (!Subtarget->hasVFP2()) - BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup_nofp)); - else - BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); + MachineInstrBuilder MIB; + MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); + + const ARMBaseInstrInfo *AII = static_cast(TII); + const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); + + // Add a register mask with no preserved registers. This results in all + // registers being marked as clobbered. + MIB.addRegMask(RI.getNoPreservedMask()); unsigned NumLPads = LPadList.size(); if (Subtarget->isThumb2()) { @@ -6020,9 +6429,9 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. - unsigned Align = getTargetData()->getPrefTypeAlignment(Int32Ty); + unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty); if (Align == 0) - Align = getTargetData()->getTypeAllocSize(C->getType()); + Align = getDataLayout()->getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); unsigned VReg1 = MRI->createVirtualRegister(TRC); @@ -6109,9 +6518,9 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. - unsigned Align = getTargetData()->getPrefTypeAlignment(Int32Ty); + unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty); if (Align == 0) - Align = getTargetData()->getTypeAllocSize(C->getType()); + Align = getDataLayout()->getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); unsigned VReg1 = MRI->createVirtualRegister(TRC); @@ -6167,8 +6576,6 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { } // N.B. the order the invoke BBs are processed in doesn't matter here. - const ARMBaseInstrInfo *AII = static_cast(TII); - const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); const uint16_t *SavedRegs = RI.getCalleeSavedRegs(MF); SmallVector MBBLPads; for (SmallPtrSet::iterator @@ -6282,7 +6689,8 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { UnitSize = 2; } else { // Check whether we can use NEON instructions. - if (!MF->getFunction()->hasFnAttr(Attribute::NoImplicitFloat) && + if (!MF->getFunction()->getFnAttributes(). + hasAttribute(Attributes::NoImplicitFloat) && Subtarget->hasNEON()) { if ((Align % 16 == 0) && SizeVal >= 16) { ldrOpc = ARM::VLD1q32wb_fixed; @@ -6367,7 +6775,8 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { } else { AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ldrOpc),scratch) - .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1)); + .addReg(srcOut, RegState::Define).addReg(srcIn) + .addReg(0).addImm(1)); AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) .addReg(scratch).addReg(destIn) @@ -6430,9 +6839,9 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { const Constant *C = ConstantInt::get(Int32Ty, LoopSize); // MachineConstantPool wants an explicit alignment. - unsigned Align = getTargetData()->getPrefTypeAlignment(Int32Ty); + unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty); if (Align == 0) - Align = getTargetData()->getTypeAllocSize(C->getType()); + Align = getDataLayout()->getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDRcp)) @@ -6709,6 +7118,26 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, /*NeedsCarry*/ false, /*IsCmpxchg*/true); + case ARM::ATOMMIN6432: + return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, + isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, + /*NeedsCarry*/ true, /*IsCmpxchg*/false, + /*IsMinMax*/ true, ARMCC::LE); + case ARM::ATOMMAX6432: + return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, + isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, + /*NeedsCarry*/ true, /*IsCmpxchg*/false, + /*IsMinMax*/ true, ARMCC::GE); + case ARM::ATOMUMIN6432: + return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, + isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, + /*NeedsCarry*/ true, /*IsCmpxchg*/false, + /*IsMinMax*/ true, ARMCC::LS); + case ARM::ATOMUMAX6432: + return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, + isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, + /*NeedsCarry*/ true, /*IsCmpxchg*/false, + /*IsMinMax*/ true, ARMCC::HS); case ARM::tMOVCCr_pseudo: { // To "insert" a SELECT_CC instruction, we actually have to insert the @@ -8980,20 +9409,36 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); } -bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const { - if (!Subtarget->allowsUnalignedMem()) - return false; +bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const { + // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus + bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i8: case MVT::i16: - case MVT::i32: - return true; + case MVT::i32: { + // Unaligned access can use (for example) LRDB, LRDH, LDR + if (AllowsUnaligned) { + if (Fast) + *Fast = Subtarget->hasV7Ops(); + return true; + } + return false; + } case MVT::f64: - return Subtarget->hasNEON(); - // FIXME: VLD1 etc with standard alignment is legal. + case MVT::v2f64: { + // For any little-endian targets with neon, we can support unaligned ld/st + // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. + // A big-endian target may also explictly support unaligned accesses + if (Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian())) { + if (Fast) + *Fast = true; + return true; + } + return false; + } } } @@ -9012,26 +9457,51 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, // See if we can use NEON instructions for this... if (IsZeroVal && - !F->hasFnAttr(Attribute::NoImplicitFloat) && - Subtarget->hasNEON()) { - if (memOpAlign(SrcAlign, DstAlign, 16) && Size >= 16) { - return MVT::v4i32; - } else if (memOpAlign(SrcAlign, DstAlign, 8) && Size >= 8) { - return MVT::v2i32; + Subtarget->hasNEON() && + !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) { + bool Fast; + if (Size >= 16 && + (memOpAlign(SrcAlign, DstAlign, 16) || + (allowsUnalignedMemoryAccesses(MVT::v2f64, &Fast) && Fast))) { + return MVT::v2f64; + } else if (Size >= 8 && + (memOpAlign(SrcAlign, DstAlign, 8) || + (allowsUnalignedMemoryAccesses(MVT::f64, &Fast) && Fast))) { + return MVT::f64; } } // Lowering to i32/i16 if the size permits. - if (Size >= 4) { + if (Size >= 4) return MVT::i32; - } else if (Size >= 2) { + else if (Size >= 2) return MVT::i16; - } // Let the target-independent logic figure it out. return MVT::Other; } +bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { + if (Val.getOpcode() != ISD::LOAD) + return false; + + EVT VT1 = Val.getValueType(); + if (!VT1.isSimple() || !VT1.isInteger() || + !VT2.isSimple() || !VT2.isInteger()) + return false; + + switch (VT1.getSimpleVT().SimpleTy) { + default: break; + case MVT::i1: + case MVT::i8: + case MVT::i16: + // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. + return true; + } + + return false; +} + static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { if (V < 0) return false; @@ -9790,6 +10260,24 @@ bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { return false; } +bool ARMTargetLowering::isIntImmLegal(const APInt &Imm, EVT VT) const { + if (VT.getSizeInBits() > 32) + return false; + + int32_t ImmVal = Imm.getSExtValue(); + if (!Subtarget->isThumb()) { + return (ImmVal >= 0 && ImmVal < 65536) || + (ARM_AM::getSOImmVal(ImmVal) != -1) || + (ARM_AM::getSOImmVal(~ImmVal) != -1); + } else if (Subtarget->isThumb2()) { + return (ImmVal >= 0 && ImmVal < 65536) || + (ARM_AM::getT2SOImmVal(ImmVal) != -1) || + (ARM_AM::getT2SOImmVal(~ImmVal) != -1); + } else /*Thumb1*/ { + return (ImmVal >= 0 && ImmVal < 256); + } +} + /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment /// specified in the intrinsic calls. @@ -9806,7 +10294,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::arm_neon_vld4lane: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. - uint64_t NumElts = getTargetData()->getTypeAllocSize(I.getType()) / 8; + uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8; Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; @@ -9831,7 +10319,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; - NumElts += getTargetData()->getTypeAllocSize(ArgTy) / 8; + NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8; } Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0);