X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTarget%2FAArch64%2FAArch64ISelLowering.cpp;h=4ecfbe9e228091335dc720281446e4668be1a672;hp=576a14d6314d715df42d9906e303c1ac43482786;hb=19e5c2eef38b9a79fa835d52ca51d5ef7abb0f61;hpb=1f7a90d7936a9a6278365ea8b0521c7ff17432c3 diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 576a14d6314..4ecfbe9e228 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -25,6 +25,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Type.h" #include "llvm/Support/CommandLine.h" @@ -39,23 +40,6 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); -namespace { -enum AlignMode { - StrictAlign, - NoStrictAlign -}; -} - -static cl::opt -Align(cl::desc("Load/store alignment support"), - cl::Hidden, cl::init(NoStrictAlign), - cl::values( - clEnumValN(StrictAlign, "aarch64-strict-align", - "Disallow all unaligned memory accesses"), - clEnumValN(NoStrictAlign, "aarch64-no-strict-align", - "Allow unaligned memory accesses"), - clEnumValEnd)); - // Place holder until extr generation is tested fully. static cl::opt EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden, @@ -75,6 +59,9 @@ cl::opt EnableAArch64ELFLocalDynamicTLSGeneration( cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false)); +/// Value type used for condition codes. +static const MVT MVT_CC = MVT::i32; + AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -209,11 +196,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); - // Exception handling. - // FIXME: These are guesses. Has this been defined yet? - setExceptionPointerRegister(AArch64::X0); - setExceptionSelectorRegister(AArch64::X1); - // Constant pool entries setOperationAction(ISD::ConstantPool, MVT::i64, Custom); @@ -233,6 +215,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // AArch64 lacks both left-rotate and popcount instructions. setOperationAction(ISD::ROTL, MVT::i32, Expand); setOperationAction(ISD::ROTL, MVT::i64, Expand); + for (MVT VT : MVT::vector_valuetypes()) { + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); + } // AArch64 doesn't have {U|S}MUL_LOHI. setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); @@ -251,6 +237,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::SDIVREM, MVT::i64, Expand); + for (MVT VT : MVT::vector_valuetypes()) { + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); + } setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::SREM, MVT::i64, Expand); setOperationAction(ISD::UDIVREM, MVT::i32, Expand); @@ -314,6 +304,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FTRUNC, MVT::f16, Promote); setOperationAction(ISD::FMINNUM, MVT::f16, Promote); setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); + setOperationAction(ISD::FMINNAN, MVT::f16, Promote); + setOperationAction(ISD::FMAXNAN, MVT::f16, Promote); // v4f16 is also a storage-only type, so promote it to v4f32 when that is // known to be safe. @@ -402,10 +394,19 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FRINT, Ty, Legal); setOperationAction(ISD::FTRUNC, Ty, Legal); setOperationAction(ISD::FROUND, Ty, Legal); + setOperationAction(ISD::FMINNUM, Ty, Legal); + setOperationAction(ISD::FMAXNUM, Ty, Legal); + setOperationAction(ISD::FMINNAN, Ty, Legal); + setOperationAction(ISD::FMAXNAN, Ty, Legal); } setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. + // This requires the Performance Monitors extension. + if (Subtarget->hasPerfMon()) + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); + if (Subtarget->isTargetMachO()) { // For iOS, we don't want to the normal expansion of a libcall to // sincos. We want to issue a libcall to __sincos_stret to avoid memory @@ -455,12 +456,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setIndexedLoadAction(im, MVT::i64, Legal); setIndexedLoadAction(im, MVT::f64, Legal); setIndexedLoadAction(im, MVT::f32, Legal); + setIndexedLoadAction(im, MVT::f16, Legal); setIndexedStoreAction(im, MVT::i8, Legal); setIndexedStoreAction(im, MVT::i16, Legal); setIndexedStoreAction(im, MVT::i32, Legal); setIndexedStoreAction(im, MVT::i64, Legal); setIndexedStoreAction(im, MVT::f64, Legal); setIndexedStoreAction(im, MVT::f32, Legal); + setIndexedStoreAction(im, MVT::f16, Legal); } // Trap. @@ -478,6 +481,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); + setTargetDAGCombine(ISD::FP_TO_SINT); + setTargetDAGCombine(ISD::FP_TO_UINT); + setTargetDAGCombine(ISD::FDIV); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::ANY_EXTEND); @@ -486,6 +493,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::CONCAT_VECTORS); setTargetDAGCombine(ISD::STORE); + if (Subtarget->supportsAddressTopByteIgnored()) + setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::MUL); @@ -495,6 +504,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; @@ -510,10 +520,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setMinFunctionAlignment(2); - RequireStrictAlign = (Align == StrictAlign); - setHasExtractBitsInsn(true); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + if (Subtarget->hasNEON()) { // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to // silliness like this: @@ -644,6 +654,9 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand); setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand); setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand); + + // But we do support custom-lowering for FCOPYSIGN. + setOperationAction(ISD::FCOPYSIGN, VT.getSimpleVT(), Custom); } setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); @@ -678,6 +691,18 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom); setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom); + // [SU][MIN|MAX] are available for all NEON types apart from i64. + if (!VT.isFloatingPoint() && + VT.getSimpleVT() != MVT::v2i64 && VT.getSimpleVT() != MVT::v1i64) + for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) + setOperationAction(Opcode, VT.getSimpleVT(), Legal); + + // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!). + if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16) + for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN, + ISD::FMINNUM, ISD::FMAXNUM}) + setOperationAction(Opcode, VT.getSimpleVT(), Legal); + if (Subtarget->isLittleEndian()) { for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { @@ -697,7 +722,8 @@ void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { addTypeForNEON(VT, MVT::v4i32); } -EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { +EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, + EVT VT) const { if (!VT.isVector()) return MVT::i32; return VT.changeVectorElementTypeToInteger(); @@ -721,7 +747,7 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( break; } case ISD::INTRINSIC_W_CHAIN: { - ConstantSDNode *CN = cast(Op->getOperand(1)); + ConstantSDNode *CN = cast(Op->getOperand(1)); Intrinsic::ID IntID = static_cast(CN->getZExtValue()); switch (IntID) { default: return; @@ -766,10 +792,39 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( } } -MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const { +MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, + EVT) const { return MVT::i64; } +bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned AddrSpace, + unsigned Align, + bool *Fast) const { + if (Subtarget->requiresStrictAlign()) + return false; + + // FIXME: This is mostly true for Cyclone, but not necessarily others. + if (Fast) { + // FIXME: Define an attribute for slow unaligned accesses instead of + // relying on the CPU type as a proxy. + // On Cyclone, unaligned 128-bit stores are slow. + *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 || + // See comments in performSTORECombine() for more details about + // these conditions. + + // Code that uses clang vector extensions can mark that it + // wants unaligned accesses to be treated as fast by + // underspecifying alignment to be 1 or 2. + Align <= 2 || + + // Disregard v2i64. Memcpy lowering produces those and splitting + // them regresses performance on micro-benchmarks and olden/bh. + VT == MVT::v2i64; + } + return true; +} + FastISel * AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { @@ -777,9 +832,8 @@ AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, } const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { - switch (Opcode) { - default: - return nullptr; + switch ((AArch64ISD::NodeType)Opcode) { + case AArch64ISD::FIRST_NUMBER: break; case AArch64ISD::CALL: return "AArch64ISD::CALL"; case AArch64ISD::ADRP: return "AArch64ISD::ADRP"; case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow"; @@ -800,9 +854,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; + case AArch64ISD::CCMP: return "AArch64ISD::CCMP"; + case AArch64ISD::CCMN: return "AArch64ISD::CCMN"; + case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP"; case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; - case AArch64ISD::FMIN: return "AArch64ISD::FMIN"; - case AArch64ISD::FMAX: return "AArch64ISD::FMAX"; case AArch64ISD::DUP: return "AArch64ISD::DUP"; case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8"; case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16"; @@ -864,6 +919,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::TBZ: return "AArch64ISD::TBZ"; case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ"; case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; + case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH"; case AArch64ISD::SITOF: return "AArch64ISD::SITOF"; case AArch64ISD::UITOF: return "AArch64ISD::UITOF"; case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST"; @@ -899,6 +955,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; } + return nullptr; } MachineBasicBlock * @@ -920,8 +977,7 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); DebugLoc DL = MI->getDebugLoc(); - MachineFunction::iterator It = MBB; - ++It; + MachineFunction::iterator It = ++MBB->getIterator(); unsigned DestReg = MI->getOperand(0).getReg(); unsigned IfTrueReg = MI->getOperand(1).getReg(); @@ -1130,8 +1186,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, // register to WZR/XZR if it ends up being unused. unsigned Opcode = AArch64ISD::SUBS; - if (RHS.getOpcode() == ISD::SUB && isa(RHS.getOperand(0)) && - cast(RHS.getOperand(0))->getZExtValue() == 0 && + if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags @@ -1145,8 +1200,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, // the absence of information about op2. Opcode = AArch64ISD::ADDS; RHS = RHS.getOperand(1); - } else if (LHS.getOpcode() == ISD::AND && isa(RHS) && - cast(RHS)->getZExtValue() == 0 && + } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) { // Similarly, (CMP (and X, Y), 0) can be implemented with a TST // (a.k.a. ANDS) except that the flags are only guaranteed to work for one @@ -1156,14 +1210,230 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, LHS = LHS.getOperand(0); } - return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS) + return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) .getValue(1); } +/// \defgroup AArch64CCMP CMP;CCMP matching +/// +/// These functions deal with the formation of CMP;CCMP;... sequences. +/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of +/// a comparison. They set the NZCV flags to a predefined value if their +/// predicate is false. This allows to express arbitrary conjunctions, for +/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))" +/// expressed as: +/// cmp A +/// ccmp B, inv(CB), CA +/// check for CB flags +/// +/// In general we can create code for arbitrary "... (and (and A B) C)" +/// sequences. We can also implement some "or" expressions, because "(or A B)" +/// is equivalent to "not (and (not A) (not B))" and we can implement some +/// negation operations: +/// We can negate the results of a single comparison by inverting the flags +/// used when the predicate fails and inverting the flags tested in the next +/// instruction; We can also negate the results of the whole previous +/// conditional compare sequence by inverting the flags tested in the next +/// instruction. However there is no way to negate the result of a partial +/// sequence. +/// +/// Therefore on encountering an "or" expression we can negate the subtree on +/// one side and have to be able to push the negate to the leafs of the subtree +/// on the other side (see also the comments in code). As complete example: +/// "or (or (setCA (cmp A)) (setCB (cmp B))) +/// (and (setCC (cmp C)) (setCD (cmp D)))" +/// is transformed to +/// "not (and (not (and (setCC (cmp C)) (setCC (cmp D)))) +/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" +/// and implemented as: +/// cmp C +/// ccmp D, inv(CD), CC +/// ccmp A, CA, inv(CD) +/// ccmp B, CB, inv(CA) +/// check for CB flags +/// A counterexample is "or (and A B) (and C D)" which cannot be implemented +/// by conditional compare sequences. +/// @{ + +/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. +static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, + ISD::CondCode CC, SDValue CCOp, + SDValue Condition, unsigned NZCV, + SDLoc DL, SelectionDAG &DAG) { + unsigned Opcode = 0; + if (LHS.getValueType().isFloatingPoint()) + Opcode = AArch64ISD::FCCMP; + else if (RHS.getOpcode() == ISD::SUB) { + SDValue SubOp0 = RHS.getOperand(0); + if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { + // See emitComparison() on why we can only do this for SETEQ and SETNE. + Opcode = AArch64ISD::CCMN; + RHS = RHS.getOperand(1); + } + } + if (Opcode == 0) + Opcode = AArch64ISD::CCMP; + + SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); + return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); +} + +/// Returns true if @p Val is a tree of AND/OR/SETCC operations. +/// CanPushNegate is set to true if we can push a negate operation through +/// the tree in a was that we are left with AND operations and negate operations +/// at the leafs only. i.e. "not (or (or x y) z)" can be changed to +/// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be +/// brought into such a form. +static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate, + unsigned Depth = 0) { + if (!Val.hasOneUse()) + return false; + unsigned Opcode = Val->getOpcode(); + if (Opcode == ISD::SETCC) { + CanPushNegate = true; + return true; + } + // Protect against stack overflow. + if (Depth > 15) + return false; + if (Opcode == ISD::AND || Opcode == ISD::OR) { + SDValue O0 = Val->getOperand(0); + SDValue O1 = Val->getOperand(1); + bool CanPushNegateL; + if (!isConjunctionDisjunctionTree(O0, CanPushNegateL, Depth+1)) + return false; + bool CanPushNegateR; + if (!isConjunctionDisjunctionTree(O1, CanPushNegateR, Depth+1)) + return false; + // We cannot push a negate through an AND operation (it would become an OR), + // we can however change a (not (or x y)) to (and (not x) (not y)) if we can + // push the negate through the x/y subtrees. + CanPushNegate = (Opcode == ISD::OR) && CanPushNegateL && CanPushNegateR; + return true; + } + return false; +} + +/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain +/// of CCMP/CFCMP ops. See @ref AArch64CCMP. +/// Tries to transform the given i1 producing node @p Val to a series compare +/// and conditional compare operations. @returns an NZCV flags producing node +/// and sets @p OutCC to the flags that should be tested or returns SDValue() if +/// transformation was not possible. +/// On recursive invocations @p PushNegate may be set to true to have negation +/// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate +/// for the comparisons in the current subtree; @p Depth limits the search +/// depth to avoid stack overflow. +static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, + AArch64CC::CondCode &OutCC, bool PushNegate = false, + SDValue CCOp = SDValue(), AArch64CC::CondCode Predicate = AArch64CC::AL, + unsigned Depth = 0) { + // We're at a tree leaf, produce a conditional comparison operation. + unsigned Opcode = Val->getOpcode(); + if (Opcode == ISD::SETCC) { + SDValue LHS = Val->getOperand(0); + SDValue RHS = Val->getOperand(1); + ISD::CondCode CC = cast(Val->getOperand(2))->get(); + bool isInteger = LHS.getValueType().isInteger(); + if (PushNegate) + CC = getSetCCInverse(CC, isInteger); + SDLoc DL(Val); + // Determine OutCC and handle FP special case. + if (isInteger) { + OutCC = changeIntCCToAArch64CC(CC); + } else { + assert(LHS.getValueType().isFloatingPoint()); + AArch64CC::CondCode ExtraCC; + changeFPCCToAArch64CC(CC, OutCC, ExtraCC); + // Surpisingly some floating point conditions can't be tested with a + // single condition code. Construct an additional comparison in this case. + // See comment below on how we deal with OR conditions. + if (ExtraCC != AArch64CC::AL) { + SDValue ExtraCmp; + if (!CCOp.getNode()) + ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); + else { + SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); + // Note that we want the inverse of ExtraCC, so NZCV is not inversed. + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC); + ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, + NZCV, DL, DAG); + } + CCOp = ExtraCmp; + Predicate = AArch64CC::getInvertedCondCode(ExtraCC); + OutCC = AArch64CC::getInvertedCondCode(OutCC); + } + } + + // Produce a normal comparison if we are first in the chain + if (!CCOp.getNode()) + return emitComparison(LHS, RHS, CC, DL, DAG); + // Otherwise produce a ccmp. + SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); + AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); + return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL, + DAG); + } else if ((Opcode != ISD::AND && Opcode != ISD::OR) || !Val->hasOneUse()) + return SDValue(); + + assert((Opcode == ISD::OR || !PushNegate) + && "Can only push negate through OR operation"); + + // Check if both sides can be transformed. + SDValue LHS = Val->getOperand(0); + SDValue RHS = Val->getOperand(1); + bool CanPushNegateL; + if (!isConjunctionDisjunctionTree(LHS, CanPushNegateL, Depth+1)) + return SDValue(); + bool CanPushNegateR; + if (!isConjunctionDisjunctionTree(RHS, CanPushNegateR, Depth+1)) + return SDValue(); + + // Do we need to negate our operands? + bool NegateOperands = Opcode == ISD::OR; + // We can negate the results of all previous operations by inverting the + // predicate flags giving us a free negation for one side. For the other side + // we need to be able to push the negation to the leafs of the tree. + if (NegateOperands) { + if (!CanPushNegateL && !CanPushNegateR) + return SDValue(); + // Order the side where we can push the negate through to LHS. + if (!CanPushNegateL && CanPushNegateR) + std::swap(LHS, RHS); + } else { + bool NeedsNegOutL = LHS->getOpcode() == ISD::OR; + bool NeedsNegOutR = RHS->getOpcode() == ISD::OR; + if (NeedsNegOutL && NeedsNegOutR) + return SDValue(); + // Order the side where we need to negate the output flags to RHS so it + // gets emitted first. + if (NeedsNegOutL) + std::swap(LHS, RHS); + } + + // Emit RHS. If we want to negate the tree we only need to push a negate + // through if we are already in a PushNegate case, otherwise we can negate + // the "flags to test" afterwards. + AArch64CC::CondCode RHSCC; + SDValue CmpR = emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, PushNegate, + CCOp, Predicate, Depth+1); + if (NegateOperands && !PushNegate) + RHSCC = AArch64CC::getInvertedCondCode(RHSCC); + // Emit LHS. We must push the negate through if we need to negate it. + SDValue CmpL = emitConjunctionDisjunctionTree(DAG, LHS, OutCC, NegateOperands, + CmpR, RHSCC, Depth+1); + // If we transformed an OR to and AND then we have to negate the result + // (or absorb a PushNegate resulting in a double negation). + if (Opcode == ISD::OR && !PushNegate) + OutCC = AArch64CC::getInvertedCondCode(OutCC); + return CmpL; +} + +/// @} + static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) { - SDValue Cmp; - AArch64CC::CondCode AArch64CC; if (ConstantSDNode *RHSC = dyn_cast(RHS.getNode())) { EVT VT = RHS.getValueType(); uint64_t C = RHSC->getZExtValue(); @@ -1218,47 +1488,56 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, } } } - // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. - // For the i8 operand, the largest immediate is 255, so this can be easily - // encoded in the compare instruction. For the i16 operand, however, the - // largest immediate cannot be encoded in the compare. - // Therefore, use a sign extending load and cmn to avoid materializing the -1 - // constant. For example, - // movz w1, #65535 - // ldrh w0, [x0, #0] - // cmp w0, w1 - // > - // ldrsh w0, [x0, #0] - // cmn w0, #1 - // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) - // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure - // both the LHS and RHS are truely zero extended and to make sure the - // transformation is profitable. + SDValue Cmp; + AArch64CC::CondCode AArch64CC; if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa(RHS)) { - if ((cast(RHS)->getZExtValue() >> 16 == 0) && - isa(LHS)) { - if (cast(LHS)->getExtensionType() == ISD::ZEXTLOAD && - cast(LHS)->getMemoryVT() == MVT::i16 && - LHS.getNode()->hasNUsesOfValue(1, 0)) { - int16_t ValueofRHS = cast(RHS)->getZExtValue(); - if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { - SDValue SExt = - DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, - DAG.getValueType(MVT::i16)); - Cmp = emitComparison(SExt, - DAG.getConstant(ValueofRHS, dl, - RHS.getValueType()), - CC, dl, DAG); - AArch64CC = changeIntCCToAArch64CC(CC); - AArch64cc = DAG.getConstant(AArch64CC, dl, MVT::i32); - return Cmp; - } + const ConstantSDNode *RHSC = cast(RHS); + + // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. + // For the i8 operand, the largest immediate is 255, so this can be easily + // encoded in the compare instruction. For the i16 operand, however, the + // largest immediate cannot be encoded in the compare. + // Therefore, use a sign extending load and cmn to avoid materializing the + // -1 constant. For example, + // movz w1, #65535 + // ldrh w0, [x0, #0] + // cmp w0, w1 + // > + // ldrsh w0, [x0, #0] + // cmn w0, #1 + // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) + // if and only if (sext LHS) == (sext RHS). The checks are in place to + // ensure both the LHS and RHS are truly zero extended and to make sure the + // transformation is profitable. + if ((RHSC->getZExtValue() >> 16 == 0) && isa(LHS) && + cast(LHS)->getExtensionType() == ISD::ZEXTLOAD && + cast(LHS)->getMemoryVT() == MVT::i16 && + LHS.getNode()->hasNUsesOfValue(1, 0)) { + int16_t ValueofRHS = cast(RHS)->getZExtValue(); + if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { + SDValue SExt = + DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, + DAG.getValueType(MVT::i16)); + Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, + RHS.getValueType()), + CC, dl, DAG); + AArch64CC = changeIntCCToAArch64CC(CC); + } + } + + if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) { + if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) { + if ((CC == ISD::SETNE) ^ RHSC->isNullValue()) + AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); } } } - Cmp = emitComparison(LHS, RHS, CC, dl, DAG); - AArch64CC = changeIntCCToAArch64CC(CC); - AArch64cc = DAG.getConstant(AArch64CC, dl, MVT::i32); + + if (!Cmp) { + Cmp = emitComparison(LHS, RHS, CC, dl, DAG); + AArch64CC = changeIntCCToAArch64CC(CC); + } + AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); return Cmp; } @@ -1380,8 +1659,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, RTLIB::Libcall Call) const { SmallVector Ops(Op->op_begin(), Op->op_end()); - return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false, - SDLoc(Op)).first; + return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first; } static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { @@ -1415,7 +1693,7 @@ static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { ConstantSDNode *CFVal = dyn_cast(FVal); ConstantSDNode *CTVal = dyn_cast(TVal); - // The the values aren't constants, this isn't the pattern we're looking for. + // The values aren't constants, this isn't the pattern we're looking for. if (!CFVal || !CTVal) return Op; @@ -1560,8 +1838,8 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, // precise. That doesn't take part in the LibCall so we can't directly use // LowerF128Call. SDValue SrcVal = Op.getOperand(0); - return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1, - /*isSigned*/ false, SDLoc(Op)).first; + return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, + SDLoc(Op)).first; } static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { @@ -1570,6 +1848,16 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { // in the cost tables. EVT InVT = Op.getOperand(0).getValueType(); EVT VT = Op.getValueType(); + unsigned NumElts = InVT.getVectorNumElements(); + + // f16 vectors are promoted to f32 before a conversion. + if (InVT.getVectorElementType() == MVT::f16) { + MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); + SDLoc dl(Op); + return DAG.getNode( + Op.getOpcode(), dl, Op.getValueType(), + DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); + } if (VT.getSizeInBits() < InVT.getSizeInBits()) { SDLoc dl(Op); @@ -1617,8 +1905,7 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); SmallVector Ops(Op->op_begin(), Op->op_end()); - return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false, - SDLoc(Op)).first; + return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first; } static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { @@ -1701,7 +1988,8 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, const char *LibcallName = (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; - SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy()); + SDValue Callee = + DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); TargetLowering::CallLoweringInfo CLI(DAG); @@ -1768,8 +2056,7 @@ static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, if (N->getOpcode() != ISD::BUILD_VECTOR) return false; - for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { - SDNode *Elt = N->getOperand(i).getNode(); + for (const SDValue &Elt : N->op_values()) { if (ConstantSDNode *C = dyn_cast(Elt)) { unsigned EltSize = VT.getVectorElementType().getSizeInBits(); unsigned HalfSize = EltSize / 2; @@ -1920,6 +2207,31 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); } +SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); + SDLoc dl(Op); + switch (IntNo) { + default: return SDValue(); // Don't custom lower most intrinsics. + case Intrinsic::aarch64_thread_pointer: { + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); + } + case Intrinsic::aarch64_neon_smax: + return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_neon_umax: + return DAG.getNode(ISD::UMAX, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_neon_smin: + return DAG.getNode(ISD::SMIN, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_neon_umin: + return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } +} + SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -2021,14 +2333,11 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerFSINCOS(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: + return LowerINTRINSIC_WO_CHAIN(Op, DAG); } } -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const { - return 2; -} - //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// @@ -2081,7 +2390,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( CurArgIdx = Ins[i].getOrigArgIndex(); // Get type of the original argument. - EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true); + EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(), + /*AllowUnknown*/ true); MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) @@ -2103,7 +2413,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( if (Ins[i].Flags.isByVal()) { // Byval is used for HFAs in the PCS, but the system should work in a // non-compliant manner for larger structs. - EVT PtrTy = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); int Size = Ins[i].Flags.getByValSize(); unsigned NumRegs = (Size + 7) / 8; @@ -2111,12 +2421,12 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // case. It should also work for fundamental types too. unsigned FrameIdx = MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); - SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy); + SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT); InVals.push_back(FrameIdxN); continue; } - + if (VA.isRegLoc()) { // Arguments stored in registers. EVT RegVT = VA.getLocVT(); @@ -2178,7 +2488,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); // Create load nodes to retrieve arguments from the stack. - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); SDValue ArgValue; // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) @@ -2202,9 +2512,10 @@ SDValue AArch64TargetLowering::LowerFormalArguments( break; } - ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - MemVT, false, false, false, 0); + ArgValue = DAG.getExtLoad( + ExtType, DL, VA.getLocVT(), Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + MemVT, false, false, false, 0); InVals.push_back(ArgValue); } @@ -2257,6 +2568,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); AArch64FunctionInfo *FuncInfo = MF.getInfo(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); SmallVector MemOps; @@ -2271,17 +2583,18 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, if (GPRSaveSize != 0) { GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false); - SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy()); + SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); - SDValue Store = - DAG.getStore(Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(i * 8), false, false, 0); + SDValue Store = DAG.getStore( + Val.getValue(1), DL, Val, FIN, + MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8), false, + false, 0); MemOps.push_back(Store); - FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, - DAG.getConstant(8, DL, getPointerTy())); + FIN = + DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); } } FuncInfo->setVarArgsGPRIndex(GPRIdx); @@ -2299,18 +2612,19 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, if (FPRSaveSize != 0) { FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); - SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy()); + SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); - SDValue Store = - DAG.getStore(Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(i * 16), false, false, 0); + SDValue Store = DAG.getStore( + Val.getValue(1), DL, Val, FIN, + MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16), + false, false, 0); MemOps.push_back(Store); - FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, - DAG.getConstant(16, DL, getPointerTy())); + FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, + DAG.getConstant(16, DL, PtrVT)); } } FuncInfo->setVarArgsFPRIndex(FPRIdx); @@ -2413,7 +2727,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( // cannot rely on the linker replacing the tail call with a return. if (GlobalAddressSDNode *G = dyn_cast(Callee)) { const GlobalValue *GV = G->getGlobal(); - const Triple TT(getTargetMachine().getTargetTriple()); + const Triple &TT = getTargetMachine().getTargetTriple(); if (GV->hasExternalWeakLinkage() && (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) return false; @@ -2440,8 +2754,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) - if (!ArgLocs[i].isRegLoc()) + for (const CCValAssign &ArgLoc : ArgLocs) + if (!ArgLoc.isRegLoc()) return false; } @@ -2606,7 +2920,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, for (unsigned i = 0; i != NumArgs; ++i) { MVT ValVT = Outs[i].VT; // Get type of the original argument. - EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty, + EVT ActualVT = getValueType(DAG.getDataLayout(), + CLI.getArgs()[Outs[i].OrigArgIndex].Ty, /*AllowUnknown*/ true); MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; @@ -2666,10 +2981,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, true), DL); - SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy()); + SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, + getPointerTy(DAG.getDataLayout())); SmallVector, 8> RegsToPass; SmallVector MemOpChains; + auto PtrVT = getPointerTy(DAG.getDataLayout()); // Walk the register/memloc assignments, inserting copies/loads. for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; @@ -2735,14 +3052,15 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, unsigned LocMemOffset = VA.getLocMemOffset(); int32_t Offset = LocMemOffset + BEAlign; SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); - PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff); + PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); if (IsTailCall) { Offset = Offset + FPDiff; int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); - DstAddr = DAG.getFrameIndex(FI, getPointerTy()); - DstInfo = MachinePointerInfo::getFixedStack(FI); + DstAddr = DAG.getFrameIndex(FI, PtrVT); + DstInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); // Make sure any stack arguments overlapping with where we're storing // are loaded before this eventual operation. Otherwise they'll be @@ -2751,8 +3069,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } else { SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); - DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff); - DstInfo = MachinePointerInfo::getStack(LocMemOffset); + DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); + DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(), + LocMemOffset); } if (Outs[i].Flags.isByVal()) { @@ -2786,9 +3105,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first, - RegsToPass[i].second, InFlag); + for (auto &RegToPass : RegsToPass) { + Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, + RegToPass.second, InFlag); InFlag = Chain.getValue(1); } @@ -2801,25 +3120,24 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, const GlobalValue *GV = G->getGlobal(); bool InternalLinkage = GV->hasInternalLinkage(); if (InternalLinkage) - Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0); + Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); else { - Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, - AArch64II::MO_GOT); - Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee); + Callee = + DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT); + Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); } } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { const char *Sym = S->getSymbol(); - Callee = - DAG.getTargetExternalSymbol(Sym, getPointerTy(), AArch64II::MO_GOT); - Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee); + Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT); + Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); } } else if (GlobalAddressSDNode *G = dyn_cast(Callee)) { const GlobalValue *GV = G->getGlobal(); - Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0); + Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { const char *Sym = S->getSymbol(); - Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0); + Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); } // We don't usually want to end the call-sequence here because we would tidy @@ -2845,9 +3163,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Add argument registers to the end of the list so that they are known live // into the call. - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) - Ops.push_back(DAG.getRegister(RegsToPass[i].first, - RegsToPass[i].second.getValueType())); + for (auto &RegToPass : RegsToPass) + Ops.push_back(DAG.getRegister(RegToPass.first, + RegToPass.second.getValueType())); // Add a register mask operand representing the call-preserved registers. const uint32_t *Mask; @@ -2872,8 +3190,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // If we're doing a tall call, use a TC_RETURN here rather than an // actual call instruction. - if (IsTailCall) + if (IsTailCall) { + MF.getFrameInfo()->setHasTailCall(); return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); + } // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); @@ -2951,6 +3271,19 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *I = + TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); + if (I) { + for (; *I; ++I) { + if (AArch64::GPR64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i64)); + else if (AArch64::FPR64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + } + } RetOps[0] = Chain; // Update chain. @@ -2967,7 +3300,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); const GlobalAddressSDNode *GN = cast(Op); const GlobalValue *GV = GN->getGlobal(); @@ -2993,11 +3326,12 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC; SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags); SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); - SDValue GlobalAddr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), PoolAddr, - MachinePointerInfo::getConstantPool(), - /*isVolatile=*/ false, - /*isNonTemporal=*/ true, - /*isInvariant=*/ true, 8); + SDValue GlobalAddr = DAG.getLoad( + PtrVT, DL, DAG.getEntryNode(), PoolAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + /*isVolatile=*/false, + /*isNonTemporal=*/true, + /*isInvariant=*/true, 8); if (GN->getOffset() != 0) return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr, DAG.getConstant(GN->getOffset(), DL, PtrVT)); @@ -3059,7 +3393,7 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin"); SDLoc DL(Op); - MVT PtrVT = getPointerTy(); + MVT PtrVT = getPointerTy(DAG.getDataLayout()); const GlobalValue *GV = cast(Op)->getGlobal(); SDValue TLVPAddr = @@ -3070,8 +3404,9 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, // to obtain the address of the variable. SDValue Chain = DAG.getEntryNode(); SDValue FuncTLVGet = - DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(), - false, true, true, 8); + DAG.getLoad(MVT::i64, DL, Chain, DescAddr, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, + true, true, 8); Chain = FuncTLVGet.getValue(1); MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); @@ -3114,7 +3449,7 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, /// the sequence is produced as per above. SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL, SelectionDAG &DAG) const { - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); @@ -3143,13 +3478,17 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, const GlobalAddressSDNode *GA = cast(Op); TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); + + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); + if (!EnableAArch64ELFLocalDynamicTLSGeneration) { if (Model == TLSModel::LocalDynamic) Model = TLSModel::GeneralDynamic; } SDValue TPOff; - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); const GlobalValue *GV = GA->getGlobal(); @@ -3260,8 +3599,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch // instruction. unsigned Opc = LHS.getOpcode(); - if (LHS.getResNo() == 1 && isa(RHS) && - cast(RHS)->isOne() && + if (LHS.getResNo() == 1 && isOneConstant(RHS) && (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { assert((CC == ISD::SETEQ || CC == ISD::SETNE) && @@ -3375,17 +3713,11 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, SDValue In1 = Op.getOperand(0); SDValue In2 = Op.getOperand(1); EVT SrcVT = In2.getValueType(); - if (SrcVT != VT) { - if (SrcVT == MVT::f32 && VT == MVT::f64) - In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); - else if (SrcVT == MVT::f64 && VT == MVT::f32) - In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, - DAG.getIntPtrConstant(0, DL)); - else - // FIXME: Src type is different, bail out for now. Can VT really be a - // vector type? - return SDValue(); - } + + if (SrcVT.bitsLT(VT)) + In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); + else if (SrcVT.bitsGT(VT)) + In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); EVT VecVT; EVT EltVT; @@ -3393,7 +3725,7 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, SDValue VecVal1, VecVal2; if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { EltVT = MVT::i32; - VecVT = MVT::v4i32; + VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); EltMask = 0x80000000ULL; if (!VT.isVector()) { @@ -3409,7 +3741,7 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, EltVT = MVT::i64; VecVT = MVT::v2i64; - // We want to materialize a mask with the the high bit set, but the AdvSIMD + // We want to materialize a mask with the high bit set, but the AdvSIMD // immediate moves cannot materialize that in a single instruction for // 64-bit elements. Instead, materialize zero and then negate it. EltMask = 0; @@ -3554,31 +3886,6 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { } } -/// A SELECT_CC operation is really some kind of max or min if both values being -/// compared are, in some sense, equal to the results in either case. However, -/// it is permissible to compare f32 values and produce directly extended f64 -/// values. -/// -/// Extending the comparison operands would also be allowed, but is less likely -/// to happen in practice since their use is right here. Note that truncate -/// operations would *not* be semantically equivalent. -static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) { - if (Cmp == Result) - return true; - - ConstantFPSDNode *CCmp = dyn_cast(Cmp); - ConstantFPSDNode *CResult = dyn_cast(Result); - if (CCmp && CResult && Cmp.getValueType() == MVT::f32 && - Result.getValueType() == MVT::f64) { - bool Lossy; - APFloat CmpVal = CCmp->getValueAPF(); - CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy); - return CResult->getValueAPF().bitwiseIsEqual(CmpVal); - } - - return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp; -} - SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, SDLoc dl, @@ -3596,7 +3903,13 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, } } - // Handle integers first. + // Also handle f16, for which we need to do a f32 comparison. + if (LHS.getValueType() == MVT::f16) { + LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); + RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); + } + + // Next, handle integers. if (LHS.getValueType().isInteger()) { assert((LHS.getValueType() == RHS.getValueType()) && (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); @@ -3619,9 +3932,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, } else if (TVal.getOpcode() == ISD::XOR) { // If TVal is a NOT we want to swap TVal and FVal so that we can match // with a CSINV rather than a CSEL. - ConstantSDNode *CVal = dyn_cast(TVal.getOperand(1)); - - if (CVal && CVal->isAllOnesValue()) { + if (isAllOnesConstant(TVal.getOperand(1))) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); @@ -3629,9 +3940,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, } else if (TVal.getOpcode() == ISD::SUB) { // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so // that we can match with a CSNEG rather than a CSEL. - ConstantSDNode *CVal = dyn_cast(TVal.getOperand(0)); - - if (CVal && CVal->isNullValue()) { + if (isNullConstant(TVal.getOperand(0))) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); @@ -3698,46 +4007,6 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); assert(LHS.getValueType() == RHS.getValueType()); EVT VT = TVal.getValueType(); - - // Try to match this select into a max/min operation, which have dedicated - // opcode in the instruction set. - // FIXME: This is not correct in the presence of NaNs, so we only enable this - // in no-NaNs mode. - if (getTargetMachine().Options.NoNaNsFPMath) { - SDValue MinMaxLHS = TVal, MinMaxRHS = FVal; - if (selectCCOpsAreFMaxCompatible(LHS, MinMaxRHS) && - selectCCOpsAreFMaxCompatible(RHS, MinMaxLHS)) { - CC = ISD::getSetCCSwappedOperands(CC); - std::swap(MinMaxLHS, MinMaxRHS); - } - - if (selectCCOpsAreFMaxCompatible(LHS, MinMaxLHS) && - selectCCOpsAreFMaxCompatible(RHS, MinMaxRHS)) { - switch (CC) { - default: - break; - case ISD::SETGT: - case ISD::SETGE: - case ISD::SETUGT: - case ISD::SETUGE: - case ISD::SETOGT: - case ISD::SETOGE: - return DAG.getNode(AArch64ISD::FMAX, dl, VT, MinMaxLHS, MinMaxRHS); - break; - case ISD::SETLT: - case ISD::SETLE: - case ISD::SETULT: - case ISD::SETULE: - case ISD::SETOLT: - case ISD::SETOLE: - return DAG.getNode(AArch64ISD::FMIN, dl, VT, MinMaxLHS, MinMaxRHS); - break; - } - } - } - - // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead - // and do the comparison. SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally @@ -3815,7 +4084,7 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, // Jump table entries as PC relative offsets. No additional tweaking // is necessary here. Just get the address of the jump table. JumpTableSDNode *JT = cast(Op); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large && @@ -3841,7 +4110,7 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { ConstantPoolSDNode *CP = cast(Op); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large) { @@ -3882,7 +4151,7 @@ SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { const BlockAddress *BA = cast(Op)->getBlockAddress(); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large && !Subtarget->isTargetMachO()) { @@ -3908,8 +4177,8 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, DAG.getMachineFunction().getInfo(); SDLoc DL(Op); - SDValue FR = - DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy()); + SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), + getPointerTy(DAG.getDataLayout())); const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), MachinePointerInfo(SV), false, false, 0); @@ -3921,6 +4190,7 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, // Standard, section B.3. MachineFunction &MF = DAG.getMachineFunction(); AArch64FunctionInfo *FuncInfo = MF.getInfo(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); SDValue Chain = Op.getOperand(0); @@ -3929,8 +4199,7 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, SmallVector MemOps; // void *__stack at offset 0 - SDValue Stack = - DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy()); + SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, MachinePointerInfo(SV), false, false, 8)); @@ -3939,12 +4208,12 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, if (GPRSize > 0) { SDValue GRTop, GRTopAddr; - GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(8, DL, getPointerTy())); + GRTopAddr = + DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT)); - GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy()); - GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop, - DAG.getConstant(GPRSize, DL, getPointerTy())); + GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT); + GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop, + DAG.getConstant(GPRSize, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, MachinePointerInfo(SV, 8), false, false, 8)); @@ -3954,28 +4223,28 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, int FPRSize = FuncInfo->getVarArgsFPRSize(); if (FPRSize > 0) { SDValue VRTop, VRTopAddr; - VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(16, DL, getPointerTy())); + VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, + DAG.getConstant(16, DL, PtrVT)); - VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy()); - VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop, - DAG.getConstant(FPRSize, DL, getPointerTy())); + VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT); + VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop, + DAG.getConstant(FPRSize, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, MachinePointerInfo(SV, 16), false, false, 8)); } // int __gr_offs at offset 24 - SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(24, DL, getPointerTy())); + SDValue GROffsAddr = + DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr, MachinePointerInfo(SV, 24), false, false, 4)); // int __vr_offs at offset 28 - SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(28, DL, getPointerTy())); + SDValue VROffsAddr = + DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr, MachinePointerInfo(SV, 28), false, @@ -4016,21 +4285,22 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Addr = Op.getOperand(1); unsigned Align = Op.getConstantOperandVal(3); + auto PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr, - MachinePointerInfo(V), false, false, false, 0); + SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V), + false, false, false, 0); Chain = VAList.getValue(1); if (Align > 8) { assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); - VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(Align - 1, DL, getPointerTy())); - VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList, - DAG.getConstant(-(int64_t)Align, DL, getPointerTy())); + VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, + DAG.getConstant(Align - 1, DL, PtrVT)); + VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, + DAG.getConstant(-(int64_t)Align, DL, PtrVT)); } Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); - uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy); + uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); // Scalar integer and FP values smaller than 64 bits are implicitly extended // up to 64 bits. At the very least, we have to increase the striding of the @@ -4045,8 +4315,8 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { } // Increment the pointer, VAList, to the next vaarg - SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(ArgSize, DL, getPointerTy())); + SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, + DAG.getConstant(ArgSize, DL, PtrVT)); // Store the incremented VAList to the legalized pointer SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V), false, false, 0); @@ -4086,14 +4356,15 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, - EVT VT) const { +unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const { unsigned Reg = StringSwitch(RegName) .Case("sp", AArch64::SP) .Default(0); if (Reg) return Reg; - report_fatal_error("Invalid register name global variable"); + report_fatal_error(Twine("Invalid register name \"" + + StringRef(RegName) + "\".")); } SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, @@ -4107,7 +4378,7 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); if (Depth) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); - SDValue Offset = DAG.getConstant(8, DL, getPointerTy()); + SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); return DAG.getLoad(VT, DL, DAG.getEntryNode(), DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo(), false, false, false, 0); @@ -4129,46 +4400,57 @@ SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); - SDValue ARMcc; unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); - SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); + SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); + + // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which + // is "undef". We wanted 0, so CSEL it directly. + SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), + ISD::SETEQ, dl, DAG); + SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); + HiBitsForLo = + DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), + HiBitsForLo, CCVal, Cmp); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, DAG.getConstant(VTBits, dl, MVT::i64)); - SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); - SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), - ISD::SETGE, dl, DAG); - SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); + SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); + SDValue LoForNormalShift = + DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo); - SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); - SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); - SDValue Lo = - DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp); + Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, + dl, DAG); + CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); + SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); + SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, + LoForNormalShift, CCVal, Cmp); // AArch64 shifts larger than the register width are wrapped rather than // clamped, so we can't just emit "hi >> x". - SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); - SDValue TrueValHi = Opc == ISD::SRA - ? DAG.getNode(Opc, dl, VT, ShOpHi, - DAG.getConstant(VTBits - 1, dl, - MVT::i64)) - : DAG.getConstant(0, dl, VT); - SDValue Hi = - DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp); + SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue HiForBigShift = + Opc == ISD::SRA + ? DAG.getNode(Opc, dl, VT, ShOpHi, + DAG.getConstant(VTBits - 1, dl, MVT::i64)) + : DAG.getConstant(0, dl, VT); + SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, + HiForNormalShift, CCVal, Cmp); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); } + /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two /// i64 values and take a 2 x i64 value to shift plus a shift amount. SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, - SelectionDAG &DAG) const { + SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); @@ -4176,31 +4458,41 @@ SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); - SDValue ARMcc; assert(Op.getOpcode() == ISD::SHL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); - SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); + SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); + + // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which + // is "undef". We wanted 0, so CSEL it directly. + SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), + ISD::SETEQ, dl, DAG); + SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); + LoBitsForHi = + DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), + LoBitsForHi, CCVal, Cmp); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, DAG.getConstant(VTBits, dl, MVT::i64)); - SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); - SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); + SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); + SDValue HiForNormalShift = + DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi); - SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); - SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), - ISD::SETGE, dl, DAG); - SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); - SDValue Hi = - DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp); + Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, + dl, DAG); + CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); + SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, + HiForNormalShift, CCVal, Cmp); // AArch64 shifts of larger than register sizes are wrapped rather than // clamped, so we can't just emit "lo << a" if a is too big. - SDValue TrueValLo = DAG.getConstant(0, dl, VT); - SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); - SDValue Lo = - DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp); + SDValue LoForBigShift = DAG.getConstant(0, dl, VT); + SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, + LoForNormalShift, CCVal, Cmp); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); @@ -4260,7 +4552,7 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. AArch64TargetLowering::ConstraintType -AArch64TargetLowering::getConstraintType(const std::string &Constraint) const { +AArch64TargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { default: @@ -4311,8 +4603,7 @@ AArch64TargetLowering::getSingleConstraintMatchWeight( std::pair AArch64TargetLowering::getRegForInlineAsmConstraint( - const TargetRegisterInfo *TRI, const std::string &Constraint, - MVT VT) const { + const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': @@ -4348,10 +4639,9 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( unsigned Size = Constraint.size(); if ((Size == 4 || Size == 5) && Constraint[0] == '{' && tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { - const std::string Reg = - std::string(&Constraint[2], &Constraint[Size - 1]); - int RegNo = atoi(Reg.c_str()); - if (RegNo >= 0 && RegNo <= 31) { + int RegNo; + bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo); + if (!Failed && RegNo >= 0 && RegNo <= 31) { // v0 - v31 are aliases of q0 - q31. // By default we'll emit v0-v31 for this unless there's a modifier where // we'll emit the correct register as well. @@ -4384,8 +4674,7 @@ void AArch64TargetLowering::LowerAsmOperandForConstraint( // Validate and return a target constant for them if we can. case 'z': { // 'z' maps to xzr or wzr so it needs an input of 0. - ConstantSDNode *C = dyn_cast(Op); - if (!C || C->getZExtValue() != 0) + if (!isNullConstant(Op)) return; if (Op.getValueType() == MVT::i64) @@ -4785,7 +5074,7 @@ static bool isEXTMask(ArrayRef M, EVT VT, bool &ReverseEXT, // The index of an EXT is the first element if it is not UNDEF. // Watch out for the beginning UNDEFs. The EXT index should be the expected - // value of the first element. E.g. + // value of the first element. E.g. // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. // ExpectedElt is the last mask index plus 1. @@ -5675,11 +5964,10 @@ static SDValue NormalizeBuildVector(SDValue Op, return Op; SmallVector Ops; - for (unsigned I = 0, E = VT.getVectorNumElements(); I != E; ++I) { - SDValue Lane = Op.getOperand(I); - if (Lane.getOpcode() == ISD::Constant) { + for (SDValue Lane : Op->ops()) { + if (auto *CstLane = dyn_cast(Lane)) { APInt LowBits(EltTy.getSizeInBits(), - cast(Lane)->getZExtValue()); + CstLane->getZExtValue()); Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32); } Ops.push_back(Lane); @@ -6019,8 +6307,7 @@ FailedModImm: // Empirical tests suggest this is rarely worth it for vectors of length <= 2. if (NumElts >= 4) { - SDValue shuffle = ReconstructShuffle(Op, DAG); - if (shuffle != SDValue()) + if (SDValue shuffle = ReconstructShuffle(Op, DAG)) return shuffle; } @@ -6039,7 +6326,10 @@ FailedModImm: // a) Avoid a RMW dependency on the full vector register, and // b) Allow the register coalescer to fold away the copy if the // value is already in an S or D register. - if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) { + // Do not do this for UNDEF/LOAD nodes because we have better patterns + // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR. + if (Op0.getOpcode() != ISD::UNDEF && Op0.getOpcode() != ISD::LOAD && + (ElemSize == 32 || ElemSize == 64)) { unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub; MachineSDNode *N = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0, @@ -6145,24 +6435,11 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, unsigned Val = Cst->getZExtValue(); unsigned Size = Op.getValueType().getSizeInBits(); - if (Val == 0) { - switch (Size) { - case 8: - return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(), - Op.getOperand(0)); - case 16: - return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(), - Op.getOperand(0)); - case 32: - return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(), - Op.getOperand(0)); - case 64: - return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(), - Op.getOperand(0)); - default: - llvm_unreachable("Unexpected vector type in extract_subvector!"); - } - } + + // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. + if (Val == 0) + return Op; + // If this is extracting the upper 64-bits of a 128-bit vector, we match // that directly. if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64) @@ -6235,26 +6512,20 @@ static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { /// 0 <= Value <= ElementBits for a long left shift. static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); - unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); if (!getVShiftImm(Op, ElementBits, Cnt)) return false; return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); } /// isVShiftRImm - Check if this is a valid build_vector for the immediate -/// operand of a vector shift right operation. For a shift opcode, the value -/// is positive, but for an intrinsic the value count must be negative. The -/// absolute value must be in the range: -/// 1 <= |Value| <= ElementBits for a right shift; or -/// 1 <= |Value| <= ElementBits/2 for a narrow right shift. -static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, - int64_t &Cnt) { +/// operand of a vector shift right operation. The value must be in the range: +/// 1 <= Value <= ElementBits for a right shift; or +static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); - unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); if (!getVShiftImm(Op, ElementBits, Cnt)) return false; - if (isIntrinsic) - Cnt = -Cnt; return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); } @@ -6283,8 +6554,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, case ISD::SRA: case ISD::SRL: // Right shift immediate - if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) && - Cnt < EltSize) { + if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) { unsigned Opc = (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; return DAG.getNode(Opc, DL, VT, Op.getOperand(0), @@ -6457,6 +6727,7 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const { + auto &DL = I.getModule()->getDataLayout(); switch (Intrinsic) { case Intrinsic::aarch64_neon_ld2: case Intrinsic::aarch64_neon_ld3: @@ -6472,7 +6743,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::aarch64_neon_ld4r: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. - uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8; + uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; @@ -6498,7 +6769,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; - NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8; + NumElts += DL.getTypeSizeInBits(ArgTy) / 64; } Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); @@ -6516,7 +6787,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); + Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); Info.vol = true; Info.readMem = true; Info.writeMem = false; @@ -6529,7 +6800,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); + Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); Info.vol = true; Info.readMem = false; Info.writeMem = true; @@ -6600,7 +6871,8 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { return true; const TargetOptions &Options = getTargetMachine().Options; - EVT VT = getValueType(User->getOperand(0)->getType()); + const DataLayout &DL = I->getModule()->getDataLayout(); + EVT VT = getValueType(DL, User->getOperand(0)->getType()); if (isFMAFasterThanFMulAndFAdd(VT) && isOperationLegalOrCustom(ISD::FMA, VT) && @@ -6665,6 +6937,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { break; case Instruction::GetElementPtr: { gep_type_iterator GTI = gep_type_begin(Instr); + auto &DL = Ext->getModule()->getDataLayout(); std::advance(GTI, U.getOperandNo()); Type *IdxTy = *GTI; // This extension will end up with a shift because of the scaling factor. @@ -6672,7 +6945,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { // Get the shift amount based on the scaling factor: // log2(sizeof(IdxTy)) - log2(8). uint64_t ShiftAmt = - countTrailingZeros(getDataLayout()->getTypeStoreSizeInBits(IdxTy)) - 3; + countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3; // Is the constant foldable in the shift of the addressing mode? // I.e., shift amount is between 1 and 4 inclusive. if (ShiftAmt == 0 || ShiftAmt > 4) @@ -6716,6 +6989,160 @@ bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, return NumBits == 32 || NumBits == 64; } +/// \brief Lower an interleaved load into a ldN intrinsic. +/// +/// E.g. Lower an interleaved load (Factor = 2): +/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr +/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements +/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements +/// +/// Into: +/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr) +/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 +/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 +bool AArch64TargetLowering::lowerInterleavedLoad( + LoadInst *LI, ArrayRef Shuffles, + ArrayRef Indices, unsigned Factor) const { + assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && + "Invalid interleave factor"); + assert(!Shuffles.empty() && "Empty shufflevector input"); + assert(Shuffles.size() == Indices.size() && + "Unmatched number of shufflevectors and indices"); + + const DataLayout &DL = LI->getModule()->getDataLayout(); + + VectorType *VecTy = Shuffles[0]->getType(); + unsigned VecSize = DL.getTypeSizeInBits(VecTy); + + // Skip if we do not have NEON and skip illegal vector types. + if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128)) + return false; + + // A pointer vector can not be the return type of the ldN intrinsics. Need to + // load integer vectors first and then convert to pointer vectors. + Type *EltTy = VecTy->getVectorElementType(); + if (EltTy->isPointerTy()) + VecTy = + VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); + + Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace()); + Type *Tys[2] = {VecTy, PtrTy}; + static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2, + Intrinsic::aarch64_neon_ld3, + Intrinsic::aarch64_neon_ld4}; + Function *LdNFunc = + Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); + + IRBuilder<> Builder(LI); + Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy); + + CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN"); + + // Replace uses of each shufflevector with the corresponding vector loaded + // by ldN. + for (unsigned i = 0; i < Shuffles.size(); i++) { + ShuffleVectorInst *SVI = Shuffles[i]; + unsigned Index = Indices[i]; + + Value *SubVec = Builder.CreateExtractValue(LdN, Index); + + // Convert the integer vector to pointer vector if the element is pointer. + if (EltTy->isPointerTy()) + SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType()); + + SVI->replaceAllUsesWith(SubVec); + } + + return true; +} + +/// \brief Get a mask consisting of sequential integers starting from \p Start. +/// +/// I.e. +static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start, + unsigned NumElts) { + SmallVector Mask; + for (unsigned i = 0; i < NumElts; i++) + Mask.push_back(Builder.getInt32(Start + i)); + + return ConstantVector::get(Mask); +} + +/// \brief Lower an interleaved store into a stN intrinsic. +/// +/// E.g. Lower an interleaved store (Factor = 3): +/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, +/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> +/// store <12 x i32> %i.vec, <12 x i32>* %ptr +/// +/// Into: +/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> +/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> +/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> +/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) +/// +/// Note that the new shufflevectors will be removed and we'll only generate one +/// st3 instruction in CodeGen. +bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, + ShuffleVectorInst *SVI, + unsigned Factor) const { + assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && + "Invalid interleave factor"); + + VectorType *VecTy = SVI->getType(); + assert(VecTy->getVectorNumElements() % Factor == 0 && + "Invalid interleaved store"); + + unsigned NumSubElts = VecTy->getVectorNumElements() / Factor; + Type *EltTy = VecTy->getVectorElementType(); + VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); + + const DataLayout &DL = SI->getModule()->getDataLayout(); + unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); + + // Skip if we do not have NEON and skip illegal vector types. + if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128)) + return false; + + Value *Op0 = SVI->getOperand(0); + Value *Op1 = SVI->getOperand(1); + IRBuilder<> Builder(SI); + + // StN intrinsics don't support pointer vectors as arguments. Convert pointer + // vectors to integer vectors. + if (EltTy->isPointerTy()) { + Type *IntTy = DL.getIntPtrType(EltTy); + unsigned NumOpElts = + dyn_cast(Op0->getType())->getVectorNumElements(); + + // Convert to the corresponding integer vector. + Type *IntVecTy = VectorType::get(IntTy, NumOpElts); + Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); + Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); + + SubVecTy = VectorType::get(IntTy, NumSubElts); + } + + Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace()); + Type *Tys[2] = {SubVecTy, PtrTy}; + static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2, + Intrinsic::aarch64_neon_st3, + Intrinsic::aarch64_neon_st4}; + Function *StNFunc = + Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); + + SmallVector Ops; + + // Split the shufflevector operands into sub vectors for the new stN call. + for (unsigned i = 0; i < Factor; i++) + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts))); + + Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy)); + Builder.CreateCall(StNFunc, Ops); + return true; +} + static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, unsigned AlignCheck) { return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && @@ -6768,8 +7195,9 @@ bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. -bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM, - Type *Ty) const { +bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, + unsigned AS) const { // AArch64 has five basic addressing modes: // reg // reg + 9-bit signed offset @@ -6789,7 +7217,7 @@ bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM, // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 uint64_t NumBytes = 0; if (Ty->isSized()) { - uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty); + uint64_t NumBits = DL.getTypeSizeInBits(Ty); NumBytes = NumBits / 8; if (!isPowerOf2_64(NumBits)) NumBytes = 0; @@ -6819,8 +7247,9 @@ bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM, return false; } -int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM, - Type *Ty) const { +int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL, + const AddrMode &AM, Type *Ty, + unsigned AS) const { // Scaling factors are not free at all. // Operands | Rt Latency // ------------------------------------------- @@ -6828,7 +7257,7 @@ int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM, // ------------------------------------------- // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 // Rt, [Xn, Wm, #imm] | - if (isLegalAddressingMode(AM, Ty)) + if (isLegalAddressingMode(DL, AM, Ty, AS)) // Scale represents reg2 * scale, thus account for 1 if // it is not equal to 0 or 1. return AM.Scale != 0 && AM.Scale != 1; @@ -7091,8 +7520,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { // First try to optimize away the conversion when it's conditionally from // a constant. Vectors only. - SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); - if (Res != SDValue()) + if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) return Res; EVT VT = N->getValueType(0); @@ -7105,7 +7533,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, // If the result of an integer load is only used by an integer-to-float // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. - // This eliminates an "integer-to-vector-move UOP and improve throughput. + // This eliminates an "integer-to-vector-move" UOP and improves throughput. SDValue N0 = N->getOperand(0); if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && // Do not change the width of a volatile load. @@ -7128,6 +7556,134 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Fold a floating-point multiply by power of two into floating-point to +/// fixed-point conversion. +static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue Op = N->getOperand(0); + if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL) + return SDValue(); + + SDValue ConstVec = Op->getOperand(1); + if (!isa(ConstVec)) + return SDValue(); + + MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); + uint32_t FloatBits = FloatTy.getSizeInBits(); + if (FloatBits != 32 && FloatBits != 64) + return SDValue(); + + MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); + uint32_t IntBits = IntTy.getSizeInBits(); + if (IntBits != 16 && IntBits != 32 && IntBits != 64) + return SDValue(); + + // Avoid conversions where iN is larger than the float (e.g., float -> i64). + if (IntBits > FloatBits) + return SDValue(); + + BitVector UndefElements; + BuildVectorSDNode *BV = cast(ConstVec); + int32_t Bits = IntBits == 64 ? 64 : 32; + int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1); + if (C == -1 || C == 0 || C > Bits) + return SDValue(); + + MVT ResTy; + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + switch (NumLanes) { + default: + return SDValue(); + case 2: + ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; + break; + case 4: + ResTy = MVT::v4i32; + break; + } + + SDLoc DL(N); + bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; + unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs + : Intrinsic::aarch64_neon_vcvtfp2fxu; + SDValue FixConv = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy, + DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), + Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32)); + // We can handle smaller integers by generating an extra trunc. + if (IntBits < FloatBits) + FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv); + + return FixConv; +} + +/// Fold a floating-point divide by power of two into fixed-point to +/// floating-point conversion. +static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue Op = N->getOperand(0); + unsigned Opc = Op->getOpcode(); + if (!Op.getValueType().isVector() || + (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP)) + return SDValue(); + + SDValue ConstVec = N->getOperand(1); + if (!isa(ConstVec)) + return SDValue(); + + MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); + int32_t IntBits = IntTy.getSizeInBits(); + if (IntBits != 16 && IntBits != 32 && IntBits != 64) + return SDValue(); + + MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); + int32_t FloatBits = FloatTy.getSizeInBits(); + if (FloatBits != 32 && FloatBits != 64) + return SDValue(); + + // Avoid conversions where iN is larger than the float (e.g., i64 -> float). + if (IntBits > FloatBits) + return SDValue(); + + BitVector UndefElements; + BuildVectorSDNode *BV = cast(ConstVec); + int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1); + if (C == -1 || C == 0 || C > FloatBits) + return SDValue(); + + MVT ResTy; + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + switch (NumLanes) { + default: + return SDValue(); + case 2: + ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; + break; + case 4: + ResTy = MVT::v4i32; + break; + } + + SDLoc DL(N); + SDValue ConvInput = Op.getOperand(0); + bool IsSigned = Opc == ISD::SINT_TO_FP; + if (IntBits < FloatBits) + ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, + ResTy, ConvInput); + + unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp + : Intrinsic::aarch64_neon_vcvtfxu2fp; + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), + DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput, + DAG.getConstant(C, DL, MVT::i32)); +} + /// An EXTR instruction is made up of two shifts, ORed together. This helper /// searches for and classifies those shifts. static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, @@ -7468,21 +8024,26 @@ static SDValue tryCombineFixedPointConvert(SDNode *N, // // This routine does the actual conversion of such DUPs, once outer routines // have determined that everything else is in order. +// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold +// similarly here. static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { - // We can handle most types of duplicate, but the lane ones have an extra - // operand saying *which* lane, so we need to know. - bool IsDUPLANE; switch (N.getOpcode()) { case AArch64ISD::DUP: - IsDUPLANE = false; - break; case AArch64ISD::DUPLANE8: case AArch64ISD::DUPLANE16: case AArch64ISD::DUPLANE32: case AArch64ISD::DUPLANE64: - IsDUPLANE = true; + case AArch64ISD::MOVI: + case AArch64ISD::MOVIshift: + case AArch64ISD::MOVIedit: + case AArch64ISD::MOVImsl: + case AArch64ISD::MVNIshift: + case AArch64ISD::MVNImsl: break; default: + // FMOV could be supported, but isn't very useful, as it would only occur + // if you passed a bitcast' floating point immediate to an eligible long + // integer op (addl, smull, ...). return SDValue(); } @@ -7492,17 +8053,11 @@ static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { MVT ElementTy = NarrowTy.getVectorElementType(); unsigned NumElems = NarrowTy.getVectorNumElements(); - MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2); + MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2); SDLoc dl(N); - SDValue NewDUP; - if (IsDUPLANE) - NewDUP = DAG.getNode(N.getOpcode(), dl, NewDUPVT, N.getOperand(0), - N.getOperand(1)); - else - NewDUP = DAG.getNode(AArch64ISD::DUP, dl, NewDUPVT, N.getOperand(0)); - - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy, NewDUP, + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy, + DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()), DAG.getConstant(NumElems, dl, MVT::i64)); } @@ -7828,7 +8383,6 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_neon_vcvtfxs2fp: case Intrinsic::aarch64_neon_vcvtfxu2fp: return tryCombineFixedPointConvert(N, DCI, DAG); - break; case Intrinsic::aarch64_neon_saddv: return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG); case Intrinsic::aarch64_neon_uaddv: @@ -7842,10 +8396,16 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_neon_umaxv: return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG); case Intrinsic::aarch64_neon_fmax: - return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0), + return DAG.getNode(ISD::FMAXNAN, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_fmin: - return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0), + return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_neon_fmaxnm: + return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_neon_fminnm: + return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_smull: case Intrinsic::aarch64_neon_umull: @@ -8005,7 +8565,7 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { unsigned Alignment = std::min(OrigAlignment, EltOffset); // Create scalar stores. This is at least as good as the code sequence for a - // split unaligned store wich is a dup.s, ext.b, and two stores. + // split unaligned store which is a dup.s, ext.b, and two stores. // Most of the time the three stores should be replaced by store pair // instructions (stp). SDLoc DL(St); @@ -8026,10 +8586,9 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { return NewST1; } -static SDValue performSTORECombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG, - const AArch64Subtarget *Subtarget) { +static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { if (!DCI.isBeforeLegalize()) return SDValue(); @@ -8037,15 +8596,17 @@ static SDValue performSTORECombine(SDNode *N, if (S->isVolatile()) return SDValue(); + // FIXME: The logic for deciding if an unaligned store should be split should + // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be + // a call to that function here. + // Cyclone has bad performance on unaligned 16B stores when crossing line and // page boundaries. We want to split such stores. if (!Subtarget->isCyclone()) return SDValue(); - // Don't split at Oz. - MachineFunction &MF = DAG.getMachineFunction(); - bool IsMinSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize); - if (IsMinSize) + // Don't split at -Oz. + if (DAG.getMachineFunction().getFunction()->optForMinSize()) return SDValue(); SDValue StVal = S->getValue(); @@ -8068,8 +8629,7 @@ static SDValue performSTORECombine(SDNode *N, // If we get a splat of a scalar convert this vector store to a store of // scalars. They will be merged into store pairs thereby removing two // instructions. - SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S); - if (ReplacedSplat != SDValue()) + if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S)) return ReplacedSplat; SDLoc DL(S); @@ -8190,6 +8750,299 @@ static SDValue performPostLD1Combine(SDNode *N, return SDValue(); } +/// Simplify \Addr given that the top byte of it is ignored by HW during +/// address translation. +static bool performTBISimplification(SDValue Addr, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + APInt DemandedMask = APInt::getLowBitsSet(64, 56); + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), + DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits(Addr, DemandedMask, KnownZero, KnownOne, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + return true; + } + return false; +} + +static SDValue performSTORECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + SDValue Split = split16BStores(N, DCI, DAG, Subtarget); + if (Split.getNode()) + return Split; + + if (Subtarget->supportsAddressTopByteIgnored() && + performTBISimplification(N->getOperand(2), DCI, DAG)) + return SDValue(N, 0); + + return SDValue(); +} + + /// This function handles the log2-shuffle pattern produced by the +/// LoopVectorizer for the across vector reduction. It consists of +/// log2(NumVectorElements) steps and, in each step, 2^(s) elements +/// are reduced, where s is an induction variable from 0 to +/// log2(NumVectorElements). +static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV, + unsigned Op, + SelectionDAG &DAG) { + EVT VTy = OpV->getOperand(0).getValueType(); + if (!VTy.isVector()) + return SDValue(); + + int NumVecElts = VTy.getVectorNumElements(); + if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { + if (NumVecElts != 4) + return SDValue(); + } else { + if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16) + return SDValue(); + } + + int NumExpectedSteps = APInt(8, NumVecElts).logBase2(); + SDValue PreOp = OpV; + // Iterate over each step of the across vector reduction. + for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) { + SDValue CurOp = PreOp.getOperand(0); + SDValue Shuffle = PreOp.getOperand(1); + if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) { + // Try to swap the 1st and 2nd operand as add and min/max instructions + // are commutative. + CurOp = PreOp.getOperand(1); + Shuffle = PreOp.getOperand(0); + if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + } + + // Check if the input vector is fed by the operator we want to handle, + // except the last step; the very first input vector is not necessarily + // the same operator we are handling. + if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1))) + return SDValue(); + + // Check if it forms one step of the across vector reduction. + // E.g., + // %cur = add %1, %0 + // %shuffle = vector_shuffle %cur, <2, 3, u, u> + // %pre = add %cur, %shuffle + if (Shuffle.getOperand(0) != CurOp) + return SDValue(); + + int NumMaskElts = 1 << CurStep; + ArrayRef Mask = cast(Shuffle)->getMask(); + // Check mask values in each step. + // We expect the shuffle mask in each step follows a specific pattern + // denoted here by the form, where M is a sequence of integers + // starting from NumMaskElts, increasing by 1, and the number integers + // in M should be NumMaskElts. U is a sequence of UNDEFs and the number + // of undef in U should be NumVecElts - NumMaskElts. + // E.g., for <8 x i16>, mask values in each step should be : + // step 0 : <1,u,u,u,u,u,u,u> + // step 1 : <2,3,u,u,u,u,u,u> + // step 2 : <4,5,6,7,u,u,u,u> + for (int i = 0; i < NumVecElts; ++i) + if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) || + (i >= NumMaskElts && !(Mask[i] < 0))) + return SDValue(); + + PreOp = CurOp; + } + unsigned Opcode; + bool IsIntrinsic = false; + + switch (Op) { + default: + llvm_unreachable("Unexpected operator for across vector reduction"); + case ISD::ADD: + Opcode = AArch64ISD::UADDV; + break; + case ISD::SMAX: + Opcode = AArch64ISD::SMAXV; + break; + case ISD::UMAX: + Opcode = AArch64ISD::UMAXV; + break; + case ISD::SMIN: + Opcode = AArch64ISD::SMINV; + break; + case ISD::UMIN: + Opcode = AArch64ISD::UMINV; + break; + case ISD::FMAXNUM: + Opcode = Intrinsic::aarch64_neon_fmaxnmv; + IsIntrinsic = true; + break; + case ISD::FMINNUM: + Opcode = Intrinsic::aarch64_neon_fminnmv; + IsIntrinsic = true; + break; + } + SDLoc DL(N); + + return IsIntrinsic + ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0), + DAG.getConstant(Opcode, DL, MVT::i32), PreOp) + : DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), + DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp), + DAG.getConstant(0, DL, MVT::i64)); +} + +/// Target-specific DAG combine for the across vector min/max reductions. +/// This function specifically handles the final clean-up step of the vector +/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle +/// pattern, which narrows down and finds the final min/max value from all +/// elements of the vector. +/// For example, for a <16 x i8> vector : +/// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> +/// %smax0 = smax %arr, svn0 +/// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u> +/// %smax1 = smax %smax0, %svn1 +/// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +/// %smax2 = smax %smax1, svn2 +/// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +/// %sc = setcc %smax2, %svn3, gt +/// %n0 = extract_vector_elt %sc, #0 +/// %n1 = extract_vector_elt %smax2, #0 +/// %n2 = extract_vector_elt $smax2, #1 +/// %result = select %n0, %n1, n2 +/// becomes : +/// %1 = smaxv %0 +/// %result = extract_vector_elt %1, 0 +static SDValue +performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue IfTrue = N->getOperand(1); + SDValue IfFalse = N->getOperand(2); + + // Check if the SELECT merges up the final result of the min/max + // from a vector. + if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // Expect N0 is fed by SETCC. + SDValue SetCC = N0.getOperand(0); + EVT SetCCVT = SetCC.getValueType(); + if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() || + SetCCVT.getVectorElementType() != MVT::i1) + return SDValue(); + + SDValue VectorOp = SetCC.getOperand(0); + unsigned Op = VectorOp->getOpcode(); + // Check if the input vector is fed by the operator we want to handle. + if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN && + Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM) + return SDValue(); + + EVT VTy = VectorOp.getValueType(); + if (!VTy.isVector()) + return SDValue(); + + if (VTy.getSizeInBits() < 64) + return SDValue(); + + EVT EltTy = VTy.getVectorElementType(); + if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { + if (EltTy != MVT::f32) + return SDValue(); + } else { + if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) + return SDValue(); + } + + // Check if extracting from the same vector. + // For example, + // %sc = setcc %vector, %svn1, gt + // %n0 = extract_vector_elt %sc, #0 + // %n1 = extract_vector_elt %vector, #0 + // %n2 = extract_vector_elt $vector, #1 + if (!(VectorOp == IfTrue->getOperand(0) && + VectorOp == IfFalse->getOperand(0))) + return SDValue(); + + // Check if the condition code is matched with the operator type. + ISD::CondCode CC = cast(SetCC->getOperand(2))->get(); + if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) || + (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) || + (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) || + (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) || + (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE && + CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT && + CC != ISD::SETGE) || + (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE && + CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT && + CC != ISD::SETLE)) + return SDValue(); + + // Expect to check only lane 0 from the vector SETCC. + if (!isNullConstant(N0.getOperand(1))) + return SDValue(); + + // Expect to extract the true value from lane 0. + if (!isNullConstant(IfTrue.getOperand(1))) + return SDValue(); + + // Expect to extract the false value from lane 1. + if (!isOneConstant(IfFalse.getOperand(1))) + return SDValue(); + + return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG); +} + +/// Target-specific DAG combine for the across vector add reduction. +/// This function specifically handles the final clean-up step of the vector +/// add reduction produced by the LoopVectorizer. It is the log2-shuffle +/// pattern, which adds all elements of a vector together. +/// For example, for a <4 x i32> vector : +/// %1 = vector_shuffle %0, <2,3,u,u> +/// %2 = add %0, %1 +/// %3 = vector_shuffle %2, <1,u,u,u> +/// %4 = add %2, %3 +/// %result = extract_vector_elt %4, 0 +/// becomes : +/// %0 = uaddv %0 +/// %result = extract_vector_elt %0, 0 +static SDValue +performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // Check if the input vector is fed by the ADD. + if (N0->getOpcode() != ISD::ADD) + return SDValue(); + + // The vector extract idx must constant zero because we only expect the final + // result of the reduction is placed in lane 0. + if (!isNullConstant(N1)) + return SDValue(); + + EVT VTy = N0.getValueType(); + if (!VTy.isVector()) + return SDValue(); + + EVT EltTy = VTy.getVectorElementType(); + if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) + return SDValue(); + + if (VTy.getSizeInBits() < 64) + return SDValue(); + + return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG); +} + /// Target-specific DAG combine function for NEON load/store intrinsics /// to merge base address updates. static SDValue performNEONPostLDSTCombine(SDNode *N, @@ -8615,10 +9468,10 @@ static SDValue performBRCONDCombine(SDNode *N, if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) return SDValue(); - if (isa(LHS) && cast(LHS)->isNullValue()) + if (isNullConstant(LHS)) std::swap(LHS, RHS); - if (!isa(RHS) || !cast(RHS)->isNullValue()) + if (!isNullConstant(RHS)) return SDValue(); if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA || @@ -8638,6 +9491,103 @@ static SDValue performBRCONDCombine(SDNode *N, return SDValue(); } +// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test +// as well as whether the test should be inverted. This code is required to +// catch these cases (as opposed to standard dag combines) because +// AArch64ISD::TBZ is matched during legalization. +static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, + SelectionDAG &DAG) { + + if (!Op->hasOneUse()) + return Op; + + // We don't handle undef/constant-fold cases below, as they should have + // already been taken care of (e.g. and of 0, test of undefined shifted bits, + // etc.) + + // (tbz (trunc x), b) -> (tbz x, b) + // This case is just here to enable more of the below cases to be caught. + if (Op->getOpcode() == ISD::TRUNCATE && + Bit < Op->getValueType(0).getSizeInBits()) { + return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); + } + + if (Op->getNumOperands() != 2) + return Op; + + auto *C = dyn_cast(Op->getOperand(1)); + if (!C) + return Op; + + switch (Op->getOpcode()) { + default: + return Op; + + // (tbz (and x, m), b) -> (tbz x, b) + case ISD::AND: + if ((C->getZExtValue() >> Bit) & 1) + return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); + return Op; + + // (tbz (shl x, c), b) -> (tbz x, b-c) + case ISD::SHL: + if (C->getZExtValue() <= Bit && + (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { + Bit = Bit - C->getZExtValue(); + return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); + } + return Op; + + // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x + case ISD::SRA: + Bit = Bit + C->getZExtValue(); + if (Bit >= Op->getValueType(0).getSizeInBits()) + Bit = Op->getValueType(0).getSizeInBits() - 1; + return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); + + // (tbz (srl x, c), b) -> (tbz x, b+c) + case ISD::SRL: + if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { + Bit = Bit + C->getZExtValue(); + return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); + } + return Op; + + // (tbz (xor x, -1), b) -> (tbnz x, b) + case ISD::XOR: + if ((C->getZExtValue() >> Bit) & 1) + Invert = !Invert; + return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); + } +} + +// Optimize test single bit zero/non-zero and branch. +static SDValue performTBZCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + unsigned Bit = cast(N->getOperand(2))->getZExtValue(); + bool Invert = false; + SDValue TestSrc = N->getOperand(1); + SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG); + + if (TestSrc == NewTestSrc) + return SDValue(); + + unsigned NewOpc = N->getOpcode(); + if (Invert) { + if (NewOpc == AArch64ISD::TBZ) + NewOpc = AArch64ISD::TBNZ; + else { + assert(NewOpc == AArch64ISD::TBNZ); + NewOpc = AArch64ISD::TBZ; + } + } + + SDLoc DL(N); + return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc, + DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3)); +} + // vselect (v1i1 setcc) -> // vselect (v1iXX setcc) (XX is the size of the compared operand type) // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as @@ -8732,6 +9682,14 @@ static SDValue performSelectCombine(SDNode *N, return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); } +/// Get rid of unnecessary NVCASTs (that don't change the type). +static SDValue performNVCASTCombine(SDNode *N) { + if (N->getValueType(0) == N->getOperand(0).getValueType()) + return N->getOperand(0); + + return SDValue(); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -8748,6 +9706,11 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return performIntToFpCombine(N, DAG, Subtarget); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + return performFpToIntCombine(N, DAG, Subtarget); + case ISD::FDIV: + return performFDivCombine(N, DAG, Subtarget); case ISD::OR: return performORCombine(N, DCI, Subtarget); case ISD::INTRINSIC_WO_CHAIN: @@ -8760,20 +9723,35 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performBitcastCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); - case ISD::SELECT: - return performSelectCombine(N, DCI); + case ISD::SELECT: { + SDValue RV = performSelectCombine(N, DCI); + if (!RV.getNode()) + RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget); + return RV; + } case ISD::VSELECT: return performVSelectCombine(N, DCI.DAG); + case ISD::LOAD: + if (performTBISimplification(N->getOperand(1), DCI, DAG)) + return SDValue(N, 0); + break; case ISD::STORE: return performSTORECombine(N, DCI, DAG, Subtarget); case AArch64ISD::BRCOND: return performBRCONDCombine(N, DCI, DAG); + case AArch64ISD::TBNZ: + case AArch64ISD::TBZ: + return performTBZCombine(N, DCI, DAG); case AArch64ISD::CSEL: return performCONDCombine(N, DCI, DAG, 2, 3); case AArch64ISD::DUP: return performPostLD1Combine(N, DCI, false); + case AArch64ISD::NVCAST: + return performNVCASTCombine(N); case ISD::INSERT_VECTOR_ELT: return performPostLD1Combine(N, DCI, true); + case ISD::EXTRACT_VECTOR_ELT: + return performAcrossLaneAddReductionCombine(N, DAG, Subtarget); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { @@ -8940,6 +9918,20 @@ static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl &Results, Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); } +static void ReplaceReductionResults(SDNode *N, + SmallVectorImpl &Results, + SelectionDAG &DAG, unsigned InterOp, + unsigned AcrossOp) { + EVT LoVT, HiVT; + SDValue Lo, Hi; + SDLoc dl(N); + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); + SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi); + SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal); + Results.push_back(SplitVal); +} + void AArch64TargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { @@ -8948,6 +9940,24 @@ void AArch64TargetLowering::ReplaceNodeResults( case ISD::BITCAST: ReplaceBITCASTResults(N, Results, DAG); return; + case AArch64ISD::SADDV: + ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); + return; + case AArch64ISD::UADDV: + ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV); + return; + case AArch64ISD::SMINV: + ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV); + return; + case AArch64ISD::UMINV: + ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV); + return; + case AArch64ISD::SMAXV: + ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV); + return; + case AArch64ISD::UMAXV: + ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV); + return; case ISD::FP_TO_UINT: case ISD::FP_TO_SINT: assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); @@ -8960,10 +9970,10 @@ bool AArch64TargetLowering::useLoadStackGuardNode() const { return true; } -bool AArch64TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { +unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { // Combine multiple FDIVs with the same divisor into multiple FMULs by the // reciprocal if there are three or more FDIVs. - return NumUsers > 2; + return 3; } TargetLoweringBase::LegalizeTypeAction @@ -8989,20 +9999,21 @@ bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { // Loads and stores less than 128-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. -bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { +TargetLowering::AtomicExpansionKind +AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { unsigned Size = LI->getType()->getPrimitiveSizeInBits(); - return Size == 128; + return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; } // For the real atomic operations, we have ldxr/stxr up to 128 bits, -TargetLoweringBase::AtomicRMWExpansionKind +TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); - return Size <= 128 ? AtomicRMWExpansionKind::LLSC - : AtomicRMWExpansionKind::None; + return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; } -bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const { +bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( + AtomicCmpXchgInst *AI) const { return true; } @@ -9041,6 +10052,13 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, cast(Addr->getType())->getElementType()); } +void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( + IRBuilder<> &Builder) const { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Builder.CreateCall( + llvm::Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); +} + Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { @@ -9059,7 +10077,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); - return Builder.CreateCall3(Stxr, Lo, Hi, Addr); + return Builder.CreateCall(Stxr, {Lo, Hi, Addr}); } Intrinsic::ID Int = @@ -9067,13 +10085,80 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, Type *Tys[] = { Addr->getType() }; Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); - return Builder.CreateCall2( - Stxr, Builder.CreateZExtOrBitCast( - Val, Stxr->getFunctionType()->getParamType(0)), - Addr); + return Builder.CreateCall(Stxr, + {Builder.CreateZExtOrBitCast( + Val, Stxr->getFunctionType()->getParamType(0)), + Addr}); } bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { return Ty->isArrayTy(); } + +bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, + EVT) const { + return false; +} + +Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { + if (!Subtarget->isTargetAndroid()) + return TargetLowering::getSafeStackPointerLocation(IRB); + + // Android provides a fixed TLS slot for the SafeStack pointer. See the + // definition of TLS_SLOT_SAFESTACK in + // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h + const unsigned TlsOffset = 0x48; + Module *M = IRB.GetInsertBlock()->getParent()->getParent(); + Function *ThreadPointerFunc = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_thread_pointer); + return IRB.CreatePointerCast( + IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), + Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); +} + +void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { + // Update IsSplitCSR in AArch64unctionInfo. + AArch64FunctionInfo *AFI = Entry->getParent()->getInfo(); + AFI->setIsSplitCSR(true); +} + +void AArch64TargetLowering::insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl &Exits) const { + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); + if (!IStart) + return; + + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + for (const MCPhysReg *I = IStart; *I; ++I) { + const TargetRegisterClass *RC = nullptr; + if (AArch64::GPR64RegClass.contains(*I)) + RC = &AArch64::GPR64RegClass; + else if (AArch64::FPR64RegClass.contains(*I)) + RC = &AArch64::FPR64RegClass; + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + + unsigned NewVR = MRI->createVirtualRegister(RC); + // Create copy from CSR to a virtual register. + // FIXME: this currently does not emit CFI pseudo-instructions, it works + // fine for CXX_FAST_TLS since the C++-style TLS access functions should be + // nounwind. If we want to generalize this later, we may need to emit + // CFI pseudo-instructions. + assert(Entry->getParent()->getFunction()->hasFnAttribute( + Attribute::NoUnwind) && + "Function should be nounwind in insertCopiesSplitCSR!"); + Entry->addLiveIn(*I); + BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + NewVR) + .addReg(*I); + + for (auto *Exit : Exits) + BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + *I) + .addReg(NewVR); + } +}