X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FAArch64%2FAArch64ISelLowering.cpp;h=430938e2f430b56c09f6ebd4d17e9bcbb0a54988;hb=1f70cb9c146df188f665e3496a31b34182a00252;hp=44d3b34cccc94f943a45e5c9007a4b8488ea3054;hpb=c2482dfc399ed4bfff562dfe56c6ac1a221aa0e5;p=oota-llvm.git diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 44d3b34cccc..430938e2f43 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -38,10 +38,12 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); +namespace { enum AlignMode { StrictAlign, NoStrictAlign }; +} static cl::opt Align(cl::desc("Load/store alignment support"), @@ -106,6 +108,7 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM) addDRTypeForNEON(MVT::v2i32); addDRTypeForNEON(MVT::v1i64); addDRTypeForNEON(MVT::v1f64); + addDRTypeForNEON(MVT::v4f16); addQRTypeForNEON(MVT::v4f32); addQRTypeForNEON(MVT::v2f64); @@ -113,6 +116,7 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM) addQRTypeForNEON(MVT::v8i16); addQRTypeForNEON(MVT::v4i32); addQRTypeForNEON(MVT::v2i64); + addQRTypeForNEON(MVT::v8f16); } // Compute derived properties from the register classes @@ -278,6 +282,94 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + // f16 is storage-only, so we promote operations to f32 if we know this is + // valid, and ignore them otherwise. The operations not mentioned here will + // fail to select, but this is not a major problem as no source language + // should be emitting native f16 operations yet. + setOperationAction(ISD::FADD, MVT::f16, Promote); + setOperationAction(ISD::FDIV, MVT::f16, Promote); + setOperationAction(ISD::FMUL, MVT::f16, Promote); + setOperationAction(ISD::FSUB, MVT::f16, Promote); + + // v4f16 is also a storage-only type, so promote it to v4f32 when that is + // known to be safe. + setOperationAction(ISD::FADD, MVT::v4f16, Promote); + setOperationAction(ISD::FSUB, MVT::v4f16, Promote); + setOperationAction(ISD::FMUL, MVT::v4f16, Promote); + setOperationAction(ISD::FDIV, MVT::v4f16, Promote); + setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote); + setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote); + AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32); + + // Expand all other v4f16 operations. + // FIXME: We could generate better code by promoting some operations to + // a pair of v4f32s + setOperationAction(ISD::FABS, MVT::v4f16, Expand); + setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); + setOperationAction(ISD::FCOS, MVT::v4f16, Expand); + setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); + setOperationAction(ISD::FMA, MVT::v4f16, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); + setOperationAction(ISD::FNEG, MVT::v4f16, Expand); + setOperationAction(ISD::FPOW, MVT::v4f16, Expand); + setOperationAction(ISD::FPOWI, MVT::v4f16, Expand); + setOperationAction(ISD::FREM, MVT::v4f16, Expand); + setOperationAction(ISD::FROUND, MVT::v4f16, Expand); + setOperationAction(ISD::FRINT, MVT::v4f16, Expand); + setOperationAction(ISD::FSIN, MVT::v4f16, Expand); + setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand); + setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); + setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); + setOperationAction(ISD::SETCC, MVT::v4f16, Expand); + setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); + setOperationAction(ISD::SELECT, MVT::v4f16, Expand); + setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); + setOperationAction(ISD::FEXP, MVT::v4f16, Expand); + setOperationAction(ISD::FEXP2, MVT::v4f16, Expand); + setOperationAction(ISD::FLOG, MVT::v4f16, Expand); + setOperationAction(ISD::FLOG2, MVT::v4f16, Expand); + setOperationAction(ISD::FLOG10, MVT::v4f16, Expand); + + + // v8f16 is also a storage-only type, so expand it. + setOperationAction(ISD::FABS, MVT::v8f16, Expand); + setOperationAction(ISD::FADD, MVT::v8f16, Expand); + setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); + setOperationAction(ISD::FCOS, MVT::v8f16, Expand); + setOperationAction(ISD::FDIV, MVT::v8f16, Expand); + setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); + setOperationAction(ISD::FMA, MVT::v8f16, Expand); + setOperationAction(ISD::FMUL, MVT::v8f16, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); + setOperationAction(ISD::FNEG, MVT::v8f16, Expand); + setOperationAction(ISD::FPOW, MVT::v8f16, Expand); + setOperationAction(ISD::FPOWI, MVT::v8f16, Expand); + setOperationAction(ISD::FREM, MVT::v8f16, Expand); + setOperationAction(ISD::FROUND, MVT::v8f16, Expand); + setOperationAction(ISD::FRINT, MVT::v8f16, Expand); + setOperationAction(ISD::FSIN, MVT::v8f16, Expand); + setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand); + setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); + setOperationAction(ISD::FSUB, MVT::v8f16, Expand); + setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); + setOperationAction(ISD::SETCC, MVT::v8f16, Expand); + setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); + setOperationAction(ISD::SELECT, MVT::v8f16, Expand); + setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); + setOperationAction(ISD::FEXP, MVT::v8f16, Expand); + setOperationAction(ISD::FEXP2, MVT::v8f16, Expand); + setOperationAction(ISD::FLOG, MVT::v8f16, Expand); + setOperationAction(ISD::FLOG2, MVT::v8f16, Expand); + setOperationAction(ISD::FLOG10, MVT::v8f16, Expand); + // AArch64 has implementations of a lot of rounding-like FP operations. static MVT RoundingTypes[] = { MVT::f32, MVT::f64}; for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) { @@ -484,13 +576,13 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM) } void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { - if (VT == MVT::v2f32) { + if (VT == MVT::v2f32 || VT == MVT::v4f16) { setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32); setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32); - } else if (VT == MVT::v2f64 || VT == MVT::v4f32) { + } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) { setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64); @@ -1025,6 +1117,8 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) { + SDValue Cmp; + AArch64CC::CondCode AArch64CC; if (ConstantSDNode *RHSC = dyn_cast(RHS.getNode())) { EVT VT = RHS.getValueType(); uint64_t C = RHSC->getZExtValue(); @@ -1079,9 +1173,45 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, } } } - - SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); - AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); + // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. + // For the i8 operand, the largest immediate is 255, so this can be easily + // encoded in the compare instruction. For the i16 operand, however, the + // largest immediate cannot be encoded in the compare. + // Therefore, use a sign extending load and cmn to avoid materializing the -1 + // constant. For example, + // movz w1, #65535 + // ldrh w0, [x0, #0] + // cmp w0, w1 + // > + // ldrsh w0, [x0, #0] + // cmn w0, #1 + // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) + // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure + // both the LHS and RHS are truely zero extended and to make sure the + // transformation is profitable. + if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa(RHS)) { + if ((cast(RHS)->getZExtValue() >> 16 == 0) && + isa(LHS)) { + if (cast(LHS)->getExtensionType() == ISD::ZEXTLOAD && + cast(LHS)->getMemoryVT() == MVT::i16 && + LHS.getNode()->hasNUsesOfValue(1, 0)) { + int16_t ValueofRHS = cast(RHS)->getZExtValue(); + if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { + SDValue SExt = + DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, + DAG.getValueType(MVT::i16)); + Cmp = emitComparison(SExt, + DAG.getConstant(ValueofRHS, RHS.getValueType()), + CC, dl, DAG); + AArch64CC = changeIntCCToAArch64CC(CC); + AArch64cc = DAG.getConstant(AArch64CC, MVT::i32); + return Cmp; + } + } + } + } + Cmp = emitComparison(LHS, RHS, CC, dl, DAG); + AArch64CC = changeIntCCToAArch64CC(CC); AArch64cc = DAG.getConstant(AArch64CC, MVT::i32); return Cmp; } @@ -1405,7 +1535,10 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { if (VT.getSizeInBits() > InVT.getSizeInBits()) { SDLoc dl(Op); - SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0)); + MVT ExtVT = + MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), + VT.getVectorNumElements()); + SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); return DAG.getNode(Op.getOpcode(), dl, VT, Ext); } @@ -1674,8 +1807,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // Assign locations to all of the incoming arguments. SmallVector ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); // At this point, Ins[].VT may already be promoted to i32. To correctly // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and @@ -1779,7 +1912,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( } else { // VA.isRegLoc() assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); unsigned ArgOffset = VA.getLocMemOffset(); - unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8; + unsigned ArgSize = VA.getValVT().getSizeInBits() / 8; uint32_t BEAlign = 0; if (ArgSize < 8 && !Subtarget->isLittleEndian()) @@ -1946,8 +2079,8 @@ SDValue AArch64TargetLowering::LowerCallResult( : RetCC_AArch64_AAPCS; // Assign locations to each value returned by this call. SmallVector RVLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC); // Copy all of the result registers out of their specified physreg. @@ -2016,6 +2149,19 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( return false; } + // Externally-defined functions with weak linkage should not be + // tail-called on AArch64 when the OS does not support dynamic + // pre-emption of symbols, as the AAELF spec requires normal calls + // to undefined weak functions to be replaced with a NOP or jump to the + // next instruction. The behaviour of branch instructions in this + // situation (as used for tail calls) is implementation-defined, so we + // cannot rely on the linker replacing the tail call with a return. + if (GlobalAddressSDNode *G = dyn_cast(Callee)) { + const GlobalValue *GV = G->getGlobal(); + if (GV->hasExternalWeakLinkage()) + return false; + } + // Now we search for cases where we can use a tail call without changing the // ABI. Sibcall is used in some places (particularly gcc) to refer to this // concept. @@ -2033,8 +2179,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( // FIXME: for now we take the most conservative of these in both cases: // disallow all variadic memory operands. SmallVector ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) @@ -2046,13 +2192,13 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( // results are returned in the same way as what the caller expects. if (!CCMatch) { SmallVector RVLocs1; - CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), - getTargetMachine(), RVLocs1, *DAG.getContext()); + CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, + *DAG.getContext()); CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg)); SmallVector RVLocs2; - CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), - getTargetMachine(), RVLocs2, *DAG.getContext()); + CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, + *DAG.getContext()); CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg)); if (RVLocs1.size() != RVLocs2.size()) @@ -2077,8 +2223,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( return true; SmallVector ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); @@ -2175,8 +2321,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; - CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); if (IsVarArg) { // Handle fixed and variable vector arguments differently. @@ -2321,7 +2467,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // common case. It should also work for fundamental types too. uint32_t BEAlign = 0; unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 - : VA.getLocVT().getSizeInBits(); + : VA.getValVT().getSizeInBits(); OpSize = (OpSize + 7) / 8; if (!Subtarget->isLittleEndian() && !Flags.isByVal()) { if (OpSize < 8) @@ -2355,8 +2501,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64); SDValue Cpy = DAG.getMemcpy( Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), - /*isVolatile = */ false, - /*alwaysInline = */ false, DstInfo, MachinePointerInfo()); + /*isVol = */ false, + /*AlwaysInline = */ false, DstInfo, MachinePointerInfo()); MemOpChains.push_back(Cpy); } else { @@ -2500,7 +2646,7 @@ bool AArch64TargetLowering::CanLowerReturn( ? RetCC_AArch64_WebKit_JS : RetCC_AArch64_AAPCS; SmallVector RVLocs; - CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context); + CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC); } @@ -2514,8 +2660,8 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, ? RetCC_AArch64_WebKit_JS : RetCC_AArch64_AAPCS; SmallVector RVLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC); // Copy the result values into the output registers. @@ -4663,7 +4809,8 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, VT.getVectorElementType() == MVT::f32) return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); // vrev <4 x i16> -> REV32 - if (VT.getVectorElementType() == MVT::i16) + if (VT.getVectorElementType() == MVT::i16 || + VT.getVectorElementType() == MVT::f16) return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); // vrev <4 x i8> -> REV16 assert(VT.getVectorElementType() == MVT::i8); @@ -4783,7 +4930,7 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef ShuffleMask, static unsigned getDUPLANEOp(EVT EltType) { if (EltType == MVT::i8) return AArch64ISD::DUPLANE8; - if (EltType == MVT::i16) + if (EltType == MVT::i16 || EltType == MVT::f16) return AArch64ISD::DUPLANE16; if (EltType == MVT::i32 || EltType == MVT::f32) return AArch64ISD::DUPLANE32; @@ -4913,7 +5060,8 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64); EVT ScalarVT = VT.getVectorElementType(); - if (ScalarVT.getSizeInBits() < 32) + + if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger()) ScalarVT = MVT::i32; return DAG.getNode( @@ -5672,11 +5820,12 @@ SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || - VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64) + VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || + VT == MVT::v8f16) return Op; if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && - VT != MVT::v1i64 && VT != MVT::v2f32) + VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) return SDValue(); // For V64 types, we perform insertion by expanding the value @@ -5705,11 +5854,12 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || - VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64) + VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || + VT == MVT::v8f16) return Op; if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && - VT != MVT::v1i64 && VT != MVT::v2f32) + VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) return SDValue(); // For V64 types, we perform extraction by expanding the value @@ -7364,11 +7514,11 @@ static SDValue performExtendCombine(SDNode *N, // If the vector type isn't a simple VT, it's beyond the scope of what // we're worried about here. Let legalization do its thing and hope for // the best. - if (!ResVT.isSimple()) + SDValue Src = N->getOperand(0); + EVT SrcVT = Src->getValueType(0); + if (!ResVT.isSimple() || !SrcVT.isSimple()) return SDValue(); - SDValue Src = N->getOperand(0); - MVT SrcVT = Src->getValueType(0).getSimpleVT(); // If the source VT is a 64-bit vector, we can play games and get the // better results we want. if (SrcVT.getSizeInBits() != 64) @@ -7602,7 +7752,7 @@ static SDValue performPostLD1Combine(SDNode *N, Ops.push_back(Inc); EVT Tys[3] = { VT, MVT::i64, MVT::Other }; - SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, 3)); + SDVTList SDTys = DAG.getVTList(Tys); unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost; SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, MemVT, @@ -7732,7 +7882,7 @@ static SDValue performNEONPostLDSTCombine(SDNode *N, Tys[n] = VecTy; Tys[n++] = MVT::i64; // Type of write back register Tys[n] = MVT::Other; // Type of the chain - SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2)); + SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); MemIntrinsicSDNode *MemInt = cast(N); SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops, @@ -7844,22 +7994,24 @@ static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); EVT ResVT = N->getValueType(0); + EVT SrcVT = N0.getOperand(0).getValueType(); + int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits(); - if (!N->getOperand(1).getValueType().isVector()) + // If NumMaskElts == 0, the comparison is larger than select result. The + // largest real NEON comparison is 64-bits per lane, which means the result is + // at most 32-bits and an illegal vector. Just bail out for now. + if (!ResVT.isVector() || NumMaskElts == 0) return SDValue(); if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1) return SDValue(); - SDLoc DL(N0); - - EVT SrcVT = N0.getOperand(0).getValueType(); - SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, - ResVT.getSizeInBits() / SrcVT.getSizeInBits()); + SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); // First perform a vector comparison, where lane 0 is the one we're interested // in. + SDLoc DL(N0); SDValue LHS = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0)); SDValue RHS = @@ -7869,8 +8021,8 @@ static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) { // Now duplicate the comparison mask we want across all other lanes. SmallVector DUPMask(CCVT.getVectorNumElements(), 0); SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data()); - Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(), - Mask); + Mask = DAG.getNode(ISD::BITCAST, DL, + ResVT.changeVectorElementTypeToInteger(), Mask); return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); } @@ -8131,8 +8283,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Type *ValTy = cast(Addr->getType())->getElementType(); - bool IsAcquire = - Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent; + bool IsAcquire = isAtLeastAcquire(Ord); // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd // intrinsic must return {i64, i64} and we have to recombine them into a @@ -8167,8 +8318,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - bool IsRelease = - Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent; + bool IsRelease = isAtLeastRelease(Ord); // Since the intrinsics must have legal type, the i128 intrinsics take two // parameters: "i64, i64". We must marshal Val into the appropriate form