X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FARM%2FARMISelLowering.cpp;h=3623174b919af39ba7be59ad10646c6405f0a4de;hb=68e6beeccc0b9ac2e8d3687a8a5b7d4b172edca1;hp=91dca5fd59223cf212b117f630facbb5622148ad;hpb=53519f015e3e84e9f57b677cc8724805a6009b73;p=oota-llvm.git diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 91dca5fd592..3623174b919 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -454,9 +454,16 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::MUL, MVT::v8i16, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); + // Custom handling for some vector types to avoid expensive expansions + setOperationAction(ISD::SDIV, MVT::v4i16, Custom); + setOperationAction(ISD::SDIV, MVT::v8i8, Custom); + setOperationAction(ISD::UDIV, MVT::v4i16, Custom); + setOperationAction(ISD::UDIV, MVT::v8i8, Custom); setOperationAction(ISD::VSETCC, MVT::v1i64, Expand); setOperationAction(ISD::VSETCC, MVT::v2i64, Expand); + setTargetDAGCombine(ISD::INTRINSIC_VOID); + setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRL); @@ -551,9 +558,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); - // FIXME: Shouldn't need this, since no register is used, but the legalizer - // doesn't yet know how to not do that for SjLj. - setExceptionSelectorRegister(ARM::R0); + setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); + setExceptionPointerRegister(ARM::R0); + setExceptionSelectorRegister(ARM::R1); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use // the default expansion. @@ -856,6 +864,23 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; + case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; + case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; + case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; + case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; + case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; + case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; + case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; + case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; + case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; + case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; + case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; + case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; + case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; + case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; + case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; + case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; + case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; } } @@ -2813,12 +2838,46 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); EVT SrcVT = Tmp1.getValueType(); - SDValue AbsVal = DAG.getNode(ISD::FABS, dl, VT, Tmp0); - SDValue ARMcc = DAG.getConstant(ARMCC::LT, MVT::i32); - SDValue FP0 = DAG.getConstantFP(0.0, SrcVT); - SDValue Cmp = getVFPCmp(Tmp1, FP0, DAG, dl); + bool F2IisFast = Subtarget->isCortexA9() || + Tmp0.getOpcode() == ISD::BITCAST || Tmp0.getOpcode() == ARMISD::VMOVDRR; + + // Bitcast operand 1 to i32. + if (SrcVT == MVT::f64) + Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), + &Tmp1, 1).getValue(1); + Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); + + // If float to int conversion isn't going to be super expensive, then simply + // or in the signbit. + if (F2IisFast) { + SDValue Mask1 = DAG.getConstant(0x80000000, MVT::i32); + SDValue Mask2 = DAG.getConstant(0x7fffffff, MVT::i32); + Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); + if (VT == MVT::f32) { + Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, + DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); + return DAG.getNode(ISD::BITCAST, dl, MVT::f32, + DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); + } + + // f64: Or the high part with signbit and then combine two parts. + Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), + &Tmp0, 1); + SDValue Lo = Tmp0.getValue(0); + SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); + Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); + return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); + } + + // Remove the signbit of operand 0. + Tmp0 = DAG.getNode(ISD::FABS, dl, VT, Tmp0); + + // If operand 1 signbit is one, then negate operand 0. + SDValue ARMcc; + SDValue Cmp = getARMCmp(Tmp1, DAG.getConstant(0, MVT::i32), + ISD::SETLT, ARMcc, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); - return DAG.getNode(ARMISD::CNEG, dl, VT, AbsVal, AbsVal, ARMcc, CCR, Cmp); + return DAG.getNode(ARMISD::CNEG, dl, VT, Tmp0, Tmp0, ARMcc, CCR, Cmp); } SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ @@ -4231,6 +4290,181 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(NewOpc, DL, VT, Op0, Op1); } +static SDValue +LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) { + // Convert to float + // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); + // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); + X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); + Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); + X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); + Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); + // Get reciprocal estimate. + // float4 recip = vrecpeq_f32(yf); + Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), Y); + // Because char has a smaller range than uchar, we can actually get away + // without any newton steps. This requires that we use a weird bias + // of 0xb000, however (again, this has been exhaustively tested). + // float4 result = as_float4(as_int4(xf*recip) + 0xb000); + X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); + X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); + Y = DAG.getConstant(0xb000, MVT::i32); + Y = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Y, Y, Y, Y); + X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); + X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); + // Convert back to short. + X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); + X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); + return X; +} + +static SDValue +LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) { + SDValue N2; + // Convert to float. + // float4 yf = vcvt_f32_s32(vmovl_s16(y)); + // float4 xf = vcvt_f32_s32(vmovl_s16(x)); + N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); + N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); + N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); + N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); + + // Use reciprocal estimate and one refinement step. + // float4 recip = vrecpeq_f32(yf); + // recip *= vrecpsq_f32(yf, recip); + N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1); + N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32), + N1, N2); + N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); + // Because short has a smaller range than ushort, we can actually get away + // with only a single newton step. This requires that we use a weird bias + // of 89, however (again, this has been exhaustively tested). + // float4 result = as_float4(as_int4(xf*recip) + 89); + N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); + N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); + N1 = DAG.getConstant(89, MVT::i32); + N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1); + N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); + N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); + // Convert back to integer and return. + // return vmovn_s32(vcvt_s32_f32(result)); + N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); + N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); + return N0; +} + +static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + assert((VT == MVT::v4i16 || VT == MVT::v8i8) && + "unexpected type for custom-lowering ISD::SDIV"); + + DebugLoc dl = Op.getDebugLoc(); + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + SDValue N2, N3; + + if (VT == MVT::v8i8) { + N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); + N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); + + N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, + DAG.getIntPtrConstant(4)); + N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, + DAG.getIntPtrConstant(4)); + N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, + DAG.getIntPtrConstant(0)); + N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, + DAG.getIntPtrConstant(0)); + + N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 + N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 + + N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); + N0 = LowerCONCAT_VECTORS(N0, DAG); + + N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); + return N0; + } + return LowerSDIV_v4i16(N0, N1, dl, DAG); +} + +static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + assert((VT == MVT::v4i16 || VT == MVT::v8i8) && + "unexpected type for custom-lowering ISD::UDIV"); + + DebugLoc dl = Op.getDebugLoc(); + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + SDValue N2, N3; + + if (VT == MVT::v8i8) { + N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); + N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); + + N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, + DAG.getIntPtrConstant(4)); + N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, + DAG.getIntPtrConstant(4)); + N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, + DAG.getIntPtrConstant(0)); + N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, + DAG.getIntPtrConstant(0)); + + N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 + N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 + + N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); + N0 = LowerCONCAT_VECTORS(N0, DAG); + + N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, + DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, MVT::i32), + N0); + return N0; + } + + // v4i16 sdiv ... Convert to float. + // float4 yf = vcvt_f32_s32(vmovl_u16(y)); + // float4 xf = vcvt_f32_s32(vmovl_u16(x)); + N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); + N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); + N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); + N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); + + // Use reciprocal estimate and two refinement steps. + // float4 recip = vrecpeq_f32(yf); + // recip *= vrecpsq_f32(yf, recip); + // recip *= vrecpsq_f32(yf, recip); + N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1); + N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32), + N1, N2); + N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); + N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32), + N1, N2); + N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); + // Simply multiplying by the reciprocal estimate can leave us a few ulps + // too low, so we add 2 ulps (exhaustive testing shows that this is enough, + // and that it will never cause us to return an answer too large). + // float4 result = as_float4(as_int4(xf*recip) + 89); + N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); + N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); + N1 = DAG.getConstant(2, MVT::i32); + N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1); + N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); + N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); + // Convert back to integer and return. + // return vmovn_u32(vcvt_s32_f32(result)); + N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); + N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); + return N0; +} + SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Don't know how to custom lower this!"); @@ -4275,6 +4509,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); + case ISD::SDIV: return LowerSDIV(Op, DAG); + case ISD::UDIV: return LowerUDIV(Op, DAG); } return SDValue(); } @@ -4324,7 +4560,7 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI, default: llvm_unreachable("unsupported size for AtomicCmpSwap!"); case 1: ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB; - strOpc = isThumb2 ? ARM::t2LDREXB : ARM::STREXB; + strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB; break; case 2: ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH; @@ -5209,6 +5445,138 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { DAG.getUNDEF(VT), NewMask.data()); } +/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and +/// NEON load/store intrinsics to merge base address updates. +static SDValue CombineBaseUpdate(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || + N->getOpcode() == ISD::INTRINSIC_W_CHAIN); + unsigned AddrOpIdx = (isIntrinsic ? 2 : 1); + SDValue Addr = N->getOperand(AddrOpIdx); + + // Search for a use of the address operand that is an increment. + for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), + UE = Addr.getNode()->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + if (User->getOpcode() != ISD::ADD || + UI.getUse().getResNo() != Addr.getResNo()) + continue; + + // Check that the add is independent of the load/store. Otherwise, folding + // it would create a cycle. + if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) + continue; + + // Find the new opcode for the updating load/store. + bool isLoad = true; + bool isLaneOp = false; + unsigned NewOpc = 0; + unsigned NumVecs = 0; + if (isIntrinsic) { + unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: assert(0 && "unexpected intrinsic for Neon base update"); + case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; + NumVecs = 1; break; + case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; + NumVecs = 2; break; + case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; + NumVecs = 3; break; + case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; + NumVecs = 4; break; + case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; + NumVecs = 2; isLaneOp = true; break; + case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; + NumVecs = 3; isLaneOp = true; break; + case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; + NumVecs = 4; isLaneOp = true; break; + case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; + NumVecs = 1; isLoad = false; break; + case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; + NumVecs = 2; isLoad = false; break; + case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; + NumVecs = 3; isLoad = false; break; + case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; + NumVecs = 4; isLoad = false; break; + case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; + NumVecs = 2; isLoad = false; isLaneOp = true; break; + case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; + NumVecs = 3; isLoad = false; isLaneOp = true; break; + case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; + NumVecs = 4; isLoad = false; isLaneOp = true; break; + } + } else { + isLaneOp = true; + switch (N->getOpcode()) { + default: assert(0 && "unexpected opcode for Neon base update"); + case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; + case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; + case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; + } + } + + // Find the size of memory referenced by the load/store. + EVT VecTy; + if (isLoad) + VecTy = N->getValueType(0); + else + VecTy = N->getOperand(AddrOpIdx+1).getValueType(); + unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; + if (isLaneOp) + NumBytes /= VecTy.getVectorNumElements(); + + // If the increment is a constant, it must match the memory ref size. + SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); + if (ConstantSDNode *CInc = dyn_cast(Inc.getNode())) { + uint64_t IncVal = CInc->getZExtValue(); + if (IncVal != NumBytes) + continue; + } else if (NumBytes >= 3 * 16) { + // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two + // separate instructions that make it harder to use a non-constant update. + continue; + } + + // Create the new updating load/store node. + EVT Tys[6]; + unsigned NumResultVecs = (isLoad ? NumVecs : 0); + unsigned n; + for (n = 0; n < NumResultVecs; ++n) + Tys[n] = VecTy; + Tys[n++] = MVT::i32; + Tys[n] = MVT::Other; + SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs+2); + SmallVector Ops; + Ops.push_back(N->getOperand(0)); // incoming chain + Ops.push_back(N->getOperand(AddrOpIdx)); + Ops.push_back(Inc); + for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) { + Ops.push_back(N->getOperand(i)); + } + MemIntrinsicSDNode *MemInt = cast(N); + SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, N->getDebugLoc(), SDTys, + Ops.data(), Ops.size(), + MemInt->getMemoryVT(), + MemInt->getMemOperand()); + + // Update the uses. + std::vector NewResults; + for (unsigned i = 0; i < NumResultVecs; ++i) { + NewResults.push_back(SDValue(UpdN.getNode(), i)); + } + NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain + DCI.CombineTo(N, NewResults); + DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); + + break; + } + return SDValue(); +} + /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and @@ -5719,10 +6087,40 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); case ISD::SELECT_CC: return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget); + case ARMISD::VLD2DUP: + case ARMISD::VLD3DUP: + case ARMISD::VLD4DUP: + return CombineBaseUpdate(N, DCI); + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: + switch (cast(N->getOperand(1))->getZExtValue()) { + case Intrinsic::arm_neon_vld1: + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: + return CombineBaseUpdate(N, DCI); + default: break; + } + break; } return SDValue(); } +bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, + EVT VT) const { + return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); +} + bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const { if (!Subtarget->allowsUnalignedMem()) return false;