X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86ISelLowering.cpp;h=7df10980fd9d7930f894740094bfd463de37b289;hb=44c2d61b6703469a95fcd2d5397c5d09a67e75c1;hp=c2a81c984ff28692053967be711cd22f698b88c2;hpb=2a33cec66a5d1b755e2bb045bdac1690fdcff19e;p=oota-llvm.git diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index c2a81c984ff..7df10980fd9 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -66,7 +66,7 @@ static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, DebugLoc dl) { EVT VT = Vec.getValueType(); - assert(VT.getSizeInBits() == 256 && "Unexpected vector size!"); + assert(VT.is256BitVector() && "Unexpected vector size!"); EVT ElVT = VT.getVectorElementType(); unsigned Factor = VT.getSizeInBits()/128; EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, @@ -85,7 +85,7 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) * ElemsPerChunk); - SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); + SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); @@ -105,7 +105,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, return Result; EVT VT = Vec.getValueType(); - assert(VT.getSizeInBits() == 128 && "Unexpected vector size!"); + assert(VT.is128BitVector() && "Unexpected vector size!"); EVT ElVT = VT.getVectorElementType(); EVT ResultVT = Result.getValueType(); @@ -118,7 +118,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) * ElemsPerChunk); - SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); + SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); } @@ -161,7 +161,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; RegInfo = TM.getRegisterInfo(); - TD = getTargetData(); + TD = getDataLayout(); // Set up the TargetLowering object. static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; @@ -182,6 +182,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setSchedulingPreference(Sched::RegPressure); setStackPointerRegisterToSaveRestore(X86StackPtr); + // Bypass i32 with i8 on Atom when compiling with O2 + if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) + addBypassSlowDiv(32, 8); + if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { // Setup Windows compiler runtime calls. setLibcallName(RTLIB::SDIV_I64, "_alldiv"); @@ -510,6 +514,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom); } if (Subtarget->hasCmpxchg16b()) { @@ -643,7 +651,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); if (!TM.Options.UnsafeFPMath) { + setOperationAction(ISD::FSIN , MVT::f32 , Expand); setOperationAction(ISD::FSIN , MVT::f64 , Expand); + setOperationAction(ISD::FCOS , MVT::f32 , Expand); setOperationAction(ISD::FCOS , MVT::f64 , Expand); } addLegalFPImmediate(APFloat(+0.0)); // FLD0 @@ -735,6 +745,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FFLOOR, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); @@ -824,12 +835,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FDIV, MVT::v4f32, Legal); setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); setOperationAction(ISD::FNEG, MVT::v4f32, Custom); + setOperationAction(ISD::FABS, MVT::v4f32, Custom); setOperationAction(ISD::LOAD, MVT::v4f32, Legal); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); - setOperationAction(ISD::SETCC, MVT::v4f32, Custom); } if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { @@ -858,6 +869,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FDIV, MVT::v2f64, Legal); setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); + setOperationAction(ISD::FABS, MVT::v2f64, Custom); setOperationAction(ISD::SETCC, MVT::v2i64, Custom); setOperationAction(ISD::SETCC, MVT::v16i8, Custom); @@ -870,27 +882,18 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); - // Custom lower build_vector, vector_shuffle, and extract_vector_elt. for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { - EVT VT = (MVT::SimpleValueType)i; + MVT VT = (MVT::SimpleValueType)i; // Do not attempt to custom lower non-power-of-2 vectors if (!isPowerOf2_32(VT.getVectorNumElements())) continue; // Do not attempt to custom lower non-128-bit vectors if (!VT.is128BitVector()) continue; - setOperationAction(ISD::BUILD_VECTOR, - VT.getSimpleVT().SimpleTy, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, - VT.getSimpleVT().SimpleTy, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, - VT.getSimpleVT().SimpleTy, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); @@ -907,23 +910,22 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { - MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; - EVT VT = SVT; + MVT VT = (MVT::SimpleValueType)i; // Do not attempt to promote non-128-bit vectors if (!VT.is128BitVector()) continue; - setOperationAction(ISD::AND, SVT, Promote); - AddPromotedToType (ISD::AND, SVT, MVT::v2i64); - setOperationAction(ISD::OR, SVT, Promote); - AddPromotedToType (ISD::OR, SVT, MVT::v2i64); - setOperationAction(ISD::XOR, SVT, Promote); - AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); - setOperationAction(ISD::LOAD, SVT, Promote); - AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); - setOperationAction(ISD::SELECT, SVT, Promote); - AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); + setOperationAction(ISD::AND, VT, Promote); + AddPromotedToType (ISD::AND, VT, MVT::v2i64); + setOperationAction(ISD::OR, VT, Promote); + AddPromotedToType (ISD::OR, VT, MVT::v2i64); + setOperationAction(ISD::XOR, VT, Promote); + AddPromotedToType (ISD::XOR, VT, MVT::v2i64); + setOperationAction(ISD::LOAD, VT, Promote); + AddPromotedToType (ISD::LOAD, VT, MVT::v2i64); + setOperationAction(ISD::SELECT, VT, Promote); + AddPromotedToType (ISD::SELECT, VT, MVT::v2i64); } setTruncStoreAction(MVT::f64, MVT::f32, Expand); @@ -936,6 +938,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + + setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); + setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); + + setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); } if (Subtarget->hasSSE41()) { @@ -950,6 +957,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FRINT, MVT::f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); + setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); + // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); @@ -1010,9 +1020,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } } - if (Subtarget->hasSSE42()) - setOperationAction(ISD::SETCC, MVT::v2i64, Custom); - if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) { addRegisterClass(MVT::v32i8, &X86::VR256RegClass); addRegisterClass(MVT::v16i16, &X86::VR256RegClass); @@ -1030,25 +1037,24 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FMUL, MVT::v8f32, Legal); setOperationAction(ISD::FDIV, MVT::v8f32, Legal); setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal); setOperationAction(ISD::FNEG, MVT::v8f32, Custom); + setOperationAction(ISD::FABS, MVT::v8f32, Custom); setOperationAction(ISD::FADD, MVT::v4f64, Legal); setOperationAction(ISD::FSUB, MVT::v4f64, Legal); setOperationAction(ISD::FMUL, MVT::v4f64, Legal); setOperationAction(ISD::FDIV, MVT::v4f64, Legal); setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); + setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); setOperationAction(ISD::FNEG, MVT::v4f64, Custom); + setOperationAction(ISD::FABS, MVT::v4f64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f64, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i64, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i8, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i16, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal); setOperationAction(ISD::SRL, MVT::v16i16, Custom); setOperationAction(ISD::SRL, MVT::v32i8, Custom); @@ -1073,7 +1079,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); - if (Subtarget->hasFMA()) { + if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { setOperationAction(ISD::FMA, MVT::v8f32, Custom); setOperationAction(ISD::FMA, MVT::v4f64, Custom); setOperationAction(ISD::FMA, MVT::v4f32, Custom); @@ -1081,6 +1087,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FMA, MVT::f32, Custom); setOperationAction(ISD::FMA, MVT::f64, Custom); } + if (Subtarget->hasAVX2()) { setOperationAction(ISD::ADD, MVT::v4i64, Legal); setOperationAction(ISD::ADD, MVT::v8i32, Legal); @@ -1134,45 +1141,44 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // Custom lower several nodes for 256-bit types. for (int i = MVT::FIRST_VECTOR_VALUETYPE; i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { - MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; - EVT VT = SVT; + MVT VT = (MVT::SimpleValueType)i; // Extract subvector is special because the value type // (result) is 128-bit but the source is 256-bit wide. if (VT.is128BitVector()) - setOperationAction(ISD::EXTRACT_SUBVECTOR, SVT, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); // Do not attempt to custom lower other non-256-bit vectors if (!VT.is256BitVector()) continue; - setOperationAction(ISD::BUILD_VECTOR, SVT, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, SVT, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, SVT, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, SVT, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, SVT, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, SVT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); } // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. for (int i = MVT::v32i8; i != MVT::v4i64; ++i) { - MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; - EVT VT = SVT; + MVT VT = (MVT::SimpleValueType)i; // Do not attempt to promote non-256-bit vectors if (!VT.is256BitVector()) continue; - setOperationAction(ISD::AND, SVT, Promote); - AddPromotedToType (ISD::AND, SVT, MVT::v4i64); - setOperationAction(ISD::OR, SVT, Promote); - AddPromotedToType (ISD::OR, SVT, MVT::v4i64); - setOperationAction(ISD::XOR, SVT, Promote); - AddPromotedToType (ISD::XOR, SVT, MVT::v4i64); - setOperationAction(ISD::LOAD, SVT, Promote); - AddPromotedToType (ISD::LOAD, SVT, MVT::v4i64); - setOperationAction(ISD::SELECT, SVT, Promote); - AddPromotedToType (ISD::SELECT, SVT, MVT::v4i64); + setOperationAction(ISD::AND, VT, Promote); + AddPromotedToType (ISD::AND, VT, MVT::v4i64); + setOperationAction(ISD::OR, VT, Promote); + AddPromotedToType (ISD::OR, VT, MVT::v4i64); + setOperationAction(ISD::XOR, VT, Promote); + AddPromotedToType (ISD::XOR, VT, MVT::v4i64); + setOperationAction(ISD::LOAD, VT, Promote); + AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); + setOperationAction(ISD::SELECT, VT, Promote); + AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); } } @@ -1339,7 +1345,7 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, // cases like PR2962. This should be removed when PR2962 is fixed. const Function *F = MF.getFunction(); if (IsZeroVal && - !F->hasFnAttr(Attribute::NoImplicitFloat)) { + !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) { if (Size >= 16 && (Subtarget->isUnalignedMemAccessFast() || ((DstAlign == 0 || DstAlign >= 16) && @@ -1902,9 +1908,9 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, RC = &X86::FR32RegClass; else if (RegVT == MVT::f64) RC = &X86::FR64RegClass; - else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) + else if (RegVT.is256BitVector()) RC = &X86::VR256RegClass; - else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) + else if (RegVT.is128BitVector()) RC = &X86::VR128RegClass; else if (RegVT == MVT::x86mmx) RC = &X86::VR64RegClass; @@ -2007,7 +2013,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, TotalNumIntRegs); - bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); + bool NoImplicitFloatOps = Fn->getFnAttributes(). + hasAttribute(Attributes::NoImplicitFloat); assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && "SSE register cannot be used when SSE is disabled!"); assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && @@ -2199,7 +2206,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, SR != NotStructReturn, - MF.getFunction()->hasStructRetAttr(), + MF.getFunction()->hasStructRetAttr(), CLI.RetTy, Outs, OutVals, Ins, DAG); // Sibcalls are automatically detected tailcalls which do not require @@ -2282,7 +2289,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); break; case CCValAssign::AExt: - if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { + if (RegVT.is128BitVector()) { // Special case: passing MMX values in XMM registers. Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); @@ -2483,7 +2490,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, OpFlags = X86II::MO_DARWIN_STUB; } else if (Subtarget->isPICStyleRIPRel() && isa(GV) && - cast(GV)->hasFnAttr(Attribute::NonLazyBind)) { + cast(GV)->getFnAttributes(). + hasAttribute(Attributes::NonLazyBind)) { // If the function is marked as non-lazy, generate an indirect call // which loads from the GOT directly. This avoids runtime overhead // at the cost of eager binding (and one extra byte of encoding). @@ -2719,6 +2727,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, bool isVarArg, bool isCalleeStructRet, bool isCallerStructRet, + Type *RetTy, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, @@ -2730,6 +2739,13 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // If -tailcallopt is specified, make fastcc functions tail-callable. const MachineFunction &MF = DAG.getMachineFunction(); const Function *CallerF = DAG.getMachineFunction().getFunction(); + + // If the function return type is x86_fp80 and the callee return type is not, + // then the FP_EXTEND of the call result is not a nop. It's not safe to + // perform a tailcall optimization here. + if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) + return false; + CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; @@ -2853,7 +2869,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); const X86InstrInfo *TII = - ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); + ((const X86TargetMachine&)getTargetMachine()).getInstrInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; @@ -3425,11 +3441,11 @@ static bool isSHUFPMask(ArrayRef Mask, EVT VT, bool HasAVX, /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVHLPS. static bool isMOVHLPSMask(ArrayRef Mask, EVT VT) { - unsigned NumElems = VT.getVectorNumElements(); - - if (VT.getSizeInBits() != 128) + if (!VT.is128BitVector()) return false; + unsigned NumElems = VT.getVectorNumElements(); + if (NumElems != 4) return false; @@ -3444,11 +3460,11 @@ static bool isMOVHLPSMask(ArrayRef Mask, EVT VT) { /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, /// <2, 3, 2, 3> static bool isMOVHLPS_v_undef_Mask(ArrayRef Mask, EVT VT) { - unsigned NumElems = VT.getVectorNumElements(); - - if (VT.getSizeInBits() != 128) + if (!VT.is128BitVector()) return false; + unsigned NumElems = VT.getVectorNumElements(); + if (NumElems != 4) return false; @@ -3461,7 +3477,7 @@ static bool isMOVHLPS_v_undef_Mask(ArrayRef Mask, EVT VT) { /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. static bool isMOVLPMask(ArrayRef Mask, EVT VT) { - if (VT.getSizeInBits() != 128) + if (!VT.is128BitVector()) return false; unsigned NumElems = VT.getVectorNumElements(); @@ -3483,10 +3499,12 @@ static bool isMOVLPMask(ArrayRef Mask, EVT VT) { /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVLHPS. static bool isMOVLHPSMask(ArrayRef Mask, EVT VT) { + if (!VT.is128BitVector()) + return false; + unsigned NumElems = VT.getVectorNumElements(); - if ((NumElems != 2 && NumElems != 4) - || VT.getSizeInBits() > 128) + if (NumElems != 2 && NumElems != 4) return false; for (unsigned i = 0, e = NumElems/2; i != e; ++i) @@ -3525,25 +3543,26 @@ SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i])) MatchOddMask = false; } - static const int CompactionMaskEven[] = {0, 2, -1, -1, 4, 6, -1, -1}; - static const int CompactionMaskOdd [] = {1, 3, -1, -1, 5, 7, -1, -1}; - const int *CompactionMask; - if (MatchEvenMask) - CompactionMask = CompactionMaskEven; - else if (MatchOddMask) - CompactionMask = CompactionMaskOdd; - else + if (!MatchEvenMask && !MatchOddMask) return SDValue(); SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT); - SDValue Op0 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(0), - UndefNode, CompactionMask); - SDValue Op1 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(1), - UndefNode, CompactionMask); - static const int UnpackMask[] = {0, 8, 1, 9, 4, 12, 5, 13}; - return DAG.getVectorShuffle(VT, dl, Op0, Op1, UnpackMask); + SDValue Op0 = SVOp->getOperand(0); + SDValue Op1 = SVOp->getOperand(1); + + if (MatchEvenMask) { + // Shift the second operand right to 32 bits. + static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 }; + Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask); + } else { + // Shift the first operand left to 32 bits. + static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 }; + Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask); + } + static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15}; + return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask); } /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand @@ -3703,7 +3722,7 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef Mask, EVT VT, bool HasAVX2) { static bool isMOVLMask(ArrayRef Mask, EVT VT) { if (VT.getVectorElementType().getSizeInBits() < 32) return false; - if (VT.getSizeInBits() == 256) + if (!VT.is128BitVector()) return false; unsigned NumElts = VT.getVectorNumElements(); @@ -3725,7 +3744,7 @@ static bool isMOVLMask(ArrayRef Mask, EVT VT) { /// The first half comes from the second half of V1 and the second half from the /// the second half of V2. static bool isVPERM2X128Mask(ArrayRef Mask, EVT VT, bool HasAVX) { - if (!HasAVX || VT.getSizeInBits() != 256) + if (!HasAVX || !VT.is256BitVector()) return false; // The shuffle result is divided into half A and half B. In total the two @@ -3817,9 +3836,10 @@ static bool isVPERMILPMask(ArrayRef Mask, EVT VT, bool HasAVX) { /// element of vector 2 and the other elements to come from vector 1 in order. static bool isCommutedMOVLMask(ArrayRef Mask, EVT VT, bool V2IsSplat = false, bool V2IsUndef = false) { - unsigned NumOps = VT.getVectorNumElements(); - if (VT.getSizeInBits() == 256) + if (!VT.is128BitVector()) return false; + + unsigned NumOps = VT.getVectorNumElements(); if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) return false; @@ -3885,9 +3905,11 @@ static bool isMOVSLDUPMask(ArrayRef Mask, EVT VT, /// specifies a shuffle of elements that is suitable for input to 256-bit /// version of MOVDDUP. static bool isMOVDDUPYMask(ArrayRef Mask, EVT VT, bool HasAVX) { - unsigned NumElts = VT.getVectorNumElements(); + if (!HasAVX || !VT.is256BitVector()) + return false; - if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4) + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts != 4) return false; for (unsigned i = 0; i != NumElts/2; ++i) @@ -3903,7 +3925,7 @@ static bool isMOVDDUPYMask(ArrayRef Mask, EVT VT, bool HasAVX) { /// specifies a shuffle of elements that is suitable for input to 128-bit /// version of MOVDDUP. static bool isMOVDDUPMask(ArrayRef Mask, EVT VT) { - if (VT.getSizeInBits() != 128) + if (!VT.is128BitVector()) return false; unsigned e = VT.getVectorNumElements() / 2; @@ -4148,7 +4170,7 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, /// V1 (and in order), and the upper half elements should come from the upper /// half of V2 (and in order). static bool ShouldXformToMOVHLPS(ArrayRef Mask, EVT VT) { - if (VT.getSizeInBits() != 128) + if (!VT.is128BitVector()) return false; if (VT.getVectorNumElements() != 4) return false; @@ -4205,7 +4227,7 @@ static bool WillBeConstantPoolLoad(SDNode *N) { /// MOVLP, it must be either a vector load or a scalar load to vector. static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, ArrayRef Mask, EVT VT) { - if (VT.getSizeInBits() != 128) + if (!VT.is128BitVector()) return false; if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) @@ -4747,7 +4769,7 @@ static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { // Although the logic below support any bitwidth size, there are no // shift instructions which handle more than 128-bit vectors. - if (SVOp->getValueType(0).getSizeInBits() > 128) + if (!SVOp->getValueType(0).is128BitVector()) return false; if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || @@ -4842,7 +4864,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, DebugLoc dl) { - assert(VT.getSizeInBits() == 128 && "Unknown type for VShift"); + assert(VT.is128BitVector() && "Unknown type for VShift"); EVT ShVT = MVT::v2i64; unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); @@ -4993,6 +5015,18 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl &Elts, LDBase->getAlignment(), false/*isVolatile*/, true/*ReadMem*/, false/*WriteMem*/); + + // Make sure the newly-created LOAD is in the same position as LDBase in + // terms of dependency. We create a TokenFactor for LDBase and ResNode, and + // update uses of LDBase's output chain to use the TokenFactor. + if (LDBase->hasAnyUseOfValue(1)) { + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), + SDValue(ResNode.getNode(), 1)); + } + return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); } return SDValue(); @@ -5006,7 +5040,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl &Elts, /// The VBROADCAST node is returned when a pattern is found, /// or SDValue() otherwise. SDValue -X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { +X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { if (!Subtarget->hasAVX()) return SDValue(); @@ -5075,7 +5109,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { } } - bool Is256 = VT.getSizeInBits() == 256; + bool Is256 = VT.is256BitVector(); // Handle the broadcasting a single constant scalar from the constant pool // into a vector. On Sandybridge it is still better to load a constant vector @@ -5237,12 +5271,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || (ExtVT == MVT::i64 && Subtarget->is64Bit())) { - if (VT.getSizeInBits() == 256) { + if (VT.is256BitVector()) { SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, Item, DAG.getIntPtrConstant(0)); } - assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); + assert(VT.is128BitVector() && "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); @@ -5251,11 +5285,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); - if (VT.getSizeInBits() == 256) { + if (VT.is256BitVector()) { SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); } else { - assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); + assert(VT.is128BitVector() && "Expected an SSE value type!"); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } return DAG.getNode(ISD::BITCAST, dl, VT, Item); @@ -5315,7 +5349,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // For AVX-length vectors, build the individual 128-bit pieces and use // shuffles to put them in place. - if (VT.getSizeInBits() == 256) { + if (VT.is256BitVector()) { SmallVector V; for (unsigned i = 0; i != NumElems; ++i) V.push_back(Op.getOperand(i)); @@ -5396,7 +5430,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); } - if (Values.size() > 1 && VT.getSizeInBits() == 128) { + if (Values.size() > 1 && VT.is128BitVector()) { // Check for a build vector of consecutive loads. for (unsigned i = 0; i < NumElems; ++i) V[i] = Op.getOperand(i); @@ -5457,39 +5491,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -// LowerMMXCONCAT_VECTORS - We support concatenate two MMX registers and place -// them in a MMX register. This is better than doing a stack convert. -static SDValue LowerMMXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { - DebugLoc dl = Op.getDebugLoc(); - EVT ResVT = Op.getValueType(); - - assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || - ResVT == MVT::v8i16 || ResVT == MVT::v16i8); - int Mask[2]; - SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0)); - SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); - InVec = Op.getOperand(1); - if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { - unsigned NumElts = ResVT.getVectorNumElements(); - VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); - VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, - InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); - } else { - InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec); - SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); - Mask[0] = 0; Mask[1] = 2; - VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); - } - return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); -} - // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction // to create 256-bit vectors from two other 128-bit ones. static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); EVT ResVT = Op.getValueType(); - assert(ResVT.getSizeInBits() == 256 && "Value type must be 256-bit wide"); + assert(ResVT.is256BitVector() && "Value type must be 256-bit wide"); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); @@ -5498,18 +5506,8 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } -SDValue -X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { - EVT ResVT = Op.getValueType(); - +static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { assert(Op.getNumOperands() == 2); - assert((ResVT.getSizeInBits() == 128 || ResVT.getSizeInBits() == 256) && - "Unsupported CONCAT_VECTORS for value type"); - - // We support concatenate two MMX registers and place them in a MMX register. - // This is better than doing a stack convert. - if (ResVT.is128BitVector()) - return LowerMMXCONCAT_VECTORS(Op, DAG); // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors // from two other 128-bit ones. @@ -5517,9 +5515,9 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { } // Try to lower a shuffle node into a simple blend instruction. -static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +static SDValue +LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); DebugLoc dl = SVOp->getDebugLoc(); @@ -5589,9 +5587,9 @@ static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, // 2. [ssse3] 1 x pshufb // 3. [ssse3] 2 x pshufb + 1 x por // 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) -SDValue -X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, - SelectionDAG &DAG) const { +static SDValue +LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast(Op); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); @@ -5848,8 +5846,6 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, DebugLoc dl = SVOp->getDebugLoc(); ArrayRef MaskVals = SVOp->getMask(); - bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; - // If we have SSSE3, case 1 is generated when all result bytes come from // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is // present, fall back to case 3. @@ -5873,7 +5869,11 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8, &pshufbMask[0], 16)); - if (V2IsUndef) + + // As PSHUFB will zero elements with negative indices, it's safe to ignore + // the 2nd operand if it's undefined or zero. + if (V2.getOpcode() == ISD::UNDEF || + ISD::isBuildVectorAllZeros(V2.getNode())) return V1; // Calculate the shuffle mask for the second input, shuffle it, and @@ -5959,6 +5959,51 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); } +// v32i8 shuffles - Translate to VPSHUFB if possible. +static +SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + EVT VT = SVOp->getValueType(0); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + DebugLoc dl = SVOp->getDebugLoc(); + SmallVector MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); + + bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; + bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode()); + + // VPSHUFB may be generated if + // (1) one of input vector is undefined or zeroinitializer. + // The mask value 0x80 puts 0 in the corresponding slot of the vector. + // And (2) the mask indexes don't cross the 128-bit lane. + if (VT != MVT::v32i8 || !Subtarget->hasAVX2() || + (!V2IsUndef && !V2IsAllZero && !V1IsAllZero)) + return SDValue(); + + if (V1IsAllZero && !V2IsAllZero) { + CommuteVectorShuffleMask(MaskVals, 32); + V1 = V2; + } + SmallVector pshufbMask; + for (unsigned i = 0; i != 32; i++) { + int EltIdx = MaskVals[i]; + if (EltIdx < 0 || EltIdx >= 32) + EltIdx = 0x80; + else { + if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16)) + // Cross lane is not allowed. + return SDValue(); + EltIdx &= 0xf; + } + pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); + } + return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, dl, + MVT::v32i8, &pshufbMask[0], 32)); +} + /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be /// done when every pair / quad of shuffle mask elements point to elements in @@ -6159,7 +6204,7 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { DebugLoc dl = SVOp->getDebugLoc(); EVT VT = SVOp->getValueType(0); - assert(VT.getSizeInBits() == 128 && "Unsupported vector size"); + assert(VT.is128BitVector() && "Unsupported vector size"); std::pair Locs[4]; int Mask1[] = { -1, -1, -1, -1 }; @@ -6505,7 +6550,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { bool HasAVX = Subtarget->hasAVX(); bool HasAVX2 = Subtarget->hasAVX2(); MachineFunction &MF = DAG.getMachineFunction(); - bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); + bool OptForSize = MF.getFunction()->getFnAttributes(). + hasAttribute(Attributes::OptimizeForSize); assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); @@ -6774,7 +6820,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // Handle v8i16 specifically since SSE can do byte extraction and insertion. if (VT == MVT::v8i16) { - SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); + SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG); if (NewOp.getNode()) return NewOp; } @@ -6785,9 +6831,15 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return NewOp; } + if (VT == MVT::v32i8) { + SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG); + if (NewOp.getNode()) + return NewOp; + } + // Handle all 128-bit wide vectors with 4 elements, and match them with // several different shuffle types. - if (NumElems == 4 && VT.getSizeInBits() == 128) + if (NumElems == 4 && VT.is128BitVector()) return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); // Handle general 256-bit shuffles @@ -6803,14 +6855,14 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); - if (Op.getOperand(0).getValueType().getSizeInBits() != 128) + if (!Op.getOperand(0).getValueType().is128BitVector()) return SDValue(); if (VT.getSizeInBits() == 8) { SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, - Op.getOperand(0), Op.getOperand(1)); + Op.getOperand(0), Op.getOperand(1)); SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, - DAG.getValueType(VT)); + DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); } @@ -6825,9 +6877,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, Op.getOperand(0)), Op.getOperand(1))); SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, - Op.getOperand(0), Op.getOperand(1)); + Op.getOperand(0), Op.getOperand(1)); SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, - DAG.getValueType(VT)); + DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); } @@ -6873,7 +6925,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // If this is a 256-bit vector result, first extract the 128-bit vector and // then extract the element from the 128-bit vector. - if (VecVT.getSizeInBits() == 256) { + if (VecVT.is256BitVector()) { DebugLoc dl = Op.getNode()->getDebugLoc(); unsigned NumElems = VecVT.getVectorNumElements(); SDValue Idx = Op.getOperand(1); @@ -6888,7 +6940,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, DAG.getConstant(IdxVal, MVT::i32)); } - assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length"); + assert(VecVT.is128BitVector() && "Unexpected vector length"); if (Subtarget->hasSSE41()) { SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); @@ -6911,9 +6963,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // Transform it so it match pextrw which produces a 32-bit result. EVT EltVT = MVT::i32; SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, - Op.getOperand(0), Op.getOperand(1)); + Op.getOperand(0), Op.getOperand(1)); SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, - DAG.getValueType(VT)); + DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); } @@ -6964,7 +7016,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SDValue N1 = Op.getOperand(1); SDValue N2 = Op.getOperand(2); - if (VT.getSizeInBits() == 256) + if (!VT.is128BitVector()) return SDValue(); if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && @@ -7020,7 +7072,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { // If this is a 256-bit vector result, first extract the 128-bit vector, // insert the element into the extracted half and then place it back. - if (VT.getSizeInBits() == 256) { + if (VT.is256BitVector()) { if (!isa(N2)) return SDValue(); @@ -7056,15 +7108,14 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue -X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { LLVMContext *Context = DAG.getContext(); DebugLoc dl = Op.getDebugLoc(); EVT OpVT = Op.getValueType(); // If this is a 256-bit vector result, first insert into a 128-bit // vector and then insert into the 256-bit vector. - if (OpVT.getSizeInBits() > 128) { + if (!OpVT.is128BitVector()) { // Insert into a 128-bit vector. EVT VT128 = EVT::getVectorVT(*Context, OpVT.getVectorElementType(), @@ -7081,7 +7132,7 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); - assert(OpVT.getSizeInBits() == 128 && "Expected an SSE type!"); + assert(OpVT.is128BitVector() && "Expected an SSE type!"); return DAG.getNode(ISD::BITCAST, dl, OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); } @@ -7089,15 +7140,15 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in // a simple subregister reference or explicit instructions to grab // upper bits of a vector. -SDValue -X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { if (Subtarget->hasAVX()) { DebugLoc dl = Op.getNode()->getDebugLoc(); SDValue Vec = Op.getNode()->getOperand(0); SDValue Idx = Op.getNode()->getOperand(1); - if (Op.getNode()->getValueType(0).getSizeInBits() == 128 && - Vec.getNode()->getValueType(0).getSizeInBits() == 256 && + if (Op.getNode()->getValueType(0).is128BitVector() && + Vec.getNode()->getValueType(0).is256BitVector() && isa(Idx)) { unsigned IdxVal = cast(Idx)->getZExtValue(); return Extract128BitVector(Vec, IdxVal, DAG, dl); @@ -7109,16 +7160,16 @@ X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a // simple superregister reference or explicit instructions to insert // the upper bits of a vector. -SDValue -X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { if (Subtarget->hasAVX()) { DebugLoc dl = Op.getNode()->getDebugLoc(); SDValue Vec = Op.getNode()->getOperand(0); SDValue SubVec = Op.getNode()->getOperand(1); SDValue Idx = Op.getNode()->getOperand(2); - if (Op.getNode()->getValueType(0).getSizeInBits() == 256 && - SubVec.getNode()->getValueType(0).getSizeInBits() == 128 && + if (Op.getNode()->getValueType(0).is256BitVector() && + SubVec.getNode()->getValueType(0).is128BitVector() && isa(Idx)) { unsigned IdxVal = cast(Idx)->getZExtValue(); return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); @@ -7253,9 +7304,10 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { Subtarget->ClassifyBlockAddressReference(); CodeModel::Model M = getTargetMachine().getCodeModel(); const BlockAddress *BA = cast(Op)->getBlockAddress(); + int64_t Offset = cast(Op)->getOffset(); DebugLoc dl = Op.getDebugLoc(); - SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), - /*isTarget=*/true, OpFlags); + SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset, + OpFlags); if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) @@ -7364,8 +7416,8 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, SDValue InFlag; DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, - DAG.getNode(X86ISD::GlobalBaseReg, - DebugLoc(), PtrVT), InFlag); + DAG.getNode(X86ISD::GlobalBaseReg, + DebugLoc(), PtrVT), InFlag); InFlag = Chain.getValue(1); return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); @@ -8082,26 +8134,49 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, return FIST; } -SDValue X86TargetLowering::LowerFABS(SDValue Op, - SelectionDAG &DAG) const { +SDValue X86TargetLowering::lowerFP_EXTEND(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + SDValue In = Op.getOperand(0); + EVT SVT = In.getValueType(); + + assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); + + return DAG.getNode(X86ISD::VFPEXT, DL, VT, + DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, + In, DAG.getUNDEF(SVT))); +} + +SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { LLVMContext *Context = DAG.getContext(); DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); EVT EltVT = VT; - if (VT.isVector()) + unsigned NumElts = VT == MVT::f64 ? 2 : 4; + if (VT.isVector()) { EltVT = VT.getVectorElementType(); - Constant *C; - if (EltVT == MVT::f64) { - C = ConstantVector::getSplat(2, - ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); - } else { - C = ConstantVector::getSplat(4, - ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); + NumElts = VT.getVectorNumElements(); } - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + Constant *C; + if (EltVT == MVT::f64) + C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); + else + C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); + C = ConstantVector::getSplat(NumElts, C); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); + unsigned Alignment = cast(CPIdx)->getAlignment(); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, false, 16); + false, false, false, Alignment); + if (VT.isVector()) { + MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(ISD::AND, dl, ANDVT, + DAG.getNode(ISD::BITCAST, dl, ANDVT, + Op.getOperand(0)), + DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask))); + } return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); } @@ -8121,12 +8196,13 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { else C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); C = ConstantVector::getSplat(NumElts, C); - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); + unsigned Alignment = cast(CPIdx)->getAlignment(); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, false, 16); + false, false, false, Alignment); if (VT.isVector()) { - MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64; + MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(ISD::XOR, dl, XORVT, DAG.getNode(ISD::BITCAST, dl, XORVT, @@ -8210,7 +8286,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); } -SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { SDValue N0 = Op.getOperand(0); DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); @@ -8221,6 +8297,98 @@ SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); } +// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able. +// +SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); + + if (!Subtarget->hasSSE41()) + return SDValue(); + + if (!Op->hasOneUse()) + return SDValue(); + + SDNode *N = Op.getNode(); + DebugLoc DL = N->getDebugLoc(); + + SmallVector Opnds; + DenseMap VecInMap; + EVT VT = MVT::Other; + + // Recognize a special case where a vector is casted into wide integer to + // test all 0s. + Opnds.push_back(N->getOperand(0)); + Opnds.push_back(N->getOperand(1)); + + for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { + SmallVector::const_iterator I = Opnds.begin() + Slot; + // BFS traverse all OR'd operands. + if (I->getOpcode() == ISD::OR) { + Opnds.push_back(I->getOperand(0)); + Opnds.push_back(I->getOperand(1)); + // Re-evaluate the number of nodes to be traversed. + e += 2; // 2 more nodes (LHS and RHS) are pushed. + continue; + } + + // Quit if a non-EXTRACT_VECTOR_ELT + if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // Quit if without a constant index. + SDValue Idx = I->getOperand(1); + if (!isa(Idx)) + return SDValue(); + + SDValue ExtractedFromVec = I->getOperand(0); + DenseMap::iterator M = VecInMap.find(ExtractedFromVec); + if (M == VecInMap.end()) { + VT = ExtractedFromVec.getValueType(); + // Quit if not 128/256-bit vector. + if (!VT.is128BitVector() && !VT.is256BitVector()) + return SDValue(); + // Quit if not the same type. + if (VecInMap.begin() != VecInMap.end() && + VT != VecInMap.begin()->first.getValueType()) + return SDValue(); + M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; + } + M->second |= 1U << cast(Idx)->getZExtValue(); + } + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Not extracted from 128-/256-bit vector."); + + unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; + SmallVector VecIns; + + for (DenseMap::const_iterator + I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { + // Quit if not all elements are used. + if (I->second != FullMask) + return SDValue(); + VecIns.push_back(I->first); + } + + EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; + + // Cast all vectors into TestVT for PTEST. + for (unsigned i = 0, e = VecIns.size(); i < e; ++i) + VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]); + + // If more than one full vectors are evaluated, OR them first before PTEST. + for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { + // Each iteration will OR 2 nodes and append the result until there is only + // 1 node left, i.e. the final OR'd value of all vectors. + SDValue LHS = VecIns[Slot]; + SDValue RHS = VecIns[Slot + 1]; + VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); + } + + return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, + VecIns.back(), VecIns.back()); +} + /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, @@ -8254,7 +8422,33 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, unsigned Opcode = 0; unsigned NumOperands = 0; - switch (Op.getNode()->getOpcode()) { + + // Truncate operations may prevent the merge of the SETCC instruction + // and the arithmetic intruction before it. Attempt to truncate the operands + // of the arithmetic instruction and use a reduced bit-width instruction. + bool NeedTruncation = false; + SDValue ArithOp = Op; + if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) { + SDValue Arith = Op->getOperand(0); + // Both the trunc and the arithmetic op need to have one user each. + if (Arith->hasOneUse()) + switch (Arith.getOpcode()) { + default: break; + case ISD::ADD: + case ISD::SUB: + case ISD::AND: + case ISD::OR: + case ISD::XOR: { + NeedTruncation = true; + ArithOp = Arith; + } + } + } + + // NOTICE: In the code below we use ArithOp to hold the arithmetic operation + // which may be the result of a CAST. We use the variable 'Op', which is the + // non-casted variable when we check for possible users. + switch (ArithOp.getOpcode()) { case ISD::ADD: // Due to an isel shortcoming, be conservative if this add is likely to be // selected as part of a load-modify-store instruction. When the root node @@ -8274,7 +8468,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, goto default_case; if (ConstantSDNode *C = - dyn_cast(Op.getNode()->getOperand(1))) { + dyn_cast(ArithOp.getNode()->getOperand(1))) { // An add of one will be selected as an INC. if (C->getAPIntValue() == 1) { Opcode = X86ISD::INC; @@ -8310,7 +8504,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC && - (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { + !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) { NonFlagUse = true; break; } @@ -8331,14 +8525,20 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, goto default_case; // Otherwise use a regular EFLAGS-setting instruction. - switch (Op.getNode()->getOpcode()) { + switch (ArithOp.getOpcode()) { default: llvm_unreachable("unexpected operator!"); - case ISD::SUB: - Opcode = X86ISD::SUB; - break; - case ISD::OR: Opcode = X86ISD::OR; break; + case ISD::SUB: Opcode = X86ISD::SUB; break; case ISD::XOR: Opcode = X86ISD::XOR; break; case ISD::AND: Opcode = X86ISD::AND; break; + case ISD::OR: { + if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { + SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG); + if (EFLAGS.getNode()) + return EFLAGS; + } + Opcode = X86ISD::OR; + break; + } } NumOperands = 2; @@ -8356,19 +8556,40 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, break; } + // If we found that truncation is beneficial, perform the truncation and + // update 'Op'. + if (NeedTruncation) { + EVT VT = Op.getValueType(); + SDValue WideVal = Op->getOperand(0); + EVT WideVT = WideVal.getValueType(); + unsigned ConvertedOp = 0; + // Use a target machine opcode to prevent further DAGCombine + // optimizations that may separate the arithmetic operations + // from the setcc node. + switch (WideVal.getOpcode()) { + default: break; + case ISD::ADD: ConvertedOp = X86ISD::ADD; break; + case ISD::SUB: ConvertedOp = X86ISD::SUB; break; + case ISD::AND: ConvertedOp = X86ISD::AND; break; + case ISD::OR: ConvertedOp = X86ISD::OR; break; + case ISD::XOR: ConvertedOp = X86ISD::XOR; break; + } + + if (ConvertedOp) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) { + SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0)); + SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1)); + Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1); + } + } + } + if (Opcode == 0) // Emit a CMP with 0, which is the TEST pattern. return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, Op.getValueType())); - if (Opcode == X86ISD::CMP) { - SDValue New = DAG.getNode(Opcode, dl, MVT::i32, Op.getOperand(0), - Op.getOperand(1)); - // We can't replace usage of SUB with CMP. - // The SUB node will be removed later because there is no use of it. - return SDValue(New.getNode(), 0); - } - SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SmallVector Ops; for (unsigned i = 0; i != NumOperands; ++i) @@ -8554,7 +8775,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); - assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::SETCC && + assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && "Unsupported value type for operation"); unsigned NumElems = VT.getVectorNumElements(); @@ -8591,10 +8812,12 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); if (isFP) { - unsigned SSECC = 8; +#ifndef NDEBUG EVT EltVT = Op0.getValueType().getVectorElementType(); - assert(EltVT == MVT::f32 || EltVT == MVT::f64); (void)EltVT; + assert(EltVT == MVT::f32 || EltVT == MVT::f64); +#endif + unsigned SSECC; bool Swap = false; // SSE Condition code mapping: @@ -8607,7 +8830,7 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { // 6 - NLE // 7 - ORD switch (SetCCOpcode) { - default: break; + default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETOEQ: case ISD::SETEQ: SSECC = 0; break; case ISD::SETOGT: @@ -8621,34 +8844,33 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { case ISD::SETUO: SSECC = 3; break; case ISD::SETUNE: case ISD::SETNE: SSECC = 4; break; - case ISD::SETULE: Swap = true; + case ISD::SETULE: Swap = true; // Fallthrough case ISD::SETUGE: SSECC = 5; break; - case ISD::SETULT: Swap = true; + case ISD::SETULT: Swap = true; // Fallthrough case ISD::SETUGT: SSECC = 6; break; case ISD::SETO: SSECC = 7; break; + case ISD::SETUEQ: + case ISD::SETONE: SSECC = 8; break; } if (Swap) std::swap(Op0, Op1); // In the two special cases we can't handle, emit two comparisons. if (SSECC == 8) { + unsigned CC0, CC1; + unsigned CombineOpc; if (SetCCOpcode == ISD::SETUEQ) { - SDValue UNORD, EQ; - UNORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, - DAG.getConstant(3, MVT::i8)); - EQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, - DAG.getConstant(0, MVT::i8)); - return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); - } - if (SetCCOpcode == ISD::SETONE) { - SDValue ORD, NEQ; - ORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, - DAG.getConstant(7, MVT::i8)); - NEQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, - DAG.getConstant(4, MVT::i8)); - return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); + CC0 = 3; CC1 = 0; CombineOpc = ISD::OR; + } else { + assert(SetCCOpcode == ISD::SETONE); + CC0 = 7; CC1 = 4; CombineOpc = ISD::AND; } - llvm_unreachable("Illegal FP comparison"); + + SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(CC0, MVT::i8)); + SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(CC1, MVT::i8)); + return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); } // Handle all other FP comparisons here. return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, @@ -8656,17 +8878,17 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { } // Break 256-bit integer vector compare into smaller ones. - if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()) + if (VT.is256BitVector() && !Subtarget->hasAVX2()) return Lower256IntVSETCC(Op, DAG); // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. - unsigned Opc = 0; + unsigned Opc; bool Swap = false, Invert = false, FlipSigns = false; switch (SetCCOpcode) { - default: break; + default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETNE: Invert = true; case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; case ISD::SETLT: Swap = true; @@ -8683,10 +8905,12 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { // Check that the operation in question is available (most are plain SSE2, // but PCMPGTQ and PCMPEQQ have different requirements). - if (Opc == X86ISD::PCMPGT && VT == MVT::v2i64 && !Subtarget->hasSSE42()) - return SDValue(); - if (Opc == X86ISD::PCMPEQ && VT == MVT::v2i64 && !Subtarget->hasSSE41()) - return SDValue(); + if (VT == MVT::v2i64) { + if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) + return SDValue(); + if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) + return SDValue(); + } // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. @@ -9361,7 +9585,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { EVT ArgVT = Op.getNode()->getValueType(0); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy); + uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy); uint8_t ArgMode; // Decide which area this value should be read from. @@ -9381,7 +9605,8 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { // Sanity Check: Make sure using fp_offset makes sense. assert(!getTargetMachine().Options.UseSoftFloat && !(DAG.getMachineFunction() - .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && + .getFunction()->getFnAttributes() + .hasAttribute(Attributes::NoImplicitFloat)) && Subtarget->hasSSE1()); } @@ -9412,7 +9637,8 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { false, false, false, 0); } -SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { // X86-64 va_list is a struct { i32, i32, i8*, i8* }. assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); SDValue Chain = Op.getOperand(0); @@ -9461,8 +9687,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, SDValue ShOps[4]; ShOps[0] = ShAmt; ShOps[1] = DAG.getConstant(0, MVT::i32); - ShOps[2] = DAG.getUNDEF(MVT::i32); - ShOps[3] = DAG.getUNDEF(MVT::i32); + ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32); ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4); // The return type has to be a 128-bit type with the same element @@ -9474,8 +9699,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); } -SDValue -X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); switch (IntNo) { @@ -9505,8 +9729,8 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const case Intrinsic::x86_sse2_ucomigt_sd: case Intrinsic::x86_sse2_ucomige_sd: case Intrinsic::x86_sse2_ucomineq_sd: { - unsigned Opc = 0; - ISD::CondCode CC = ISD::SETCC_INVALID; + unsigned Opc; + ISD::CondCode CC; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_sse_comieq_ss: @@ -9580,55 +9804,102 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const DAG.getConstant(X86CC, MVT::i8), Cond); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } + // Arithmetic intrinsics. case Intrinsic::x86_sse2_pmulu_dq: case Intrinsic::x86_avx2_pmulu_dq: return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + + // SSE3/AVX horizontal add/sub intrinsics case Intrinsic::x86_sse3_hadd_ps: case Intrinsic::x86_sse3_hadd_pd: case Intrinsic::x86_avx_hadd_ps_256: case Intrinsic::x86_avx_hadd_pd_256: - return DAG.getNode(X86ISD::FHADD, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_sse3_hsub_ps: case Intrinsic::x86_sse3_hsub_pd: case Intrinsic::x86_avx_hsub_ps_256: case Intrinsic::x86_avx_hsub_pd_256: - return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_ssse3_phadd_w_128: case Intrinsic::x86_ssse3_phadd_d_128: case Intrinsic::x86_avx2_phadd_w: case Intrinsic::x86_avx2_phadd_d: - return DAG.getNode(X86ISD::HADD, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_ssse3_phsub_w_128: case Intrinsic::x86_ssse3_phsub_d_128: case Intrinsic::x86_avx2_phsub_w: - case Intrinsic::x86_avx2_phsub_d: - return DAG.getNode(X86ISD::HSUB, dl, Op.getValueType(), + case Intrinsic::x86_avx2_phsub_d: { + unsigned Opcode; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_sse3_hadd_ps: + case Intrinsic::x86_sse3_hadd_pd: + case Intrinsic::x86_avx_hadd_ps_256: + case Intrinsic::x86_avx_hadd_pd_256: + Opcode = X86ISD::FHADD; + break; + case Intrinsic::x86_sse3_hsub_ps: + case Intrinsic::x86_sse3_hsub_pd: + case Intrinsic::x86_avx_hsub_ps_256: + case Intrinsic::x86_avx_hsub_pd_256: + Opcode = X86ISD::FHSUB; + break; + case Intrinsic::x86_ssse3_phadd_w_128: + case Intrinsic::x86_ssse3_phadd_d_128: + case Intrinsic::x86_avx2_phadd_w: + case Intrinsic::x86_avx2_phadd_d: + Opcode = X86ISD::HADD; + break; + case Intrinsic::x86_ssse3_phsub_w_128: + case Intrinsic::x86_ssse3_phsub_d_128: + case Intrinsic::x86_avx2_phsub_w: + case Intrinsic::x86_avx2_phsub_d: + Opcode = X86ISD::HSUB; + break; + } + return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + } + + // AVX2 variable shift intrinsics case Intrinsic::x86_avx2_psllv_d: case Intrinsic::x86_avx2_psllv_q: case Intrinsic::x86_avx2_psllv_d_256: case Intrinsic::x86_avx2_psllv_q_256: - return DAG.getNode(ISD::SHL, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_avx2_psrlv_d: case Intrinsic::x86_avx2_psrlv_q: case Intrinsic::x86_avx2_psrlv_d_256: case Intrinsic::x86_avx2_psrlv_q_256: - return DAG.getNode(ISD::SRL, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_avx2_psrav_d: - case Intrinsic::x86_avx2_psrav_d_256: - return DAG.getNode(ISD::SRA, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_avx2_psrav_d_256: { + unsigned Opcode; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_avx2_psllv_d: + case Intrinsic::x86_avx2_psllv_q: + case Intrinsic::x86_avx2_psllv_d_256: + case Intrinsic::x86_avx2_psllv_q_256: + Opcode = ISD::SHL; + break; + case Intrinsic::x86_avx2_psrlv_d: + case Intrinsic::x86_avx2_psrlv_q: + case Intrinsic::x86_avx2_psrlv_d_256: + case Intrinsic::x86_avx2_psrlv_q_256: + Opcode = ISD::SRL; + break; + case Intrinsic::x86_avx2_psrav_d: + case Intrinsic::x86_avx2_psrav_d_256: + Opcode = ISD::SRA; + break; + } + return DAG.getNode(Opcode, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + case Intrinsic::x86_ssse3_pshuf_b_128: case Intrinsic::x86_avx2_pshuf_b: return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_ssse3_psign_b_128: case Intrinsic::x86_ssse3_psign_w_128: case Intrinsic::x86_ssse3_psign_d_128: @@ -9637,15 +9908,18 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const case Intrinsic::x86_avx2_psign_d: return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_sse41_insertps: return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::x86_avx_vperm2f128_ps_256: case Intrinsic::x86_avx_vperm2f128_pd_256: case Intrinsic::x86_avx_vperm2f128_si_256: case Intrinsic::x86_avx2_vperm2i128: return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::x86_avx2_permd: case Intrinsic::x86_avx2_permps: // Operands intentionally swapped. Mask is last operand to intrinsic, @@ -9675,7 +9949,7 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const case Intrinsic::x86_avx_vtestc_pd_256: case Intrinsic::x86_avx_vtestnzc_pd_256: { bool IsTestPacked = false; - unsigned X86CC = 0; + unsigned X86CC; switch (IntNo) { default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); case Intrinsic::x86_avx_vtestz_ps: @@ -9726,100 +10000,93 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const case Intrinsic::x86_avx2_psll_w: case Intrinsic::x86_avx2_psll_d: case Intrinsic::x86_avx2_psll_q: - return DAG.getNode(X86ISD::VSHL, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_sse2_psrl_w: case Intrinsic::x86_sse2_psrl_d: case Intrinsic::x86_sse2_psrl_q: case Intrinsic::x86_avx2_psrl_w: case Intrinsic::x86_avx2_psrl_d: case Intrinsic::x86_avx2_psrl_q: - return DAG.getNode(X86ISD::VSRL, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_sse2_psra_w: case Intrinsic::x86_sse2_psra_d: case Intrinsic::x86_avx2_psra_w: - case Intrinsic::x86_avx2_psra_d: - return DAG.getNode(X86ISD::VSRA, dl, Op.getValueType(), + case Intrinsic::x86_avx2_psra_d: { + unsigned Opcode; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + Opcode = X86ISD::VSHL; + break; + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + Opcode = X86ISD::VSRL; + break; + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx2_psra_d: + Opcode = X86ISD::VSRA; + break; + } + return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + } + + // SSE/AVX immediate shift intrinsics case Intrinsic::x86_sse2_pslli_w: case Intrinsic::x86_sse2_pslli_d: case Intrinsic::x86_sse2_pslli_q: case Intrinsic::x86_avx2_pslli_w: case Intrinsic::x86_avx2_pslli_d: case Intrinsic::x86_avx2_pslli_q: - return getTargetVShiftNode(X86ISD::VSHLI, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2), DAG); case Intrinsic::x86_sse2_psrli_w: case Intrinsic::x86_sse2_psrli_d: case Intrinsic::x86_sse2_psrli_q: case Intrinsic::x86_avx2_psrli_w: case Intrinsic::x86_avx2_psrli_d: case Intrinsic::x86_avx2_psrli_q: - return getTargetVShiftNode(X86ISD::VSRLI, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2), DAG); case Intrinsic::x86_sse2_psrai_w: case Intrinsic::x86_sse2_psrai_d: case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: - return getTargetVShiftNode(X86ISD::VSRAI, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2), DAG); - // Fix vector shift instructions where the last operand is a non-immediate - // i32 value. - case Intrinsic::x86_mmx_pslli_w: - case Intrinsic::x86_mmx_pslli_d: - case Intrinsic::x86_mmx_pslli_q: - case Intrinsic::x86_mmx_psrli_w: - case Intrinsic::x86_mmx_psrli_d: - case Intrinsic::x86_mmx_psrli_q: - case Intrinsic::x86_mmx_psrai_w: - case Intrinsic::x86_mmx_psrai_d: { - SDValue ShAmt = Op.getOperand(2); - if (isa(ShAmt)) - return SDValue(); - - unsigned NewIntNo = 0; + case Intrinsic::x86_avx2_psrai_d: { + unsigned Opcode; switch (IntNo) { - case Intrinsic::x86_mmx_pslli_w: - NewIntNo = Intrinsic::x86_mmx_psll_w; - break; - case Intrinsic::x86_mmx_pslli_d: - NewIntNo = Intrinsic::x86_mmx_psll_d; - break; - case Intrinsic::x86_mmx_pslli_q: - NewIntNo = Intrinsic::x86_mmx_psll_q; - break; - case Intrinsic::x86_mmx_psrli_w: - NewIntNo = Intrinsic::x86_mmx_psrl_w; - break; - case Intrinsic::x86_mmx_psrli_d: - NewIntNo = Intrinsic::x86_mmx_psrl_d; - break; - case Intrinsic::x86_mmx_psrli_q: - NewIntNo = Intrinsic::x86_mmx_psrl_q; + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + Opcode = X86ISD::VSHLI; break; - case Intrinsic::x86_mmx_psrai_w: - NewIntNo = Intrinsic::x86_mmx_psra_w; + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + Opcode = X86ISD::VSRLI; break; - case Intrinsic::x86_mmx_psrai_d: - NewIntNo = Intrinsic::x86_mmx_psra_d; + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: + Opcode = X86ISD::VSRAI; break; - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. } - - // The vector shift intrinsics with scalars uses 32b shift amounts but - // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits - // to be zero. - ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, ShAmt, - DAG.getConstant(0, MVT::i32)); -// FIXME this must be lowered to get rid of the invalid type. - - EVT VT = Op.getValueType(); - ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(NewIntNo, MVT::i32), - Op.getOperand(1), ShAmt); + return getTargetVShiftNode(Opcode, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), DAG); } + case Intrinsic::x86_sse42_pcmpistria128: case Intrinsic::x86_sse42_pcmpestria128: case Intrinsic::x86_sse42_pcmpistric128: @@ -9884,6 +10151,7 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const SDValue(PCMP.getNode(), 1)); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } + case Intrinsic::x86_sse42_pcmpistri128: case Intrinsic::x86_sse42_pcmpestri128: { unsigned Opcode; @@ -9897,11 +10165,78 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); } + case Intrinsic::x86_fma_vfmadd_ps: + case Intrinsic::x86_fma_vfmadd_pd: + case Intrinsic::x86_fma_vfmsub_ps: + case Intrinsic::x86_fma_vfmsub_pd: + case Intrinsic::x86_fma_vfnmadd_ps: + case Intrinsic::x86_fma_vfnmadd_pd: + case Intrinsic::x86_fma_vfnmsub_ps: + case Intrinsic::x86_fma_vfnmsub_pd: + case Intrinsic::x86_fma_vfmaddsub_ps: + case Intrinsic::x86_fma_vfmaddsub_pd: + case Intrinsic::x86_fma_vfmsubadd_ps: + case Intrinsic::x86_fma_vfmsubadd_pd: + case Intrinsic::x86_fma_vfmadd_ps_256: + case Intrinsic::x86_fma_vfmadd_pd_256: + case Intrinsic::x86_fma_vfmsub_ps_256: + case Intrinsic::x86_fma_vfmsub_pd_256: + case Intrinsic::x86_fma_vfnmadd_ps_256: + case Intrinsic::x86_fma_vfnmadd_pd_256: + case Intrinsic::x86_fma_vfnmsub_ps_256: + case Intrinsic::x86_fma_vfnmsub_pd_256: + case Intrinsic::x86_fma_vfmaddsub_ps_256: + case Intrinsic::x86_fma_vfmaddsub_pd_256: + case Intrinsic::x86_fma_vfmsubadd_ps_256: + case Intrinsic::x86_fma_vfmsubadd_pd_256: { + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_fma_vfmadd_ps: + case Intrinsic::x86_fma_vfmadd_pd: + case Intrinsic::x86_fma_vfmadd_ps_256: + case Intrinsic::x86_fma_vfmadd_pd_256: + Opc = X86ISD::FMADD; + break; + case Intrinsic::x86_fma_vfmsub_ps: + case Intrinsic::x86_fma_vfmsub_pd: + case Intrinsic::x86_fma_vfmsub_ps_256: + case Intrinsic::x86_fma_vfmsub_pd_256: + Opc = X86ISD::FMSUB; + break; + case Intrinsic::x86_fma_vfnmadd_ps: + case Intrinsic::x86_fma_vfnmadd_pd: + case Intrinsic::x86_fma_vfnmadd_ps_256: + case Intrinsic::x86_fma_vfnmadd_pd_256: + Opc = X86ISD::FNMADD; + break; + case Intrinsic::x86_fma_vfnmsub_ps: + case Intrinsic::x86_fma_vfnmsub_pd: + case Intrinsic::x86_fma_vfnmsub_ps_256: + case Intrinsic::x86_fma_vfnmsub_pd_256: + Opc = X86ISD::FNMSUB; + break; + case Intrinsic::x86_fma_vfmaddsub_ps: + case Intrinsic::x86_fma_vfmaddsub_pd: + case Intrinsic::x86_fma_vfmaddsub_ps_256: + case Intrinsic::x86_fma_vfmaddsub_pd_256: + Opc = X86ISD::FMADDSUB; + break; + case Intrinsic::x86_fma_vfmsubadd_ps: + case Intrinsic::x86_fma_vfmsubadd_pd: + case Intrinsic::x86_fma_vfmsubadd_ps_256: + case Intrinsic::x86_fma_vfmsubadd_pd_256: + Opc = X86ISD::FMSUBADD; + break; + } + + return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); + } } } -SDValue -X86TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); unsigned IntNo = cast(Op.getOperand(1))->getZExtValue(); switch (IntNo) { @@ -10001,8 +10336,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); } -SDValue X86TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, - SelectionDAG &DAG) const { +static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) { return Op.getOperand(0); } @@ -10015,6 +10349,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, DebugLoc dl = Op.getDebugLoc(); const Value *TrmpAddr = cast(Op.getOperand(4))->getValue(); + const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); if (Subtarget->is64Bit()) { SDValue OutChains[6]; @@ -10023,8 +10358,8 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. - const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10); - const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11); + const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7; + const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7; const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix @@ -10097,7 +10432,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, for (FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end(); I != E; ++I, ++Idx) - if (Attrs.paramHasAttr(Idx, Attribute::InReg)) + if (Attrs.getParamAttributes(Idx).hasAttribute(Attributes::InReg)) // FIXME: should only count parameters that are lowered to integers. InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; @@ -10126,7 +10461,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, // This is storing the opcode for MOV32ri. const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. - const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg); + const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(MOV32ri|N86Reg, MVT::i8), Trmp, MachinePointerInfo(TrmpAddr), @@ -10225,7 +10560,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); } -SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); @@ -10259,8 +10594,7 @@ SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { return Op; } -SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op, - SelectionDAG &DAG) const { +static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); @@ -10285,7 +10619,7 @@ SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op, return Op; } -SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); unsigned NumBits = VT.getSizeInBits(); DebugLoc dl = Op.getDebugLoc(); @@ -10310,7 +10644,7 @@ SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); - assert(VT.getSizeInBits() == 256 && VT.isInteger() && + assert(VT.is256BitVector() && VT.isInteger() && "Unsupported value type for operation"); unsigned NumElems = VT.getVectorNumElements(); @@ -10334,25 +10668,26 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); } -SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const { - assert(Op.getValueType().getSizeInBits() == 256 && +static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { + assert(Op.getValueType().is256BitVector() && Op.getValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return Lower256IntArith(Op, DAG); } -SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const { - assert(Op.getValueType().getSizeInBits() == 256 && +static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { + assert(Op.getValueType().is256BitVector() && Op.getValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return Lower256IntArith(Op, DAG); } -SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { EVT VT = Op.getValueType(); // Decompose 256-bit ops into smaller 128-bit ops. - if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()) + if (VT.is256BitVector() && !Subtarget->hasAVX2()) return Lower256IntArith(Op, DAG); assert((VT == MVT::v2i64 || VT == MVT::v4i64) && @@ -10582,7 +10917,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { } // Decompose 256-bit shifts into smaller 128-bit shifts. - if (VT.getSizeInBits() == 256) { + if (VT.is256BitVector()) { unsigned NumElems = VT.getVectorNumElements(); MVT EltVT = VT.getVectorElementType().getSimpleVT(); EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); @@ -10623,7 +10958,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // Lower the "add/sub/mul with overflow" instruction into a regular ins plus // a "setcc" instruction that checks the overflow flag. The "brcond" lowering // looks for this combo and may remove the "setcc" instruction if the "setcc" @@ -10738,7 +11073,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);; + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2); } // fall through case MVT::v4i32: @@ -10751,7 +11086,8 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, } -SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ +static SDValue LowerMEMBARRIER(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); // Go ahead and emit the fence on x86-64 even if we asked for no-sse2. @@ -10796,8 +11132,8 @@ SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); } -SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op, - SelectionDAG &DAG) const { +static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); AtomicOrdering FenceOrdering = static_cast( cast(Op.getOperand(1))->getZExtValue()); @@ -10835,7 +11171,8 @@ SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op, } -SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { EVT T = Op.getValueType(); DebugLoc DL = Op.getDebugLoc(); unsigned Reg = 0; @@ -10866,8 +11203,8 @@ SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { return cpOut; } -SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, - SelectionDAG &DAG) const { +static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { assert(Subtarget->is64Bit() && "Result not type legalized?"); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue TheChain = Op.getOperand(0); @@ -10885,8 +11222,7 @@ SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, return DAG.getMergeValues(Ops, 2, dl); } -SDValue X86TargetLowering::LowerBITCAST(SDValue Op, - SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { EVT SrcVT = Op.getOperand(0).getValueType(); EVT DstVT = Op.getValueType(); assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && @@ -10906,7 +11242,7 @@ SDValue X86TargetLowering::LowerBITCAST(SDValue Op, return SDValue(); } -SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { SDNode *Node = Op.getNode(); DebugLoc dl = Node->getDebugLoc(); EVT T = Node->getValueType(0); @@ -10979,9 +11315,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Should not custom lower this!"); case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); - case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); - case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op,DAG); - case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); + case ISD::MEMBARRIER: return LowerMEMBARRIER(Op, Subtarget, DAG); + case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); + case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op, Subtarget, DAG); case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); @@ -10989,8 +11325,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); - case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); - case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, DAG); + case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); + case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); @@ -11004,6 +11340,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + case ISD::FP_EXTEND: return lowerFP_EXTEND(Op, DAG); case ISD::FABS: return LowerFABS(Op, DAG); case ISD::FNEG: return LowerFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); @@ -11014,7 +11351,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); - case ISD::VACOPY: return LowerVACOPY(Op, DAG); + case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); @@ -11029,7 +11366,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::CTLZ: return LowerCTLZ(Op, DAG); case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op, DAG); - case ISD::MUL: return LowerMUL(Op, DAG); + case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerShift(Op, DAG); @@ -11039,7 +11376,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::USUBO: case ISD::SMULO: case ISD::UMULO: return LowerXALUO(Op, DAG); - case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); + case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); case ISD::BITCAST: return LowerBITCAST(Op, DAG); case ISD::ADDC: case ISD::ADDE: @@ -11071,9 +11408,9 @@ static void ReplaceATOMIC_LOAD(SDNode *Node, Results.push_back(Swap.getValue(1)); } -void X86TargetLowering:: +static void ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl&Results, - SelectionDAG &DAG, unsigned NewOp) const { + SelectionDAG &DAG, unsigned NewOp) { DebugLoc dl = Node->getDebugLoc(); assert (Node->getValueType(0) == MVT::i64 && "Only know how to expand i64 atomics"); @@ -11132,6 +11469,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } return; } + case ISD::FP_ROUND: { + SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); + Results.push_back(V); + return; + } case ISD::READCYCLECOUNTER: { SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue TheChain = N->getOperand(0); @@ -11194,27 +11536,57 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } case ISD::ATOMIC_LOAD_ADD: - ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); - return; case ISD::ATOMIC_LOAD_AND: - ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); - return; case ISD::ATOMIC_LOAD_NAND: - ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); - return; case ISD::ATOMIC_LOAD_OR: - ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); - return; case ISD::ATOMIC_LOAD_SUB: - ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); - return; case ISD::ATOMIC_LOAD_XOR: - ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); - return; - case ISD::ATOMIC_SWAP: - ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); - return; - case ISD::ATOMIC_LOAD: + case ISD::ATOMIC_LOAD_MAX: + case ISD::ATOMIC_LOAD_MIN: + case ISD::ATOMIC_LOAD_UMAX: + case ISD::ATOMIC_LOAD_UMIN: + case ISD::ATOMIC_SWAP: { + unsigned Opc; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected opcode"); + case ISD::ATOMIC_LOAD_ADD: + Opc = X86ISD::ATOMADD64_DAG; + break; + case ISD::ATOMIC_LOAD_AND: + Opc = X86ISD::ATOMAND64_DAG; + break; + case ISD::ATOMIC_LOAD_NAND: + Opc = X86ISD::ATOMNAND64_DAG; + break; + case ISD::ATOMIC_LOAD_OR: + Opc = X86ISD::ATOMOR64_DAG; + break; + case ISD::ATOMIC_LOAD_SUB: + Opc = X86ISD::ATOMSUB64_DAG; + break; + case ISD::ATOMIC_LOAD_XOR: + Opc = X86ISD::ATOMXOR64_DAG; + break; + case ISD::ATOMIC_LOAD_MAX: + Opc = X86ISD::ATOMMAX64_DAG; + break; + case ISD::ATOMIC_LOAD_MIN: + Opc = X86ISD::ATOMMIN64_DAG; + break; + case ISD::ATOMIC_LOAD_UMAX: + Opc = X86ISD::ATOMUMAX64_DAG; + break; + case ISD::ATOMIC_LOAD_UMIN: + Opc = X86ISD::ATOMUMIN64_DAG; + break; + case ISD::ATOMIC_SWAP: + Opc = X86ISD::ATOMSWAP64_DAG; + break; + } + ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc); + return; + } + case ISD::ATOMIC_LOAD: ReplaceATOMIC_LOAD(N, Results, DAG); } } @@ -11273,6 +11645,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FHSUB: return "X86ISD::FHSUB"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMIN: return "X86ISD::FMIN"; + case X86ISD::FMAXC: return "X86ISD::FMAXC"; + case X86ISD::FMINC: return "X86ISD::FMINC"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; case X86ISD::FRCP: return "X86ISD::FRCP"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; @@ -11291,7 +11665,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; + case X86ISD::VSEXT_MOVL: return "X86ISD::VSEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; + case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; + case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; @@ -11493,7 +11870,7 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl &Mask, // FIXME: This collection of masks seems suspect. if (NumElts == 2) return true; - if (NumElts == 4 && VT.getSizeInBits() == 128) { + if (NumElts == 4 && VT.is128BitVector()) { return (isMOVLMask(Mask, VT) || isCommutedMOVLMask(Mask, VT, true) || isSHUFPMask(Mask, VT, Subtarget->hasAVX()) || @@ -11507,385 +11884,591 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl &Mask, //===----------------------------------------------------------------------===// // private utility function + +// Get CMPXCHG opcode for the specified data type. +static unsigned getCmpXChgOpcode(EVT VT) { + switch (VT.getSimpleVT().SimpleTy) { + case MVT::i8: return X86::LCMPXCHG8; + case MVT::i16: return X86::LCMPXCHG16; + case MVT::i32: return X86::LCMPXCHG32; + case MVT::i64: return X86::LCMPXCHG64; + default: + break; + } + llvm_unreachable("Invalid operand size!"); +} + +// Get LOAD opcode for the specified data type. +static unsigned getLoadOpcode(EVT VT) { + switch (VT.getSimpleVT().SimpleTy) { + case MVT::i8: return X86::MOV8rm; + case MVT::i16: return X86::MOV16rm; + case MVT::i32: return X86::MOV32rm; + case MVT::i64: return X86::MOV64rm; + default: + break; + } + llvm_unreachable("Invalid operand size!"); +} + +// Get opcode of the non-atomic one from the specified atomic instruction. +static unsigned getNonAtomicOpcode(unsigned Opc) { + switch (Opc) { + case X86::ATOMAND8: return X86::AND8rr; + case X86::ATOMAND16: return X86::AND16rr; + case X86::ATOMAND32: return X86::AND32rr; + case X86::ATOMAND64: return X86::AND64rr; + case X86::ATOMOR8: return X86::OR8rr; + case X86::ATOMOR16: return X86::OR16rr; + case X86::ATOMOR32: return X86::OR32rr; + case X86::ATOMOR64: return X86::OR64rr; + case X86::ATOMXOR8: return X86::XOR8rr; + case X86::ATOMXOR16: return X86::XOR16rr; + case X86::ATOMXOR32: return X86::XOR32rr; + case X86::ATOMXOR64: return X86::XOR64rr; + } + llvm_unreachable("Unhandled atomic-load-op opcode!"); +} + +// Get opcode of the non-atomic one from the specified atomic instruction with +// extra opcode. +static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc, + unsigned &ExtraOpc) { + switch (Opc) { + case X86::ATOMNAND8: ExtraOpc = X86::NOT8r; return X86::AND8rr; + case X86::ATOMNAND16: ExtraOpc = X86::NOT16r; return X86::AND16rr; + case X86::ATOMNAND32: ExtraOpc = X86::NOT32r; return X86::AND32rr; + case X86::ATOMNAND64: ExtraOpc = X86::NOT64r; return X86::AND64rr; + case X86::ATOMMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVL32rr; + case X86::ATOMMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr; + case X86::ATOMMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr; + case X86::ATOMMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr; + case X86::ATOMMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVG32rr; + case X86::ATOMMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr; + case X86::ATOMMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr; + case X86::ATOMMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr; + case X86::ATOMUMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVB32rr; + case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr; + case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr; + case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr; + case X86::ATOMUMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVA32rr; + case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr; + case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr; + case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr; + } + llvm_unreachable("Unhandled atomic-load-op opcode!"); +} + +// Get opcode of the non-atomic one from the specified atomic instruction for +// 64-bit data type on 32-bit target. +static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) { + switch (Opc) { + case X86::ATOMAND6432: HiOpc = X86::AND32rr; return X86::AND32rr; + case X86::ATOMOR6432: HiOpc = X86::OR32rr; return X86::OR32rr; + case X86::ATOMXOR6432: HiOpc = X86::XOR32rr; return X86::XOR32rr; + case X86::ATOMADD6432: HiOpc = X86::ADC32rr; return X86::ADD32rr; + case X86::ATOMSUB6432: HiOpc = X86::SBB32rr; return X86::SUB32rr; + case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr; + case X86::ATOMMAX6432: HiOpc = X86::SETLr; return X86::SETLr; + case X86::ATOMMIN6432: HiOpc = X86::SETGr; return X86::SETGr; + case X86::ATOMUMAX6432: HiOpc = X86::SETBr; return X86::SETBr; + case X86::ATOMUMIN6432: HiOpc = X86::SETAr; return X86::SETAr; + } + llvm_unreachable("Unhandled atomic-load-op opcode!"); +} + +// Get opcode of the non-atomic one from the specified atomic instruction for +// 64-bit data type on 32-bit target with extra opcode. +static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc, + unsigned &HiOpc, + unsigned &ExtraOpc) { + switch (Opc) { + case X86::ATOMNAND6432: + ExtraOpc = X86::NOT32r; + HiOpc = X86::AND32rr; + return X86::AND32rr; + } + llvm_unreachable("Unhandled atomic-load-op opcode!"); +} + +// Get pseudo CMOV opcode from the specified data type. +static unsigned getPseudoCMOVOpc(EVT VT) { + switch (VT.getSimpleVT().SimpleTy) { + case MVT::i8: return X86::CMOV_GR8; + case MVT::i16: return X86::CMOV_GR16; + case MVT::i32: return X86::CMOV_GR32; + default: + break; + } + llvm_unreachable("Unknown CMOV opcode!"); +} + +// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions. +// They will be translated into a spin-loop or compare-exchange loop from +// +// ... +// dst = atomic-fetch-op MI.addr, MI.val +// ... +// +// to +// +// ... +// EAX = LOAD MI.addr +// loop: +// t1 = OP MI.val, EAX +// LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] +// JNE loop +// sink: +// dst = EAX +// ... MachineBasicBlock * -X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, - MachineBasicBlock *MBB, - unsigned regOpc, - unsigned immOpc, - unsigned LoadOpc, - unsigned CXchgOpc, - unsigned notOpc, - unsigned EAXreg, - const TargetRegisterClass *RC, - bool Invert) const { - // For the atomic bitwise operator, we generate - // thisMBB: - // newMBB: - // ld t1 = [bitinstr.addr] - // op t2 = t1, [bitinstr.val] - // not t3 = t2 (if Invert) - // mov EAX = t1 - // lcs dest = [bitinstr.addr], t3 [EAX is implicit] - // bz newMBB - // fallthrough -->nextMBB +X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, + MachineBasicBlock *MBB) const { const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - const BasicBlock *LLVM_BB = MBB->getBasicBlock(); - MachineFunction::iterator MBBIter = MBB; - ++MBBIter; + DebugLoc DL = MI->getDebugLoc(); + + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + const BasicBlock *BB = MBB->getBasicBlock(); + MachineFunction::iterator I = MBB; + ++I; + + assert(MI->getNumOperands() <= X86::AddrNumOperands + 2 && + "Unexpected number of operands"); + + assert(MI->hasOneMemOperand() && + "Expected atomic-load-op to have one memoperand"); + + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + unsigned DstReg, SrcReg; + unsigned MemOpndSlot; + + unsigned CurOp = 0; + + DstReg = MI->getOperand(CurOp++).getReg(); + MemOpndSlot = CurOp; + CurOp += X86::AddrNumOperands; + SrcReg = MI->getOperand(CurOp++).getReg(); + + const TargetRegisterClass *RC = MRI.getRegClass(DstReg); + MVT::SimpleValueType VT = *RC->vt_begin(); + unsigned AccPhyReg = getX86SubSuperRegister(X86::EAX, VT); + + unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT); + unsigned LOADOpc = getLoadOpcode(VT); + + // For the atomic load-arith operator, we generate + // + // thisMBB: + // EAX = LOAD [MI.addr] + // mainMBB: + // t1 = OP MI.val, EAX + // LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] + // JNE mainMBB + // sinkMBB: - /// First build the CFG - MachineFunction *F = MBB->getParent(); MachineBasicBlock *thisMBB = MBB; - MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(MBBIter, newMBB); - F->insert(MBBIter, nextMBB); - - // Transfer the remainder of thisMBB and its successor edges to nextMBB. - nextMBB->splice(nextMBB->begin(), thisMBB, - llvm::next(MachineBasicBlock::iterator(bInstr)), - thisMBB->end()); - nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); - - // Update thisMBB to fall through to newMBB - thisMBB->addSuccessor(newMBB); - - // newMBB jumps to itself and fall through to nextMBB - newMBB->addSuccessor(nextMBB); - newMBB->addSuccessor(newMBB); - - // Insert instructions into newMBB based on incoming instruction - assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && - "unexpected number of operands"); - DebugLoc dl = bInstr->getDebugLoc(); - MachineOperand& destOper = bInstr->getOperand(0); - MachineOperand* argOpers[2 + X86::AddrNumOperands]; - int numArgs = bInstr->getNumOperands() - 1; - for (int i=0; i < numArgs; ++i) - argOpers[i] = &bInstr->getOperand(i+1); - - // x86 address has 4 operands: base, index, scale, and displacement - int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] - int valArgIndx = lastAddrIndx + 1; - - unsigned t1 = F->getRegInfo().createVirtualRegister(RC); - MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); - for (int i=0; i <= lastAddrIndx; ++i) - (*MIB).addOperand(*argOpers[i]); - - unsigned t2 = F->getRegInfo().createVirtualRegister(RC); - assert((argOpers[valArgIndx]->isReg() || - argOpers[valArgIndx]->isImm()) && - "invalid operand"); - if (argOpers[valArgIndx]->isReg()) - MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); - else - MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); - MIB.addReg(t1); - (*MIB).addOperand(*argOpers[valArgIndx]); + MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(I, mainMBB); + MF->insert(I, sinkMBB); + + MachineInstrBuilder MIB; + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), MBB, + llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // thisMBB: + MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), AccPhyReg); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + MIB.setMemRefs(MMOBegin, MMOEnd); + + thisMBB->addSuccessor(mainMBB); - unsigned t3 = F->getRegInfo().createVirtualRegister(RC); - if (Invert) { - MIB = BuildMI(newMBB, dl, TII->get(notOpc), t3).addReg(t2); + // mainMBB: + MachineBasicBlock *origMainMBB = mainMBB; + mainMBB->addLiveIn(AccPhyReg); + + // Copy AccPhyReg as it is used more than once. + unsigned AccReg = MRI.createVirtualRegister(RC); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccReg) + .addReg(AccPhyReg); + + unsigned t1 = MRI.createVirtualRegister(RC); + unsigned Opc = MI->getOpcode(); + switch (Opc) { + default: + llvm_unreachable("Unhandled atomic-load-op opcode!"); + case X86::ATOMAND8: + case X86::ATOMAND16: + case X86::ATOMAND32: + case X86::ATOMAND64: + case X86::ATOMOR8: + case X86::ATOMOR16: + case X86::ATOMOR32: + case X86::ATOMOR64: + case X86::ATOMXOR8: + case X86::ATOMXOR16: + case X86::ATOMXOR32: + case X86::ATOMXOR64: { + unsigned ARITHOpc = getNonAtomicOpcode(Opc); + BuildMI(mainMBB, DL, TII->get(ARITHOpc), t1).addReg(SrcReg) + .addReg(AccReg); + break; } - else - t3 = t2; + case X86::ATOMNAND8: + case X86::ATOMNAND16: + case X86::ATOMNAND32: + case X86::ATOMNAND64: { + unsigned t2 = MRI.createVirtualRegister(RC); + unsigned NOTOpc; + unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc); + BuildMI(mainMBB, DL, TII->get(ANDOpc), t2).addReg(SrcReg) + .addReg(AccReg); + BuildMI(mainMBB, DL, TII->get(NOTOpc), t1).addReg(t2); + break; + } + case X86::ATOMMAX8: + case X86::ATOMMAX16: + case X86::ATOMMAX32: + case X86::ATOMMAX64: + case X86::ATOMMIN8: + case X86::ATOMMIN16: + case X86::ATOMMIN32: + case X86::ATOMMIN64: + case X86::ATOMUMAX8: + case X86::ATOMUMAX16: + case X86::ATOMUMAX32: + case X86::ATOMUMAX64: + case X86::ATOMUMIN8: + case X86::ATOMUMIN16: + case X86::ATOMUMIN32: + case X86::ATOMUMIN64: { + unsigned CMPOpc; + unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc); + + BuildMI(mainMBB, DL, TII->get(CMPOpc)) + .addReg(SrcReg) + .addReg(AccReg); + + if (Subtarget->hasCMov()) { + if (VT != MVT::i8) { + // Native support + BuildMI(mainMBB, DL, TII->get(CMOVOpc), t1) + .addReg(SrcReg) + .addReg(AccReg); + } else { + // Promote i8 to i32 to use CMOV32 + const TargetRegisterClass *RC32 = getRegClassFor(MVT::i32); + unsigned SrcReg32 = MRI.createVirtualRegister(RC32); + unsigned AccReg32 = MRI.createVirtualRegister(RC32); + unsigned t2 = MRI.createVirtualRegister(RC32); + + unsigned Undef = MRI.createVirtualRegister(RC32); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef); + + BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32) + .addReg(Undef) + .addReg(SrcReg) + .addImm(X86::sub_8bit); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32) + .addReg(Undef) + .addReg(AccReg) + .addImm(X86::sub_8bit); + + BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2) + .addReg(SrcReg32) + .addReg(AccReg32); + + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t1) + .addReg(t2, 0, X86::sub_8bit); + } + } else { + // Use pseudo select and lower them. + assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) && + "Invalid atomic-load-op transformation!"); + unsigned SelOpc = getPseudoCMOVOpc(VT); + X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc); + assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!"); + MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t1) + .addReg(SrcReg).addReg(AccReg) + .addImm(CC); + mainMBB = EmitLoweredSelect(MIB, mainMBB); + } + break; + } + } + + // Copy AccPhyReg back from virtual register. + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccPhyReg) + .addReg(AccReg); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); + MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI->getOperand(MemOpndSlot + i)); MIB.addReg(t1); + MIB.setMemRefs(MMOBegin, MMOEnd); - MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); - for (int i=0; i <= lastAddrIndx; ++i) - (*MIB).addOperand(*argOpers[i]); - MIB.addReg(t3); - assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); - (*MIB).setMemRefs(bInstr->memoperands_begin(), - bInstr->memoperands_end()); + BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); - MIB.addReg(EAXreg); + mainMBB->addSuccessor(origMainMBB); + mainMBB->addSuccessor(sinkMBB); - // insert branch - BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); + // sinkMBB: + sinkMBB->addLiveIn(AccPhyReg); - bInstr->eraseFromParent(); // The pseudo instruction is gone now. - return nextMBB; + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(TargetOpcode::COPY), DstReg) + .addReg(AccPhyReg); + + MI->eraseFromParent(); + return sinkMBB; } -// private utility function: 64 bit atomics on 32 bit host. +// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic +// instructions. They will be translated into a spin-loop or compare-exchange +// loop from +// +// ... +// dst = atomic-fetch-op MI.addr, MI.val +// ... +// +// to +// +// ... +// EAX = LOAD [MI.addr + 0] +// EDX = LOAD [MI.addr + 4] +// loop: +// EBX = OP MI.val.lo, EAX +// ECX = OP MI.val.hi, EDX +// LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] +// JNE loop +// sink: +// dst = EDX:EAX +// ... MachineBasicBlock * -X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, - MachineBasicBlock *MBB, - unsigned regOpcL, - unsigned regOpcH, - unsigned immOpcL, - unsigned immOpcH, - bool Invert) const { - // For the atomic bitwise operator, we generate - // thisMBB (instructions are in pairs, except cmpxchg8b) - // ld t1,t2 = [bitinstr.addr] - // newMBB: - // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) - // op t5, t6 <- out1, out2, [bitinstr.val] - // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) - // neg t7, t8 < t5, t6 (if Invert) - // mov ECX, EBX <- t5, t6 - // mov EAX, EDX <- t1, t2 - // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] - // mov t3, t4 <- EAX, EDX - // bz newMBB - // result in out1, out2 - // fallthrough -->nextMBB - - const TargetRegisterClass *RC = &X86::GR32RegClass; - const unsigned LoadOpc = X86::MOV32rm; - const unsigned NotOpc = X86::NOT32r; +X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, + MachineBasicBlock *MBB) const { const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - const BasicBlock *LLVM_BB = MBB->getBasicBlock(); - MachineFunction::iterator MBBIter = MBB; - ++MBBIter; + DebugLoc DL = MI->getDebugLoc(); - /// First build the CFG - MachineFunction *F = MBB->getParent(); - MachineBasicBlock *thisMBB = MBB; - MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(MBBIter, newMBB); - F->insert(MBBIter, nextMBB); - - // Transfer the remainder of thisMBB and its successor edges to nextMBB. - nextMBB->splice(nextMBB->begin(), thisMBB, - llvm::next(MachineBasicBlock::iterator(bInstr)), - thisMBB->end()); - nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); - - // Update thisMBB to fall through to newMBB - thisMBB->addSuccessor(newMBB); - - // newMBB jumps to itself and fall through to nextMBB - newMBB->addSuccessor(nextMBB); - newMBB->addSuccessor(newMBB); - - DebugLoc dl = bInstr->getDebugLoc(); - // Insert instructions into newMBB based on incoming instruction - // There are 8 "real" operands plus 9 implicit def/uses, ignored here. - assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && - "unexpected number of operands"); - MachineOperand& dest1Oper = bInstr->getOperand(0); - MachineOperand& dest2Oper = bInstr->getOperand(1); - MachineOperand* argOpers[2 + X86::AddrNumOperands]; - for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { - argOpers[i] = &bInstr->getOperand(i+2); - - // We use some of the operands multiple times, so conservatively just - // clear any kill flags that might be present. - if (argOpers[i]->isReg() && argOpers[i]->isUse()) - argOpers[i]->setIsKill(false); - } - - // x86 address has 5 operands: base, index, scale, displacement, and segment. - int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] - - unsigned t1 = F->getRegInfo().createVirtualRegister(RC); - MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); - for (int i=0; i <= lastAddrIndx; ++i) - (*MIB).addOperand(*argOpers[i]); - unsigned t2 = F->getRegInfo().createVirtualRegister(RC); - MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); - // add 4 to displacement. - for (int i=0; i <= lastAddrIndx-2; ++i) - (*MIB).addOperand(*argOpers[i]); - MachineOperand newOp3 = *(argOpers[3]); - if (newOp3.isImm()) - newOp3.setImm(newOp3.getImm()+4); - else - newOp3.setOffset(newOp3.getOffset()+4); - (*MIB).addOperand(newOp3); - (*MIB).addOperand(*argOpers[lastAddrIndx]); - - // t3/4 are defined later, at the bottom of the loop - unsigned t3 = F->getRegInfo().createVirtualRegister(RC); - unsigned t4 = F->getRegInfo().createVirtualRegister(RC); - BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) - .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); - BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) - .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); - - // The subsequent operations should be using the destination registers of - // the PHI instructions. - t1 = dest1Oper.getReg(); - t2 = dest2Oper.getReg(); - - int valArgIndx = lastAddrIndx + 1; - assert((argOpers[valArgIndx]->isReg() || - argOpers[valArgIndx]->isImm()) && - "invalid operand"); - unsigned t5 = F->getRegInfo().createVirtualRegister(RC); - unsigned t6 = F->getRegInfo().createVirtualRegister(RC); - if (argOpers[valArgIndx]->isReg()) - MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); - else - MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); - if (regOpcL != X86::MOV32rr) - MIB.addReg(t1); - (*MIB).addOperand(*argOpers[valArgIndx]); - assert(argOpers[valArgIndx + 1]->isReg() == - argOpers[valArgIndx]->isReg()); - assert(argOpers[valArgIndx + 1]->isImm() == - argOpers[valArgIndx]->isImm()); - if (argOpers[valArgIndx + 1]->isReg()) - MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); - else - MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); - if (regOpcH != X86::MOV32rr) - MIB.addReg(t2); - (*MIB).addOperand(*argOpers[valArgIndx + 1]); - - unsigned t7, t8; - if (Invert) { - t7 = F->getRegInfo().createVirtualRegister(RC); - t8 = F->getRegInfo().createVirtualRegister(RC); - MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t7).addReg(t5); - MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t8).addReg(t6); - } else { - t7 = t5; - t8 = t6; - } + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); - MIB.addReg(t1); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); - MIB.addReg(t2); + const BasicBlock *BB = MBB->getBasicBlock(); + MachineFunction::iterator I = MBB; + ++I; - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); - MIB.addReg(t7); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); - MIB.addReg(t8); + assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 && + "Unexpected number of operands"); - MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); - for (int i=0; i <= lastAddrIndx; ++i) - (*MIB).addOperand(*argOpers[i]); + assert(MI->hasOneMemOperand() && + "Expected atomic-load-op32 to have one memoperand"); - assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); - (*MIB).setMemRefs(bInstr->memoperands_begin(), - bInstr->memoperands_end()); + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); - MIB.addReg(X86::EAX); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); - MIB.addReg(X86::EDX); + unsigned DstLoReg, DstHiReg; + unsigned SrcLoReg, SrcHiReg; + unsigned MemOpndSlot; - // insert branch - BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); + unsigned CurOp = 0; - bInstr->eraseFromParent(); // The pseudo instruction is gone now. - return nextMBB; -} + DstLoReg = MI->getOperand(CurOp++).getReg(); + DstHiReg = MI->getOperand(CurOp++).getReg(); + MemOpndSlot = CurOp; + CurOp += X86::AddrNumOperands; + SrcLoReg = MI->getOperand(CurOp++).getReg(); + SrcHiReg = MI->getOperand(CurOp++).getReg(); -// private utility function -MachineBasicBlock * -X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, - MachineBasicBlock *MBB, - unsigned cmovOpc) const { - // For the atomic min/max operator, we generate - // thisMBB: - // newMBB: - // ld t1 = [min/max.addr] - // mov t2 = [min/max.val] - // cmp t1, t2 - // cmov[cond] t2 = t1 - // mov EAX = t1 - // lcs dest = [bitinstr.addr], t2 [EAX is implicit] - // bz newMBB - // fallthrough -->nextMBB + const TargetRegisterClass *RC = &X86::GR32RegClass; + const TargetRegisterClass *RC8 = &X86::GR8RegClass; + + unsigned LCMPXCHGOpc = X86::LCMPXCHG8B; + unsigned LOADOpc = X86::MOV32rm; + + // For the atomic load-arith operator, we generate // - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - const BasicBlock *LLVM_BB = MBB->getBasicBlock(); - MachineFunction::iterator MBBIter = MBB; - ++MBBIter; + // thisMBB: + // EAX = LOAD [MI.addr + 0] + // EDX = LOAD [MI.addr + 4] + // mainMBB: + // EBX = OP MI.vallo, EAX + // ECX = OP MI.valhi, EDX + // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] + // JNE mainMBB + // sinkMBB: - /// First build the CFG - MachineFunction *F = MBB->getParent(); MachineBasicBlock *thisMBB = MBB; - MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(MBBIter, newMBB); - F->insert(MBBIter, nextMBB); - - // Transfer the remainder of thisMBB and its successor edges to nextMBB. - nextMBB->splice(nextMBB->begin(), thisMBB, - llvm::next(MachineBasicBlock::iterator(mInstr)), - thisMBB->end()); - nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); - - // Update thisMBB to fall through to newMBB - thisMBB->addSuccessor(newMBB); - - // newMBB jumps to newMBB and fall through to nextMBB - newMBB->addSuccessor(nextMBB); - newMBB->addSuccessor(newMBB); - - DebugLoc dl = mInstr->getDebugLoc(); - // Insert instructions into newMBB based on incoming instruction - assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && - "unexpected number of operands"); - MachineOperand& destOper = mInstr->getOperand(0); - MachineOperand* argOpers[2 + X86::AddrNumOperands]; - int numArgs = mInstr->getNumOperands() - 1; - for (int i=0; i < numArgs; ++i) - argOpers[i] = &mInstr->getOperand(i+1); - - // x86 address has 4 operands: base, index, scale, and displacement - int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] - int valArgIndx = lastAddrIndx + 1; - - unsigned t1 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass); - MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); - for (int i=0; i <= lastAddrIndx; ++i) - (*MIB).addOperand(*argOpers[i]); - - // We only support register and immediate values - assert((argOpers[valArgIndx]->isReg() || - argOpers[valArgIndx]->isImm()) && - "invalid operand"); - - unsigned t2 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass); - if (argOpers[valArgIndx]->isReg()) - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); - else - MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); - (*MIB).addOperand(*argOpers[valArgIndx]); + MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(I, mainMBB); + MF->insert(I, sinkMBB); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); - MIB.addReg(t1); + MachineInstrBuilder MIB; - MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); - MIB.addReg(t1); - MIB.addReg(t2); + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), MBB, + llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // thisMBB: + // Lo + MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EAX); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + MIB.setMemRefs(MMOBegin, MMOEnd); + // Hi + MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EDX); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + if (i == X86::AddrDisp) { + MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32) + // Don't forget to transfer the target flag. + MachineOperand &MO = MIB->getOperand(MIB->getNumOperands()-1); + MO.setTargetFlags(MI->getOperand(MemOpndSlot + i).getTargetFlags()); + } else + MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + } + MIB.setMemRefs(MMOBegin, MMOEnd); + + thisMBB->addSuccessor(mainMBB); + + // mainMBB: + MachineBasicBlock *origMainMBB = mainMBB; + mainMBB->addLiveIn(X86::EAX); + mainMBB->addLiveIn(X86::EDX); + + // Copy EDX:EAX as they are used more than once. + unsigned LoReg = MRI.createVirtualRegister(RC); + unsigned HiReg = MRI.createVirtualRegister(RC); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), LoReg).addReg(X86::EAX); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), HiReg).addReg(X86::EDX); + + unsigned t1L = MRI.createVirtualRegister(RC); + unsigned t1H = MRI.createVirtualRegister(RC); + + unsigned Opc = MI->getOpcode(); + switch (Opc) { + default: + llvm_unreachable("Unhandled atomic-load-op6432 opcode!"); + case X86::ATOMAND6432: + case X86::ATOMOR6432: + case X86::ATOMXOR6432: + case X86::ATOMADD6432: + case X86::ATOMSUB6432: { + unsigned HiOpc; + unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); + BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(SrcLoReg).addReg(LoReg); + BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(SrcHiReg).addReg(HiReg); + break; + } + case X86::ATOMNAND6432: { + unsigned HiOpc, NOTOpc; + unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc); + unsigned t2L = MRI.createVirtualRegister(RC); + unsigned t2H = MRI.createVirtualRegister(RC); + BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg).addReg(LoReg); + BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg).addReg(HiReg); + BuildMI(mainMBB, DL, TII->get(NOTOpc), t1L).addReg(t2L); + BuildMI(mainMBB, DL, TII->get(NOTOpc), t1H).addReg(t2H); + break; + } + case X86::ATOMMAX6432: + case X86::ATOMMIN6432: + case X86::ATOMUMAX6432: + case X86::ATOMUMIN6432: { + unsigned HiOpc; + unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); + unsigned cL = MRI.createVirtualRegister(RC8); + unsigned cH = MRI.createVirtualRegister(RC8); + unsigned cL32 = MRI.createVirtualRegister(RC); + unsigned cH32 = MRI.createVirtualRegister(RC); + unsigned cc = MRI.createVirtualRegister(RC); + // cl := cmp src_lo, lo + BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) + .addReg(SrcLoReg).addReg(LoReg); + BuildMI(mainMBB, DL, TII->get(LoOpc), cL); + BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL); + // ch := cmp src_hi, hi + BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) + .addReg(SrcHiReg).addReg(HiReg); + BuildMI(mainMBB, DL, TII->get(HiOpc), cH); + BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH); + // cc := if (src_hi == hi) ? cl : ch; + if (Subtarget->hasCMov()) { + BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc) + .addReg(cH32).addReg(cL32); + } else { + MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc) + .addReg(cH32).addReg(cL32) + .addImm(X86::COND_E); + mainMBB = EmitLoweredSelect(MIB, mainMBB); + } + BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc); + if (Subtarget->hasCMov()) { + BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1L) + .addReg(SrcLoReg).addReg(LoReg); + BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1H) + .addReg(SrcHiReg).addReg(HiReg); + } else { + MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1L) + .addReg(SrcLoReg).addReg(LoReg) + .addImm(X86::COND_NE); + mainMBB = EmitLoweredSelect(MIB, mainMBB); + MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1H) + .addReg(SrcHiReg).addReg(HiReg) + .addImm(X86::COND_NE); + mainMBB = EmitLoweredSelect(MIB, mainMBB); + } + break; + } + case X86::ATOMSWAP6432: { + unsigned HiOpc; + unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); + BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(SrcLoReg); + BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(SrcHiReg); + break; + } + } - // Generate movc - unsigned t3 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass); - MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); - MIB.addReg(t2); - MIB.addReg(t1); + // Copy EDX:EAX back from HiReg:LoReg + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(LoReg); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(HiReg); + // Copy ECX:EBX from t1H:t1L + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t1L); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t1H); + + MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + MIB.setMemRefs(MMOBegin, MMOEnd); - // Cmp and exchange if none has modified the memory location - MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); - for (int i=0; i <= lastAddrIndx; ++i) - (*MIB).addOperand(*argOpers[i]); - MIB.addReg(t3); - assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); - (*MIB).setMemRefs(mInstr->memoperands_begin(), - mInstr->memoperands_end()); + BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); - MIB.addReg(X86::EAX); + mainMBB->addSuccessor(origMainMBB); + mainMBB->addSuccessor(sinkMBB); - // insert branch - BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); + // sinkMBB: + sinkMBB->addLiveIn(X86::EAX); + sinkMBB->addLiveIn(X86::EDX); - mInstr->eraseFromParent(); // The pseudo instruction is gone now. - return nextMBB; + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(TargetOpcode::COPY), DstLoReg) + .addReg(X86::EAX); + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(TargetOpcode::COPY), DstHiReg) + .addReg(X86::EDX); + + MI->eraseFromParent(); + return sinkMBB; } // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 @@ -12741,183 +13324,92 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // String/text processing lowering. case X86::PCMPISTRM128REG: case X86::VPCMPISTRM128REG: - return EmitPCMP(MI, BB, 3, false /* in-mem */); case X86::PCMPISTRM128MEM: case X86::VPCMPISTRM128MEM: - return EmitPCMP(MI, BB, 3, true /* in-mem */); case X86::PCMPESTRM128REG: case X86::VPCMPESTRM128REG: - return EmitPCMP(MI, BB, 5, false /* in mem */); case X86::PCMPESTRM128MEM: - case X86::VPCMPESTRM128MEM: - return EmitPCMP(MI, BB, 5, true /* in mem */); + case X86::VPCMPESTRM128MEM: { + unsigned NumArgs; + bool MemArg; + switch (MI->getOpcode()) { + default: llvm_unreachable("illegal opcode!"); + case X86::PCMPISTRM128REG: + case X86::VPCMPISTRM128REG: + NumArgs = 3; MemArg = false; break; + case X86::PCMPISTRM128MEM: + case X86::VPCMPISTRM128MEM: + NumArgs = 3; MemArg = true; break; + case X86::PCMPESTRM128REG: + case X86::VPCMPESTRM128REG: + NumArgs = 5; MemArg = false; break; + case X86::PCMPESTRM128MEM: + case X86::VPCMPESTRM128MEM: + NumArgs = 5; MemArg = true; break; + } + return EmitPCMP(MI, BB, NumArgs, MemArg); + } // Thread synchronization. case X86::MONITOR: return EmitMonitor(MI, BB); // Atomic Lowering. - case X86::ATOMAND32: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, - X86::AND32ri, X86::MOV32rm, - X86::LCMPXCHG32, - X86::NOT32r, X86::EAX, - &X86::GR32RegClass); - case X86::ATOMOR32: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, - X86::OR32ri, X86::MOV32rm, - X86::LCMPXCHG32, - X86::NOT32r, X86::EAX, - &X86::GR32RegClass); - case X86::ATOMXOR32: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, - X86::XOR32ri, X86::MOV32rm, - X86::LCMPXCHG32, - X86::NOT32r, X86::EAX, - &X86::GR32RegClass); - case X86::ATOMNAND32: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, - X86::AND32ri, X86::MOV32rm, - X86::LCMPXCHG32, - X86::NOT32r, X86::EAX, - &X86::GR32RegClass, true); - case X86::ATOMMIN32: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); - case X86::ATOMMAX32: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); - case X86::ATOMUMIN32: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); - case X86::ATOMUMAX32: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); - + case X86::ATOMAND8: case X86::ATOMAND16: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, - X86::AND16ri, X86::MOV16rm, - X86::LCMPXCHG16, - X86::NOT16r, X86::AX, - &X86::GR16RegClass); + case X86::ATOMAND32: + case X86::ATOMAND64: + // Fall through + case X86::ATOMOR8: case X86::ATOMOR16: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, - X86::OR16ri, X86::MOV16rm, - X86::LCMPXCHG16, - X86::NOT16r, X86::AX, - &X86::GR16RegClass); + case X86::ATOMOR32: + case X86::ATOMOR64: + // Fall through case X86::ATOMXOR16: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, - X86::XOR16ri, X86::MOV16rm, - X86::LCMPXCHG16, - X86::NOT16r, X86::AX, - &X86::GR16RegClass); - case X86::ATOMNAND16: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, - X86::AND16ri, X86::MOV16rm, - X86::LCMPXCHG16, - X86::NOT16r, X86::AX, - &X86::GR16RegClass, true); - case X86::ATOMMIN16: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); - case X86::ATOMMAX16: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); - case X86::ATOMUMIN16: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); - case X86::ATOMUMAX16: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); - - case X86::ATOMAND8: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, - X86::AND8ri, X86::MOV8rm, - X86::LCMPXCHG8, - X86::NOT8r, X86::AL, - &X86::GR8RegClass); - case X86::ATOMOR8: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, - X86::OR8ri, X86::MOV8rm, - X86::LCMPXCHG8, - X86::NOT8r, X86::AL, - &X86::GR8RegClass); case X86::ATOMXOR8: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, - X86::XOR8ri, X86::MOV8rm, - X86::LCMPXCHG8, - X86::NOT8r, X86::AL, - &X86::GR8RegClass); - case X86::ATOMNAND8: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, - X86::AND8ri, X86::MOV8rm, - X86::LCMPXCHG8, - X86::NOT8r, X86::AL, - &X86::GR8RegClass, true); - // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. - // This group is for 64-bit host. - case X86::ATOMAND64: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, - X86::AND64ri32, X86::MOV64rm, - X86::LCMPXCHG64, - X86::NOT64r, X86::RAX, - &X86::GR64RegClass); - case X86::ATOMOR64: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, - X86::OR64ri32, X86::MOV64rm, - X86::LCMPXCHG64, - X86::NOT64r, X86::RAX, - &X86::GR64RegClass); + case X86::ATOMXOR32: case X86::ATOMXOR64: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, - X86::XOR64ri32, X86::MOV64rm, - X86::LCMPXCHG64, - X86::NOT64r, X86::RAX, - &X86::GR64RegClass); + // Fall through + case X86::ATOMNAND8: + case X86::ATOMNAND16: + case X86::ATOMNAND32: case X86::ATOMNAND64: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, - X86::AND64ri32, X86::MOV64rm, - X86::LCMPXCHG64, - X86::NOT64r, X86::RAX, - &X86::GR64RegClass, true); - case X86::ATOMMIN64: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); + // Fall through + case X86::ATOMMAX8: + case X86::ATOMMAX16: + case X86::ATOMMAX32: case X86::ATOMMAX64: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); - case X86::ATOMUMIN64: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); + // Fall through + case X86::ATOMMIN8: + case X86::ATOMMIN16: + case X86::ATOMMIN32: + case X86::ATOMMIN64: + // Fall through + case X86::ATOMUMAX8: + case X86::ATOMUMAX16: + case X86::ATOMUMAX32: case X86::ATOMUMAX64: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); + // Fall through + case X86::ATOMUMIN8: + case X86::ATOMUMIN16: + case X86::ATOMUMIN32: + case X86::ATOMUMIN64: + return EmitAtomicLoadArith(MI, BB); // This group does 64-bit operations on a 32-bit host. case X86::ATOMAND6432: - return EmitAtomicBit6432WithCustomInserter(MI, BB, - X86::AND32rr, X86::AND32rr, - X86::AND32ri, X86::AND32ri, - false); case X86::ATOMOR6432: - return EmitAtomicBit6432WithCustomInserter(MI, BB, - X86::OR32rr, X86::OR32rr, - X86::OR32ri, X86::OR32ri, - false); case X86::ATOMXOR6432: - return EmitAtomicBit6432WithCustomInserter(MI, BB, - X86::XOR32rr, X86::XOR32rr, - X86::XOR32ri, X86::XOR32ri, - false); case X86::ATOMNAND6432: - return EmitAtomicBit6432WithCustomInserter(MI, BB, - X86::AND32rr, X86::AND32rr, - X86::AND32ri, X86::AND32ri, - true); case X86::ATOMADD6432: - return EmitAtomicBit6432WithCustomInserter(MI, BB, - X86::ADD32rr, X86::ADC32rr, - X86::ADD32ri, X86::ADC32ri, - false); case X86::ATOMSUB6432: - return EmitAtomicBit6432WithCustomInserter(MI, BB, - X86::SUB32rr, X86::SBB32rr, - X86::SUB32ri, X86::SBB32ri, - false); + case X86::ATOMMAX6432: + case X86::ATOMMIN6432: + case X86::ATOMUMAX6432: + case X86::ATOMUMIN6432: case X86::ATOMSWAP6432: - return EmitAtomicBit6432WithCustomInserter(MI, BB, - X86::MOV32rr, X86::MOV32rr, - X86::MOV32ri, X86::MOV32ri, - false); + return EmitAtomicLoadArith6432(MI, BB); + case X86::VASTART_SAVE_XMM_REGS: return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); @@ -13150,12 +13642,12 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); // Combine 256-bit vector shuffles. This is only profitable when in AVX mode - if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 && + if (Subtarget->hasAVX() && VT.is256BitVector() && N->getOpcode() == ISD::VECTOR_SHUFFLE) return PerformShuffleCombine256(N, DAG, DCI, Subtarget); // Only handle 128 wide vector from here on. - if (VT.getSizeInBits() != 128) + if (!VT.is128BitVector()) return SDValue(); // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, @@ -13169,12 +13661,12 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, } -/// DCI, PerformTruncateCombine - Converts truncate operation to +/// PerformTruncateCombine - Converts truncate operation to /// a sequence of vector shuffle operations. /// It is possible when we truncate 256-bit vector to 128-bit vector - -SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, - DAGCombinerInfo &DCI) const { +static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { if (!DCI.isBeforeLegalizeOps()) return SDValue(); @@ -13215,8 +13707,9 @@ SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, // PSHUFD static const int ShufMask1[] = {0, 2, 0, 0}; - OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT), ShufMask1); - OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT), ShufMask1); + SDValue Undef = DAG.getUNDEF(VT); + OpLo = DAG.getVectorShuffle(VT, dl, OpLo, Undef, ShufMask1); + OpHi = DAG.getVectorShuffle(VT, dl, OpHi, Undef, ShufMask1); // MOVLHPS static const int ShufMask2[] = {0, 1, 4, 5}; @@ -13274,10 +13767,9 @@ SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1}; - OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, DAG.getUNDEF(MVT::v16i8), - ShufMask1); - OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, DAG.getUNDEF(MVT::v16i8), - ShufMask1); + SDValue Undef = DAG.getUNDEF(MVT::v16i8); + OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, Undef, ShufMask1); + OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, Undef, ShufMask1); OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo); OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi); @@ -13366,7 +13858,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, // alignment is valid. unsigned Align = LN0->getAlignment(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned NewAlign = TLI.getTargetData()-> + unsigned NewAlign = TLI.getDataLayout()-> getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) @@ -13797,7 +14289,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // // where Op could be BRCOND or CMOV. // -static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { +static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { // Quit if not CMP and SUB with its value result used. if (Cmp.getOpcode() != X86ISD::CMP && (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0))) @@ -13833,24 +14325,55 @@ static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { if (SetCC.getOpcode() == ISD::ZERO_EXTEND) SetCC = SetCC.getOperand(0); - // Quit if not SETCC. - // FIXME: So far we only handle the boolean value generated from SETCC. If - // there is other ways to generate boolean values, we need handle them here - // as well. - if (SetCC.getOpcode() != X86ISD::SETCC) - return SDValue(); - - // Set the condition code or opposite one if necessary. - CC = X86::CondCode(SetCC.getConstantOperandVal(0)); - if (needOppositeCond) - CC = X86::GetOppositeBranchCondition(CC); + switch (SetCC.getOpcode()) { + case X86ISD::SETCC: + // Set the condition code or opposite one if necessary. + CC = X86::CondCode(SetCC.getConstantOperandVal(0)); + if (needOppositeCond) + CC = X86::GetOppositeBranchCondition(CC); + return SetCC.getOperand(1); + case X86ISD::CMOV: { + // Check whether false/true value has canonical one, i.e. 0 or 1. + ConstantSDNode *FVal = dyn_cast(SetCC.getOperand(0)); + ConstantSDNode *TVal = dyn_cast(SetCC.getOperand(1)); + // Quit if true value is not a constant. + if (!TVal) + return SDValue(); + // Quit if false value is not a constant. + if (!FVal) { + // A special case for rdrand, where 0 is set if false cond is found. + SDValue Op = SetCC.getOperand(0); + if (Op.getOpcode() != X86ISD::RDRAND) + return SDValue(); + } + // Quit if false value is not the constant 0 or 1. + bool FValIsFalse = true; + if (FVal && FVal->getZExtValue() != 0) { + if (FVal->getZExtValue() != 1) + return SDValue(); + // If FVal is 1, opposite cond is needed. + needOppositeCond = !needOppositeCond; + FValIsFalse = false; + } + // Quit if TVal is not the constant opposite of FVal. + if (FValIsFalse && TVal->getZExtValue() != 1) + return SDValue(); + if (!FValIsFalse && TVal->getZExtValue() != 0) + return SDValue(); + CC = X86::CondCode(SetCC.getConstantOperandVal(2)); + if (needOppositeCond) + CC = X86::GetOppositeBranchCondition(CC); + return SetCC.getOperand(3); + } + } - return SetCC.getOperand(1); + return SDValue(); } /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { DebugLoc DL = N->getDebugLoc(); // If the flag operand isn't dead, don't touch this CMOV. @@ -13875,8 +14398,10 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, SDValue Flags; - Flags = BoolTestSetCCCombine(Cond, CC); - if (Flags.getNode()) { + Flags = checkBoolTestSetCCCombine(Cond, CC); + if (Flags.getNode() && + // Extra check as FCMOV only supports a subset of X86 cond. + (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { SDValue Ops[] = { FalseOp, TrueOp, DAG.getConstant(CC, MVT::i8), Flags }; return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), @@ -14305,7 +14830,7 @@ static bool CanFoldXORWithAllOnes(const SDNode *N) { // Sometimes the operand may come from a insert_subvector building a 256-bit // allones vector - if (VT.getSizeInBits() == 256 && + if (VT.is256BitVector() && N->getOpcode() == ISD::INSERT_SUBVECTOR) { SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); @@ -14750,7 +15275,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // On Sandy Bridge, 256-bit memory operations are executed by two // 128-bit ports. However, on Haswell it is better to issue a single 256-bit // memory operation. - if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2() && + if (VT.is256BitVector() && !Subtarget->hasAVX2() && StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS && StoredVal.getNumOperands() == 2) { SDValue Value0 = StoredVal.getOperand(0); @@ -14861,7 +15386,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, return SDValue(); const Function *F = DAG.getMachineFunction().getFunction(); - bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); + bool NoImplicitFloatOps = F->getFnAttributes(). + hasAttribute(Attributes::NoImplicitFloat); bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps && Subtarget->hasSSE2(); if ((VT.isVector() || @@ -15133,6 +15659,29 @@ static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and +/// X86ISD::FMAX nodes. +static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { + assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); + + // Only perform optimizations if UnsafeMath is used. + if (!DAG.getTarget().Options.UnsafeFPMath) + return SDValue(); + + // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes + // into FMINC and FMAXC, which are Commutative operations. + unsigned NewOp = 0; + switch (N->getOpcode()) { + default: llvm_unreachable("unknown opcode"); + case X86ISD::FMIN: NewOp = X86ISD::FMINC; break; + case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; + } + + return DAG.getNode(NewOp, N->getDebugLoc(), N->getValueType(0), + N->getOperand(0), N->getOperand(1)); +} + + /// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { // FAND(0.0, x) -> 0.0 @@ -15208,19 +15757,19 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, // concat the vectors to original VT unsigned NumElems = OpVT.getVectorNumElements(); + SDValue Undef = DAG.getUNDEF(OpVT); + SmallVector ShufMask1(NumElems, -1); for (unsigned i = 0; i != NumElems/2; ++i) ShufMask1[i] = i; - SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT), - &ShufMask1[0]); + SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask1[0]); SmallVector ShufMask2(NumElems, -1); for (unsigned i = 0; i != NumElems/2; ++i) ShufMask2[i] = i + NumElems/2; - SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT), - &ShufMask2[0]); + SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask2[0]); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), VT.getVectorNumElements()/2); @@ -15238,8 +15787,13 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, DebugLoc dl = N->getDebugLoc(); EVT VT = N->getValueType(0); + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + EVT ScalarVT = VT.getScalarType(); - if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasFMA()) + if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || + (!Subtarget->hasFMA() && !Subtarget->hasFMA4())) return SDValue(); SDValue A = N->getOperand(0); @@ -15261,9 +15815,10 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode; if (!NegMul) - Opcode = (!NegC)? X86ISD::FMADD : X86ISD::FMSUB; + Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; else - Opcode = (!NegC)? X86ISD::FNMADD : X86ISD::FNMSUB; + Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; + return DAG.getNode(Opcode, dl, VT, A, B, C); } @@ -15361,7 +15916,9 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { } // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT -static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { DebugLoc DL = N->getDebugLoc(); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); SDValue EFLAGS = N->getOperand(1); @@ -15377,7 +15934,7 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { SDValue Flags; - Flags = BoolTestSetCCCombine(EFLAGS, CC); + Flags = checkBoolTestSetCCCombine(EFLAGS, CC); if (Flags.getNode()) { SDValue Cond = DAG.getConstant(CC, MVT::i8); return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); @@ -15399,7 +15956,7 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, SDValue Flags; - Flags = BoolTestSetCCCombine(EFLAGS, CC); + Flags = checkBoolTestSetCCCombine(EFLAGS, CC); if (Flags.getNode()) { SDValue Cond = DAG.getConstant(CC, MVT::i8); return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, @@ -15594,7 +16151,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); case ISD::VSELECT: case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); - case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); + case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); @@ -15614,15 +16171,17 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); case X86ISD::FXOR: case X86ISD::FOR: return PerformFORCombine(N, DAG); + case X86ISD::FMIN: + case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); case X86ISD::FAND: return PerformFANDCombine(N, DAG); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); - case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG, DCI); + case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget); case ISD::SETCC: return PerformISDSETCCCombine(N, DAG); - case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); + case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::PALIGN: