X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86ISelLowering.cpp;h=31b1ada9a855594cf2e0d1a62b1bba01f2e2b389;hb=3d39845812f510eef949fc318b3b437fbd334cc7;hp=efe1de730a2b7c79b312b560a49831901ed6ae93;hpb=da681cc578e56e869b5fa5bb041d7c21fb60dfc7;p=oota-llvm.git diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index efe1de730a2..31b1ada9a85 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1675,6 +1675,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SHL); @@ -1844,8 +1845,7 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, MachineFunction &MF) const { const Function *F = MF.getFunction(); if ((!IsMemset || ZeroMemset) && - !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::NoImplicitFloat)) { + !F->hasFnAttribute(Attribute::NoImplicitFloat)) { if (Size >= 16 && (Subtarget->isUnalignedMemAccessFast() || ((DstAlign == 0 || DstAlign >= 16) && @@ -2106,14 +2106,15 @@ X86TargetLowering::LowerReturn(SDValue Chain, // Win32 requires us to put the sret argument to %eax as well. // We saved the argument into a virtual register in the entry block, // so now we copy the value out and into %rax/%eax. - if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() && - (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) { - MachineFunction &MF = DAG.getMachineFunction(); - X86MachineFunctionInfo *FuncInfo = MF.getInfo(); - unsigned Reg = FuncInfo->getSRetReturnReg(); - assert(Reg && - "SRetReturnReg should have been set in LowerFormalArguments()."); - SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); + // + // Checking Function.hasStructRetAttr() here is insufficient because the IR + // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is + // false, then an sret argument may be implicitly inserted in the SelDAG. In + // either case FuncInfo->setSRetReturnReg() will have been called. + if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { + assert((Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) && + "No need for an sret register"); + SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy()); unsigned RetValReg = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ? @@ -2400,8 +2401,7 @@ static ArrayRef get64BitArgumentXMMs(MachineFunction &MF, } const Function *Fn = MF.getFunction(); - bool NoImplicitFloatOps = Fn->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); + bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat); assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"); if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || @@ -2569,8 +2569,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // Figure out if XMM registers are in use. assert(!(MF.getTarget().Options.UseSoftFloat && - Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::NoImplicitFloat)) && + Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"); // 64-bit calling conventions support varargs and register parameters, so we @@ -3130,11 +3129,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. OpFlags = X86II::MO_DARWIN_STUB; - } else if (Subtarget->isPICStyleRIPRel() && - isa(GV) && - cast(GV)->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, - Attribute::NonLazyBind)) { + } else if (Subtarget->isPICStyleRIPRel() && isa(GV) && + cast(GV)->hasFnAttribute(Attribute::NonLazyBind)) { // If the function is marked as non-lazy, generate an indirect call // which loads from the GOT directly. This avoids runtime overhead // at the cost of eager binding (and one extra byte of encoding). @@ -4140,7 +4136,7 @@ static bool isSHUFPMask(ArrayRef Mask, MVT VT, bool Commuted = false) { return false; unsigned EltSize = VT.getVectorElementType().getSizeInBits(); - bool symetricMaskRequired = + bool symmetricMaskRequired = (VT.getSizeInBits() >= 256) && (EltSize == 32); // VSHUFPSY divides the resulting vector into 4 chunks. @@ -4173,7 +4169,7 @@ static bool isSHUFPMask(ArrayRef Mask, MVT VT, bool Commuted = false) { // For VSHUFPSY, the mask of the second half must be the same as the // first but with the appropriate offsets. This works in the same way as // VPERMILPS works with masks. - if (!symetricMaskRequired || Idx < 0) + if (!symmetricMaskRequired || Idx < 0) continue; if (MaskVal[i] < 0) { MaskVal[i] = Idx - l; @@ -4601,7 +4597,7 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { return (FstHalf | (SndHalf << 4)); } -// Symetric in-lane mask. Each lane has 4 elements (for imm8) +// Symmetric in-lane mask. Each lane has 4 elements (for imm8) static bool isPermImmMask(ArrayRef Mask, MVT VT, unsigned& Imm8) { unsigned EltSize = VT.getVectorElementType().getSizeInBits(); if (EltSize < 32) @@ -4650,7 +4646,7 @@ static bool isVPERMILPMask(ArrayRef Mask, MVT VT) { unsigned EltSize = VT.getVectorElementType().getSizeInBits(); if (VT.getSizeInBits() < 256 || EltSize < 32) return false; - bool symetricMaskRequired = (EltSize == 32); + bool symmetricMaskRequired = (EltSize == 32); unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits()/128; @@ -4662,7 +4658,7 @@ static bool isVPERMILPMask(ArrayRef Mask, MVT VT) { for (unsigned i = 0; i != LaneSize; ++i) { if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) return false; - if (symetricMaskRequired) { + if (symmetricMaskRequired) { if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) { ExpectedMaskVal[i] = Mask[i+l] - l; continue; @@ -5478,6 +5474,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, if (auto *C = dyn_cast(MaskCP->getConstVal())) { DecodePSHUFBMask(C, Mask); + if (Mask.empty()) + return false; break; } @@ -6222,8 +6220,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. const Function *F = DAG.getMachineFunction().getFunction(); - bool OptForSize = F->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize); // Handle broadcasting a single constant scalar from the constant pool // into a vector. @@ -7371,13 +7368,25 @@ namespace { /// \brief Implementation of the \c isShuffleEquivalent variadic functor. /// /// See its documentation for details. -bool isShuffleEquivalentImpl(ArrayRef Mask, ArrayRef Args) { +bool isShuffleEquivalentImpl(SDValue V1, SDValue V2, ArrayRef Mask, + ArrayRef Args) { if (Mask.size() != Args.size()) return false; + + // If the values are build vectors, we can look through them to find + // equivalent inputs that make the shuffles equivalent. + auto *BV1 = dyn_cast(V1); + auto *BV2 = dyn_cast(V2); + for (int i = 0, e = Mask.size(); i < e; ++i) { assert(*Args[i] >= 0 && "Arguments must be positive integers!"); - if (Mask[i] != -1 && Mask[i] != *Args[i]) - return false; + if (Mask[i] != -1 && Mask[i] != *Args[i]) { + auto *MaskBV = Mask[i] < e ? BV1 : BV2; + auto *ArgsBV = *Args[i] < e ? BV1 : BV2; + if (!MaskBV || !ArgsBV || + MaskBV->getOperand(Mask[i] % e) != ArgsBV->getOperand(*Args[i] % e)) + return false; + } } return true; } @@ -7394,8 +7403,9 @@ bool isShuffleEquivalentImpl(ArrayRef Mask, ArrayRef Args) { /// It returns true if the mask is exactly as wide as the argument list, and /// each element of the mask is either -1 (signifying undef) or the value given /// in the argument. -static const VariadicFunction1< - bool, ArrayRef, int, isShuffleEquivalentImpl> isShuffleEquivalent = {}; +static const VariadicFunction3, int, + isShuffleEquivalentImpl> isShuffleEquivalent = + {}; /// \brief Get a 4-lane 8-bit shuffle immediate for a mask. /// @@ -7546,6 +7556,38 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, } } +/// \brief Try to lower as a blend of elements from two inputs followed by +/// a single-input permutation. +/// +/// This matches the pattern where we can blend elements from two inputs and +/// then reduce the shuffle to a single-input permutation. +static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, + ArrayRef Mask, + SelectionDAG &DAG) { + // We build up the blend mask while checking whether a blend is a viable way + // to reduce the shuffle. + SmallVector BlendMask(Mask.size(), -1); + SmallVector PermuteMask(Mask.size(), -1); + + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Mask[i] < 0) + continue; + + assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds."); + + if (BlendMask[Mask[i] % Size] == -1) + BlendMask[Mask[i] % Size] = Mask[i]; + else if (BlendMask[Mask[i] % Size] != Mask[i]) + return SDValue(); // Can't blend in the needed input! + + PermuteMask[i] = Mask[i] % Size; + } + + SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); + return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); +} + /// \brief Generic routine to lower a shuffle and blend as a decomposed set of /// unblended shuffles followed by an unshuffled blend. /// @@ -7706,6 +7748,11 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef Mask, SDValue V1, SDValue V2) { SmallBitVector Zeroable(Mask.size(), false); + while (V1.getOpcode() == ISD::BITCAST) + V1 = V1->getOperand(0); + while (V2.getOpcode() == ISD::BITCAST) + V2 = V2->getOperand(0); + bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); @@ -7717,10 +7764,10 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef Mask, continue; } - // If this is an index into a build_vector node, dig out the input value and - // use it. + // If this is an index into a build_vector node (which has the same number + // of elements), dig out the input value and use it. SDValue V = M < Size ? V1 : V2; - if (V.getOpcode() != ISD::BUILD_VECTOR) + if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands()) continue; SDValue Input = V.getOperand(M % Size); @@ -7768,8 +7815,9 @@ static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, return SDValue(); // No non-zeroable elements! SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps); - V = DAG.getNode(VT.isFloatingPoint() ? X86ISD::FAND : ISD::AND, DL, VT, V, - VMask); + V = DAG.getNode(VT.isFloatingPoint() + ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND, + DL, VT, V, VMask); return V; } @@ -8207,6 +8255,10 @@ static SDValue lowerVectorShuffleAsElementInsertion( ExtVT, V1, V2); } + // This lowering only works for the low element with floating point vectors. + if (VT.isFloatingPoint() && V2Index != 0) + return SDValue(); + V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); if (ExtVT != VT) V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); @@ -8409,7 +8461,7 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (isSingleInputShuffleMask(Mask)) { // Use low duplicate instructions for masks that match their pattern. if (Subtarget->hasSSE3()) - if (isShuffleEquivalent(Mask, 0, 0)) + if (isShuffleEquivalent(V1, V2, Mask, 0, 0)) return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1); // Straight shuffle of a single input vector. Simulate this by using the @@ -8429,12 +8481,6 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); assert(Mask[1] >= 2 && "Non-canonicalized blend!"); - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 2)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 3)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2); - // If we have a single input, insert that into V1 if we can do so cheaply. if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( @@ -8451,7 +8497,7 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // Try to use one of the special instruction patterns to handle two common // blend patterns if a zero-blend above didn't work. - if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3)) + if (isShuffleEquivalent(V1, V2, Mask, 0, 3) || isShuffleEquivalent(V1, V2, Mask, 1, 3)) if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG)) // We can either use a special instruction to load over the low double or // to move just the low double. @@ -8465,6 +8511,12 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Subtarget, DAG)) return Blend; + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, 0, 2)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, 1, 3)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2); + unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2, DAG.getConstant(SHUFPDMask, MVT::i8)); @@ -8526,17 +8578,17 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Insertion; } - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 2)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 3)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); - if (Subtarget->hasSSE41()) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Blend; + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, 0, 2)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, 1, 3)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); + // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget->hasSSSE3()) @@ -8554,6 +8606,24 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); } +/// \brief Test whether this can be lowered with a single SHUFPS instruction. +/// +/// This is used to disable more specialized lowerings when the shufps lowering +/// will happen to be efficient. +static bool isSingleSHUFPSMask(ArrayRef Mask) { + // This routine only handles 128-bit shufps. + assert(Mask.size() == 4 && "Unsupported mask size!"); + + // To lower with a single SHUFPS we need to have the low half and high half + // each requiring a single input. + if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4)) + return false; + if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4)) + return false; + + return true; +} + /// \brief Lower a vector shuffle using the SHUFPS instruction. /// /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. @@ -8671,9 +8741,9 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // Use even/odd duplicate instructions for masks that match their pattern. if (Subtarget->hasSSE3()) { - if (isShuffleEquivalent(Mask, 0, 0, 2, 2)) + if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2)) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1); - if (isShuffleEquivalent(Mask, 1, 1, 3, 3)) + if (isShuffleEquivalent(V1, V2, Mask, 1, 1, 3, 3)) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1); } @@ -8690,12 +8760,6 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(Mask, DAG)); } - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2); - if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); - // There are special ways we can lower some single-element blends. However, we // have custom ways we can lower more complex single-element blends below that // we defer to if both this and BLENDPS fail to match, so restrict this to @@ -8714,8 +8778,19 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // Use INSERTPS if we can complete the shuffle efficiently. if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG)) return V; + + if (!isSingleSHUFPSMask(Mask)) + if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute( + DL, MVT::v4f32, V1, V2, Mask, DAG)) + return BlendPerm; } + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 1, 5)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, 2, 6, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); + // Otherwise fall back to a SHUFPS lowering strategy. return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); } @@ -8758,9 +8833,9 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // so prevents folding a load into this instruction or making a copy. const int UnpackLoMask[] = {0, 0, 1, 1}; const int UnpackHiMask[] = {2, 2, 3, 3}; - if (isShuffleEquivalent(Mask, 0, 0, 1, 1)) + if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 1, 1)) Mask = UnpackLoMask; - else if (isShuffleEquivalent(Mask, 2, 2, 3, 3)) + else if (isShuffleEquivalent(V1, V2, Mask, 2, 2, 3, 3)) Mask = UnpackHiMask; return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, @@ -8793,9 +8868,9 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Masked; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) + if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 1, 5)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2); - if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) + if (isShuffleEquivalent(V1, V2, Mask, 2, 6, 3, 7)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); // Try to use byte rotation instructions. @@ -8872,9 +8947,9 @@ static SDValue lowerV8I16SingleInputVectorShuffle( return Shift; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3)) + if (isShuffleEquivalent(V, V, Mask, 0, 0, 1, 1, 2, 2, 3, 3)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V); - if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7)) + if (isShuffleEquivalent(V, V, Mask, 4, 4, 5, 5, 6, 6, 7, 7)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V); // Try to use byte rotation instructions. @@ -9509,9 +9584,9 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Masked; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11)) + if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 2, 10, 3, 11)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2); - if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15)) + if (isShuffleEquivalent(V1, V2, Mask, 4, 12, 5, 13, 6, 14, 7, 15)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2); // Try to use byte rotation instructions. @@ -10060,7 +10135,7 @@ static bool canWidenShuffleElements(ArrayRef Mask, return true; } -/// \brief Generic routine to split ector shuffle into half-sized shuffles. +/// \brief Generic routine to split vector shuffle into half-sized shuffles. /// /// This routine just extracts two subvectors, shuffles them independently, and /// then concatenates them back together. This should work effectively with all @@ -10081,14 +10156,43 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, MVT ScalarVT = VT.getScalarType(); MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2); - SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1, - DAG.getIntPtrConstant(0)); - SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1, - DAG.getIntPtrConstant(SplitNumElements)); - SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2, - DAG.getIntPtrConstant(0)); - SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2, - DAG.getIntPtrConstant(SplitNumElements)); + // Rather than splitting build-vectors, just build two narrower build + // vectors. This helps shuffling with splats and zeros. + auto SplitVector = [&](SDValue V) { + while (V.getOpcode() == ISD::BITCAST) + V = V->getOperand(0); + + MVT OrigVT = V.getSimpleValueType(); + int OrigNumElements = OrigVT.getVectorNumElements(); + int OrigSplitNumElements = OrigNumElements / 2; + MVT OrigScalarVT = OrigVT.getScalarType(); + MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2); + + SDValue LoV, HiV; + + auto *BV = dyn_cast(V); + if (!BV) { + LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, + DAG.getIntPtrConstant(0)); + HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, + DAG.getIntPtrConstant(OrigSplitNumElements)); + } else { + + SmallVector LoOps, HiOps; + for (int i = 0; i < OrigSplitNumElements; ++i) { + LoOps.push_back(BV->getOperand(i)); + HiOps.push_back(BV->getOperand(i + OrigSplitNumElements)); + } + LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps); + HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps); + } + return std::make_pair(DAG.getNode(ISD::BITCAST, DL, SplitVT, LoV), + DAG.getNode(ISD::BITCAST, DL, SplitVT, HiV)); + }; + + SDValue LoV1, HiV1, LoV2, HiV2; + std::tie(LoV1, HiV1) = SplitVector(V1); + std::tie(LoV2, HiV2) = SplitVector(V2); // Now create two 4-way blends of these half-width vectors. auto HalfBlend = [&](ArrayRef HalfMask) { @@ -10284,15 +10388,15 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, VT.getVectorNumElements() / 2); // Check for patterns which can be matched with a single insert of a 128-bit // subvector. - if (isShuffleEquivalent(Mask, 0, 1, 0, 1) || - isShuffleEquivalent(Mask, 0, 1, 4, 5)) { + if (isShuffleEquivalent(V1, V2, Mask, 0, 1, 0, 1) || + isShuffleEquivalent(V1, V2, Mask, 0, 1, 4, 5)) { SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, DAG.getIntPtrConstant(0)); SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0)); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); } - if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) { + if (isShuffleEquivalent(V1, V2, Mask, 0, 1, 6, 7)) { SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, DAG.getIntPtrConstant(0)); SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2, @@ -10431,7 +10535,7 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Broadcast; // Use low duplicate instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 0, 2, 2)) + if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2)) return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1); if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { @@ -10455,9 +10559,9 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(Mask, 0, 4, 2, 6)) + if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 5, 3, 7)) + if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2); // If we have a single input to the zero element, insert that into V1 if we @@ -10561,9 +10665,9 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, } // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 4, 2, 6)) + if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 5, 3, 7)) + if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2); } @@ -10619,9 +10723,9 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, "Repeated masks must be half the mask width!"); // Use even/odd duplicate instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 0, 2, 2, 4, 4, 6, 6)) + if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2, 4, 4, 6, 6)) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1); - if (isShuffleEquivalent(Mask, 1, 1, 3, 3, 5, 5, 7, 7)) + if (isShuffleEquivalent(V1, V2, Mask, 1, 1, 3, 3, 5, 5, 7, 7)) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1); if (isSingleInputShuffleMask(Mask)) @@ -10629,9 +10733,9 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(RepeatedMask, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13)) + if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 4, 12, 5, 13)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2); - if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15)) + if (isShuffleEquivalent(V1, V2, Mask, 2, 10, 3, 11, 6, 14, 7, 15)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2); // Otherwise, fall back to a SHUFPS sequence. Here it is important that we @@ -10725,9 +10829,9 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(RepeatedMask, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13)) + if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 4, 12, 5, 13)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2); - if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15)) + if (isShuffleEquivalent(V1, V2, Mask, 2, 10, 3, 11, 6, 14, 7, 15)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2); } @@ -10791,13 +10895,13 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, + if (isShuffleEquivalent(V1, V2, Mask, // First 128-bit lane: 0, 16, 1, 17, 2, 18, 3, 19, // Second 128-bit lane: 8, 24, 9, 25, 10, 26, 11, 27)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2); - if (isShuffleEquivalent(Mask, + if (isShuffleEquivalent(V1, V2, Mask, // First 128-bit lane: 4, 20, 5, 21, 6, 22, 7, 23, // Second 128-bit lane: @@ -10881,14 +10985,14 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // Note that these are repeated 128-bit lane unpacks, not unpacks across all // 256-bit lanes. if (isShuffleEquivalent( - Mask, + V1, V2, Mask, // First 128-bit lane: 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, // Second 128-bit lane: 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2); if (isShuffleEquivalent( - Mask, + V1, V2, Mask, // First 128-bit lane: 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, // Second 128-bit lane: @@ -10993,9 +11097,9 @@ static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14)) + if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 2, 10, 4, 12, 6, 14)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15)) + if (isShuffleEquivalent(V1, V2, Mask, 1, 9, 3, 11, 5, 13, 7, 15)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2); // FIXME: Implement direct support for this type! @@ -11014,11 +11118,11 @@ static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, + if (isShuffleEquivalent(V1, V2, Mask, 0, 16, 1, 17, 4, 20, 5, 21, 8, 24, 9, 25, 12, 28, 13, 29)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2); - if (isShuffleEquivalent(Mask, + if (isShuffleEquivalent(V1, V2, Mask, 2, 18, 3, 19, 6, 22, 7, 23, 10, 26, 11, 27, 14, 30, 15, 31)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2); @@ -11040,9 +11144,9 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14)) + if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 2, 10, 4, 12, 6, 14)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15)) + if (isShuffleEquivalent(V1, V2, Mask, 1, 9, 3, 11, 5, 13, 7, 15)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2); // FIXME: Implement direct support for this type! @@ -11061,11 +11165,11 @@ static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, + if (isShuffleEquivalent(V1, V2, Mask, 0, 16, 1, 17, 4, 20, 5, 21, 8, 24, 9, 25, 12, 28, 13, 29)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2); - if (isShuffleEquivalent(Mask, + if (isShuffleEquivalent(V1, V2, Mask, 2, 18, 3, 19, 6, 22, 7, 23, 10, 26, 11, 27, 14, 30, 15, 31)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2); @@ -11198,6 +11302,13 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask); } + // We actually see shuffles that are entirely re-arrangements of a set of + // zero inputs. This mostly happens while decomposing complex shuffles into + // simple ones. Directly lower these as a buildvector of zeros. + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + if (Zeroable.all()) + return getZeroVector(VT, Subtarget, DAG, dl); + // Try to collapse shuffles into using a vector type with fewer elements but // wider element types. We cap this to not form integers or floating point // elements wider than 64 bits, but it might be interesting to form i128 @@ -11312,7 +11423,7 @@ static bool isBlendMask(ArrayRef MaskVals, MVT VT, bool hasSSE41, unsigned NumLanes = (NumElems - 1) / 8 + 1; unsigned NumElemsInLane = NumElems / NumLanes; - // Blend for v16i16 should be symetric for the both lanes. + // Blend for v16i16 should be symmetric for both lanes. for (unsigned i = 0; i < NumElemsInLane; ++i) { int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1; @@ -12194,7 +12305,7 @@ static SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); - // Canonizalize to v2f64. + // Canonicalize to v2f64. V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); return DAG.getNode(ISD::BITCAST, dl, VT, getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, @@ -12213,7 +12324,7 @@ SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, if (HasSSE2 && VT == MVT::v2f64) return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); - // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1) + // v4f32 or v4i32: canonicalize to v4f32 (which is legal for SSE1) return DAG.getNode(ISD::BITCAST, dl, VT, getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32, DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1), @@ -12527,8 +12638,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { bool HasFp256 = Subtarget->hasFp256(); bool HasInt256 = Subtarget->hasInt256(); MachineFunction &MF = DAG.getMachineFunction(); - bool OptForSize = MF.getFunction()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + bool OptForSize = + MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize); // Check if we should use the experimental vector shuffle lowering. If so, // delegate completely to that code path. @@ -12756,8 +12867,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { DAG); unsigned MaskValue; - if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(), - &MaskValue)) + if (isBlendMask(M, VT, Subtarget->hasSSE41(), HasInt256, &MaskValue)) return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG); if (isSHUFPMask(M, VT)) @@ -12835,7 +12945,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return NewOp; } - if (VT == MVT::v16i16 && Subtarget->hasInt256()) { + if (VT == MVT::v16i16 && HasInt256) { SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG); if (NewOp.getNode()) return NewOp; @@ -15043,11 +15153,11 @@ static bool hasNonFlagsUse(SDValue Op) { /// equivalent. SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, SelectionDAG &DAG) const { - if (Op.getValueType() == MVT::i1) - // KORTEST instruction should be selected - return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, - DAG.getConstant(0, Op.getValueType())); - + if (Op.getValueType() == MVT::i1) { + SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op); + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp, + DAG.getConstant(0, MVT::i8)); + } // CF and OF aren't always set the way we want. Determine which // of these we need. bool NeedCF = false; @@ -15295,8 +15405,8 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, // if we're optimizing for size, however, as that'll allow better folding // of memory operations. if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 && - !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::MinSize) && + !DAG.getMachineFunction().getFunction()->hasFnAttribute( + Attribute::MinSize) && !Subtarget->isAtom()) { unsigned ExtendOp = isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; @@ -16291,6 +16401,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, // may emit an illegal shuffle but the expansion is still better than scalar // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise // we'll emit a shuffle and a arithmetic shift. +// FIXME: Is the expansion actually better than scalar code? It doesn't seem so. // TODO: It is possible to support ZExt by zeroing the undef values during // the shuffle phase or after the shuffle. static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, @@ -16997,10 +17108,8 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { if (ArgMode == 2) { // Sanity Check: Make sure using fp_offset makes sense. assert(!DAG.getTarget().Options.UseSoftFloat && - !(DAG.getMachineFunction() - .getFunction()->getAttributes() - .hasAttribute(AttributeSet::FunctionIndex, - Attribute::NoImplicitFloat)) && + !(DAG.getMachineFunction().getFunction()->hasFnAttribute( + Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()); } @@ -17950,15 +18059,33 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, } SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { - MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo(); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + EVT VT = Op.getValueType(); + MFI->setFrameAddressIsTaken(true); - EVT VT = Op.getValueType(); + if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) { + // Depth > 0 makes no sense on targets which use Windows unwind codes. It + // is not possible to crawl up the stack without looking at the unwind codes + // simultaneously. + int FrameAddrIndex = FuncInfo->getFAIndex(); + if (!FrameAddrIndex) { + // Set up a frame object for the return address. + unsigned SlotSize = RegInfo->getSlotSize(); + FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject( + SlotSize, /*Offset=*/INT64_MIN, /*IsImmutable=*/false); + FuncInfo->setFAIndex(FrameAddrIndex); + } + return DAG.getFrameIndex(FrameAddrIndex, VT); + } + + unsigned FrameReg = + RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); - const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); - unsigned FrameReg = RegInfo->getPtrSizedFrameRegister( - DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"); @@ -20396,6 +20523,8 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return false; } +bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; } + bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { if (!(Subtarget->hasFMA() || Subtarget->hasFMA4())) @@ -22753,9 +22882,9 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { // We're looking for blends between FADD and FSUB nodes. We insist on these // nodes being lined up in a specific expected pattern. - if (!(isShuffleEquivalent(Mask, 0, 3) || - isShuffleEquivalent(Mask, 0, 5, 2, 7) || - isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15))) + if (!(isShuffleEquivalent(V1, V2, Mask, 0, 3) || + isShuffleEquivalent(V1, V2, Mask, 0, 5, 2, 7) || + isShuffleEquivalent(V1, V2, Mask, 0, 9, 2, 11, 4, 13, 6, 15))) return SDValue(); // Only specific types are legal at this point, assert so we notice if and @@ -22985,6 +23114,25 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, EltNo); } +/// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are +/// special and don't usually play with other vector types, it's better to +/// handle them early to be sure we emit efficient code by avoiding +/// store-load conversions. +static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) { + if (N->getValueType(0) != MVT::x86mmx || + N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR || + N->getOperand(0)->getValueType(0) != MVT::v2i32) + return SDValue(); + + SDValue V = N->getOperand(0); + ConstantSDNode *C = dyn_cast(V.getOperand(1)); + if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32) + return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)), + N->getValueType(0), V.getOperand(0)); + + return SDValue(); +} + /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index /// generation and convert it from being a bunch of shuffles and extracts /// into a somewhat faster sequence. For i686, the best sequence is apparently @@ -24587,7 +24735,7 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, uint64_t Mask = MaskNode->getZExtValue(); uint64_t Shift = ShiftNode->getZExtValue(); if (isMask_64(Mask)) { - uint64_t MaskSize = CountPopulation_64(Mask); + uint64_t MaskSize = countPopulation(Mask); if (Shift + MaskSize <= VT.getSizeInBits()) return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0), DAG.getConstant(Shift | (MaskSize << 8), VT)); @@ -24720,8 +24868,8 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) MachineFunction &MF = DAG.getMachineFunction(); - bool OptForSize = MF.getFunction()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + bool OptForSize = + MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize); // SHLD/SHRD instructions have lower register pressure, but on some // platforms they have higher latency than the equivalent @@ -25168,8 +25316,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, return SDValue(); const Function *F = DAG.getMachineFunction().getFunction(); - bool NoImplicitFloatOps = F->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); + bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat); bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps && Subtarget->hasSSE2(); if ((VT.isVector() || @@ -25428,11 +25575,13 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); + // F[X]OR(0.0, x) -> x - // F[X]OR(x, 0.0) -> x if (ConstantFPSDNode *C = dyn_cast(N->getOperand(0))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); + + // F[X]OR(x, 0.0) -> x if (ConstantFPSDNode *C = dyn_cast(N->getOperand(1))) if (C->getValueAPF().isPosZero()) return N->getOperand(0); @@ -25463,26 +25612,30 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { /// Do target-specific dag combines on X86ISD::FAND nodes. static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { // FAND(0.0, x) -> 0.0 - // FAND(x, 0.0) -> 0.0 if (ConstantFPSDNode *C = dyn_cast(N->getOperand(0))) if (C->getValueAPF().isPosZero()) return N->getOperand(0); + + // FAND(x, 0.0) -> 0.0 if (ConstantFPSDNode *C = dyn_cast(N->getOperand(1))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); + return SDValue(); } /// Do target-specific dag combines on X86ISD::FANDN nodes static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { - // FANDN(x, 0.0) -> 0.0 // FANDN(0.0, x) -> x if (ConstantFPSDNode *C = dyn_cast(N->getOperand(0))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); + + // FANDN(x, 0.0) -> 0.0 if (ConstantFPSDNode *C = dyn_cast(N->getOperand(1))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); + return SDValue(); } @@ -26128,6 +26281,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SELECT: case X86ISD::SHRUNKBLEND: return PerformSELECTCombine(N, DAG, DCI, Subtarget); + case ISD::BITCAST: return PerformBITCASTCombine(N, DAG); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);