X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;ds=sidebyside;f=lib%2FTarget%2FX86%2FX86ISelLowering.cpp;h=cba145208da63a4ff91e9bde0550dd7c376bb261;hb=3922da8ae8fab29de6416eeeebf21208b1491557;hp=140b2a830ce029f64529244364f1ed042ba2af04;hpb=c4788790f6c11414fb0ab97a2979f76e54dcb580;p=oota-llvm.git diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 140b2a830ce..cba145208da 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -519,10 +519,21 @@ void X86TargetLowering::resetOperationActions() { // If we don't have F16C support, then lower half float conversions // into library calls. if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) { - setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand); - setOperationAction(ISD::FP32_TO_FP16, MVT::i16, Expand); + setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); } + // There's never any support for operations beyond MVT::f32. + setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); + setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f16, Expand); + setTruncStoreAction(MVT::f80, MVT::f16, Expand); + if (Subtarget->hasPOPCNT()) { setOperationAction(ISD::CTPOP , MVT::i8 , Promote); } else { @@ -878,7 +889,12 @@ void X86TargetLowering::resetOperationActions() { (MVT::SimpleValueType)InnerVT, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, Expand); + + // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types, + // we have to deal with them whether we ask for Expansion or not. Setting + // Expand causes its own optimisation problems though, so leave them legal. + if (VT.getVectorElementType() == MVT::i1) + setLoadExtAction(ISD::EXTLOAD, VT, Expand); } // FIXME: In order to prevent SSE instructions being expanded to MMX ones @@ -1489,6 +1505,11 @@ void X86TargetLowering::resetOperationActions() { } }// has AVX-512 + if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) { + addRegisterClass(MVT::v32i1, &X86::VK32RegClass); + addRegisterClass(MVT::v64i1, &X86::VK64RegClass); + } + // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion // of this type with custom code. for (int VT = MVT::FIRST_VECTOR_VALUETYPE; @@ -2296,6 +2317,10 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, RC = &X86::VK8RegClass; else if (RegVT == MVT::v16i1) RC = &X86::VK16RegClass; + else if (RegVT == MVT::v32i1) + RC = &X86::VK32RegClass; + else if (RegVT == MVT::v64i1) + RC = &X86::VK64RegClass; else llvm_unreachable("Unknown argument type!"); @@ -3043,7 +3068,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // If a tail called function callee has more arguments than the caller the // caller needs to make sure that there is room to move the RETADDR to. This is // achieved by reserving an area the size of the argument delta right after the -// original REtADDR, but before the saved framepointer or the spilled registers +// original RETADDR, but before the saved framepointer or the spilled registers // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) // stack layout: // arg1 @@ -4751,28 +4776,6 @@ bool X86::isZeroNode(SDValue Elt) { return false; } -/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in -/// their permute mask. -static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, - SelectionDAG &DAG) { - MVT VT = SVOp->getSimpleValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - SmallVector MaskVec; - - for (unsigned i = 0; i != NumElems; ++i) { - int Idx = SVOp->getMaskElt(i); - if (Idx >= 0) { - if (Idx < (int)NumElems) - Idx += NumElems; - else - Idx -= NumElems; - } - MaskVec.push_back(Idx); - } - return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1), - SVOp->getOperand(0), &MaskVec[0]); -} - /// ShouldXformToMOVHLPS - Return true if the node should be transformed to /// match movhlps. The lower half elements should come from upper half of /// V1 (and in order), and the upper half elements should come from the upper @@ -7694,6 +7697,9 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (isSingleInputShuffleMask(Mask)) { // Check whether we can widen this to an i16 shuffle by duplicating bytes. // Notably, this handles splat and partial-splat shuffles more efficiently. + // However, it only makes sense if the pre-duplication shuffle simplifies + // things significantly. Currently, this means we need to be able to + // express the pre-duplication shuffle as an i16 shuffle. // // FIXME: We should check for other patterns which can be widened into an // i16 shuffle as well. @@ -7704,7 +7710,9 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, } return true; }; - if (canWidenViaDuplication(Mask)) { + auto tryToWidenViaDuplication = [&]() -> SDValue { + if (!canWidenViaDuplication(Mask)) + return SDValue(); SmallVector LoInputs; std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs), [](int M) { return M >= 0 && M < 8; }); @@ -7722,52 +7730,57 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef InPlaceInputs = TargetLo ? LoInputs : HiInputs; ArrayRef MovingInputs = TargetLo ? HiInputs : LoInputs; - int ByteMask[16]; + int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1}; SmallDenseMap LaneMap; - for (int i = 0; i < 16; ++i) - ByteMask[i] = -1; for (int I : InPlaceInputs) { - ByteMask[I] = I; + PreDupI16Shuffle[I/2] = I/2; LaneMap[I] = I; } - int FreeByteIdx = 0; - int TargetOffset = TargetLo ? 0 : 8; - for (int I : MovingInputs) { - // Walk the free index into the byte mask until we find an unoccupied - // spot. We bound this to 8 steps to catch bugs, the pigeonhole - // principle indicates that there *must* be a spot as we can only have - // 8 duplicated inputs. We have to walk the index using modular - // arithmetic to wrap around as necessary. - // FIXME: We could do a much better job of picking an inexpensive slot - // so this doesn't go through the worst case for the byte shuffle. - for (int j = 0; j < 8 && ByteMask[FreeByteIdx + TargetOffset] != -1; - ++j, FreeByteIdx = (FreeByteIdx + 1) % 8) - ; - assert(ByteMask[FreeByteIdx + TargetOffset] == -1 && - "Failed to find a free byte!"); - ByteMask[FreeByteIdx + TargetOffset] = I; - LaneMap[I] = FreeByteIdx + TargetOffset; + int j = TargetLo ? 0 : 4, je = j + 4; + for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) { + // Check if j is already a shuffle of this input. This happens when + // there are two adjacent bytes after we move the low one. + if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) { + // If we haven't yet mapped the input, search for a slot into which + // we can map it. + while (j < je && PreDupI16Shuffle[j] != -1) + ++j; + + if (j == je) + // We can't place the inputs into a single half with a simple i16 shuffle, so bail. + return SDValue(); + + // Map this input with the i16 shuffle. + PreDupI16Shuffle[j] = MovingInputs[i] / 2; + } + + // Update the lane map based on the mapping we ended up with. + LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2; } - V1 = DAG.getVectorShuffle(MVT::v16i8, DL, V1, DAG.getUNDEF(MVT::v16i8), - ByteMask); - for (int &M : Mask) - if (M != -1) - M = LaneMap[M]; + V1 = DAG.getNode( + ISD::BITCAST, DL, MVT::v16i8, + DAG.getVectorShuffle(MVT::v8i16, DL, + DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), + DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); // Unpack the bytes to form the i16s that will be shuffled into place. V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, MVT::v16i8, V1, V1); - int I16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; for (int i = 0; i < 16; i += 2) { if (Mask[i] != -1) - I16Shuffle[i / 2] = Mask[i] - (TargetLo ? 0 : 8); - assert(I16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!"); + PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); + assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!"); } - return DAG.getVectorShuffle(MVT::v8i16, DL, - DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), - DAG.getUNDEF(MVT::v8i16), I16Shuffle); - } + return DAG.getNode( + ISD::BITCAST, DL, MVT::v16i8, + DAG.getVectorShuffle(MVT::v8i16, DL, + DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), + DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle)); + }; + if (SDValue V = tryToWidenViaDuplication()) + return V; } // Check whether an interleaving lowering is likely to be more efficient. @@ -7815,16 +7828,42 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL); - auto buildLoAndHiV8s = - [&](SDValue V, ArrayRef LoBlendMask, ArrayRef HiBlendMask) { - SDValue LoV = - DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, - DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); - SDValue HiV = - DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, - DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); - SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, LoV, HiV, LoBlendMask); - SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, LoV, HiV, HiBlendMask); + auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef LoBlendMask, + MutableArrayRef HiBlendMask) { + SDValue V1, V2; + // Check if any of the odd lanes in the v16i8 are used. If not, we can mask + // them out and avoid using UNPCK{L,H} to extract the elements of V as + // i16s. + if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(), + [](int M) { return M >= 0 && M % 2 == 1; }) && + std::none_of(HiBlendMask.begin(), HiBlendMask.end(), + [](int M) { return M >= 0 && M % 2 == 1; })) { + // Use a mask to drop the high bytes. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); + V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1, + DAG.getConstant(0x00FF, MVT::v8i16)); + + // This will be a single vector shuffle instead of a blend so nuke V2. + V2 = DAG.getUNDEF(MVT::v8i16); + + // Squash the masks to point directly into V1. + for (int &M : LoBlendMask) + if (M >= 0) + M /= 2; + for (int &M : HiBlendMask) + if (M >= 0) + M /= 2; + } else { + // Otherwise just unpack the low half of V into V1 and the high half into + // V2 so that we can blend them as i16s. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); + } + + SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask); + SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask); return std::make_pair(BlendedLo, BlendedHi); }; SDValue V1Lo, V1Hi, V2Lo, V2Hi; @@ -7900,7 +7939,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, // but in some cases the first operand may be transformed to UNDEF. // In this case we should just commute the node. if (V1IsUndef) - return CommuteVectorShuffle(SVOp, DAG); + return DAG.getCommutedVectorShuffle(*SVOp); // Check for non-undef masks pointing at an undef vector and make the masks // undef as well. This makes it easier to match the shuffle based solely on @@ -7946,7 +7985,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, // V2. This allows us to match the shuffle pattern strictly on how many // elements come from V1 without handling the symmetric cases. if (NumV2Elements > NumV1Elements) - return CommuteVectorShuffle(SVOp, DAG); + return DAG.getCommutedVectorShuffle(*SVOp); // When the number of V1 and V2 elements are the same, try to minimize the // number of uses of V2 in the low half of the vector. @@ -7958,7 +7997,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, else if (M >= 0) ++LowV1Elements; if (LowV2Elements > LowV1Elements) - return CommuteVectorShuffle(SVOp, DAG); + return DAG.getCommutedVectorShuffle(*SVOp); } // For each vector width, delegate to a specialized lowering routine. @@ -9242,7 +9281,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // but in some cases the first operand may be transformed to UNDEF. // In this case we should just commute the node. if (V1IsUndef) - return CommuteVectorShuffle(SVOp, DAG); + return DAG.getCommutedVectorShuffle(*SVOp); // Vector shuffle lowering takes 3 steps: // @@ -9354,7 +9393,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (ShouldXformToMOVHLPS(M, VT) || ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT)) - return CommuteVectorShuffle(SVOp, DAG); + return DAG.getCommutedVectorShuffle(*SVOp); if (isShift) { // No better options. Use a vshldq / vsrldq. @@ -9426,7 +9465,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // Normalize the node to match x86 shuffle ops if needed if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true))) - return CommuteVectorShuffle(SVOp, DAG); + return DAG.getCommutedVectorShuffle(*SVOp); // The checks below are all present in isShuffleMaskLegal, but they are // inlined here right now to enable us to directly emit target specific @@ -15120,10 +15159,23 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) || (VT == MVT::v8i32 && Subtarget->hasInt256())); - // Get the high parts. + // PMULxD operations multiply each even value (starting at 0) of LHS with + // the related value of RHS and produce a widen result. + // E.g., PMULUDQ <4 x i32> , <4 x i32> + // => <2 x i64> + // + // In other word, to have all the results, we need to perform two PMULxD: + // 1. one with the even values. + // 2. one with the odd values. + // To achieve #2, with need to place the odd values at an even position. + // + // Place the odd value at an even position (basically, shift all values 1 + // step to the left): const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1}; - SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask); - SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask); + // => + SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask); + // => + SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask); // Emit two multiplies, one for the lower 2 ints and one for the higher 2 // ints. @@ -15131,22 +15183,39 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI; unsigned Opcode = (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; + // PMULUDQ <4 x i32> , <4 x i32> + // => <2 x i64> SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1)); + // PMULUDQ <4 x i32> , <4 x i32> + // => <2 x i64> SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1)); + DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1)); // Shuffle it back into the right order. + // The internal representation is big endian. + // In other words, a i64 bitcasted to 2 x i32 has its high part at index 0 + // and its low part at index 1. + // Moreover, we have: Mul1 = ; Mul2 = + // Vector index 0 1 ; 2 3 + // We want + // Vector index 0 2 1 3 + // Since each element is seen as 2 x i32, we get: + // high_mask[i] = 2 x vector_index[i] + // low_mask[i] = 2 x vector_index[i] + 1 + // where vector_index = {0, Size/2, 1, Size/2 + 1, ..., + // Size/2 - 1, Size/2 + Size/2 - 1} + // where Size is the number of element of the final vector. SDValue Highs, Lows; if (VT == MVT::v8i32) { - const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15}; + const int HighMask[] = {0, 8, 2, 10, 4, 12, 6, 14}; Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); - const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14}; + const int LowMask[] = {1, 9, 3, 11, 5, 13, 7, 15}; Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); } else { - const int HighMask[] = {1, 5, 3, 7}; + const int HighMask[] = {0, 4, 2, 6}; Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); - const int LowMask[] = {0, 4, 2, 6}; + const int LowMask[] = {1, 5, 3, 7}; Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); } @@ -15164,7 +15233,9 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup); } - return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows); + // The low part of a MUL_LOHI is supposed to be the first value and the + // high part the second value. + return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Lows, Highs); } static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, @@ -16812,6 +16883,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, return (SVT.getVectorNumElements() == 2 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isMOVLMask(M, SVT) || + isMOVHLPSMask(M, SVT) || isSHUFPMask(M, SVT) || isPSHUFDMask(M, SVT) || isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) || @@ -18456,6 +18528,39 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef Mask, return false; continue; + + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword + // shuffle into a preceding word shuffle. + if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16) + return false; + + // Search for a half-shuffle which we can combine with. + unsigned CombineOp = + V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; + if (V.getOperand(0) != V.getOperand(1) || + !V->isOnlyUserOf(V.getOperand(0).getNode())) + return false; + V = V.getOperand(0); + do { + switch (V.getOpcode()) { + default: + return false; // Nothing to combine. + + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + if (V.getOpcode() == CombineOp) + break; + + // Fallthrough! + case ISD::BITCAST: + V = V.getOperand(0); + continue; + } + break; + } while (V.hasOneUse()); + break; } // Break out of the loop if we break out of the switch. break; @@ -18472,7 +18577,7 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef Mask, SmallVector VMask = getPSHUFShuffleMask(V); for (int &M : Mask) M = VMask[M]; - V = DAG.getNode(X86ISD::PSHUFD, DL, V.getValueType(), V.getOperand(0), + V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0), getV4X86ShuffleImm8ForMask(Mask, DAG)); // It is possible that one of the combinable shuffles was completely absorbed @@ -18625,6 +18730,47 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); } + // Look for shuffle patterns which can be implemented as a single unpack. + // FIXME: This doesn't handle the location of the PSHUFD generically, and + // only works when we have a PSHUFD followed by two half-shuffles. + if (Mask[0] == Mask[1] && Mask[2] == Mask[3] && + (V.getOpcode() == X86ISD::PSHUFLW || + V.getOpcode() == X86ISD::PSHUFHW) && + V.getOpcode() != N.getOpcode() && + V.hasOneUse()) { + SDValue D = V.getOperand(0); + while (D.getOpcode() == ISD::BITCAST && D.hasOneUse()) + D = D.getOperand(0); + if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) { + SmallVector VMask = getPSHUFShuffleMask(V); + SmallVector DMask = getPSHUFShuffleMask(D); + int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; + int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; + int WordMask[8]; + for (int i = 0; i < 4; ++i) { + WordMask[i + NOffset] = Mask[i] + NOffset; + WordMask[i + VOffset] = VMask[i] + VOffset; + } + // Map the word mask through the DWord mask. + int MappedMask[8]; + for (int i = 0; i < 8; ++i) + MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2; + const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3}; + const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7}; + if (std::equal(std::begin(MappedMask), std::end(MappedMask), + std::begin(UnpackLoMask)) || + std::equal(std::begin(MappedMask), std::end(MappedMask), + std::begin(UnpackHiMask))) { + // We can replace all three shuffles with an unpack. + V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0)); + DCI.AddToWorklist(V.getNode()); + return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL + : X86ISD::UNPCKH, + DL, MVT::v8i16, V, V); + } + } + } + break; case X86ISD::PSHUFD: @@ -18646,49 +18792,6 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); - // Canonicalize shuffles that perform 'addsub' on packed float vectors - // according to the rule: - // (shuffle (FADD A, B), (FSUB A, B), Mask) -> - // (shuffle (FSUB A, -B), (FADD A, -B), Mask) - // - // Where 'Mask' is: - // <0,5,2,7> -- for v4f32 and v4f64 shuffles; - // <0,3> -- for v2f64 shuffles; - // <0,9,2,11,4,13,6,15> -- for v8f32 shuffles. - // - // This helps pattern-matching more SSE3/AVX ADDSUB instructions - // during ISel stage. - if (N->getOpcode() == ISD::VECTOR_SHUFFLE && - ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || - (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && - N0->getOpcode() == ISD::FADD && N1->getOpcode() == ISD::FSUB && - // Operands to the FADD and FSUB must be the same. - ((N0->getOperand(0) == N1->getOperand(0) && - N0->getOperand(1) == N1->getOperand(1)) || - // FADD is commutable. See if by commuting the operands of the FADD - // we would still be able to match the operands of the FSUB dag node. - (N0->getOperand(1) == N1->getOperand(0) && - N0->getOperand(0) == N1->getOperand(1))) && - N0->getOperand(0)->getOpcode() != ISD::UNDEF && - N0->getOperand(1)->getOpcode() != ISD::UNDEF) { - - ShuffleVectorSDNode *SV = cast(N); - unsigned NumElts = VT.getVectorNumElements(); - ArrayRef Mask = SV->getMask(); - bool CanFold = true; - - for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) - CanFold = Mask[i] == (int)((i & 1) ? i + NumElts : i); - - if (CanFold) { - SDValue Op0 = N1->getOperand(0); - SDValue Op1 = DAG.getNode(ISD::FNEG, dl, VT, N1->getOperand(1)); - SDValue Sub = DAG.getNode(ISD::FSUB, dl, VT, Op0, Op1); - SDValue Add = DAG.getNode(ISD::FADD, dl, VT, Op0, Op1); - return DAG.getVectorShuffle(VT, dl, Sub, Add, Mask); - } - } - // Don't create instructions with illegal types after legalize types has run. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) @@ -21693,8 +21796,59 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, + SelectionDAG &DAG) { + // Take advantage of vector comparisons producing 0 or -1 in each lane to + // optimize away operation when it's from a constant. + // + // The general transformation is: + // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> + // AND(VECTOR_CMP(x,y), constant2) + // constant2 = UNARYOP(constant) + + // Early exit if this isn't a vector operation or if the operand of the + // unary operation isn't a bitwise AND. + EVT VT = N->getValueType(0); + if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || + N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC) + return SDValue(); + + // Now check that the other operand of the AND is a constant splat. We could + // make the transformation for non-constant splats as well, but it's unclear + // that would be a benefit as it would not eliminate any operations, just + // perform one more step in scalar code before moving to the vector unit. + if (BuildVectorSDNode *BV = + dyn_cast(N->getOperand(0)->getOperand(1))) { + // Bail out if the vector isn't a constant splat. + if (!BV->getConstantSplatNode()) + return SDValue(); + + // Everything checks out. Build up the new and improved node. + SDLoc DL(N); + EVT IntVT = BV->getValueType(0); + // Create a new constant of the appropriate type for the transformed + // DAG. + SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); + // The AND node needs bitcasts to/from an integer vector type around it. + SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); + SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, + N->getOperand(0)->getOperand(0), MaskConst); + SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); + return Res; + } + + return SDValue(); +} + static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86TargetLowering *XTLI) { + // First try to optimize away the conversion entirely when it's + // conditionally from a constant. Vectors only. + SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); + if (Res != SDValue()) + return Res; + + // Now move on to more general possibilities. SDValue Op0 = N->getOperand(0); EVT InVT = Op0->getValueType(0);