+ if (i < Size / 2)
+ ++NumLoInputs[InputIdx];
+ else
+ ++NumHiInputs[InputIdx];
+
+ if ((i % 2) == 0)
+ ++NumEvenInputs[InputIdx];
+ else
+ ++NumOddInputs[InputIdx];
+ }
+
+ // The minimum number of cross-input results for both the interleaved and
+ // split cases. If interleaving results in fewer cross-input results, return
+ // true.
+ int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
+ NumEvenInputs[0] + NumOddInputs[1]);
+ int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
+ NumLoInputs[0] + NumHiInputs[1]);
+ return InterleavedCrosses < SplitCrosses;
+}
+
+/// \brief Blend two v8i16 vectors using a naive unpack strategy.
+///
+/// This strategy only works when the inputs from each vector fit into a single
+/// half of that vector, and generally there are not so many inputs as to leave
+/// the in-place shuffles required highly constrained (and thus expensive). It
+/// shifts all the inputs into a single side of both input vectors and then
+/// uses an unpack to interleave these inputs in a single vector. At that
+/// point, we will fall back on the generic single input shuffle lowering.
+static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
+ SDValue V2,
+ MutableArrayRef<int> Mask,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
+ assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
+ SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
+ for (int i = 0; i < 8; ++i)
+ if (Mask[i] >= 0 && Mask[i] < 4)
+ LoV1Inputs.push_back(i);
+ else if (Mask[i] >= 4 && Mask[i] < 8)
+ HiV1Inputs.push_back(i);
+ else if (Mask[i] >= 8 && Mask[i] < 12)
+ LoV2Inputs.push_back(i);
+ else if (Mask[i] >= 12)
+ HiV2Inputs.push_back(i);
+
+ int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
+ int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
+ (void)NumV1Inputs;
+ (void)NumV2Inputs;
+ assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
+ assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
+ assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
+
+ bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
+ HiV1Inputs.size() + HiV2Inputs.size();
+
+ auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
+ ArrayRef<int> HiInputs, bool MoveToLo,
+ int MaskOffset) {
+ ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
+ ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
+ if (BadInputs.empty())
+ return V;
+
+ int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ int MoveOffset = MoveToLo ? 0 : 4;
+
+ if (GoodInputs.empty()) {
+ for (int BadInput : BadInputs) {
+ MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
+ Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
+ }
+ } else {
+ if (GoodInputs.size() == 2) {
+ // If the low inputs are spread across two dwords, pack them into
+ // a single dword.
+ MoveMask[Mask[GoodInputs[0]] % 2 + MoveOffset] =
+ Mask[GoodInputs[0]] - MaskOffset;
+ MoveMask[Mask[GoodInputs[1]] % 2 + MoveOffset] =
+ Mask[GoodInputs[1]] - MaskOffset;
+ Mask[GoodInputs[0]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset;
+ Mask[GoodInputs[1]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset;
+ } else {
+ // Otherwise pin the low inputs.
+ for (int GoodInput : GoodInputs)
+ MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
+ }
+
+ int MoveMaskIdx =
+ std::find(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), -1) -
+ std::begin(MoveMask);
+ assert(MoveMaskIdx >= MoveOffset && "Established above");
+
+ if (BadInputs.size() == 2) {
+ assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
+ assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
+ MoveMask[MoveMaskIdx + Mask[BadInputs[0]] % 2] =
+ Mask[BadInputs[0]] - MaskOffset;
+ MoveMask[MoveMaskIdx + Mask[BadInputs[1]] % 2] =
+ Mask[BadInputs[1]] - MaskOffset;
+ Mask[BadInputs[0]] = MoveMaskIdx + Mask[BadInputs[0]] % 2 + MaskOffset;
+ Mask[BadInputs[1]] = MoveMaskIdx + Mask[BadInputs[1]] % 2 + MaskOffset;
+ } else {
+ assert(BadInputs.size() == 1 && "All sizes handled");
+ MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
+ Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
+ }
+ }
+
+ return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
+ MoveMask);
+ };
+ V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
+ /*MaskOffset*/ 0);
+ V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
+ /*MaskOffset*/ 8);
+
+ // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
+ // cross-half traffic in the final shuffle.
+
+ // Munge the mask to be a single-input mask after the unpack merges the
+ // results.
+ for (int &M : Mask)
+ if (M != -1)
+ M = 2 * (M % 4) + (M / 8);
+
+ return DAG.getVectorShuffle(
+ MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
+ DL, MVT::v8i16, V1, V2),
+ DAG.getUNDEF(MVT::v8i16), Mask);
+}
+
+/// \brief Generic lowering of 8-lane i16 shuffles.
+///
+/// This handles both single-input shuffles and combined shuffle/blends with
+/// two inputs. The single input shuffles are immediately delegated to
+/// a dedicated lowering routine.
+///
+/// The blends are lowered in one of three fundamental ways. If there are few
+/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
+/// of the input is significantly cheaper when lowered as an interleaving of
+/// the two inputs, try to interleave them. Otherwise, blend the low and high
+/// halves of the inputs separately (making them have relatively few inputs)
+/// and then concatenate them.
+static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> OrigMask = SVOp->getMask();
+ int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
+ OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
+ MutableArrayRef<int> Mask(MaskStorage);
+
+ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+ auto isV1 = [](int M) { return M >= 0 && M < 8; };
+ auto isV2 = [](int M) { return M >= 8; };
+
+ int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
+ int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
+
+ if (NumV2Inputs == 0)
+ return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
+
+ assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
+ "to be V1-input shuffles.");
+
+ if (NumV1Inputs + NumV2Inputs <= 4)
+ return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
+
+ // Check whether an interleaving lowering is likely to be more efficient.
+ // This isn't perfect but it is a strong heuristic that tends to work well on
+ // the kinds of shuffles that show up in practice.
+ //
+ // FIXME: Handle 1x, 2x, and 4x interleaving.
+ if (shouldLowerAsInterleaving(Mask)) {
+ // FIXME: Figure out whether we should pack these into the low or high
+ // halves.
+
+ int EMask[8], OMask[8];
+ for (int i = 0; i < 4; ++i) {
+ EMask[i] = Mask[2*i];
+ OMask[i] = Mask[2*i + 1];
+ EMask[i + 4] = -1;
+ OMask[i + 4] = -1;
+ }
+
+ SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
+ SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
+
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
+ }
+
+ int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+
+ for (int i = 0; i < 4; ++i) {
+ LoBlendMask[i] = Mask[i];
+ HiBlendMask[i] = Mask[i + 4];
+ }
+
+ SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
+ SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
+ LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
+ HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
+
+ return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+ DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
+}
+
+/// \brief Generic lowering of v16i8 shuffles.
+///
+/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
+/// detect any complexity reducing interleaving. If that doesn't help, it uses
+/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
+/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
+/// back together.
+static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> OrigMask = SVOp->getMask();
+ assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ int MaskStorage[16] = {
+ OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
+ OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7],
+ OrigMask[8], OrigMask[9], OrigMask[10], OrigMask[11],
+ OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
+ MutableArrayRef<int> Mask(MaskStorage);
+ MutableArrayRef<int> LoMask = Mask.slice(0, 8);
+ MutableArrayRef<int> HiMask = Mask.slice(8, 8);
+
+ // For single-input shuffles, there are some nicer lowering tricks we can use.
+ if (isSingleInputShuffleMask(Mask)) {
+ // Check whether we can widen this to an i16 shuffle by duplicating bytes.
+ // Notably, this handles splat and partial-splat shuffles more efficiently.
+ // However, it only makes sense if the pre-duplication shuffle simplifies
+ // things significantly. Currently, this means we need to be able to
+ // express the pre-duplication shuffle as an i16 shuffle.
+ //
+ // FIXME: We should check for other patterns which can be widened into an
+ // i16 shuffle as well.
+ auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
+ for (int i = 0; i < 16; i += 2) {
+ if (Mask[i] != Mask[i + 1])
+ return false;
+ }
+ return true;
+ };
+ auto tryToWidenViaDuplication = [&]() -> SDValue {
+ if (!canWidenViaDuplication(Mask))
+ return SDValue();
+ SmallVector<int, 4> LoInputs;
+ std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
+ [](int M) { return M >= 0 && M < 8; });
+ std::sort(LoInputs.begin(), LoInputs.end());
+ LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
+ LoInputs.end());
+ SmallVector<int, 4> HiInputs;
+ std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
+ [](int M) { return M >= 8; });
+ std::sort(HiInputs.begin(), HiInputs.end());
+ HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
+ HiInputs.end());
+
+ bool TargetLo = LoInputs.size() >= HiInputs.size();
+ ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
+ ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
+
+ int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ SmallDenseMap<int, int, 8> LaneMap;
+ for (int I : InPlaceInputs) {
+ PreDupI16Shuffle[I/2] = I/2;
+ LaneMap[I] = I;
+ }
+ int j = TargetLo ? 0 : 4, je = j + 4;
+ for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
+ // Check if j is already a shuffle of this input. This happens when
+ // there are two adjacent bytes after we move the low one.
+ if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
+ // If we haven't yet mapped the input, search for a slot into which
+ // we can map it.
+ while (j < je && PreDupI16Shuffle[j] != -1)
+ ++j;
+
+ if (j == je)
+ // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
+ return SDValue();
+
+ // Map this input with the i16 shuffle.
+ PreDupI16Shuffle[j] = MovingInputs[i] / 2;
+ }
+
+ // Update the lane map based on the mapping we ended up with.
+ LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
+ }
+ V1 = DAG.getNode(
+ ISD::BITCAST, DL, MVT::v16i8,
+ DAG.getVectorShuffle(MVT::v8i16, DL,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+ DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
+
+ // Unpack the bytes to form the i16s that will be shuffled into place.
+ V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+ MVT::v16i8, V1, V1);
+
+ int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ for (int i = 0; i < 16; i += 2) {
+ if (Mask[i] != -1)
+ PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
+ assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!");
+ }
+ return DAG.getNode(
+ ISD::BITCAST, DL, MVT::v16i8,
+ DAG.getVectorShuffle(MVT::v8i16, DL,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+ DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
+ };
+ if (SDValue V = tryToWidenViaDuplication())
+ return V;
+ }
+
+ // Check whether an interleaving lowering is likely to be more efficient.
+ // This isn't perfect but it is a strong heuristic that tends to work well on
+ // the kinds of shuffles that show up in practice.
+ //
+ // FIXME: We need to handle other interleaving widths (i16, i32, ...).
+ if (shouldLowerAsInterleaving(Mask)) {
+ // FIXME: Figure out whether we should pack these into the low or high
+ // halves.
+
+ int EMask[16], OMask[16];
+ for (int i = 0; i < 8; ++i) {
+ EMask[i] = Mask[2*i];
+ OMask[i] = Mask[2*i + 1];
+ EMask[i + 8] = -1;
+ OMask[i + 8] = -1;
+ }
+
+ SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
+ SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
+
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds);
+ }
+
+ int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+
+ auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
+ MutableArrayRef<int> V1HalfBlendMask,
+ MutableArrayRef<int> V2HalfBlendMask) {
+ for (int i = 0; i < 8; ++i)
+ if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
+ V1HalfBlendMask[i] = HalfMask[i];
+ HalfMask[i] = i;
+ } else if (HalfMask[i] >= 16) {
+ V2HalfBlendMask[i] = HalfMask[i] - 16;
+ HalfMask[i] = i + 8;
+ }
+ };
+ buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
+ buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
+
+ SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
+
+ auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
+ MutableArrayRef<int> HiBlendMask) {
+ SDValue V1, V2;
+ // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
+ // them out and avoid using UNPCK{L,H} to extract the elements of V as
+ // i16s.
+ if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
+ [](int M) { return M >= 0 && M % 2 == 1; }) &&
+ std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
+ [](int M) { return M >= 0 && M % 2 == 1; })) {
+ // Use a mask to drop the high bytes.
+ V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+ V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
+ DAG.getConstant(0x00FF, MVT::v8i16));
+
+ // This will be a single vector shuffle instead of a blend so nuke V2.
+ V2 = DAG.getUNDEF(MVT::v8i16);
+
+ // Squash the masks to point directly into V1.
+ for (int &M : LoBlendMask)
+ if (M >= 0)
+ M /= 2;
+ for (int &M : HiBlendMask)
+ if (M >= 0)
+ M /= 2;
+ } else {
+ // Otherwise just unpack the low half of V into V1 and the high half into
+ // V2 so that we can blend them as i16s.
+ V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+ DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
+ V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+ DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
+ }
+
+ SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
+ SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
+ return std::make_pair(BlendedLo, BlendedHi);
+ };
+ SDValue V1Lo, V1Hi, V2Lo, V2Hi;
+ std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
+ std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
+
+ SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
+ SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
+
+ return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
+}
+
+/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
+///
+/// This routine breaks down the specific type of 128-bit shuffle and
+/// dispatches to the lowering routines accordingly.
+static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ MVT VT, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ switch (VT.SimpleTy) {
+ case MVT::v2i64:
+ return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v2f64:
+ return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v4i32:
+ return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v4f32:
+ return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v8i16:
+ return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v16i8:
+ return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
+
+ default:
+ llvm_unreachable("Unimplemented!");
+ }
+}
+
+/// \brief Tiny helper function to test whether adjacent masks are sequential.
+static bool areAdjacentMasksSequential(ArrayRef<int> Mask) {
+ for (int i = 0, Size = Mask.size(); i < Size; i += 2)
+ if (Mask[i] + 1 != Mask[i+1])
+ return false;
+
+ return true;
+}
+
+/// \brief Top-level lowering for x86 vector shuffles.
+///
+/// This handles decomposition, canonicalization, and lowering of all x86
+/// vector shuffles. Most of the specific lowering strategies are encapsulated
+/// above in helper routines. The canonicalization attempts to widen shuffles
+/// to involve fewer lanes of wider elements, consolidate symmetric patterns
+/// s.t. only one of the two inputs needs to be tested, etc.
+static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ MVT VT = Op.getSimpleValueType();
+ int NumElements = VT.getVectorNumElements();
+ SDLoc dl(Op);
+
+ assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
+
+ bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
+ bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+ if (V1IsUndef && V2IsUndef)
+ return DAG.getUNDEF(VT);
+
+ // When we create a shuffle node we put the UNDEF node to second operand,
+ // but in some cases the first operand may be transformed to UNDEF.
+ // In this case we should just commute the node.
+ if (V1IsUndef)
+ return DAG.getCommutedVectorShuffle(*SVOp);
+
+ // Check for non-undef masks pointing at an undef vector and make the masks
+ // undef as well. This makes it easier to match the shuffle based solely on
+ // the mask.
+ if (V2IsUndef)
+ for (int M : Mask)
+ if (M >= NumElements) {
+ SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
+ for (int &M : NewMask)
+ if (M >= NumElements)
+ M = -1;
+ return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
+ }
+
+ // For integer vector shuffles, try to collapse them into a shuffle of fewer
+ // lanes but wider integers. We cap this to not form integers larger than i64
+ // but it might be interesting to form i128 integers to handle flipping the
+ // low and high halves of AVX 256-bit vectors.
+ if (VT.isInteger() && VT.getScalarSizeInBits() < 64 &&
+ areAdjacentMasksSequential(Mask)) {
+ SmallVector<int, 8> NewMask;
+ for (int i = 0, Size = Mask.size(); i < Size; i += 2)
+ NewMask.push_back(Mask[i] / 2);
+ MVT NewVT =
+ MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2),
+ VT.getVectorNumElements() / 2);
+ V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
+ V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
+ return DAG.getNode(ISD::BITCAST, dl, VT,
+ DAG.getVectorShuffle(NewVT, dl, V1, V2, NewMask));
+ }
+
+ int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
+ for (int M : SVOp->getMask())
+ if (M < 0)
+ ++NumUndefElements;
+ else if (M < NumElements)
+ ++NumV1Elements;
+ else
+ ++NumV2Elements;
+
+ // Commute the shuffle as needed such that more elements come from V1 than
+ // V2. This allows us to match the shuffle pattern strictly on how many
+ // elements come from V1 without handling the symmetric cases.
+ if (NumV2Elements > NumV1Elements)
+ return DAG.getCommutedVectorShuffle(*SVOp);
+
+ // When the number of V1 and V2 elements are the same, try to minimize the
+ // number of uses of V2 in the low half of the vector.
+ if (NumV1Elements == NumV2Elements) {
+ int LowV1Elements = 0, LowV2Elements = 0;
+ for (int M : SVOp->getMask().slice(0, NumElements / 2))
+ if (M >= NumElements)
+ ++LowV2Elements;
+ else if (M >= 0)
+ ++LowV1Elements;
+ if (LowV2Elements > LowV1Elements)
+ return DAG.getCommutedVectorShuffle(*SVOp);
+ }
+
+ // For each vector width, delegate to a specialized lowering routine.
+ if (VT.getSizeInBits() == 128)
+ return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+
+ llvm_unreachable("Unimplemented!");
+}
+
+
+//===----------------------------------------------------------------------===//
+// Legacy vector shuffle lowering
+//
+// This code is the legacy code handling vector shuffles until the above
+// replaces its functionality and performance.
+//===----------------------------------------------------------------------===//
+
+static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
+ bool hasInt256, unsigned *MaskOut = nullptr) {
+ MVT EltVT = VT.getVectorElementType();
+
+ // There is no blend with immediate in AVX-512.
+ if (VT.is512BitVector())
+ return false;
+
+ if (!hasSSE41 || EltVT == MVT::i8)
+ return false;
+ if (!hasInt256 && VT == MVT::v16i16)
+ return false;
+
+ unsigned MaskValue = 0;
+ unsigned NumElems = VT.getVectorNumElements();
+ // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+ unsigned NumLanes = (NumElems - 1) / 8 + 1;
+ unsigned NumElemsInLane = NumElems / NumLanes;
+
+ // Blend for v16i16 should be symetric for the both lanes.
+ for (unsigned i = 0; i < NumElemsInLane; ++i) {
+
+ int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
+ int EltIdx = MaskVals[i];
+
+ if ((EltIdx < 0 || EltIdx == (int)i) &&
+ (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
+ continue;
+
+ if (((unsigned)EltIdx == (i + NumElems)) &&
+ (SndLaneEltIdx < 0 ||
+ (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
+ MaskValue |= (1 << i);
+ else
+ return false;
+ }
+
+ if (MaskOut)
+ *MaskOut = MaskValue;
+ return true;
+}
+
+// Try to lower a shuffle node into a simple blend instruction.
+// This function assumes isBlendMask returns true for this
+// SuffleVectorSDNode
+static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
+ unsigned MaskValue,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = SVOp->getSimpleValueType(0);
+ MVT EltVT = VT.getVectorElementType();
+ assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
+ Subtarget->hasInt256() && "Trying to lower a "
+ "VECTOR_SHUFFLE to a Blend but "
+ "with the wrong mask"));
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ SDLoc dl(SVOp);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ // Convert i32 vectors to floating point if it is not AVX2.
+ // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
+ MVT BlendVT = VT;
+ if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
+ BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
+ NumElems);
+ V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
+ V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
+ }
+
+ SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
+ DAG.getConstant(MaskValue, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+}
+
+/// In vector type \p VT, return true if the element at index \p InputIdx
+/// falls on a different 128-bit lane than \p OutputIdx.
+static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
+ unsigned OutputIdx) {
+ unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+ return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
+}
+
+/// Generate a PSHUFB if possible. Selects elements from \p V1 according to
+/// \p MaskVals. MaskVals[OutputIdx] = InputIdx specifies that we want to
+/// shuffle the element at InputIdx in V1 to OutputIdx in the result. If \p
+/// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
+/// zero.
+static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
+ SelectionDAG &DAG) {
+ MVT VT = V1.getSimpleValueType();
+ assert(VT.is128BitVector() || VT.is256BitVector());
+
+ MVT EltVT = VT.getVectorElementType();
+ unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
+ unsigned NumElts = VT.getVectorNumElements();
+
+ SmallVector<SDValue, 32> PshufbMask;
+ for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
+ int InputIdx = MaskVals[OutputIdx];
+ unsigned InputByteIdx;
+
+ if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
+ InputByteIdx = 0x80;
+ else {
+ // Cross lane is not allowed.
+ if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
+ return SDValue();
+ InputByteIdx = InputIdx * EltSizeInBytes;
+ // Index is an byte offset within the 128-bit lane.
+ InputByteIdx &= 0xf;
+ }
+
+ for (unsigned j = 0; j < EltSizeInBytes; ++j) {
+ PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
+ if (InputByteIdx != 0x80)
+ ++InputByteIdx;
+ }
+ }
+
+ MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
+ if (ShufVT != VT)
+ V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
+ return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
+ DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
+}
+
+// v8i16 shuffles - Prefer shuffles in the following order:
+// 1. [all] pshuflw, pshufhw, optional move
+// 2. [ssse3] 1 x pshufb
+// 3. [ssse3] 2 x pshufb + 1 x por
+// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
+static SDValue
+LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ SDLoc dl(SVOp);
+ SmallVector<int, 8> MaskVals;
+
+ // Determine if more than 1 of the words in each of the low and high quadwords
+ // of the result come from the same quadword of one of the two inputs. Undef
+ // mask values count as coming from any quadword, for better codegen.
+ //
+ // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
+ // feeds this quad. For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
+ unsigned LoQuad[] = { 0, 0, 0, 0 };
+ unsigned HiQuad[] = { 0, 0, 0, 0 };
+ // Indices of quads used.
+ std::bitset<4> InputQuads;
+ for (unsigned i = 0; i < 8; ++i) {
+ unsigned *Quad = i < 4 ? LoQuad : HiQuad;
+ int EltIdx = SVOp->getMaskElt(i);
+ MaskVals.push_back(EltIdx);
+ if (EltIdx < 0) {
+ ++Quad[0];
+ ++Quad[1];
+ ++Quad[2];
+ ++Quad[3];
+ continue;
+ }
+ ++Quad[EltIdx / 4];
+ InputQuads.set(EltIdx / 4);
+ }
+
+ int BestLoQuad = -1;
+ unsigned MaxQuad = 1;
+ for (unsigned i = 0; i < 4; ++i) {
+ if (LoQuad[i] > MaxQuad) {
+ BestLoQuad = i;
+ MaxQuad = LoQuad[i];
+ }
+ }
+
+ int BestHiQuad = -1;
+ MaxQuad = 1;
+ for (unsigned i = 0; i < 4; ++i) {
+ if (HiQuad[i] > MaxQuad) {
+ BestHiQuad = i;
+ MaxQuad = HiQuad[i];
+ }
+ }