DAG.getConstant(InsertPSMask, MVT::i8));
}
+/// \brief Try to lower a shuffle as a permute of the inputs followed by an
+/// UNPCK instruction.
+///
+/// This specifically targets cases where we end up with alternating between
+/// the two inputs, and so can permute them into something that feeds a single
+/// UNPCK instruction.
+static SDValue lowerVectorShuffleAsUnpack(MVT VT, SDLoc DL, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(!isSingleInputShuffleMask(Mask) &&
+ "This routine should only be used when blending two inputs.");
+ assert(Mask.size() >= 2 && "Single element masks are invalid.");
+
+ int Size = Mask.size();
+
+ int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) {
+ return M >= 0 && M % Size < Size / 2;
+ });
+ int NumHiInputs = std::count_if(
+ Mask.begin(), Mask.end(), [Size](int M) { return M % Size > Size / 2; });
+
+ bool UnpackLo = NumLoInputs >= NumHiInputs;
+
+ SmallVector<int, 32> V1Mask(Mask.size(), -1);
+ SmallVector<int, 32> V2Mask(Mask.size(), -1);
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ // We only handle the case where V1 feeds even mask slots and V2 feeds odd
+ // mask slots. We rely on canonicalization to ensure this is the case.
+ if ((i % 2 == 0) != (Mask[i] < Size))
+ return SDValue();
+
+ SmallVectorImpl<int> &VMask = (i % 2 == 0) ? V1Mask : V2Mask;
+ VMask[i / 2 + (UnpackLo ? 0 : Size / 2)] = Mask[i] % Size;
+ }
+
+ V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
+ V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
+ return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, VT, V1,
+ V2);
+}
+
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
///
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
Mask, DAG);
+ // Try to lower by permuting the inputs into an unpack instruction.
+ if (SDValue Unpack =
+ lowerVectorShuffleAsUnpack(MVT::v4i32, DL, V1, V2, Mask, DAG))
+ return Unpack;
+
// We implement this with SHUFPS because it can blend from two vectors.
// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
// up the inputs, bypassing domain shift penalties that we would encur if we
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
Mask, DAG);
+ // Try to lower by permuting the inputs into an unpack instruction.
+ if (SDValue Unpack =
+ lowerVectorShuffleAsUnpack(MVT::v8i16, DL, V1, V2, Mask, DAG))
+ return Unpack;
+
int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};