[x86] Teach the shuffle mask equivalence test to look through build

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 4e8616fb138adc7e2989f82ee315cc999699f490..31b1ada9a855594cf2e0d1a62b1bba01f2e2b389 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -7368,13 +7368,25 @@ namespace {
  /// \brief Implementation of the \c isShuffleEquivalent variadic functor.
  ///
  /// See its documentation for details.
-bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) {
+bool isShuffleEquivalentImpl(SDValue V1, SDValue V2, ArrayRef<int> Mask,
+                             ArrayRef<const int *> Args) {
    if (Mask.size() != Args.size())
      return false;
+
+  // If the values are build vectors, we can look through them to find
+  // equivalent inputs that make the shuffles equivalent.
+  auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
+  auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
+
    for (int i = 0, e = Mask.size(); i < e; ++i) {
      assert(*Args[i] >= 0 && "Arguments must be positive integers!");
-    if (Mask[i] != -1 && Mask[i] != *Args[i])
-      return false;
+    if (Mask[i] != -1 && Mask[i] != *Args[i]) {
+      auto *MaskBV = Mask[i] < e ? BV1 : BV2;
+      auto *ArgsBV = *Args[i] < e ? BV1 : BV2;
+      if (!MaskBV || !ArgsBV ||
+          MaskBV->getOperand(Mask[i] % e) != ArgsBV->getOperand(*Args[i] % e))
+        return false;
+    }
    }
    return true;
  }
@@ -7391,8 +7403,9 @@ bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) {
  /// It returns true if the mask is exactly as wide as the argument list, and
  /// each element of the mask is either -1 (signifying undef) or the value given
  /// in the argument.
-static const VariadicFunction1<
-    bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {};
+static const VariadicFunction3<bool, SDValue, SDValue, ArrayRef<int>, int,
+                               isShuffleEquivalentImpl> isShuffleEquivalent =
+    {};
  
  /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
  ///
@@ -8242,6 +8255,10 @@ static SDValue lowerVectorShuffleAsElementInsertion(
                         ExtVT, V1, V2);
    }
  
+  // This lowering only works for the low element with floating point vectors.
+  if (VT.isFloatingPoint() && V2Index != 0)
+    return SDValue();
+
    V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
    if (ExtVT != VT)
      V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
@@ -8444,7 +8461,7 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    if (isSingleInputShuffleMask(Mask)) {
      // Use low duplicate instructions for masks that match their pattern.
      if (Subtarget->hasSSE3())
-      if (isShuffleEquivalent(Mask, 0, 0))
+      if (isShuffleEquivalent(V1, V2, Mask, 0, 0))
          return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
  
      // Straight shuffle of a single input vector. Simulate this by using the
@@ -8464,12 +8481,6 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
    assert(Mask[1] >= 2 && "Non-canonicalized blend!");
  
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask, 0, 2))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
-  if (isShuffleEquivalent(Mask, 1, 3))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
-
    // If we have a single input, insert that into V1 if we can do so cheaply.
    if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
      if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
@@ -8486,7 +8497,7 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    // Try to use one of the special instruction patterns to handle two common
    // blend patterns if a zero-blend above didn't work.
-  if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3))
+  if (isShuffleEquivalent(V1, V2, Mask, 0, 3) || isShuffleEquivalent(V1, V2, Mask, 1, 3))
      if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
        // We can either use a special instruction to load over the low double or
        // to move just the low double.
@@ -8500,6 +8511,12 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                    Subtarget, DAG))
        return Blend;
  
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(V1, V2, Mask, 0, 2))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, 1, 3))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
+
    unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
    return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
                       DAG.getConstant(SHUFPDMask, MVT::i8));
@@ -8561,17 +8578,17 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
        return Insertion;
    }
  
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask, 0, 2))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
-  if (isShuffleEquivalent(Mask, 1, 3))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
-
    if (Subtarget->hasSSE41())
      if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
                                                    Subtarget, DAG))
        return Blend;
  
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(V1, V2, Mask, 0, 2))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, 1, 3))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
+
    // Try to use byte rotation instructions.
    // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
    if (Subtarget->hasSSSE3())
@@ -8599,9 +8616,9 @@ static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
  
    // To lower with a single SHUFPS we need to have the low half and high half
    // each requiring a single input.
-  if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4 != Mask[1] < 4))
+  if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4))
      return false;
-  if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4 != Mask[3] < 4))
+  if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4))
      return false;
  
    return true;
@@ -8724,9 +8741,9 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
      // Use even/odd duplicate instructions for masks that match their pattern.
      if (Subtarget->hasSSE3()) {
-      if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
+      if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2))
          return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
-      if (isShuffleEquivalent(Mask, 1, 1, 3, 3))
+      if (isShuffleEquivalent(V1, V2, Mask, 1, 1, 3, 3))
          return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
      }
  
@@ -8743,12 +8760,6 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                         getV4X86ShuffleImm8ForMask(Mask, DAG));
    }
  
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
-  if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
-
    // There are special ways we can lower some single-element blends. However, we
    // have custom ways we can lower more complex single-element blends below that
    // we defer to if both this and BLENDPS fail to match, so restrict this to
@@ -8774,6 +8785,12 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
          return BlendPerm;
    }
  
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 1, 5))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, 2, 6, 3, 7))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
+
    // Otherwise fall back to a SHUFPS lowering strategy.
    return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
  }
@@ -8816,9 +8833,9 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      // so prevents folding a load into this instruction or making a copy.
      const int UnpackLoMask[] = {0, 0, 1, 1};
      const int UnpackHiMask[] = {2, 2, 3, 3};
-    if (isShuffleEquivalent(Mask, 0, 0, 1, 1))
+    if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 1, 1))
        Mask = UnpackLoMask;
-    else if (isShuffleEquivalent(Mask, 2, 2, 3, 3))
+    else if (isShuffleEquivalent(V1, V2, Mask, 2, 2, 3, 3))
        Mask = UnpackHiMask;
  
      return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
@@ -8851,9 +8868,9 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      return Masked;
  
    // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
+  if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 1, 5))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
-  if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
+  if (isShuffleEquivalent(V1, V2, Mask, 2, 6, 3, 7))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
  
    // Try to use byte rotation instructions.
@@ -8930,9 +8947,9 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
      return Shift;
  
    // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
+  if (isShuffleEquivalent(V, V, Mask, 0, 0, 1, 1, 2, 2, 3, 3))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
-  if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
+  if (isShuffleEquivalent(V, V, Mask, 4, 4, 5, 5, 6, 6, 7, 7))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
  
    // Try to use byte rotation instructions.
@@ -9567,9 +9584,9 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      return Masked;
  
    // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
+  if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 2, 10, 3, 11))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
-  if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
+  if (isShuffleEquivalent(V1, V2, Mask, 4, 12, 5, 13, 6, 14, 7, 15))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
  
    // Try to use byte rotation instructions.
@@ -10139,14 +10156,43 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
    MVT ScalarVT = VT.getScalarType();
    MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
  
-  SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
-                             DAG.getIntPtrConstant(0));
-  SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
-                             DAG.getIntPtrConstant(SplitNumElements));
-  SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
-                             DAG.getIntPtrConstant(0));
-  SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
-                             DAG.getIntPtrConstant(SplitNumElements));
+  // Rather than splitting build-vectors, just build two narrower build
+  // vectors. This helps shuffling with splats and zeros.
+  auto SplitVector = [&](SDValue V) {
+    while (V.getOpcode() == ISD::BITCAST)
+      V = V->getOperand(0);
+
+    MVT OrigVT = V.getSimpleValueType();
+    int OrigNumElements = OrigVT.getVectorNumElements();
+    int OrigSplitNumElements = OrigNumElements / 2;
+    MVT OrigScalarVT = OrigVT.getScalarType();
+    MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
+
+    SDValue LoV, HiV;
+
+    auto *BV = dyn_cast<BuildVectorSDNode>(V);
+    if (!BV) {
+      LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
+                        DAG.getIntPtrConstant(0));
+      HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
+                        DAG.getIntPtrConstant(OrigSplitNumElements));
+    } else {
+
+      SmallVector<SDValue, 16> LoOps, HiOps;
+      for (int i = 0; i < OrigSplitNumElements; ++i) {
+        LoOps.push_back(BV->getOperand(i));
+        HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
+      }
+      LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps);
+      HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps);
+    }
+    return std::make_pair(DAG.getNode(ISD::BITCAST, DL, SplitVT, LoV),
+                          DAG.getNode(ISD::BITCAST, DL, SplitVT, HiV));
+  };
+
+  SDValue LoV1, HiV1, LoV2, HiV2;
+  std::tie(LoV1, HiV1) = SplitVector(V1);
+  std::tie(LoV2, HiV2) = SplitVector(V2);
  
    // Now create two 4-way blends of these half-width vectors.
    auto HalfBlend = [&](ArrayRef<int> HalfMask) {
@@ -10342,15 +10388,15 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
                                 VT.getVectorNumElements() / 2);
    // Check for patterns which can be matched with a single insert of a 128-bit
    // subvector.
-  if (isShuffleEquivalent(Mask, 0, 1, 0, 1) ||
-      isShuffleEquivalent(Mask, 0, 1, 4, 5)) {
+  if (isShuffleEquivalent(V1, V2, Mask, 0, 1, 0, 1) ||
+      isShuffleEquivalent(V1, V2, Mask, 0, 1, 4, 5)) {
      SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
                                DAG.getIntPtrConstant(0));
      SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
                                Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
    }
-  if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) {
+  if (isShuffleEquivalent(V1, V2, Mask, 0, 1, 6, 7)) {
      SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
                                DAG.getIntPtrConstant(0));
      SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
@@ -10489,7 +10535,7 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
        return Broadcast;
  
      // Use low duplicate instructions for masks that match their pattern.
-    if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
+    if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2))
        return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
  
      if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
@@ -10513,9 +10559,9 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    // X86 has dedicated unpack instructions that can handle specific blend
    // operations: UNPCKH and UNPCKL.
-  if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
+  if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
-  if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
+  if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
  
    // If we have a single input to the zero element, insert that into V1 if we
@@ -10619,9 +10665,9 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      }
  
      // Use dedicated unpack instructions for masks that match their pattern.
-    if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
+    if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
        return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
-    if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
+    if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
        return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
    }
  
@@ -10677,9 +10723,9 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
             "Repeated masks must be half the mask width!");
  
      // Use even/odd duplicate instructions for masks that match their pattern.
-    if (isShuffleEquivalent(Mask, 0, 0, 2, 2, 4, 4, 6, 6))
+    if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2, 4, 4, 6, 6))
        return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
-    if (isShuffleEquivalent(Mask, 1, 1, 3, 3, 5, 5, 7, 7))
+    if (isShuffleEquivalent(V1, V2, Mask, 1, 1, 3, 3, 5, 5, 7, 7))
        return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
  
      if (isSingleInputShuffleMask(Mask))
@@ -10687,9 +10733,9 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                           getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
  
      // Use dedicated unpack instructions for masks that match their pattern.
-    if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
+    if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 4, 12, 5, 13))
        return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
-    if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
+    if (isShuffleEquivalent(V1, V2, Mask, 2, 10, 3, 11, 6, 14, 7, 15))
        return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
  
      // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
@@ -10783,9 +10829,9 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                           getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
  
      // Use dedicated unpack instructions for masks that match their pattern.
-    if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
+    if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 4, 12, 5, 13))
        return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
-    if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
+    if (isShuffleEquivalent(V1, V2, Mask, 2, 10, 3, 11, 6, 14, 7, 15))
        return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
    }
  
@@ -10849,13 +10895,13 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      return Blend;
  
    // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask,
+  if (isShuffleEquivalent(V1, V2, Mask,
                            // First 128-bit lane:
                            0, 16, 1, 17, 2, 18, 3, 19,
                            // Second 128-bit lane:
                            8, 24, 9, 25, 10, 26, 11, 27))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
-  if (isShuffleEquivalent(Mask,
+  if (isShuffleEquivalent(V1, V2, Mask,
                            // First 128-bit lane:
                            4, 20, 5, 21, 6, 22, 7, 23,
                            // Second 128-bit lane:
@@ -10939,14 +10985,14 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    // Note that these are repeated 128-bit lane unpacks, not unpacks across all
    // 256-bit lanes.
    if (isShuffleEquivalent(
-          Mask,
+          V1, V2, Mask,
            // First 128-bit lane:
            0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
            // Second 128-bit lane:
            16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
    if (isShuffleEquivalent(
-          Mask,
+          V1, V2, Mask,
            // First 128-bit lane:
            8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
            // Second 128-bit lane:
@@ -11051,9 +11097,9 @@ static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    // X86 has dedicated unpack instructions that can handle specific blend
    // operations: UNPCKH and UNPCKL.
-  if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
+  if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 2, 10, 4, 12, 6, 14))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
-  if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
+  if (isShuffleEquivalent(V1, V2, Mask, 1, 9, 3, 11, 5, 13, 7, 15))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
  
    // FIXME: Implement direct support for this type!
@@ -11072,11 +11118,11 @@ static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
  
    // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask,
+  if (isShuffleEquivalent(V1, V2, Mask,
                            0, 16, 1, 17, 4, 20, 5, 21,
                            8, 24, 9, 25, 12, 28, 13, 29))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
-  if (isShuffleEquivalent(Mask,
+  if (isShuffleEquivalent(V1, V2, Mask,
                            2, 18, 3, 19, 6, 22, 7, 23,
                            10, 26, 11, 27, 14, 30, 15, 31))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
@@ -11098,9 +11144,9 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    // X86 has dedicated unpack instructions that can handle specific blend
    // operations: UNPCKH and UNPCKL.
-  if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
+  if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 2, 10, 4, 12, 6, 14))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
-  if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
+  if (isShuffleEquivalent(V1, V2, Mask, 1, 9, 3, 11, 5, 13, 7, 15))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
  
    // FIXME: Implement direct support for this type!
@@ -11119,11 +11165,11 @@ static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
  
    // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask,
+  if (isShuffleEquivalent(V1, V2, Mask,
                            0, 16, 1, 17, 4, 20, 5, 21,
                            8, 24, 9, 25, 12, 28, 13, 29))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
-  if (isShuffleEquivalent(Mask,
+  if (isShuffleEquivalent(V1, V2, Mask,
                            2, 18, 3, 19, 6, 22, 7, 23,
                            10, 26, 11, 27, 14, 30, 15, 31))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
@@ -11256,6 +11302,13 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
          return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
        }
  
+  // We actually see shuffles that are entirely re-arrangements of a set of
+  // zero inputs. This mostly happens while decomposing complex shuffles into
+  // simple ones. Directly lower these as a buildvector of zeros.
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  if (Zeroable.all())
+    return getZeroVector(VT, Subtarget, DAG, dl);
+
    // Try to collapse shuffles into using a vector type with fewer elements but
    // wider element types. We cap this to not form integers or floating point
    // elements wider than 64 bits, but it might be interesting to form i128
@@ -22829,9 +22882,9 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
  
    // We're looking for blends between FADD and FSUB nodes. We insist on these
    // nodes being lined up in a specific expected pattern.
-  if (!(isShuffleEquivalent(Mask, 0, 3) ||
-        isShuffleEquivalent(Mask, 0, 5, 2, 7) ||
-        isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
+  if (!(isShuffleEquivalent(V1, V2, Mask, 0, 3) ||
+        isShuffleEquivalent(V1, V2, Mask, 0, 5, 2, 7) ||
+        isShuffleEquivalent(V1, V2, Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
      return SDValue();
  
    // Only specific types are legal at this point, assert so we notice if and