Allow AVX vrsqrtps generation.

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 552d420b8053328c2a719784faf2a0f39a482f15..26b1e6d51ea1769f08067fb98ac6725d9a239e20 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -67,7 +67,7 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization(
      cl::Hidden);
  
  static cl::opt<bool> ExperimentalVectorShuffleLowering(
-    "x86-experimental-vector-shuffle-lowering", cl::init(false),
+    "x86-experimental-vector-shuffle-lowering", cl::init(true),
      cl::desc("Enable an experimental vector shuffle lowering code path."),
      cl::Hidden);
  
@@ -213,8 +213,8 @@ static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
  
  // FIXME: This should stop caching the target machine as soon as
  // we can remove resetOperationActions et al.
-X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
-  : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
+X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)
+    : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
    Subtarget = &TM.getSubtarget<X86Subtarget>();
    X86ScalarSSEf64 = Subtarget->hasSSE2();
    X86ScalarSSEf32 = Subtarget->hasSSE1();
@@ -811,6 +811,8 @@ void X86TargetLowering::resetOperationActions() {
    setOperationAction(ISD::FLOG10, MVT::f80, Expand);
    setOperationAction(ISD::FEXP, MVT::f80, Expand);
    setOperationAction(ISD::FEXP2, MVT::f80, Expand);
+  setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
+  setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
  
    // First set operation action for all vector types to either promote
    // (for widening) or expand (for scalarization). Then we will selectively
@@ -1560,6 +1562,7 @@ void X86TargetLowering::resetOperationActions() {
  
      setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
      setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Legal);
    }
  
    // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
@@ -1594,9 +1597,6 @@ void X86TargetLowering::resetOperationActions() {
      setOperationAction(ISD::UMULO, VT, Custom);
    }
  
-  // There are no 8-bit 3-address imul/mul instructions
-  setOperationAction(ISD::SMULO, MVT::i8, Expand);
-  setOperationAction(ISD::UMULO, MVT::i8, Expand);
  
    if (!Subtarget->is64Bit()) {
      // These libcalls are not available in 32-bit.
@@ -7448,7 +7448,7 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
  /// \brief Try to lower a vector shuffle as a byte rotation.
  ///
  /// We have a generic PALIGNR instruction in x86 that will do an arbitrary
-/// byte-rotation of a the concatentation of two vectors. This routine will
+/// byte-rotation of the concatenation of two vectors. This routine will
  /// try to generically lower a vector shuffle through such an instruction. It
  /// does not check for the availability of PALIGNR-based lowerings, only the
  /// applicability of this strategy to the given mask. This matches shuffle
@@ -7737,6 +7737,39 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
    return SDValue();
  }
  
+/// \brief Try to get a scalar value for a specific element of a vector.
+///
+/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
+static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
+                                              SelectionDAG &DAG) {
+  MVT VT = V.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  while (V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+  // If the bitcasts shift the element size, we can't extract an equivalent
+  // element from it.
+  MVT NewVT = V.getSimpleValueType();
+  if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
+    return SDValue();
+
+  if (V.getOpcode() == ISD::BUILD_VECTOR ||
+      (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR))
+    return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx));
+
+  return SDValue();
+}
+
+/// \brief Helper to test for a load that can be folded with x86 shuffles.
+///
+/// This is particularly important because the set of instructions varies
+/// significantly based on whether the operand is a load or not.
+static bool isShuffleFoldableLoad(SDValue V) {
+  while (V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+
+  return ISD::isNON_EXTLoad(V.getNode());
+}
+
  /// \brief Try to lower insertion of a single element into a zero vector.
  ///
  /// This is a common pattern that we have especially efficient patterns to lower
@@ -7745,59 +7778,71 @@ static SDValue lowerVectorShuffleAsElementInsertion(
      MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
      const X86Subtarget *Subtarget, SelectionDAG &DAG) {
    SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  MVT ExtVT = VT;
+  MVT EltVT = VT.getVectorElementType();
  
    int V2Index = std::find_if(Mask.begin(), Mask.end(),
                               [&Mask](int M) { return M >= (int)Mask.size(); }) -
                  Mask.begin();
-  if (Mask.size() == 2) {
-    if (!Zeroable[V2Index ^ 1]) {
-      // For 2-wide masks we may be able to just invert the inputs. We use an xor
-      // with 2 to flip from {2,3} to {0,1} and vice versa.
-      int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
-                            Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
-      if (Zeroable[V2Index])
-        return lowerVectorShuffleAsElementInsertion(VT, DL, V2, V1, InverseMask,
-                                                    Subtarget, DAG);
-      else
-        return SDValue();
+  bool IsV1Zeroable = true;
+  for (int i = 0, Size = Mask.size(); i < Size; ++i)
+    if (i != V2Index && !Zeroable[i]) {
+      IsV1Zeroable = false;
+      break;
      }
-  } else {
-    for (int i = 0, Size = Mask.size(); i < Size; ++i)
-      if (i != V2Index && !Zeroable[i])
-        return SDValue(); // Not inserting into a zero vector.
-  }
-
-  // Step over any bitcasts on either input so we can scan the actual
-  // BUILD_VECTOR nodes.
-  while (V1.getOpcode() == ISD::BITCAST)
-    V1 = V1.getOperand(0);
-  while (V2.getOpcode() == ISD::BITCAST)
-    V2 = V2.getOperand(0);
  
    // Check for a single input from a SCALAR_TO_VECTOR node.
    // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
    // all the smarts here sunk into that routine. However, the current
    // lowering of BUILD_VECTOR makes that nearly impossible until the old
    // vector shuffle lowering is dead.
-  if (!((V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
-         Mask[V2Index] == (int)Mask.size()) ||
-        V2.getOpcode() == ISD::BUILD_VECTOR))
+  if (SDValue V2S = getScalarValueForVectorElement(
+          V2, Mask[V2Index] - Mask.size(), DAG)) {
+    // We need to zext the scalar if it is smaller than an i32.
+    V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
+    if (EltVT == MVT::i8 || EltVT == MVT::i16) {
+      // Using zext to expand a narrow element won't work for non-zero
+      // insertions.
+      if (!IsV1Zeroable)
+        return SDValue();
+
+      // Zero-extend directly to i32.
+      ExtVT = MVT::v4i32;
+      V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
+    }
+    V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
+  } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
+             EltVT == MVT::i16) {
+    // Either not inserting from the low element of the input or the input
+    // element size is too small to use VZEXT_MOVL to clear the high bits.
      return SDValue();
+  }
  
-  SDValue V2S = V2.getOperand(Mask[V2Index] - Mask.size());
+  if (!IsV1Zeroable) {
+    // If V1 can't be treated as a zero vector we have fewer options to lower
+    // this. We can't support integer vectors or non-zero targets cheaply, and
+    // the V1 elements can't be permuted in any way.
+    assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
+    if (!VT.isFloatingPoint() || V2Index != 0)
+      return SDValue();
+    SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
+    V1Mask[V2Index] = -1;
+    if (!isNoopShuffleMask(V1Mask))
+      return SDValue();
+    // This is essentially a special case blend operation, but if we have
+    // general purpose blend operations, they are always faster. Bail and let
+    // the rest of the lowering handle these as blends.
+    if (Subtarget->hasSSE41())
+      return SDValue();
  
-  // First, we need to zext the scalar if it is smaller than an i32.
-  MVT ExtVT = VT;
-  MVT EltVT = VT.getVectorElementType();
-  V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
-  if (EltVT == MVT::i8 || EltVT == MVT::i16) {
-    // Zero-extend directly to i32.
-    ExtVT = MVT::v4i32;
-    V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
+    // Otherwise, use MOVSD or MOVSS.
+    assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
+           "Only two types of floating point element types to handle!");
+    return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
+                       ExtVT, V1, V2);
    }
  
-  V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT,
-                   DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S));
+  V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
    if (ExtVT != VT)
      V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
  
@@ -7823,6 +7868,83 @@ static SDValue lowerVectorShuffleAsElementInsertion(
    return V2;
  }
  
+/// \brief Try to lower broadcast of a single element.
+///
+/// For convenience, this code also bundles all of the subtarget feature set
+/// filtering. While a little annoying to re-dispatch on type here, there isn't
+/// a convenient way to factor it out.
+static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
+                                             ArrayRef<int> Mask,
+                                             const X86Subtarget *Subtarget,
+                                             SelectionDAG &DAG) {
+  if (!Subtarget->hasAVX())
+    return SDValue();
+  if (VT.isInteger() && !Subtarget->hasAVX2())
+    return SDValue();
+
+  // Check that the mask is a broadcast.
+  int BroadcastIdx = -1;
+  for (int M : Mask)
+    if (M >= 0 && BroadcastIdx == -1)
+      BroadcastIdx = M;
+    else if (M >= 0 && M != BroadcastIdx)
+      return SDValue();
+
+  assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
+                                            "a sorted mask where the broadcast "
+                                            "comes from V1.");
+
+  // Go up the chain of (vector) values to try and find a scalar load that
+  // we can combine with the broadcast.
+  for (;;) {
+    switch (V.getOpcode()) {
+    case ISD::CONCAT_VECTORS: {
+      int OperandSize = Mask.size() / V.getNumOperands();
+      V = V.getOperand(BroadcastIdx / OperandSize);
+      BroadcastIdx %= OperandSize;
+      continue;
+    }
+
+    case ISD::INSERT_SUBVECTOR: {
+      SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
+      auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
+      if (!ConstantIdx)
+        break;
+
+      int BeginIdx = (int)ConstantIdx->getZExtValue();
+      int EndIdx =
+          BeginIdx + (int)VInner.getValueType().getVectorNumElements();
+      if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
+        BroadcastIdx -= BeginIdx;
+        V = VInner;
+      } else {
+        V = VOuter;
+      }
+      continue;
+    }
+    }
+    break;
+  }
+
+  // Check if this is a broadcast of a scalar. We special case lowering
+  // for scalars so that we can more effectively fold with loads.
+  if (V.getOpcode() == ISD::BUILD_VECTOR ||
+      (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
+    V = V.getOperand(BroadcastIdx);
+
+    // If the scalar isn't a load we can't broadcast from it in AVX1, only with
+    // AVX2.
+    if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
+      return SDValue();
+  } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
+    // We can't broadcast from a vector register w/o AVX2, and we can only
+    // broadcast from the zero-element of a vector register.
+    return SDValue();
+  }
+
+  return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
+}
+
  /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
  ///
  /// This is the basis function for the 2-lane 64-bit shuffles as we have full
@@ -7866,10 +7988,29 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
  
    // If we have a single input, insert that into V1 if we can do so cheaply.
-  if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1)
+  if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
      if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
              MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
        return Insertion;
+    // Try inverting the insertion since for v2 masks it is easy to do and we
+    // can't reliably sort the mask one way or the other.
+    int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
+                          Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
+    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+            MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
+      return Insertion;
+  }
+
+  // Try to use one of the special instruction patterns to handle two common
+  // blend patterns if a zero-blend above didn't work.
+  if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3))
+    if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
+      // We can either use a special instruction to load over the low double or
+      // to move just the low double.
+      return DAG.getNode(
+          isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
+          DL, MVT::v2f64, V2,
+          DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
  
    if (Subtarget->hasSSE41())
      if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
@@ -7899,6 +8040,11 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
  
    if (isSingleInputShuffleMask(Mask)) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
+                                                          Mask, Subtarget, DAG))
+      return Broadcast;
+
      // Straight shuffle of a single input vector. For everything from SSE2
      // onward this has a single fast instruction with no scary immediates.
      // We have to map the mask as it is actually a v4i32 shuffle instruction.
@@ -7912,18 +8058,26 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                      getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
    }
  
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(Mask, 0, 2))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
-  if (isShuffleEquivalent(Mask, 1, 3))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
-
    // If we have a single input from V2 insert that into V1 if we can do so
    // cheaply.
-  if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1)
+  if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
      if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
              MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
        return Insertion;
+    // Try inverting the insertion since for v2 masks it is easy to do and we
+    // can't reliably sort the mask one way or the other.
+    int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
+                          Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
+    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+            MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
+      return Insertion;
+  }
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(Mask, 0, 2))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
+  if (isShuffleEquivalent(Mask, 1, 3))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
  
    if (Subtarget->hasSSE41())
      if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
@@ -8056,6 +8210,11 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
        std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
  
    if (NumV2Elements == 0) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
+                                                          Mask, Subtarget, DAG))
+      return Broadcast;
+
      if (Subtarget->hasAVX()) {
        // If we have AVX, we can use VPERMILPS which will allow folding a load
        // into the shuffle.
@@ -8152,10 +8311,22 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    ArrayRef<int> Mask = SVOp->getMask();
    assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
  
+  // Whenever we can lower this as a zext, that instruction is strictly faster
+  // than any alternative. It also allows us to fold memory operands into the
+  // shuffle in many cases.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
+                                                         Mask, Subtarget, DAG))
+    return ZExt;
+
    int NumV2Elements =
        std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
  
    if (NumV2Elements == 0) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
+                                                          Mask, Subtarget, DAG))
+      return Broadcast;
+
      // Straight shuffle of a single input vector. For everything from SSE2
      // onward this has a single fast instruction with no scary immediates.
      // We coerce the shuffle pattern to be compatible with UNPCK instructions
@@ -8172,11 +8343,11 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                         getV4X86ShuffleImm8ForMask(Mask, DAG));
    }
  
-  // Whenever we can lower this as a zext, that instruction is strictly faster
-  // than any alternative.
-  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
+  // There are special ways we can lower some single-element blends.
+  if (NumV2Elements == 1)
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
                                                           Mask, Subtarget, DAG))
-    return ZExt;
+      return V;
  
    // Use dedicated unpack instructions for masks that match their pattern.
    if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
@@ -8184,12 +8355,6 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
  
-  // There are special ways we can lower some single-element blends.
-  if (NumV2Elements == 1)
-    if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
-                                                         Mask, Subtarget, DAG))
-      return V;
-
    if (Subtarget->hasSSE41())
      if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
                                                    Subtarget, DAG))
@@ -8252,6 +8417,11 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
    MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
    MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
  
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
    // Use dedicated unpack instructions for masks that match their pattern.
    if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
@@ -8878,7 +9048,8 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    // Try to use rotation instructions if available.
    if (Subtarget->hasSSSE3())
-    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask, DAG))
+    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+            DL, MVT::v8i16, V1, V2, Mask, DAG))
        return Rotate;
  
    if (NumV1Inputs + NumV2Inputs <= 4)
@@ -9012,8 +9183,8 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    // Try to use rotation instructions if available.
    if (Subtarget->hasSSSE3())
-    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v16i8, V1, V2,
-                                                        OrigMask, DAG))
+    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+            DL, MVT::v16i8, V1, V2, OrigMask, DAG))
        return Rotate;
  
    // Try to use a zext lowering.
@@ -9035,6 +9206,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    // For single-input shuffles, there are some nicer lowering tricks we can use.
    if (NumV2Elements == 0) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
+                                                          Mask, Subtarget, DAG))
+      return Broadcast;
+
      // Check whether we can widen this to an i16 shuffle by duplicating bytes.
      // Notably, this handles splat and partial-splat shuffles more efficiently.
      // However, it only makes sense if the pre-duplication shuffle simplifies
@@ -9134,21 +9310,29 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    //
    // FIXME: We need to handle other interleaving widths (i16, i32, ...).
    if (shouldLowerAsInterleaving(Mask)) {
-    // FIXME: Figure out whether we should pack these into the low or high
-    // halves.
-
-    int EMask[16], OMask[16];
+    int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
+      return (M >= 0 && M < 8) || (M >= 16 && M < 24);
+    });
+    int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
+      return (M >= 8 && M < 16) || M >= 24;
+    });
+    int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
+                     -1, -1, -1, -1, -1, -1, -1, -1};
+    int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
+                     -1, -1, -1, -1, -1, -1, -1, -1};
+    bool UnpackLo = NumLoHalf >= NumHiHalf;
+    MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);
+    MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);
      for (int i = 0; i < 8; ++i) {
-      EMask[i] = Mask[2*i];
-      OMask[i] = Mask[2*i + 1];
-      EMask[i + 8] = -1;
-      OMask[i + 8] = -1;
+      TargetEMask[i] = Mask[2 * i];
+      TargetOMask[i] = Mask[2 * i + 1];
      }
  
      SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
      SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
  
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds);
+    return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+                       MVT::v16i8, Evens, Odds);
    }
  
    // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
@@ -9326,6 +9510,61 @@ static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    }
  }
  
+/// \brief Helper function to test whether a shuffle mask could be
+/// simplified by widening the elements being shuffled.
+///
+/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
+/// leaves it in an unspecified state.
+///
+/// NOTE: This must handle normal vector shuffle masks and *target* vector
+/// shuffle masks. The latter have the special property of a '-2' representing
+/// a zero-ed lane of a vector.
+static bool canWidenShuffleElements(ArrayRef<int> Mask,
+                                    SmallVectorImpl<int> &WidenedMask) {
+  for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
+    // If both elements are undef, its trivial.
+    if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
+      WidenedMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+
+    // Check for an undef mask and a mask value properly aligned to fit with
+    // a pair of values. If we find such a case, use the non-undef mask's value.
+    if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
+      WidenedMask.push_back(Mask[i + 1] / 2);
+      continue;
+    }
+    if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
+      WidenedMask.push_back(Mask[i] / 2);
+      continue;
+    }
+
+    // When zeroing, we need to spread the zeroing across both lanes to widen.
+    if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
+      if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
+          (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
+        WidenedMask.push_back(SM_SentinelZero);
+        continue;
+      }
+      return false;
+    }
+
+    // Finally check if the two mask values are adjacent and aligned with
+    // a pair.
+    if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
+      WidenedMask.push_back(Mask[i] / 2);
+      continue;
+    }
+
+    // Otherwise we can't safely widen the elements used in this shuffle.
+    return false;
+  }
+  assert(WidenedMask.size() == Mask.size() / 2 &&
+         "Incorrect size of mask after widening the elements!");
+
+  return true;
+}
+
  /// \brief Generic routine to split ector shuffle into half-sized shuffles.
  ///
  /// This routine just extracts two subvectors, shuffles them independently, and
@@ -9439,6 +9678,43 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
  }
  
+/// \brief Handle lowering 2-lane 128-bit shuffles.
+static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
+                                        SDValue V2, ArrayRef<int> Mask,
+                                        const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
+  // Blends are faster and handle all the non-lane-crossing cases.
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
+                                                Subtarget, DAG))
+    return Blend;
+
+  MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
+                               VT.getVectorNumElements() / 2);
+  // Check for patterns which can be matched with a single insert of a 128-bit
+  // subvector.
+  if (isShuffleEquivalent(Mask, 0, 1, 0, 1) ||
+      isShuffleEquivalent(Mask, 0, 1, 4, 5)) {
+    SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+                              DAG.getIntPtrConstant(0));
+    SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
+                              Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+  }
+  if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) {
+    SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+                              DAG.getIntPtrConstant(0));
+    SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
+                              DAG.getIntPtrConstant(2));
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+  }
+
+  // Otherwise form a 128-bit permutation.
+  // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
+  unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4;
+  return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
+                     DAG.getConstant(PermMask, MVT::i8));
+}
+
  /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
  ///
  /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
@@ -9453,7 +9729,17 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    ArrayRef<int> Mask = SVOp->getMask();
    assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
  
+  SmallVector<int, 4> WidenedMask;
+  if (canWidenShuffleElements(Mask, WidenedMask))
+    return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
+                                    DAG);
+
    if (isSingleInputShuffleMask(Mask)) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
+                                                          Mask, Subtarget, DAG))
+      return Broadcast;
+
      if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
        // Non-half-crossing single input shuffles can be lowerid with an
        // interleaved permutation.
@@ -9533,10 +9819,20 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
    assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
  
+  SmallVector<int, 4> WidenedMask;
+  if (canWidenShuffleElements(Mask, WidenedMask))
+    return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
+                                    DAG);
+
    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
                                                  Subtarget, DAG))
      return Blend;
  
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
    // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
    // use lower latency instructions that will operate on both 128-bit lanes.
    SmallVector<int, 2> RepeatedMask;
@@ -9591,6 +9887,11 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                  Subtarget, DAG))
      return Blend;
  
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
    // If the shuffle mask is repeated in each 128-bit lane, we have many more
    // options to efficiently lower the shuffle.
    SmallVector<int, 4> RepeatedMask;
@@ -9664,6 +9965,11 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                  Subtarget, DAG))
      return Blend;
  
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
    // If the shuffle mask is repeated in each 128-bit lane we can use more
    // efficient instructions that mirror the shuffles across the two 128-bit
    // lanes.
@@ -9713,6 +10019,11 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
    assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
  
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
    // There are no generalized cross-lane shuffle operations available on i16
    // element types.
    if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
@@ -9778,6 +10089,11 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
    assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
  
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
    // There are no generalized cross-lane shuffle operations available on i8
    // element types.
    if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
@@ -10017,61 +10333,6 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
  }
  
-/// \brief Helper function to test whether a shuffle mask could be
-/// simplified by widening the elements being shuffled.
-///
-/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
-/// leaves it in an unspecified state.
-///
-/// NOTE: This must handle normal vector shuffle masks and *target* vector
-/// shuffle masks. The latter have the special property of a '-2' representing
-/// a zero-ed lane of a vector.
-static bool canWidenShuffleElements(ArrayRef<int> Mask,
-                                    SmallVectorImpl<int> &WidenedMask) {
-  for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
-    // If both elements are undef, its trivial.
-    if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
-      WidenedMask.push_back(SM_SentinelUndef);
-      continue;
-    }
-
-    // Check for an undef mask and a mask value properly aligned to fit with
-    // a pair of values. If we find such a case, use the non-undef mask's value.
-    if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
-      WidenedMask.push_back(Mask[i + 1] / 2);
-      continue;
-    }
-    if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
-      WidenedMask.push_back(Mask[i] / 2);
-      continue;
-    }
-
-    // When zeroing, we need to spread the zeroing across both lanes to widen.
-    if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
-      if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
-          (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
-        WidenedMask.push_back(SM_SentinelZero);
-        continue;
-      }
-      return false;
-    }
-
-    // Finally check if the two mask values are adjacent and aligned with
-    // a pair.
-    if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
-      WidenedMask.push_back(Mask[i] / 2);
-      continue;
-    }
-
-    // Otherwise we can't safely widen the elements used in this shuffle.
-    return false;
-  }
-  assert(WidenedMask.size() == Mask.size() / 2 &&
-         "Incorrect size of mask after widening the elements!");
-
-  return true;
-}
-
  /// \brief Top-level lowering for x86 vector shuffles.
  ///
  /// This handles decomposition, canonicalization, and lowering of all x86
@@ -10115,20 +10376,25 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
          return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
        }
  
-  // For integer vector shuffles, try to collapse them into a shuffle of fewer
-  // lanes but wider integers. We cap this to not form integers larger than i64
-  // but it might be interesting to form i128 integers to handle flipping the
-  // low and high halves of AVX 256-bit vectors.
+  // Try to collapse shuffles into using a vector type with fewer elements but
+  // wider element types. We cap this to not form integers or floating point
+  // elements wider than 64 bits, but it might be interesting to form i128
+  // integers to handle flipping the low and high halves of AVX 256-bit vectors.
    SmallVector<int, 16> WidenedMask;
-  if (VT.isInteger() && VT.getScalarSizeInBits() < 64 &&
+  if (VT.getScalarSizeInBits() < 64 &&
        canWidenShuffleElements(Mask, WidenedMask)) {
-    MVT NewVT =
-        MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2),
-                         VT.getVectorNumElements() / 2);
-    V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
-    V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
-    return DAG.getNode(ISD::BITCAST, dl, VT,
-                       DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
+    MVT NewEltVT = VT.isFloatingPoint()
+                       ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
+                       : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
+    MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
+    // Make sure that the new vector type is legal. For example, v2f64 isn't
+    // legal on SSE1.
+    if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
+      V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
+      V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
+      return DAG.getNode(ISD::BITCAST, dl, VT,
+                         DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
+    }
    }
  
    int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
@@ -11353,37 +11619,6 @@ static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
    if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
      return SDValue();
  
-  // Simplify the operand as it's prepared to be fed into shuffle.
-  unsigned SignificantBits = NVT.getSizeInBits() >> Shift;
-  if (V1.getOpcode() == ISD::BITCAST &&
-      V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
-      V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-      V1.getOperand(0).getOperand(0)
-        .getSimpleValueType().getSizeInBits() == SignificantBits) {
-    // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
-    SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
-    ConstantSDNode *CIdx =
-      dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1));
-    // If it's foldable, i.e. normal load with single use, we will let code
-    // selection to fold it. Otherwise, we will short the conversion sequence.
-    if (CIdx && CIdx->getZExtValue() == 0 &&
-        (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) {
-      MVT FullVT = V.getSimpleValueType();
-      MVT V1VT = V1.getSimpleValueType();
-      if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) {
-        // The "ext_vec_elt" node is wider than the result node.
-        // In this case we should extract subvector from V.
-        // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)).
-        unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits();
-        MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(),
-                                        FullVT.getVectorNumElements()/Ratio);
-        V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V,
-                        DAG.getIntPtrConstant(0));
-      }
-      V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V);
-    }
-  }
-
    return DAG.getNode(ISD::BITCAST, DL, VT,
                       DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
  }
@@ -11841,80 +12076,41 @@ static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
    SDValue Cond = Op.getOperand(0);
    SDValue LHS = Op.getOperand(1);
    SDValue RHS = Op.getOperand(2);
-  SDLoc DL(Op);
+  SDLoc dl(Op);
    MVT VT = Op.getSimpleValueType();
    MVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
  
    // There is no blend with immediate in AVX-512.
    if (VT.is512BitVector())
      return SDValue();
  
-  // No blend instruction before SSE4.1.
-  if (!Subtarget->hasSSE41())
+  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
      return SDValue();
-  // There is no byte-blend immediate controlled instruction.
-  if (EltVT == MVT::i8)
+  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
      return SDValue();
  
    if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
      return SDValue();
  
-  auto *CondBV = cast<BuildVectorSDNode>(Cond);
+  // Check the mask for BLEND and build the value.
+  unsigned MaskValue = 0;
+  if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+    return SDValue();
  
-  unsigned BlendMask = 0;
+  // Convert i32 vectors to floating point if it is not AVX2.
+  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
    MVT BlendVT = VT;
-  if (VT == MVT::v16i16) {
-    // v16i16 blends are completely special. We can only do them when we have
-    // a repeated blend across the two 128-bit halves and we have AVX2.
-    if (!Subtarget->hasAVX2())
-      return SDValue();
-
-    for (int i = 0; i < 8; ++i) {
-      SDValue Lo = CondBV->getOperand(i);
-      SDValue Hi = CondBV->getOperand(i + 8);
-      bool IsLoZero = X86::isZeroNode(Lo);
-      bool IsHiZero = X86::isZeroNode(Hi);
-      if (Lo->getOpcode() != ISD::UNDEF && Hi->getOpcode() != ISD::UNDEF &&
-          IsLoZero != IsHiZero)
-        // Asymmetric blends, bail.
-        return SDValue();
-      BlendMask |= (unsigned)(IsLoZero || IsHiZero) << i;
-    }
-  } else {
-    // Everything else uses a generic blend mask computation with a custom type.
-    if (VT.isInteger()) {
-      if (VT.is256BitVector()) {
-        // The 256-bit integer blend instructions are only available on AVX2.
-        if (!Subtarget->hasAVX2())
-          return SDValue();
-
-        // We do the blend on v8i32 for 256-bit integer types.
-        BlendVT = MVT::v8i32;
-      } else {
-        // For 128-bit vectors we do the blend on v8i16 types.
-        BlendVT = MVT::v8i16;
-      }
-    }
-    assert(BlendVT.getVectorNumElements() <= 8 &&
-           "Cannot blend more than 8 elements with an immediate!");
-    // Scale the blend mask based on the number of elements in the selected
-    // blend type.
-    int Scale = BlendVT.getVectorNumElements() / VT.getVectorNumElements();
-    for (int i = 0, e = CondBV->getNumOperands(); i < e; ++i) {
-      SDValue CondElement = CondBV->getOperand(i);
-      if (CondElement->getOpcode() != ISD::UNDEF &&
-          X86::isZeroNode(CondElement))
-        for (int j = 0; j < Scale; ++j)
-          BlendMask |= 1u << (i * Scale + j);
-    }
+  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
+    BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
+                               NumElems);
+    LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
+    RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
    }
  
-  LHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, LHS);
-  RHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, RHS);
-
-  return DAG.getNode(ISD::BITCAST, DL, VT,
-                     DAG.getNode(X86ISD::BLENDI, DL, BlendVT, LHS, RHS,
-                                 DAG.getConstant(BlendMask, MVT::i8)));
+  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
+                            DAG.getConstant(MaskValue, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
  }
  
  SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
@@ -13604,12 +13800,24 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
                                   In, DAG.getUNDEF(SVT)));
  }
  
-// The only differences between FABS and FNEG are the mask and the logic op.
+/// The only differences between FABS and FNEG are the mask and the logic op.
+/// FNEG also has a folding opportunity for FNEG(FABS(x)).
  static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
    assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
           "Wrong opcode for lowering FABS or FNEG.");
  
    bool IsFABS = (Op.getOpcode() == ISD::FABS);
+
+  // If this is a FABS and it has an FNEG user, bail out to fold the combination
+  // into an FNABS. We'll lower the FABS after that if it is still in use.
+  if (IsFABS)
+    for (SDNode *User : Op->uses())
+      if (User->getOpcode() == ISD::FNEG)
+        return Op;
+
+  SDValue Op0 = Op.getOperand(0);
+  bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
+
    SDLoc dl(Op);
    MVT VT = Op.getSimpleValueType();
    // Assume scalar op for initialization; update for vector if needed.
@@ -13645,15 +13853,19 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
      // For a vector, cast operands to a vector type, perform the logic op,
      // and cast the result back to the original value type.
      MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
-    SDValue Op0Casted = DAG.getNode(ISD::BITCAST, dl, VecVT, Op.getOperand(0));
      SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
-    unsigned LogicOp = IsFABS ? ISD::AND : ISD::XOR;
+    SDValue Operand = IsFNABS ?
+      DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :
+      DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);
+    unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
      return DAG.getNode(ISD::BITCAST, dl, VT,
-                       DAG.getNode(LogicOp, dl, VecVT, Op0Casted, MaskCasted));
+                       DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
    }
+  
    // If not vector, then scalar.
-  unsigned LogicOp = IsFABS ? X86ISD::FAND : X86ISD::FXOR;
-  return DAG.getNode(LogicOp, dl, VT, Op.getOperand(0), Mask);
+  unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
+  SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
+  return DAG.getNode(BitOp, dl, VT, Operand, Mask);
  }
  
  static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
@@ -13747,8 +13959,7 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
    return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
  }
  
-// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
-//
+// Check whether an OR'd tree is PTEST-able.
  static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
                                        SelectionDAG &DAG) {
    assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
@@ -14156,6 +14367,37 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
    return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
  }
  
+/// The minimum architected relative accuracy is 2^-12. We need one
+/// Newton-Raphson step to have a good float result (24 bits of precision).
+SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
+                                            DAGCombinerInfo &DCI,
+                                            unsigned &RefinementSteps,
+                                            bool &UseOneConstNR) const {
+  // FIXME: We should use instruction latency models to calculate the cost of
+  // each potential sequence, but this is very hard to do reliably because
+  // at least Intel's Core* chips have variable timing based on the number of
+  // significant digits in the divisor and/or sqrt operand.
+  if (!Subtarget->useSqrtEst())
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  
+  // SSE1 has rsqrtss and rsqrtps.
+  // TODO: Add support for AVX512 (v16f32).
+  // It is likely not profitable to do this for f64 because a double-precision
+  // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
+  // instructions: convert to single, rsqrtss, convert back to double, refine
+  // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
+  // along with FMA, this could be a throughput win.
+  if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
+      (Subtarget->hasAVX() && VT == MVT::v8f32)) {
+    RefinementSteps = 1;
+    UseOneConstNR = false;
+    return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
+  }
+  return SDValue();
+}
+
  static bool isAllOnes(SDValue V) {
    ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
    return C && C->isAllOnesValue();
@@ -14940,13 +15182,32 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
    return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
  }
  
-static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
    MVT VT = Op->getSimpleValueType(0);
    SDValue In = Op->getOperand(0);
    MVT InVT = In.getSimpleValueType();
+  MVT VTElt = VT.getVectorElementType();
+  MVT InVTElt = InVT.getVectorElementType();
    SDLoc dl(Op);
  
+  // SKX processor
+  if ((InVTElt == MVT::i1) &&
+      (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
+        VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
+
+       ((Subtarget->hasBWI() && VT.is512BitVector() &&
+        VTElt.getSizeInBits() <= 16)) ||
+
+       ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
+        VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
+    
+       ((Subtarget->hasDQI() && VT.is512BitVector() &&
+        VTElt.getSizeInBits() >= 32))))
+    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+    
    unsigned int NumElts = VT.getVectorNumElements();
+
    if (NumElts != 8 && NumElts != 16)
      return SDValue();
  
@@ -14979,7 +15240,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
    SDLoc dl(Op);
  
    if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
-    return LowerSIGN_EXTEND_AVX512(Op, DAG);
+    return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
  
    if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
        (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
@@ -15898,19 +16159,39 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
    return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
  }
  
-/// \brief Return (vselect \p Mask, \p Op, \p PreservedSrc) along with the
+/// \brief Return (and \p Op, \p Mask) for compare instructions or
+/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
  /// necessary casting for \p Mask when lowering masking intrinsics.
  static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
                                      SDValue PreservedSrc, SelectionDAG &DAG) {
      EVT VT = Op.getValueType();
      EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
                                    MVT::i1, VT.getVectorNumElements());
+    EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                     Mask.getValueType().getSizeInBits());
      SDLoc dl(Op);
  
      assert(MaskVT.isSimple() && "invalid mask type");
-    return DAG.getNode(ISD::VSELECT, dl, VT,
-                       DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask),
-                       Op, PreservedSrc);
+
+    if (isAllOnes(Mask))
+      return Op;
+
+    // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+    // are extracted by EXTRACT_SUBVECTOR.
+    SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+                              DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+                              DAG.getIntPtrConstant(0));
+
+    switch (Op.getOpcode()) {
+      default: break;
+      case X86ISD::PCMPEQM:
+      case X86ISD::PCMPGTM:
+      case X86ISD::CMPM:
+      case X86ISD::CMPMU:
+        return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
+    }
+
+    return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
  }
  
  static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) {
@@ -15976,6 +16257,39 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
      case INTR_TYPE_3OP:
        return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
          Op.getOperand(2), Op.getOperand(3));
+    case CMP_MASK:
+    case CMP_MASK_CC: {
+      // Comparison intrinsics with masks.
+      // Example of transformation:
+      // (i8 (int_x86_avx512_mask_pcmpeq_q_128
+      //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
+      // (i8 (bitcast
+      //   (v8i1 (insert_subvector undef,
+      //           (v2i1 (and (PCMPEQM %a, %b),
+      //                      (extract_subvector
+      //                         (v8i1 (bitcast %mask)), 0))), 0))))
+      EVT VT = Op.getOperand(1).getValueType();
+      EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                    VT.getVectorNumElements());
+      SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
+      EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                       Mask.getValueType().getSizeInBits());
+      SDValue Cmp;
+      if (IntrData->Type == CMP_MASK_CC) {
+        Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+                    Op.getOperand(2), Op.getOperand(3));
+      } else {
+        assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
+        Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+                    Op.getOperand(2));
+      }
+      SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
+                                        DAG.getTargetConstant(0, MaskVT), DAG);
+      SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+                                DAG.getUNDEF(BitcastVT), CmpMask,
+                                DAG.getIntPtrConstant(0));
+      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+    }
      case COMI: { // Comparison intrinsics
        ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
        SDValue LHS = Op.getOperand(1);
@@ -17904,10 +18218,15 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
      Cond = X86::COND_B;
      break;
    case ISD::SMULO:
-    BaseOp = X86ISD::SMUL;
+    BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
      Cond = X86::COND_O;
      break;
    case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
+    if (N->getValueType(0) == MVT::i8) {
+      BaseOp = X86ISD::UMUL8;
+      Cond = X86::COND_O;
+      break;
+    }
      SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
                                   MVT::i32);
      SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
@@ -22314,7 +22633,12 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
      TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
                                            DCI.isBeforeLegalizeOps());
      if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
-        TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO))
+        (TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
+                                  TLO) &&
+         // Don't optimize vector of constants. Those are handled by
+         // the generic code and all the bits must be properly set for
+         // the generic optimizer.
+         !ISD::isBuildVectorOfConstantSDNodes(TLO.New.getNode())))
        DCI.CommitTargetLoweringOpt(TLO);
    }
  
@@ -24420,18 +24744,68 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
  
  /// performVZEXTCombine - Performs build vector combines
  static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
-                                        TargetLowering::DAGCombinerInfo &DCI,
-                                        const X86Subtarget *Subtarget) {
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget *Subtarget) {
+  SDLoc DL(N);
+  MVT VT = N->getSimpleValueType(0);
+  SDValue Op = N->getOperand(0);
+  MVT OpVT = Op.getSimpleValueType();
+  MVT OpEltVT = OpVT.getVectorElementType();
+  unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
+
    // (vzext (bitcast (vzext (x)) -> (vzext x)
-  SDValue In = N->getOperand(0);
-  while (In.getOpcode() == ISD::BITCAST)
-    In = In.getOperand(0);
+  SDValue V = Op;
+  while (V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
  
-  if (In.getOpcode() != X86ISD::VZEXT)
-    return SDValue();
+  if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
+    MVT InnerVT = V.getSimpleValueType();
+    MVT InnerEltVT = InnerVT.getVectorElementType();
+
+    // If the element sizes match exactly, we can just do one larger vzext. This
+    // is always an exact type match as vzext operates on integer types.
+    if (OpEltVT == InnerEltVT) {
+      assert(OpVT == InnerVT && "Types must match for vzext!");
+      return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
+    }
+
+    // The only other way we can combine them is if only a single element of the
+    // inner vzext is used in the input to the outer vzext.
+    if (InnerEltVT.getSizeInBits() < InputBits)
+      return SDValue();
  
-  return DAG.getNode(X86ISD::VZEXT, SDLoc(N), N->getValueType(0),
-                     In.getOperand(0));
+    // In this case, the inner vzext is completely dead because we're going to
+    // only look at bits inside of the low element. Just do the outer vzext on
+    // a bitcast of the input to the inner.
+    return DAG.getNode(X86ISD::VZEXT, DL, VT,
+                       DAG.getNode(ISD::BITCAST, DL, OpVT, V));
+  }
+
+  // Check if we can bypass extracting and re-inserting an element of an input
+  // vector. Essentialy:
+  // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
+  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+      V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
+    SDValue ExtractedV = V.getOperand(0);
+    SDValue OrigV = ExtractedV.getOperand(0);
+    if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))
+      if (ExtractIdx->getZExtValue() == 0) {
+        MVT OrigVT = OrigV.getSimpleValueType();
+        // Extract a subvector if necessary...
+        if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
+          int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
+          OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
+                                    OrigVT.getVectorNumElements() / Ratio);
+          OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
+                              DAG.getIntPtrConstant(0));
+        }
+        Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV);
+        return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
+      }
+  }
+
+  return SDValue();
  }
  
  SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,