[SKX] Enabling mask instructions: encoding, lowering

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 10538ffc27f14ed746a4f96aedc44cb280a6b38f..cba145208da63a4ff91e9bde0550dd7c376bb261 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -58,6 +58,12 @@ using namespace llvm;
  
  STATISTIC(NumTailCalls, "Number of tail calls");
  
+static cl::opt<bool> ExperimentalVectorWideningLegalization(
+    "x86-experimental-vector-widening-legalization", cl::init(false),
+    cl::desc("Enable an experimental vector type legalization through widening "
+             "rather than promotion."),
+    cl::Hidden);
+
  static cl::opt<bool> ExperimentalVectorShuffleLowering(
      "x86-experimental-vector-shuffle-lowering", cl::init(false),
      cl::desc("Enable an experimental vector shuffle lowering code path."),
@@ -509,6 +515,25 @@ void X86TargetLowering::resetOperationActions() {
      }
    }
  
+  // Special handling for half-precision floating point conversions.
+  // If we don't have F16C support, then lower half float conversions
+  // into library calls.
+  if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
+    setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+    setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+  }
+
+  // There's never any support for operations beyond MVT::f32.
+  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+  setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f80, MVT::f16, Expand);
+
    if (Subtarget->hasPOPCNT()) {
      setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
    } else {
@@ -864,7 +889,12 @@ void X86TargetLowering::resetOperationActions() {
                            (MVT::SimpleValueType)InnerVT, Expand);
      setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
      setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
-    setLoadExtAction(ISD::EXTLOAD, VT, Expand);
+
+    // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types,
+    // we have to deal with them whether we ask for Expansion or not. Setting
+    // Expand causes its own optimisation problems though, so leave them legal.
+    if (VT.getVectorElementType() == MVT::i1)
+      setLoadExtAction(ISD::EXTLOAD, VT, Expand);
    }
  
    // FIXME: In order to prevent SSE instructions being expanded to MMX ones
@@ -1475,6 +1505,11 @@ void X86TargetLowering::resetOperationActions() {
      }
    }// has  AVX-512
  
+  if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
+    addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
+    addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
+  }
+
    // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
    // of this type with custom code.
    for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
@@ -1588,6 +1623,16 @@ void X86TargetLowering::resetOperationActions() {
    setPrefFunctionAlignment(4); // 2^4 bytes.
  }
  
+TargetLoweringBase::LegalizeTypeAction
+X86TargetLowering::getPreferredVectorAction(EVT VT) const {
+  if (ExperimentalVectorWideningLegalization &&
+      VT.getVectorNumElements() != 1 &&
+      VT.getVectorElementType().getSimpleVT() != MVT::i1)
+    return TypeWidenVector;
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
  EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
    if (!VT.isVector())
      return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
@@ -2272,6 +2317,10 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
          RC = &X86::VK8RegClass;
        else if (RegVT == MVT::v16i1)
          RC = &X86::VK16RegClass;
+      else if (RegVT == MVT::v32i1)
+        RC = &X86::VK32RegClass;
+      else if (RegVT == MVT::v64i1)
+        RC = &X86::VK64RegClass;
        else
          llvm_unreachable("Unknown argument type!");
  
@@ -3019,7 +3068,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  //  If a tail called function callee has more arguments than the caller the
  //  caller needs to make sure that there is room to move the RETADDR to. This is
  //  achieved by reserving an area the size of the argument delta right after the
-//  original REtADDR, but before the saved framepointer or the spilled registers
+//  original RETADDR, but before the saved framepointer or the spilled registers
  //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
  //  stack layout:
  //    arg1
@@ -4727,28 +4776,6 @@ bool X86::isZeroNode(SDValue Elt) {
    return false;
  }
  
-/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
-/// their permute mask.
-static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
-                                    SelectionDAG &DAG) {
-  MVT VT = SVOp->getSimpleValueType(0);
-  unsigned NumElems = VT.getVectorNumElements();
-  SmallVector<int, 8> MaskVec;
-
-  for (unsigned i = 0; i != NumElems; ++i) {
-    int Idx = SVOp->getMaskElt(i);
-    if (Idx >= 0) {
-      if (Idx < (int)NumElems)
-        Idx += NumElems;
-      else
-        Idx -= NumElems;
-    }
-    MaskVec.push_back(Idx);
-  }
-  return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1),
-                              SVOp->getOperand(0), &MaskVec[0]);
-}
-
  /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
  /// match movhlps. The lower half elements should come from upper half of
  /// V1 (and in order), and the upper half elements should come from the upper
@@ -4834,19 +4861,6 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
    return true;
  }
  
-/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
-/// all the same.
-static bool isSplatVector(SDNode *N) {
-  if (N->getOpcode() != ISD::BUILD_VECTOR)
-    return false;
-
-  SDValue SplatValue = N->getOperand(0);
-  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
-    if (N->getOperand(i) != SplatValue)
-      return false;
-  return true;
-}
-
  /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
  /// to an zero vector.
  /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
@@ -5755,18 +5769,22 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
        return SDValue();
  
      case ISD::BUILD_VECTOR: {
-      // The BUILD_VECTOR node must be a splat.
-      if (!isSplatVector(Op.getNode()))
+      auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
+      BitVector UndefElements;
+      SDValue Splat = BVOp->getSplatValue(&UndefElements);
+
+      // We need a splat of a single value to use broadcast, and it doesn't
+      // make any sense if the value is only in one element of the vector.
+      if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
          return SDValue();
  
-      Ld = Op.getOperand(0);
+      Ld = Splat;
        ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
-                     Ld.getOpcode() == ISD::ConstantFP);
+                       Ld.getOpcode() == ISD::ConstantFP);
  
-      // The suspected load node has several users. Make sure that all
-      // of its users are from the BUILD_VECTOR node.
-      // Constants may have multiple users.
-      if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
+      // Make sure that all of the users of a non-constant load are from the
+      // BUILD_VECTOR node.
+      if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
          return SDValue();
        break;
      }
@@ -7679,6 +7697,9 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    if (isSingleInputShuffleMask(Mask)) {
      // Check whether we can widen this to an i16 shuffle by duplicating bytes.
      // Notably, this handles splat and partial-splat shuffles more efficiently.
+    // However, it only makes sense if the pre-duplication shuffle simplifies
+    // things significantly. Currently, this means we need to be able to
+    // express the pre-duplication shuffle as an i16 shuffle.
      //
      // FIXME: We should check for other patterns which can be widened into an
      // i16 shuffle as well.
@@ -7689,7 +7710,9 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
        }
        return true;
      };
-    if (canWidenViaDuplication(Mask)) {
+    auto tryToWidenViaDuplication = [&]() -> SDValue {
+      if (!canWidenViaDuplication(Mask))
+        return SDValue();
        SmallVector<int, 4> LoInputs;
        std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
                     [](int M) { return M >= 0 && M < 8; });
@@ -7707,52 +7730,57 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
        ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
        ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
  
-      int ByteMask[16];
+      int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
        SmallDenseMap<int, int, 8> LaneMap;
-      for (int i = 0; i < 16; ++i)
-        ByteMask[i] = -1;
        for (int I : InPlaceInputs) {
-        ByteMask[I] = I;
+        PreDupI16Shuffle[I/2] = I/2;
          LaneMap[I] = I;
        }
-      int FreeByteIdx = 0;
-      int TargetOffset = TargetLo ? 0 : 8;
-      for (int I : MovingInputs) {
-        // Walk the free index into the byte mask until we find an unoccupied
-        // spot. We bound this to 8 steps to catch bugs, the pigeonhole
-        // principle indicates that there *must* be a spot as we can only have
-        // 8 duplicated inputs. We have to walk the index using modular
-        // arithmetic to wrap around as necessary.
-        // FIXME: We could do a much better job of picking an inexpensive slot
-        // so this doesn't go through the worst case for the byte shuffle.
-        for (int j = 0; j < 8 && ByteMask[FreeByteIdx + TargetOffset] != -1;
-             ++j, FreeByteIdx = (FreeByteIdx + 1) % 8)
-          ;
-        assert(ByteMask[FreeByteIdx + TargetOffset] == -1 &&
-               "Failed to find a free byte!");
-        ByteMask[FreeByteIdx + TargetOffset] = I;
-        LaneMap[I] = FreeByteIdx + TargetOffset;
+      int j = TargetLo ? 0 : 4, je = j + 4;
+      for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
+        // Check if j is already a shuffle of this input. This happens when
+        // there are two adjacent bytes after we move the low one.
+        if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
+          // If we haven't yet mapped the input, search for a slot into which
+          // we can map it.
+          while (j < je && PreDupI16Shuffle[j] != -1)
+            ++j;
+
+          if (j == je)
+            // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
+            return SDValue();
+
+          // Map this input with the i16 shuffle.
+          PreDupI16Shuffle[j] = MovingInputs[i] / 2;
+        }
+
+        // Update the lane map based on the mapping we ended up with.
+        LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
        }
-      V1 = DAG.getVectorShuffle(MVT::v16i8, DL, V1, DAG.getUNDEF(MVT::v16i8),
-                                ByteMask);
-      for (int &M : Mask)
-        if (M != -1)
-          M = LaneMap[M];
+      V1 = DAG.getNode(
+          ISD::BITCAST, DL, MVT::v16i8,
+          DAG.getVectorShuffle(MVT::v8i16, DL,
+                               DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+                               DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
  
        // Unpack the bytes to form the i16s that will be shuffled into place.
        V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
                         MVT::v16i8, V1, V1);
  
-      int I16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+      int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
        for (int i = 0; i < 16; i += 2) {
          if (Mask[i] != -1)
-          I16Shuffle[i / 2] = Mask[i] - (TargetLo ? 0 : 8);
-        assert(I16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!");
+          PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
+        assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!");
        }
-      return DAG.getVectorShuffle(MVT::v8i16, DL,
-                                  DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
-                                  DAG.getUNDEF(MVT::v8i16), I16Shuffle);
-    }
+      return DAG.getNode(
+          ISD::BITCAST, DL, MVT::v16i8,
+          DAG.getVectorShuffle(MVT::v8i16, DL,
+                               DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+                               DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
+    };
+    if (SDValue V = tryToWidenViaDuplication())
+      return V;
    }
  
    // Check whether an interleaving lowering is likely to be more efficient.
@@ -7777,19 +7805,6 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds);
    }
-  SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
-  SDValue LoV1 =
-      DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
-                  DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, Zero));
-  SDValue HiV1 =
-      DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
-                  DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, Zero));
-  SDValue LoV2 =
-      DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
-                  DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V2, Zero));
-  SDValue HiV2 =
-      DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
-                  DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V2, Zero));
  
    int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
    int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
@@ -7811,10 +7826,49 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
    buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
  
-  SDValue V1Lo = DAG.getVectorShuffle(MVT::v8i16, DL, LoV1, HiV1, V1LoBlendMask);
-  SDValue V2Lo = DAG.getVectorShuffle(MVT::v8i16, DL, LoV2, HiV2, V2LoBlendMask);
-  SDValue V1Hi = DAG.getVectorShuffle(MVT::v8i16, DL, LoV1, HiV1, V1HiBlendMask);
-  SDValue V2Hi = DAG.getVectorShuffle(MVT::v8i16, DL, LoV2, HiV2, V2HiBlendMask);
+  SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
+
+  auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
+                             MutableArrayRef<int> HiBlendMask) {
+    SDValue V1, V2;
+    // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
+    // them out and avoid using UNPCK{L,H} to extract the elements of V as
+    // i16s.
+    if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
+                     [](int M) { return M >= 0 && M % 2 == 1; }) &&
+        std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
+                     [](int M) { return M >= 0 && M % 2 == 1; })) {
+      // Use a mask to drop the high bytes.
+      V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+      V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
+                       DAG.getConstant(0x00FF, MVT::v8i16));
+
+      // This will be a single vector shuffle instead of a blend so nuke V2.
+      V2 = DAG.getUNDEF(MVT::v8i16);
+
+      // Squash the masks to point directly into V1.
+      for (int &M : LoBlendMask)
+        if (M >= 0)
+          M /= 2;
+      for (int &M : HiBlendMask)
+        if (M >= 0)
+          M /= 2;
+    } else {
+      // Otherwise just unpack the low half of V into V1 and the high half into
+      // V2 so that we can blend them as i16s.
+      V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                       DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
+      V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                       DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
+    }
+
+    SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
+    SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
+    return std::make_pair(BlendedLo, BlendedHi);
+  };
+  SDValue V1Lo, V1Hi, V2Lo, V2Hi;
+  std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
+  std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
  
    SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
    SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
@@ -7885,7 +7939,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
    // but in some cases the first operand may be transformed to UNDEF.
    // In this case we should just commute the node.
    if (V1IsUndef)
-    return CommuteVectorShuffle(SVOp, DAG);
+    return DAG.getCommutedVectorShuffle(*SVOp);
  
    // Check for non-undef masks pointing at an undef vector and make the masks
    // undef as well. This makes it easier to match the shuffle based solely on
@@ -7931,7 +7985,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
    // V2. This allows us to match the shuffle pattern strictly on how many
    // elements come from V1 without handling the symmetric cases.
    if (NumV2Elements > NumV1Elements)
-    return CommuteVectorShuffle(SVOp, DAG);
+    return DAG.getCommutedVectorShuffle(*SVOp);
  
    // When the number of V1 and V2 elements are the same, try to minimize the
    // number of uses of V2 in the low half of the vector.
@@ -7943,7 +7997,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
        else if (M >= 0)
          ++LowV1Elements;
      if (LowV2Elements > LowV1Elements)
-      return CommuteVectorShuffle(SVOp, DAG);
+      return DAG.getCommutedVectorShuffle(*SVOp);
    }
  
    // For each vector width, delegate to a specialized lowering routine.
@@ -9227,7 +9281,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    // but in some cases the first operand may be transformed to UNDEF.
    // In this case we should just commute the node.
    if (V1IsUndef)
-    return CommuteVectorShuffle(SVOp, DAG);
+    return DAG.getCommutedVectorShuffle(*SVOp);
  
    // Vector shuffle lowering takes 3 steps:
    //
@@ -9339,7 +9393,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
  
    if (ShouldXformToMOVHLPS(M, VT) ||
        ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
-    return CommuteVectorShuffle(SVOp, DAG);
+    return DAG.getCommutedVectorShuffle(*SVOp);
  
    if (isShift) {
      // No better options. Use a vshldq / vsrldq.
@@ -9351,8 +9405,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    bool Commuted = false;
    // FIXME: This should also accept a bitcast of a splat?  Be careful, not
    // 1,1,1,1 -> v8i16 though.
-  V1IsSplat = isSplatVector(V1.getNode());
-  V2IsSplat = isSplatVector(V2.getNode());
+  BitVector UndefElements;
+  if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
+    if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
+      V1IsSplat = true;
+  if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
+    if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
+      V2IsSplat = true;
  
    // Canonicalize the splat or undef, if present, to be on the RHS.
    if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
@@ -9406,7 +9465,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
  
    // Normalize the node to match x86 shuffle ops if needed
    if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
-    return CommuteVectorShuffle(SVOp, DAG);
+    return DAG.getCommutedVectorShuffle(*SVOp);
  
    // The checks below are all present in isShuffleMaskLegal, but they are
    // inlined here right now to enable us to directly emit target specific
@@ -15084,7 +15143,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
    CLI.setDebugLoc(dl).setChain(InChain)
      .setCallee(getLibcallCallingConv(LC),
                 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
-               Callee, &Args, 0)
+               Callee, std::move(Args), 0)
      .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
  
    std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
@@ -15100,10 +15159,23 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
    assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
           (VT == MVT::v8i32 && Subtarget->hasInt256()));
  
-  // Get the high parts.
-  const int Mask[] = {1, 2, 3, 4, 5, 6, 7, 8};
-  SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
-  SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
+  // PMULxD operations multiply each even value (starting at 0) of LHS with
+  // the related value of RHS and produce a widen result.
+  // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+  // => <2 x i64> <ae|cg>
+  //
+  // In other word, to have all the results, we need to perform two PMULxD:
+  // 1. one with the even values.
+  // 2. one with the odd values.
+  // To achieve #2, with need to place the odd values at an even position.
+  //
+  // Place the odd value at an even position (basically, shift all values 1
+  // step to the left):
+  const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
+  // <a|b|c|d> => <b|undef|d|undef>
+  SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
+  // <e|f|g|h> => <f|undef|h|undef>
+  SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
  
    // Emit two multiplies, one for the lower 2 ints and one for the higher 2
    // ints.
@@ -15111,16 +15183,41 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
    bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
    unsigned Opcode =
        (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
+  // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+  // => <2 x i64> <ae|cg>
    SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
                               DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
+  // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
+  // => <2 x i64> <bf|dh>
    SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
-                             DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1));
+                             DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
  
    // Shuffle it back into the right order.
-  const int HighMask[] = {1, 5, 3, 7, 9, 13, 11, 15};
-  SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
-  const int LowMask[] = {0, 4, 2, 6, 8, 12, 10, 14};
-  SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+  // The internal representation is big endian.
+  // In other words, a i64 bitcasted to 2 x i32 has its high part at index 0
+  // and its low part at index 1.
+  // Moreover, we have: Mul1 = <ae|cg> ; Mul2 = <bf|dh>
+  // Vector index                0 1   ;          2 3
+  // We want      <ae|bf|cg|dh>
+  // Vector index   0  2  1  3
+  // Since each element is seen as 2 x i32, we get:
+  // high_mask[i] = 2 x vector_index[i]
+  // low_mask[i] = 2 x vector_index[i] + 1
+  // where vector_index = {0, Size/2, 1, Size/2 + 1, ...,
+  //                       Size/2 - 1, Size/2 + Size/2 - 1}
+  // where Size is the number of element of the final vector.
+  SDValue Highs, Lows;
+  if (VT == MVT::v8i32) {
+    const int HighMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
+    Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+    const int LowMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
+    Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+  } else {
+    const int HighMask[] = {0, 4, 2, 6};
+    Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+    const int LowMask[] = {1, 5, 3, 7};
+    Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+  }
  
    // If we have a signed multiply but no PMULDQ fix up the high parts of a
    // unsigned multiply.
@@ -15136,7 +15233,9 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
      Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
    }
  
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows);
+  // The low part of a MUL_LOHI is supposed to be the first value and the
+  // high part the second value.
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Lows, Highs);
  }
  
  static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
@@ -15147,10 +15246,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
    SDValue Amt = Op.getOperand(1);
  
    // Optimize shl/srl/sra with constant shift amount.
-  if (isSplatVector(Amt.getNode())) {
-    SDValue SclrAmt = Amt->getOperand(0);
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
-      uint64_t ShiftAmt = C->getZExtValue();
+  if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
+    if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
+      uint64_t ShiftAmt = ShiftConst->getZExtValue();
  
        if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
            (Subtarget->hasInt256() &&
@@ -15457,15 +15555,14 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
  
  static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
                            SelectionDAG &DAG) {
-
    MVT VT = Op.getSimpleValueType();
    SDLoc dl(Op);
    SDValue R = Op.getOperand(0);
    SDValue Amt = Op.getOperand(1);
    SDValue V;
  
-  if (!Subtarget->hasSSE2())
-    return SDValue();
+  assert(VT.isVector() && "Custom lowering only for vector shifts!");
+  assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
  
    V = LowerScalarImmediateShift(Op, DAG, Subtarget);
    if (V.getNode())
@@ -16084,7 +16181,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
  
    TargetLowering::CallLoweringInfo CLI(DAG);
    CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(CallingConv::C, RetTy, Callee, &Args, 0);
+    .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
  
    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
  
@@ -16360,6 +16457,20 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
      Results.push_back(EFLAGS.getValue(1));
      return;
    }
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+    // Delegate to generic TypeLegalization. Situations we can really handle
+    // should have already been dealt with by X86AtomicExpand.cpp.
+    break;
    case ISD::ATOMIC_LOAD: {
      ReplaceATOMIC_LOAD(N, Results, DAG);
      return;
@@ -16380,6 +16491,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                     MVT::v2f64, N->getOperand(0));
      SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
  
+    if (ExperimentalVectorWideningLegalization) {
+      // If we are legalizing vectors by widening, we already have the desired
+      // legal vector type, just return it.
+      Results.push_back(ToVecInt);
+      return;
+    }
+
      SmallVector<SDValue, 8> Elts;
      for (unsigned i = 0, e = NumElts; i != e; ++i)
        Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
@@ -16765,6 +16883,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
    return (SVT.getVectorNumElements() == 2 ||
            ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
            isMOVLMask(M, SVT) ||
+          isMOVHLPSMask(M, SVT) ||
            isSHUFPMask(M, SVT) ||
            isPSHUFDMask(M, SVT) ||
            isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
@@ -18409,6 +18528,39 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
          return false;
  
        continue;
+
+    case X86ISD::UNPCKL:
+    case X86ISD::UNPCKH:
+      // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
+      // shuffle into a preceding word shuffle.
+      if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
+        return false;
+
+      // Search for a half-shuffle which we can combine with.
+      unsigned CombineOp =
+          V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
+      if (V.getOperand(0) != V.getOperand(1) ||
+          !V->isOnlyUserOf(V.getOperand(0).getNode()))
+        return false;
+      V = V.getOperand(0);
+      do {
+        switch (V.getOpcode()) {
+        default:
+          return false; // Nothing to combine.
+
+        case X86ISD::PSHUFLW:
+        case X86ISD::PSHUFHW:
+          if (V.getOpcode() == CombineOp)
+            break;
+
+          // Fallthrough!
+        case ISD::BITCAST:
+          V = V.getOperand(0);
+          continue;
+        }
+        break;
+      } while (V.hasOneUse());
+      break;
      }
      // Break out of the loop if we break out of the switch.
      break;
@@ -18425,9 +18577,16 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
    SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
    for (int &M : Mask)
      M = VMask[M];
-  V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V.getOperand(0),
+  V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
                    getV4X86ShuffleImm8ForMask(Mask, DAG));
  
+  // It is possible that one of the combinable shuffles was completely absorbed
+  // by the other, just replace it and revisit all users in that case.
+  if (Old.getNode() == V.getNode()) {
+    DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo=*/true);
+    return true;
+  }
+
    // Replace N with its operand as we're going to combine that shuffle away.
    DAG.ReplaceAllUsesWith(N, N.getOperand(0));
  
@@ -18571,6 +18730,47 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
        return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
      }
  
+    // Look for shuffle patterns which can be implemented as a single unpack.
+    // FIXME: This doesn't handle the location of the PSHUFD generically, and
+    // only works when we have a PSHUFD followed by two half-shuffles.
+    if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
+        (V.getOpcode() == X86ISD::PSHUFLW ||
+         V.getOpcode() == X86ISD::PSHUFHW) &&
+        V.getOpcode() != N.getOpcode() &&
+        V.hasOneUse()) {
+      SDValue D = V.getOperand(0);
+      while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
+        D = D.getOperand(0);
+      if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
+        SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+        SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
+        int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+        int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+        int WordMask[8];
+        for (int i = 0; i < 4; ++i) {
+          WordMask[i + NOffset] = Mask[i] + NOffset;
+          WordMask[i + VOffset] = VMask[i] + VOffset;
+        }
+        // Map the word mask through the DWord mask.
+        int MappedMask[8];
+        for (int i = 0; i < 8; ++i)
+          MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
+        const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
+        const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
+        if (std::equal(std::begin(MappedMask), std::end(MappedMask),
+                       std::begin(UnpackLoMask)) ||
+            std::equal(std::begin(MappedMask), std::end(MappedMask),
+                       std::begin(UnpackHiMask))) {
+          // We can replace all three shuffles with an unpack.
+          V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
+          DCI.AddToWorklist(V.getNode());
+          return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
+                                                : X86ISD::UNPCKH,
+                             DL, MVT::v8i16, V, V);
+        }
+      }
+    }
+
      break;
  
    case X86ISD::PSHUFD:
@@ -18592,49 +18792,6 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
    SDValue N1 = N->getOperand(1);
    EVT VT = N->getValueType(0);
  
-  // Canonicalize shuffles that perform 'addsub' on packed float vectors
-  // according to the rule:
-  //  (shuffle (FADD A, B), (FSUB A, B), Mask) ->
-  //  (shuffle (FSUB A, -B), (FADD A, -B), Mask)
-  //
-  // Where 'Mask' is:
-  //  <0,5,2,7>             -- for v4f32 and v4f64 shuffles;
-  //  <0,3>                 -- for v2f64 shuffles;
-  //  <0,9,2,11,4,13,6,15>  -- for v8f32 shuffles.
-  //
-  // This helps pattern-matching more SSE3/AVX ADDSUB instructions
-  // during ISel stage.
-  if (N->getOpcode() == ISD::VECTOR_SHUFFLE &&
-      ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
-       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
-      N0->getOpcode() == ISD::FADD && N1->getOpcode() == ISD::FSUB &&
-      // Operands to the FADD and FSUB must be the same.
-      ((N0->getOperand(0) == N1->getOperand(0) &&
-        N0->getOperand(1) == N1->getOperand(1)) ||
-       // FADD is commutable. See if by commuting the operands of the FADD
-       // we would still be able to match the operands of the FSUB dag node.
-       (N0->getOperand(1) == N1->getOperand(0) &&
-        N0->getOperand(0) == N1->getOperand(1))) &&
-      N0->getOperand(0)->getOpcode() != ISD::UNDEF &&
-      N0->getOperand(1)->getOpcode() != ISD::UNDEF) {
-    
-    ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N);
-    unsigned NumElts = VT.getVectorNumElements();
-    ArrayRef<int> Mask = SV->getMask();
-    bool CanFold = true;
-
-    for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i)
-      CanFold = Mask[i] == (int)((i & 1) ? i + NumElts : i);
-
-    if (CanFold) {
-      SDValue Op0 = N1->getOperand(0);
-      SDValue Op1 = DAG.getNode(ISD::FNEG, dl, VT, N1->getOperand(1));
-      SDValue Sub = DAG.getNode(ISD::FSUB, dl, VT, Op0, Op1);
-      SDValue Add = DAG.getNode(ISD::FADD, dl, VT, Op0, Op1);
-      return DAG.getVectorShuffle(VT, dl, Sub, Add, Mask);
-    }
-  }
-
    // Don't create instructions with illegal types after legalize types has run.
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
@@ -19372,28 +19529,34 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
            Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
  
-      // If the RHS is a constant we have to reverse the const canonicalization.
-      // x > C-1 ? x+-C : 0 --> subus x, C
-      if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
-          isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) {
-        APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
-        if (CondRHS.getConstantOperandVal(0) == -A-1)
-          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS,
-                             DAG.getConstant(-A, VT));
-      }
-
-      // Another special case: If C was a sign bit, the sub has been
-      // canonicalized into a xor.
-      // FIXME: Would it be better to use computeKnownBits to determine whether
-      //        it's safe to decanonicalize the xor?
-      // x s< 0 ? x^C : 0 --> subus x, C
-      if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
-          ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
-          isSplatVector(OpRHS.getNode())) {
-        APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
-        if (A.isSignBit())
-          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
-      }
+      if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
+        if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
+          if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
+            if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
+              // If the RHS is a constant we have to reverse the const
+              // canonicalization.
+              // x > C-1 ? x+-C : 0 --> subus x, C
+              if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
+                  CondRHSConst->getAPIntValue() ==
+                      (-OpRHSConst->getAPIntValue() - 1))
+                return DAG.getNode(
+                    X86ISD::SUBUS, DL, VT, OpLHS,
+                    DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));
+
+          // Another special case: If C was a sign bit, the sub has been
+          // canonicalized into a xor.
+          // FIXME: Would it be better to use computeKnownBits to determine
+          //        whether it's safe to decanonicalize the xor?
+          // x s< 0 ? x^C : 0 --> subus x, C
+          if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
+              ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
+              OpRHSConst->getAPIntValue().isSignBit())
+            // Note that we have to rebuild the RHS constant here to ensure we
+            // don't rely on particular values of undef lanes.
+            return DAG.getNode(
+                X86ISD::SUBUS, DL, VT, OpLHS,
+                DAG.getConstant(OpRHSConst->getAPIntValue(), VT));
+        }
      }
    }
  
@@ -20101,16 +20264,15 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
    // vector operations in many cases. Also, on sandybridge ADD is faster than
    // shl.
    // (shl V, 1) -> add V,V
-  if (isSplatVector(N1.getNode())) {
-    assert(N0.getValueType().isVector() && "Invalid vector shift type");
-    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0));
-    // We shift all of the values by one. In many cases we do not have
-    // hardware support for this operation. This is better expressed as an ADD
-    // of two values.
-    if (N1C && (1 == N1C->getZExtValue())) {
-      return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
+  if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
+    if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
+      assert(N0.getValueType().isVector() && "Invalid vector shift type");
+      // We shift all of the values by one. In many cases we do not have
+      // hardware support for this operation. This is better expressed as an ADD
+      // of two values.
+      if (N1SplatC->getZExtValue() == 1)
+        return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
      }
-  }
  
    return SDValue();
  }
@@ -20129,10 +20291,9 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
  
    SDValue Amt = N->getOperand(1);
    SDLoc DL(N);
-  if (isSplatVector(Amt.getNode())) {
-    SDValue SclrAmt = Amt->getOperand(0);
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
-      APInt ShiftAmt = C->getAPIntValue();
+  if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
+    if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
+      APInt ShiftAmt = AmtSplat->getAPIntValue();
        unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
  
        // SSE2/AVX2 logical shifts always return a vector of 0s
@@ -20142,7 +20303,6 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
        if (ShiftAmt.trunc(8).uge(MaxAmount))
          return getZeroVector(VT, Subtarget, DAG, DL);
      }
-  }
  
    return SDValue();
  }
@@ -20336,9 +20496,10 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
  
    // The right side has to be a 'trunc' or a constant vector.
    bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
-  bool RHSConst = (isSplatVector(N1.getNode()) &&
-                   isa<ConstantSDNode>(N1->getOperand(0)));
-  if (!RHSTrunc && !RHSConst)
+  ConstantSDNode *RHSConstSplat = nullptr;
+  if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
+    RHSConstSplat = RHSBV->getConstantSplatNode();
+  if (!RHSTrunc && !RHSConstSplat)
      return SDValue();
  
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -20348,9 +20509,9 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
  
    // Set N0 and N1 to hold the inputs to the new wide operation.
    N0 = N0->getOperand(0);
-  if (RHSConst) {
+  if (RHSConstSplat) {
      N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
-                     N1->getOperand(0));
+                     SDValue(RHSConstSplat, 0));
      SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
      N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
    } else if (RHSTrunc) {
@@ -20496,12 +20657,9 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
        unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
        unsigned SraAmt = ~0;
        if (Mask.getOpcode() == ISD::SRA) {
-        SDValue Amt = Mask.getOperand(1);
-        if (isSplatVector(Amt.getNode())) {
-          SDValue SclrAmt = Amt->getOperand(0);
-          if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt))
-            SraAmt = C->getZExtValue();
-        }
+        if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
+          if (auto *AmtConst = AmtBV->getConstantSplatNode())
+            SraAmt = AmtConst->getZExtValue();
        } else if (Mask.getOpcode() == X86ISD::VSRAI) {
          SDValue SraC = Mask.getOperand(1);
          SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
@@ -21638,8 +21796,59 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
+static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
+                                                         SelectionDAG &DAG) {
+  // Take advantage of vector comparisons producing 0 or -1 in each lane to
+  // optimize away operation when it's from a constant.
+  //
+  // The general transformation is:
+  //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
+  //       AND(VECTOR_CMP(x,y), constant2)
+  //    constant2 = UNARYOP(constant)
+
+  // Early exit if this isn't a vector operation or if the operand of the
+  // unary operation isn't a bitwise AND.
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
+      N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  // Now check that the other operand of the AND is a constant splat. We could
+  // make the transformation for non-constant splats as well, but it's unclear
+  // that would be a benefit as it would not eliminate any operations, just
+  // perform one more step in scalar code before moving to the vector unit.
+  if (BuildVectorSDNode *BV =
+          dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
+    // Bail out if the vector isn't a constant splat.
+    if (!BV->getConstantSplatNode())
+      return SDValue();
+
+    // Everything checks out. Build up the new and improved node.
+    SDLoc DL(N);
+    EVT IntVT = BV->getValueType(0);
+    // Create a new constant of the appropriate type for the transformed
+    // DAG.
+    SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
+    // The AND node needs bitcasts to/from an integer vector type around it.
+    SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
+    SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
+                                 N->getOperand(0)->getOperand(0), MaskConst);
+    SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
+    return Res;
+  }
+
+  return SDValue();
+}
+
  static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
                                          const X86TargetLowering *XTLI) {
+  // First try to optimize away the conversion entirely when it's
+  // conditionally from a constant. Vectors only.
+  SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
+  if (Res != SDValue())
+    return Res;
+
+  // Now move on to more general possibilities.
    SDValue Op0 = N->getOperand(0);
    EVT InVT = Op0->getValueType(0);