Convert push_back loops into append calls.

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 32a5be3a16bfd200a0f37f01147b220919f43980..d39dea85788dac8cff2ab0284d55634c7939b66a 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1609,11 +1609,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
    }
  
-  // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
-  // of this type with custom code.
-  for (MVT VT : MVT::vector_valuetypes())
-    setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
-
    // We want to custom lower some of our intrinsics.
    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -1699,7 +1694,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    setTargetDAGCombine(ISD::MUL);
    setTargetDAGCombine(ISD::XOR);
  
-  computeRegisterProperties();
+  computeRegisterProperties(Subtarget->getRegisterInfo());
  
    // On Darwin, -Os means optimize for size without hurting performance,
    // do not reduce the limit.
@@ -1935,14 +1930,14 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
    return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
  }
  
-// FIXME: Why this routine is here? Move to RegInfo!
-std::pair<const TargetRegisterClass*, uint8_t>
-X86TargetLowering::findRepresentativeClass(MVT VT) const{
+std::pair<const TargetRegisterClass *, uint8_t>
+X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
+                                           MVT VT) const {
    const TargetRegisterClass *RRC = nullptr;
    uint8_t Cost = 1;
    switch (VT.SimpleTy) {
    default:
-    return TargetLowering::findRepresentativeClass(VT);
+    return TargetLowering::findRepresentativeClass(TRI, VT);
    case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
      RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
      break;
@@ -2573,10 +2568,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
      // Find the first unallocated argument registers.
      ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
      ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
-    unsigned NumIntRegs =
-        CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
-    unsigned NumXMMRegs =
-        CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
+    unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
             "SSE register cannot be used when SSE is disabled!");
  
@@ -3002,7 +2995,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
      };
-    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
      assert((Subtarget->hasSSE1() || !NumXMMRegs)
             && "SSE registers cannot be used when SSE is disabled");
  
@@ -3388,6 +3381,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
    bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
    bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
  
+  // Win64 functions have extra shadow space for argument homing. Don't do the
+  // sibcall if the caller and callee have mismatched expectations for this
+  // space.
+  if (IsCalleeWin64 != IsCallerWin64)
+    return false;
+
    if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
      if (IsTailCallConvention(CalleeCC) && CCMatch)
        return true;
@@ -5998,57 +5997,39 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
    return true;
  }
  
-/// \brief Base case helper for testing a single mask element.
-static bool isShuffleEquivalentImpl(SDValue V1, SDValue V2,
-                                    BuildVectorSDNode *BV1,
-                                    BuildVectorSDNode *BV2, ArrayRef<int> Mask,
-                                    int i, int Arg) {
-  int Size = Mask.size();
-  if (Mask[i] != -1 && Mask[i] != Arg) {
-    auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
-    auto *ArgsBV = Arg < Size ? BV1 : BV2;
-    if (!MaskBV || !ArgsBV ||
-        MaskBV->getOperand(Mask[i] % Size) != ArgsBV->getOperand(Arg % Size))
-      return false;
-  }
-  return true;
-}
-
-/// \brief Recursive helper to peel off and test each mask element.
-template <typename... Ts>
-static bool isShuffleEquivalentImpl(SDValue V1, SDValue V2,
-                                    BuildVectorSDNode *BV1,
-                                    BuildVectorSDNode *BV2, ArrayRef<int> Mask,
-                                    int i, int Arg, Ts... Args) {
-  if (!isShuffleEquivalentImpl(V1, V2, BV1, BV2, Mask, i, Arg))
-    return false;
-
-  return isShuffleEquivalentImpl(V1, V2, BV1, BV2, Mask, i + 1, Args...);
-}
-
  /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
  /// arguments.
  ///
  /// This is a fast way to test a shuffle mask against a fixed pattern:
  ///
-///   if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
+///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
  ///
  /// It returns true if the mask is exactly as wide as the argument list, and
  /// each element of the mask is either -1 (signifying undef) or the value given
  /// in the argument.
-template <typename... Ts>
  static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
-                                Ts... Args) {
-  if (Mask.size() != sizeof...(Args))
+                                ArrayRef<int> ExpectedMask) {
+  if (Mask.size() != ExpectedMask.size())
      return false;
  
+  int Size = Mask.size();
+
    // If the values are build vectors, we can look through them to find
    // equivalent inputs that make the shuffles equivalent.
    auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
    auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
  
-  // Recursively peel off arguments and test them against the mask.
-  return isShuffleEquivalentImpl(V1, V2, BV1, BV2, Mask, 0, Args...);
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) {
+      auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
+      auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
+      if (!MaskBV || !ExpectedBV ||
+          MaskBV->getOperand(Mask[i] % Size) !=
+              ExpectedBV->getOperand(ExpectedMask[i] % Size))
+        return false;
+    }
+
+  return true;
  }
  
  /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
@@ -6194,6 +6175,9 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
      // FALLTHROUGH
    case MVT::v16i8:
    case MVT::v32i8: {
+    assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) &&
+           "256-bit byte-blends require AVX2 support!");
+
      // Scale the blend by the number of bytes per element.
      int Scale = VT.getScalarSizeInBits() / 8;
  
@@ -6828,7 +6812,7 @@ static bool isShuffleFoldableLoad(SDValue V) {
  /// This is a common pattern that we have especially efficient patterns to lower
  /// across all subtarget feature sets.
  static SDValue lowerVectorShuffleAsElementInsertion(
-    MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
      const X86Subtarget *Subtarget, SelectionDAG &DAG) {
    SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
    MVT ExtVT = VT;
@@ -6930,7 +6914,7 @@ static SDValue lowerVectorShuffleAsElementInsertion(
  /// For convenience, this code also bundles all of the subtarget feature set
  /// filtering. While a little annoying to re-dispatch on type here, there isn't
  /// a convenient way to factor it out.
-static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
+static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
                                               ArrayRef<int> Mask,
                                               const X86Subtarget *Subtarget,
                                               SelectionDAG &DAG) {
@@ -7089,7 +7073,7 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
  /// because for floating point vectors we have a generalized SHUFPS lowering
  /// strategy that handles everything that doesn't *exactly* match an unpack,
  /// making this clever lowering unnecessary.
-static SDValue lowerVectorShuffleAsUnpack(MVT VT, SDLoc DL, SDValue V1,
+static SDValue lowerVectorShuffleAsUnpack(SDLoc DL, MVT VT, SDValue V1,
                                            SDValue V2, ArrayRef<int> Mask,
                                            SelectionDAG &DAG) {
    assert(!VT.isFloatingPoint() &&
@@ -7216,7 +7200,7 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    if (isSingleInputShuffleMask(Mask)) {
      // Use low duplicate instructions for masks that match their pattern.
      if (Subtarget->hasSSE3())
-      if (isShuffleEquivalent(V1, V2, Mask, 0, 0))
+      if (isShuffleEquivalent(V1, V2, Mask, {0, 0}))
          return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
  
      // Straight shuffle of a single input vector. Simulate this by using the
@@ -7239,20 +7223,21 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    // If we have a single input, insert that into V1 if we can do so cheaply.
    if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
      if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
-            MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
+            DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
        return Insertion;
      // Try inverting the insertion since for v2 masks it is easy to do and we
      // can't reliably sort the mask one way or the other.
      int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
                            Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
      if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
-            MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
+            DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG))
        return Insertion;
    }
  
    // Try to use one of the special instruction patterns to handle two common
    // blend patterns if a zero-blend above didn't work.
-  if (isShuffleEquivalent(V1, V2, Mask, 0, 3) || isShuffleEquivalent(V1, V2, Mask, 1, 3))
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
+      isShuffleEquivalent(V1, V2, Mask, {1, 3}))
      if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
        // We can either use a special instruction to load over the low double or
        // to move just the low double.
@@ -7267,9 +7252,9 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
        return Blend;
  
    // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(V1, V2, Mask, 0, 2))
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 2}))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, 1, 3))
+  if (isShuffleEquivalent(V1, V2, Mask, {1, 3}))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
  
    unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
@@ -7296,7 +7281,7 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    if (isSingleInputShuffleMask(Mask)) {
      // Check for being able to broadcast a single element.
-    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1,
                                                            Mask, Subtarget, DAG))
        return Broadcast;
  
@@ -7344,13 +7329,13 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    // When loading a scalar and then shuffling it into a vector we can often do
    // the insertion cheaply.
    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
-          MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
+          DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
      return Insertion;
    // Try inverting the insertion since for v2 masks it is easy to do and we
    // can't reliably sort the mask one way or the other.
    int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
-          MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
+          DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG))
      return Insertion;
  
    // We have different paths for blend lowering, but they all must use the
@@ -7362,9 +7347,9 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
        return Blend;
  
    // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(V1, V2, Mask, 0, 2))
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 2}))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, 1, 3))
+  if (isShuffleEquivalent(V1, V2, Mask, {1, 3}))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
  
    // Try to use byte rotation instructions.
@@ -7519,15 +7504,15 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    if (NumV2Elements == 0) {
      // Check for being able to broadcast a single element.
-    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1,
                                                            Mask, Subtarget, DAG))
        return Broadcast;
  
      // Use even/odd duplicate instructions for masks that match their pattern.
      if (Subtarget->hasSSE3()) {
-      if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2))
+      if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
          return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
-      if (isShuffleEquivalent(V1, V2, Mask, 1, 1, 3, 3))
+      if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
          return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
      }
  
@@ -7550,7 +7535,7 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    // when the V2 input is targeting element 0 of the mask -- that is the fast
    // case here.
    if (NumV2Elements == 1 && Mask[0] >= 4)
-    if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2,
                                                           Mask, Subtarget, DAG))
        return V;
  
@@ -7570,13 +7555,13 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    }
  
    // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 1, 5))
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5}))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, 2, 6, 3, 7))
+  if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7}))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 5, 1))
+  if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1}))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V2, V1);
-  if (isShuffleEquivalent(V1, V2, Mask, 6, 2, 7, 3))
+  if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3}))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V2, V1);
  
    // Otherwise fall back to a SHUFPS lowering strategy.
@@ -7610,7 +7595,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    if (NumV2Elements == 0) {
      // Check for being able to broadcast a single element.
-    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1,
                                                            Mask, Subtarget, DAG))
        return Broadcast;
  
@@ -7621,9 +7606,9 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      // so prevents folding a load into this instruction or making a copy.
      const int UnpackLoMask[] = {0, 0, 1, 1};
      const int UnpackHiMask[] = {2, 2, 3, 3};
-    if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 1, 1))
+    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
        Mask = UnpackLoMask;
-    else if (isShuffleEquivalent(V1, V2, Mask, 2, 2, 3, 3))
+    else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
        Mask = UnpackHiMask;
  
      return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
@@ -7637,7 +7622,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    // There are special ways we can lower some single-element blends.
    if (NumV2Elements == 1)
-    if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2,
                                                           Mask, Subtarget, DAG))
        return V;
  
@@ -7654,13 +7639,13 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      return Masked;
  
    // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 1, 5))
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5}))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, 2, 6, 3, 7))
+  if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7}))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 5, 1))
+  if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1}))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V2, V1);
-  if (isShuffleEquivalent(V1, V2, Mask, 6, 2, 7, 3))
+  if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3}))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V2, V1);
  
    // Try to use byte rotation instructions.
@@ -7678,7 +7663,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    // Try to lower by permuting the inputs into an unpack instruction.
    if (SDValue Unpack =
-          lowerVectorShuffleAsUnpack(MVT::v4i32, DL, V1, V2, Mask, DAG))
+          lowerVectorShuffleAsUnpack(DL, MVT::v4i32, V1, V2, Mask, DAG))
      return Unpack;
  
    // We implement this with SHUFPS because it can blend from two vectors.
@@ -7704,10 +7689,18 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  /// The exact breakdown of how to form these dword pairs and align them on the
  /// correct sides is really tricky. See the comments within the function for
  /// more of the details.
-static SDValue lowerV8I16SingleInputVectorShuffle(
-    SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
+///
+/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
+/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
+/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
+/// vector, form the analogous 128-bit 8-element Mask.
+static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
+    SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
      const X86Subtarget *Subtarget, SelectionDAG &DAG) {
-  assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
+  assert(VT.getScalarType() == MVT::i16 && "Bad input type!");
+  MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+
+  assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
    MutableArrayRef<int> LoMask = Mask.slice(0, 4);
    MutableArrayRef<int> HiMask = Mask.slice(4, 4);
  
@@ -7732,27 +7725,6 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
    MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
    MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
  
-  // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
-                                                        Mask, Subtarget, DAG))
-    return Broadcast;
-
-  // Try to use shift instructions.
-  if (SDValue Shift =
-          lowerVectorShuffleAsShift(DL, MVT::v8i16, V, V, Mask, DAG))
-    return Shift;
-
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(V, V, Mask, 0, 0, 1, 1, 2, 2, 3, 3))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
-  if (isShuffleEquivalent(V, V, Mask, 4, 4, 5, 5, 6, 6, 7, 7))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
-
-  // Try to use byte rotation instructions.
-  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
-          DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
-    return Rotate;
-
    // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
    // such inputs we can swap two of the dwords across the half mark and end up
    // with <=2 inputs to each half in each half. Once there, we can fall through
@@ -7881,9 +7853,9 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
      int PSHUFDMask[] = {0, 1, 2, 3};
      PSHUFDMask[ADWord] = BDWord;
      PSHUFDMask[BDWord] = ADWord;
-    V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
-                    DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
-                                DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
+    V = DAG.getNode(ISD::BITCAST, DL, VT,
+                    DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT,
+                                DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V),
                                  getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
  
      // Adjust the mask to match the new locations of A and B.
@@ -7895,8 +7867,8 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
  
      // Recurse back into this routine to re-compute state now that this isn't
      // a 3 and 1 problem.
-    return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
-                                Mask);
+    return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
+                                                     DAG);
    };
    if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
      return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
@@ -8119,15 +8091,15 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
    // Now enact all the shuffles we've computed to move the inputs into their
    // target half.
    if (!isNoopShuffleMask(PSHUFLMask))
-    V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
+    V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
                      getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
    if (!isNoopShuffleMask(PSHUFHMask))
-    V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
+    V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
                      getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
    if (!isNoopShuffleMask(PSHUFDMask))
-    V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
-                    DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
-                                DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
+    V = DAG.getNode(ISD::BITCAST, DL, VT,
+                    DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT,
+                                DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V),
                                  getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
  
    // At this point, each half should contain all its inputs, and we can then
@@ -8141,7 +8113,7 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
  
    // Do a half shuffle for the low mask.
    if (!isNoopShuffleMask(LoMask))
-    V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
+    V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
                      getV4X86ShuffleImm8ForMask(LoMask, DAG));
  
    // Do a half shuffle with the high mask after shifting its values down.
@@ -8149,7 +8121,7 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
      if (M >= 0)
        M -= 4;
    if (!isNoopShuffleMask(HiMask))
-    V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
+    V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
                      getV4X86ShuffleImm8ForMask(HiMask, DAG));
  
    return V;
@@ -8246,8 +8218,31 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
  
-  if (NumV2Inputs == 0)
-    return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
+  if (NumV2Inputs == 0) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1,
+                                                          Mask, Subtarget, DAG))
+      return Broadcast;
+
+    // Try to use shift instructions.
+    if (SDValue Shift =
+            lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, DAG))
+      return Shift;
+
+    // Use dedicated unpack instructions for masks that match their pattern.
+    if (isShuffleEquivalent(V1, V1, Mask, {0, 0, 1, 1, 2, 2, 3, 3}))
+      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V1);
+    if (isShuffleEquivalent(V1, V1, Mask, {4, 4, 5, 5, 6, 6, 7, 7}))
+      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V1);
+
+    // Try to use byte rotation instructions.
+    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
+                                                        Mask, Subtarget, DAG))
+      return Rotate;
+
+    return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, Mask,
+                                                     Subtarget, DAG);
+  }
  
    assert(std::any_of(Mask.begin(), Mask.end(), isV1) &&
           "All single-input shuffles should be canonicalized to be V1-input "
@@ -8260,7 +8255,7 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    // There are special ways we can lower some single-element blends.
    if (NumV2Inputs == 1)
-    if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
                                                           Mask, Subtarget, DAG))
        return V;
  
@@ -8277,9 +8272,9 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      return Masked;
  
    // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 2, 10, 3, 11))
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 2, 10, 3, 11}))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, 4, 12, 5, 13, 6, 14, 7, 15))
+  if (isShuffleEquivalent(V1, V2, Mask, {4, 12, 5, 13, 6, 14, 7, 15}))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
  
    // Try to use byte rotation instructions.
@@ -8292,7 +8287,7 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      return BitBlend;
  
    if (SDValue Unpack =
-          lowerVectorShuffleAsUnpack(MVT::v8i16, DL, V1, V2, Mask, DAG))
+          lowerVectorShuffleAsUnpack(DL, MVT::v8i16, V1, V2, Mask, DAG))
      return Unpack;
  
    // If we can't directly blend but can use PSHUFB, that will be better as it
@@ -8416,7 +8411,7 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    // For single-input shuffles, there are some nicer lowering tricks we can use.
    if (NumV2Elements == 0) {
      // Check for being able to broadcast a single element.
-    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1,
                                                            Mask, Subtarget, DAG))
        return Broadcast;
  
@@ -8514,13 +8509,15 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    }
  
    // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(V1, V2, Mask,
-                          0,  16,  1, 17,  2, 18,  3, 19,
-                          4,  20,  5, 21,  6, 22,  7, 23))
+  if (isShuffleEquivalent(V1, V2, Mask, {// Low half.
+                                         0, 16, 1, 17, 2, 18, 3, 19,
+                                         // High half.
+                                         4, 20, 5, 21, 6, 22, 7, 23}))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask,
-                          8,  24,  9, 25, 10, 26, 11, 27,
-                          12, 28, 13, 29, 14, 30, 15, 31))
+  if (isShuffleEquivalent(V1, V2, Mask, {// Low half.
+                                         8, 24, 9, 25, 10, 26, 11, 27,
+                                         // High half.
+                                         12, 28, 13, 29, 14, 30, 15, 31}))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, V2);
  
    // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
@@ -8561,7 +8558,7 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
        // shuffles will both be pshufb, in which case we shouldn't bother with
        // this.
        if (SDValue Unpack =
-              lowerVectorShuffleAsUnpack(MVT::v16i8, DL, V1, V2, Mask, DAG))
+              lowerVectorShuffleAsUnpack(DL, MVT::v16i8, V1, V2, Mask, DAG))
          return Unpack;
      }
  
@@ -8570,7 +8567,7 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    // There are special ways we can lower some single-element blends.
    if (NumV2Elements == 1)
-    if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2,
                                                           Mask, Subtarget, DAG))
        return V;
  
@@ -9006,15 +9003,15 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
                                 VT.getVectorNumElements() / 2);
    // Check for patterns which can be matched with a single insert of a 128-bit
    // subvector.
-  if (isShuffleEquivalent(V1, V2, Mask, 0, 1, 0, 1) ||
-      isShuffleEquivalent(V1, V2, Mask, 0, 1, 4, 5)) {
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}) ||
+      isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
      SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
                                DAG.getIntPtrConstant(0));
      SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
                                Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
    }
-  if (isShuffleEquivalent(V1, V2, Mask, 0, 1, 6, 7)) {
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 6, 7})) {
      SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
                                DAG.getIntPtrConstant(0));
      SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
@@ -9148,12 +9145,12 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    if (isSingleInputShuffleMask(Mask)) {
      // Check for being able to broadcast a single element.
-    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1,
                                                            Mask, Subtarget, DAG))
        return Broadcast;
  
      // Use low duplicate instructions for masks that match their pattern.
-    if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2))
+    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
        return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
  
      if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
@@ -9177,13 +9174,13 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    // X86 has dedicated unpack instructions that can handle specific blend
    // operations: UNPCKH and UNPCKL.
-  if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6}))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
+  if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7}))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 6, 2))
+  if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2}))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1);
-  if (isShuffleEquivalent(V1, V2, Mask, 5, 1, 7, 3))
+  if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1);
  
    // If we have a single input to the zero element, insert that into V1 if we
@@ -9192,7 +9189,7 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
        std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
    if (NumV2Elements == 1 && Mask[0] >= 4)
      if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
-            MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
+            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
        return Insertion;
  
    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
@@ -9264,7 +9261,7 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      return Blend;
  
    // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1,
                                                          Mask, Subtarget, DAG))
      return Broadcast;
  
@@ -9299,13 +9296,13 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      return Shift;
  
    // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6}))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
+  if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7}))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 6, 2))
+  if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2}))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V2, V1);
-  if (isShuffleEquivalent(V1, V2, Mask, 5, 1, 7, 3))
+  if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V2, V1);
  
    // Try to simplify this by merging 128-bit lanes to enable a lane-based
@@ -9342,7 +9339,7 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      return Blend;
  
    // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1,
                                                          Mask, Subtarget, DAG))
      return Broadcast;
  
@@ -9354,9 +9351,9 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
             "Repeated masks must be half the mask width!");
  
      // Use even/odd duplicate instructions for masks that match their pattern.
-    if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2, 4, 4, 6, 6))
+    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
        return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
-    if (isShuffleEquivalent(V1, V2, Mask, 1, 1, 3, 3, 5, 5, 7, 7))
+    if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7}))
        return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
  
      if (isSingleInputShuffleMask(Mask))
@@ -9364,13 +9361,13 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                           getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
  
      // Use dedicated unpack instructions for masks that match their pattern.
-    if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 4, 12, 5, 13))
+    if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13}))
        return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
-    if (isShuffleEquivalent(V1, V2, Mask, 2, 10, 3, 11, 6, 14, 7, 15))
+    if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15}))
        return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
-    if (isShuffleEquivalent(V1, V2, Mask, 8, 0, 9, 1, 12, 4, 13, 5))
+    if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5}))
        return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V2, V1);
-    if (isShuffleEquivalent(V1, V2, Mask, 10, 2, 11, 3, 14, 6, 15, 7))
+    if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7}))
        return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V2, V1);
  
      // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
@@ -9449,7 +9446,7 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      return Blend;
  
    // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1,
                                                          Mask, Subtarget, DAG))
      return Broadcast;
  
@@ -9464,13 +9461,13 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                           getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
  
      // Use dedicated unpack instructions for masks that match their pattern.
-    if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 4, 12, 5, 13))
+    if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13}))
        return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
-    if (isShuffleEquivalent(V1, V2, Mask, 2, 10, 3, 11, 6, 14, 7, 15))
+    if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15}))
        return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
-    if (isShuffleEquivalent(V1, V2, Mask, 8, 0, 9, 1, 12, 4, 13, 5))
+    if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5}))
        return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V2, V1);
-    if (isShuffleEquivalent(V1, V2, Mask, 10, 2, 11, 3, 14, 6, 15, 7))
+    if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7}))
        return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V2, V1);
    }
  
@@ -9529,7 +9526,7 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      return ZExt;
  
    // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1,
                                                          Mask, Subtarget, DAG))
      return Broadcast;
  
@@ -9539,16 +9536,16 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    // Use dedicated unpack instructions for masks that match their pattern.
    if (isShuffleEquivalent(V1, V2, Mask,
-                          // First 128-bit lane:
-                          0, 16, 1, 17, 2, 18, 3, 19,
-                          // Second 128-bit lane:
-                          8, 24, 9, 25, 10, 26, 11, 27))
+                          {// First 128-bit lane:
+                           0, 16, 1, 17, 2, 18, 3, 19,
+                           // Second 128-bit lane:
+                           8, 24, 9, 25, 10, 26, 11, 27}))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
    if (isShuffleEquivalent(V1, V2, Mask,
-                          // First 128-bit lane:
-                          4, 20, 5, 21, 6, 22, 7, 23,
-                          // Second 128-bit lane:
-                          12, 28, 13, 29, 14, 30, 15, 31))
+                          {// First 128-bit lane:
+                           4, 20, 5, 21, 6, 22, 7, 23,
+                           // Second 128-bit lane:
+                           12, 28, 13, 29, 14, 30, 15, 31}))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
  
    // Try to use shift instructions.
@@ -9568,6 +9565,15 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
        return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
                                                       Mask, DAG);
  
+    SmallVector<int, 8> RepeatedMask;
+    if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+      // As this is a single-input shuffle, the repeated mask should be
+      // a strictly valid v8i16 mask that we can pass through to the v8i16
+      // lowering to handle even the v16 case.
+      return lowerV8I16GeneralSingleInputVectorShuffle(
+          DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
+    }
+
      SDValue PSHUFBMask[32];
      for (int i = 0; i < 16; ++i) {
        if (Mask[i] == -1) {
@@ -9621,7 +9627,7 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      return ZExt;
  
    // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1,
                                                          Mask, Subtarget, DAG))
      return Broadcast;
  
@@ -9634,17 +9640,17 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    // 256-bit lanes.
    if (isShuffleEquivalent(
            V1, V2, Mask,
-          // First 128-bit lane:
-          0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
-          // Second 128-bit lane:
-          16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
+          {// First 128-bit lane:
+           0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
+           // Second 128-bit lane:
+           16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55}))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
    if (isShuffleEquivalent(
            V1, V2, Mask,
-          // First 128-bit lane:
-          8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
-          // Second 128-bit lane:
-          24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
+          {// First 128-bit lane:
+           8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+           // Second 128-bit lane:
+           24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63}))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
  
    // Try to use shift instructions.
@@ -9750,9 +9756,9 @@ static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    // X86 has dedicated unpack instructions that can handle specific blend
    // operations: UNPCKH and UNPCKL.
-  if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 2, 10, 4, 12, 6, 14))
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, 1, 9, 3, 11, 5, 13, 7, 15))
+  if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
  
    // FIXME: Implement direct support for this type!
@@ -9772,12 +9778,16 @@ static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    // Use dedicated unpack instructions for masks that match their pattern.
    if (isShuffleEquivalent(V1, V2, Mask,
-                          0, 16, 1, 17, 4, 20, 5, 21,
-                          8, 24, 9, 25, 12, 28, 13, 29))
+                          {// First 128-bit lane.
+                           0, 16, 1, 17, 4, 20, 5, 21,
+                           // Second 128-bit lane.
+                           8, 24, 9, 25, 12, 28, 13, 29}))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
    if (isShuffleEquivalent(V1, V2, Mask,
-                          2, 18, 3, 19, 6, 22, 7, 23,
-                          10, 26, 11, 27, 14, 30, 15, 31))
+                          {// First 128-bit lane.
+                           2, 18, 3, 19, 6, 22, 7, 23,
+                           // Second 128-bit lane.
+                           10, 26, 11, 27, 14, 30, 15, 31}))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
  
    // FIXME: Implement direct support for this type!
@@ -9797,9 +9807,9 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    // X86 has dedicated unpack instructions that can handle specific blend
    // operations: UNPCKH and UNPCKL.
-  if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 2, 10, 4, 12, 6, 14))
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, 1, 9, 3, 11, 5, 13, 7, 15))
+  if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
  
    // FIXME: Implement direct support for this type!
@@ -9819,12 +9829,16 @@ static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    // Use dedicated unpack instructions for masks that match their pattern.
    if (isShuffleEquivalent(V1, V2, Mask,
-                          0, 16, 1, 17, 4, 20, 5, 21,
-                          8, 24, 9, 25, 12, 28, 13, 29))
+                          {// First 128-bit lane.
+                           0, 16, 1, 17, 4, 20, 5, 21,
+                           // Second 128-bit lane.
+                           8, 24, 9, 25, 12, 28, 13, 29}))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
    if (isShuffleEquivalent(V1, V2, Mask,
-                          2, 18, 3, 19, 6, 22, 7, 23,
-                          10, 26, 11, 27, 14, 30, 15, 31))
+                          {// First 128-bit lane.
+                           2, 18, 3, 19, 6, 22, 7, 23,
+                           // Second 128-bit lane.
+                           10, 26, 11, 27, 14, 30, 15, 31}))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
  
    // FIXME: Implement direct support for this type!
@@ -9878,8 +9892,8 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
           "Cannot lower 512-bit vectors w/ basic ISA!");
  
    // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
-                                                        Mask, Subtarget, DAG))
+  if (SDValue Broadcast =
+          lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG))
      return Broadcast;
  
    // Dispatch to each element type for lowering. If we don't have supprot for
@@ -10129,24 +10143,31 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
    if (!Subtarget->hasSSE41())
      return SDValue();
  
-  // Some types for vselect were previously set to Expand, not Legal or
-  // Custom. Return an empty SDValue so we fall-through to Expand, after
-  // the Custom lowering phase.
-  MVT VT = Op.getSimpleValueType();
-  switch (VT.SimpleTy) {
+  // Only some types will be legal on some subtargets. If we can emit a legal
+  // VSELECT-matching blend, return Op, and but if we need to expand, return
+  // a null value.
+  switch (Op.getSimpleValueType().SimpleTy) {
    default:
-    break;
+    // Most of the vector types have blends past SSE4.1.
+    return Op;
+
+  case MVT::v32i8:
+    // The byte blends for AVX vectors were introduced only in AVX2.
+    if (Subtarget->hasAVX2())
+      return Op;
+
+    return SDValue();
+
    case MVT::v8i16:
    case MVT::v16i16:
+    // AVX-512 BWI and VLX features support VSELECT with i16 elements.
      if (Subtarget->hasBWI() && Subtarget->hasVLX())
-      break;
+      return Op;
+
+    // FIXME: We should custom lower this by fixing the condition and using i8
+    // blends.
      return SDValue();
    }
-
-  // We couldn't create a "Blend with immediate" node.
-  // This node should still be legal, but we'll have to emit a blendv*
-  // instruction.
-  return Op;
  }
  
  static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
@@ -14433,9 +14454,20 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
        SDValue Src2 = Op.getOperand(2);
        SDValue Src0 = Op.getOperand(3);
        SDValue Mask = Op.getOperand(4);
-      SDValue RoundingMode = Op.getOperand(5);
+      // There are 2 kinds of intrinsics in this group:
+      // (1) With supress-all-exceptions (sae) - 6 operands
+      // (2) With rounding mode and sae - 7 operands.
+      if (Op.getNumOperands() == 6) {
+        SDValue Sae  = Op.getOperand(5);
+        return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
+                                                Sae),
+                                    Mask, Src0, Subtarget, DAG);
+      }
+      assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
+      SDValue RoundingMode  = Op.getOperand(5);
+      SDValue Sae  = Op.getOperand(6);
        return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
-                                              RoundingMode),
+                                              RoundingMode, Sae),
                                    Mask, Src0, Subtarget, DAG);
      }
      case INTR_TYPE_2OP_MASK: {
@@ -15790,76 +15822,29 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
                                              DAG);
        }
  
-      if (VT == MVT::v16i8) {
-        if (Op.getOpcode() == ISD::SHL) {
-          // Make a large shift.
-          SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
-                                                   MVT::v8i16, R, ShiftAmt,
-                                                   DAG);
-          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
-          // Zero out the rightmost bits.
-          SmallVector<SDValue, 16> V(16,
-                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
-                                                     MVT::i8));
-          return DAG.getNode(ISD::AND, dl, VT, SHL,
-                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
-        }
-        if (Op.getOpcode() == ISD::SRL) {
-          // Make a large shift.
-          SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
-                                                   MVT::v8i16, R, ShiftAmt,
-                                                   DAG);
-          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
-          // Zero out the leftmost bits.
-          SmallVector<SDValue, 16> V(16,
-                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
-                                                     MVT::i8));
-          return DAG.getNode(ISD::AND, dl, VT, SRL,
-                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
-        }
-        if (Op.getOpcode() == ISD::SRA) {
-          if (ShiftAmt == 7) {
-            // R s>> 7  ===  R s< 0
-            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
-            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
-          }
-
-          // R s>> a === ((R u>> a) ^ m) - m
-          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
-          SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
-                                                         MVT::i8));
-          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
-          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
-          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
-          return Res;
-        }
-        llvm_unreachable("Unknown shift opcode.");
-      }
+      if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) {
+        unsigned NumElts = VT.getVectorNumElements();
+        MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
  
-      if (Subtarget->hasInt256() && VT == MVT::v32i8) {
          if (Op.getOpcode() == ISD::SHL) {
            // Make a large shift.
-          SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
-                                                   MVT::v16i16, R, ShiftAmt,
-                                                   DAG);
+          SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
+                                                   R, ShiftAmt, DAG);
            SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
            // Zero out the rightmost bits.
-          SmallVector<SDValue, 32> V(32,
-                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
-                                                     MVT::i8));
+          SmallVector<SDValue, 32> V(
+              NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), MVT::i8));
            return DAG.getNode(ISD::AND, dl, VT, SHL,
                               DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
          }
          if (Op.getOpcode() == ISD::SRL) {
            // Make a large shift.
-          SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
-                                                   MVT::v16i16, R, ShiftAmt,
-                                                   DAG);
+          SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
+                                                   R, ShiftAmt, DAG);
            SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
            // Zero out the leftmost bits.
-          SmallVector<SDValue, 32> V(32,
-                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
-                                                     MVT::i8));
+          SmallVector<SDValue, 32> V(
+              NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, MVT::i8));
            return DAG.getNode(ISD::AND, dl, VT, SRL,
                               DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
          }
@@ -15872,8 +15857,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
  
            // R s>> a === ((R u>> a) ^ m) - m
            SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
-          SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
-                                                         MVT::i8));
+          SmallVector<SDValue, 32> V(NumElts,
+                                     DAG.getConstant(128 >> ShiftAmt, MVT::i8));
            SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
            Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
            Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
@@ -16270,7 +16255,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
      Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
      return DAG.getNode(ISD::TRUNCATE, dl, VT,
                         DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
-    }
+  }
  
    // Decompose 256-bit shifts into smaller 128-bit shifts.
    if (VT.is256BitVector()) {
@@ -16286,12 +16271,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
      SDValue Amt1, Amt2;
      if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
        // Constant shift amount
-      SmallVector<SDValue, 4> Amt1Csts;
-      SmallVector<SDValue, 4> Amt2Csts;
-      for (unsigned i = 0; i != NumElems/2; ++i)
-        Amt1Csts.push_back(Amt->getOperand(i));
-      for (unsigned i = NumElems/2; i != NumElems; ++i)
-        Amt2Csts.push_back(Amt->getOperand(i));
+      SmallVector<SDValue, 8> Ops(Amt->op_begin(), Amt->op_begin() + NumElems);
+      ArrayRef<SDValue> Amt1Csts = makeArrayRef(Ops).slice(0, NumElems / 2);
+      ArrayRef<SDValue> Amt2Csts = makeArrayRef(Ops).slice(NumElems / 2);
  
        Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
        Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
@@ -16392,68 +16374,6 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
  }
  
-// Sign extension of the low part of vector elements. This may be used either
-// when sign extend instructions are not available or if the vector element
-// sizes already match the sign-extended size. If the vector elements are in
-// their pre-extended size and sign extend instructions are available, that will
-// be handled by LowerSIGN_EXTEND.
-SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
-  MVT VT = Op.getSimpleValueType();
-
-  if (!Subtarget->hasSSE2() || !VT.isVector())
-    return SDValue();
-
-  unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
-                      ExtraVT.getScalarType().getSizeInBits();
-
-  switch (VT.SimpleTy) {
-    default: return SDValue();
-    case MVT::v8i32:
-    case MVT::v16i16:
-      if (!Subtarget->hasFp256())
-        return SDValue();
-      if (!Subtarget->hasInt256()) {
-        // needs to be split
-        unsigned NumElems = VT.getVectorNumElements();
-
-        // Extract the LHS vectors
-        SDValue LHS = Op.getOperand(0);
-        SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
-        SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
-
-        MVT EltVT = VT.getVectorElementType();
-        EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
-
-        EVT ExtraEltVT = ExtraVT.getVectorElementType();
-        unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
-        ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
-                                   ExtraNumElems/2);
-        SDValue Extra = DAG.getValueType(ExtraVT);
-
-        LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
-        LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
-
-        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
-      }
-      // fall through
-    case MVT::v4i32:
-    case MVT::v8i16: {
-      SDValue Op0 = Op.getOperand(0);
-
-      // This is a sign extension of some low part of vector elements without
-      // changing the size of the vector elements themselves:
-      // Shift-Left + Shift-Right-Algebraic.
-      SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,
-                                               BitsDiff, DAG);
-      return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,
-                                        DAG);
-    }
-  }
-}
-
  /// Returns true if the operand type is exactly twice the native width, and
  /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
  /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
@@ -16961,7 +16881,6 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
  SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    switch (Op.getOpcode()) {
    default: llvm_unreachable("Should not custom lower this!");
-  case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
    case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
    case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
      return LowerCMP_SWAP(Op, Subtarget, DAG);
@@ -19227,7 +19146,9 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
    // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
    // vectors because it can have a load folded into it that UNPCK cannot. This
    // doesn't preclude something switching to the shorter encoding post-RA.
-  if (FloatDomain) {
+  //
+  // FIXME: Should teach these routines about AVX vector widths.
+  if (FloatDomain && VT.getSizeInBits() == 128) {
      if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
        bool Lo = Mask.equals(0, 0);
        unsigned Shuffle;
@@ -19291,7 +19212,7 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
    // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
    // variants as none of these have single-instruction variants that are
    // superior to the UNPCK formulation.
-  if (!FloatDomain &&
+  if (!FloatDomain && VT.getSizeInBits() == 128 &&
        (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
         Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
         Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
@@ -19332,9 +19253,9 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
    // in practice PSHUFB tends to be *very* fast so we're more aggressive.
    if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
      SmallVector<SDValue, 16> PSHUFBMask;
-    assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
-    int Ratio = 16 / Mask.size();
-    for (unsigned i = 0; i < 16; ++i) {
+    int NumBytes = VT.getSizeInBits() / 8;
+    int Ratio = NumBytes / Mask.size();
+    for (int i = 0; i < NumBytes; ++i) {
        if (Mask[i / Ratio] == SM_SentinelUndef) {
          PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
          continue;
@@ -19344,12 +19265,13 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
                    : 255;
        PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
      }
-    Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);
+    MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
+    Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Input);
      DCI.AddToWorklist(Op.getNode());
      SDValue PSHUFBMaskOp =
-        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask);
+        DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask);
      DCI.AddToWorklist(PSHUFBMaskOp.getNode());
-    Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp);
+    Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp);
      DCI.AddToWorklist(Op.getNode());
      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
                    /*AddTo*/ true);
@@ -19407,10 +19329,6 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
    MVT VT = Op.getSimpleValueType();
    if (!VT.isVector())
      return false; // Bail if we hit a non-vector.
-  // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit
-  // version should be added.
-  if (VT.getSizeInBits() != 128)
-    return false;
  
    assert(Root.getSimpleValueType().isVector() &&
           "Shuffles operate on vector types!");
@@ -19513,12 +19431,26 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
  /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
  /// PSHUF-style masks that can be reused with such instructions.
  static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
+  MVT VT = N.getSimpleValueType();
    SmallVector<int, 4> Mask;
    bool IsUnary;
-  bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
+  bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary);
    (void)HaveMask;
    assert(HaveMask);
  
+  // If we have more than 128-bits, only the low 128-bits of shuffle mask
+  // matter. Check that the upper masks are repeats and remove them.
+  if (VT.getSizeInBits() > 128) {
+    int LaneElts = 128 / VT.getScalarSizeInBits();
+#ifndef NDEBUG
+    for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
+      for (int j = 0; j < LaneElts; ++j)
+        assert(Mask[j] == Mask[i * LaneElts + j] - LaneElts &&
+               "Mask doesn't repeat in high 128-bit lanes!");
+#endif
+    Mask.resize(LaneElts);
+  }
+
    switch (N.getOpcode()) {
    case X86ISD::PSHUFD:
      return Mask;
@@ -19591,7 +19523,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
      case X86ISD::UNPCKH:
        // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
        // shuffle into a preceding word shuffle.
-      if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
+      if (V.getSimpleValueType().getScalarType() != MVT::i8 &&
+          V.getSimpleValueType().getScalarType() != MVT::i16)
          return SDValue();
  
        // Search for a half-shuffle which we can combine with.
@@ -19765,8 +19698,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
      break;
    case X86ISD::PSHUFLW:
    case X86ISD::PSHUFHW:
-    assert(VT == MVT::v8i16);
-    (void)VT;
+    assert(VT.getScalarType() == MVT::i16 && "Bad word shuffle type!");
  
      if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
        return SDValue(); // We combined away this shuffle, so we're done.
@@ -19779,12 +19711,13 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
        int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
        DMask[DOffset + 0] = DOffset + 1;
        DMask[DOffset + 1] = DOffset + 0;
-      V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
+      MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+      V = DAG.getNode(ISD::BITCAST, DL, DVT, V);
        DCI.AddToWorklist(V.getNode());
-      V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
+      V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
                        getV4X86ShuffleImm8ForMask(DMask, DAG));
        DCI.AddToWorklist(V.getNode());
-      return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+      return DAG.getNode(ISD::BITCAST, DL, VT, V);
      }
  
      // Look for shuffle patterns which can be implemented as a single unpack.
@@ -19819,11 +19752,11 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
              std::equal(std::begin(MappedMask), std::end(MappedMask),
                         std::begin(UnpackHiMask))) {
            // We can replace all three shuffles with an unpack.
-          V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
+          V = DAG.getNode(ISD::BITCAST, DL, VT, D.getOperand(0));
            DCI.AddToWorklist(V.getNode());
            return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
                                                  : X86ISD::UNPCKH,
-                             DL, MVT::v8i16, V, V);
+                             DL, VT, V, V);
          }
        }
      }
@@ -19881,9 +19814,9 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
  
    // We're looking for blends between FADD and FSUB nodes. We insist on these
    // nodes being lined up in a specific expected pattern.
-  if (!(isShuffleEquivalent(V1, V2, Mask, 0, 3) ||
-        isShuffleEquivalent(V1, V2, Mask, 0, 5, 2, 7) ||
-        isShuffleEquivalent(V1, V2, Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
+  if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
+        isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
+        isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
      return SDValue();
  
    // Only specific types are legal at this point, assert so we notice if and
@@ -19971,10 +19904,6 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
      }
    }
  
-  // Only handle 128 wide vector from here on.
-  if (!VT.is128BitVector())
-    return SDValue();
-
    // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
    // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
    // consecutive, non-overlapping, and in the right order.
@@ -20842,21 +20771,31 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
      }
    }
  
-  // If we know that this node is legal then we know that it is going to be
-  // matched by one of the SSE/AVX BLEND instructions. These instructions only
-  // depend on the highest bit in each word. Try to use SimplifyDemandedBits
-  // to simplify previous instructions.
+  // We should generate an X86ISD::BLENDI from a vselect if its argument
+  // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
+  // constants. This specific pattern gets generated when we split a
+  // selector for a 512 bit vector in a machine without AVX512 (but with
+  // 256-bit vectors), during legalization:
+  //
+  // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
+  //
+  // Iff we find this pattern and the build_vectors are built from
+  // constants, we translate the vselect into a shuffle_vector that we
+  // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
+  if ((N->getOpcode() == ISD::VSELECT ||
+       N->getOpcode() == X86ISD::SHRUNKBLEND) &&
+      !DCI.isBeforeLegalize()) {
+    SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+    if (Shuffle.getNode())
+      return Shuffle;
+  }
+
+  // If this is a *dynamic* select (non-constant condition) and we can match
+  // this node with one of the variable blend instructions, restructure the
+  // condition so that the blends can use the high bit of each element and use
+  // SimplifyDemandedBits to simplify the condition operand.
    if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
        !DCI.isBeforeLegalize() &&
-      // We explicitly check against SSE4.1, v8i16 and v16i16 because, although
-      // vselect nodes may be marked as Custom, they might only be legal when
-      // Cond is a build_vector of constants. This will be taken care in
-      // a later condition.
-      (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) &&
-       Subtarget->hasSSE41() && VT != MVT::v16i16 && VT != MVT::v8i16) &&
-      // Don't optimize vector of constants. Those are handled by
-      // the generic code and all the bits must be properly set for
-      // the generic optimizer.
        !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
      unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
  
@@ -20864,6 +20803,31 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
      if (BitWidth == 1)
        return SDValue();
  
+    // We can only handle the cases where VSELECT is directly legal on the
+    // subtarget. We custom lower VSELECT nodes with constant conditions and
+    // this makes it hard to see whether a dynamic VSELECT will correctly
+    // lower, so we both check the operation's status and explicitly handle the
+    // cases where a *dynamic* blend will fail even though a constant-condition
+    // blend could be custom lowered.
+    // FIXME: We should find a better way to handle this class of problems.
+    // Potentially, we should combine constant-condition vselect nodes
+    // pre-legalization into shuffles and not mark as many types as custom
+    // lowered.
+    if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
+      return SDValue();
+    // FIXME: We don't support i16-element blends currently. We could and
+    // should support them by making *all* the bits in the condition be set
+    // rather than just the high bit and using an i8-element blend.
+    if (VT.getScalarType() == MVT::i16)
+      return SDValue();
+    // Dynamic blending was only available from SSE4.1 onward.
+    if (VT.getSizeInBits() == 128 && !Subtarget->hasSSE41())
+      return SDValue();
+    // Byte blends are only available in AVX2
+    if (VT.getSizeInBits() == 256 && VT.getScalarType() == MVT::i8 &&
+        !Subtarget->hasAVX2())
+      return SDValue();
+
      assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
      APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
  
@@ -20912,25 +20876,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
      }
    }
  
-  // We should generate an X86ISD::BLENDI from a vselect if its argument
-  // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
-  // constants. This specific pattern gets generated when we split a
-  // selector for a 512 bit vector in a machine without AVX512 (but with
-  // 256-bit vectors), during legalization:
-  //
-  // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
-  //
-  // Iff we find this pattern and the build_vectors are built from
-  // constants, we translate the vselect into a shuffle_vector that we
-  // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
-  if ((N->getOpcode() == ISD::VSELECT ||
-       N->getOpcode() == X86ISD::SHRUNKBLEND) &&
-      !DCI.isBeforeLegalize()) {
-    SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
-    if (Shuffle.getNode())
-      return Shuffle;
-  }
-
    return SDValue();
  }
  
@@ -23015,20 +22960,47 @@ static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
    if (MayFoldLoad(Ld)) {
      // Extract the countS bits from the immediate so we can get the proper
      // address when narrowing the vector load to a specific element.
-    // When the second source op is a memory address, interps doesn't use
+    // When the second source op is a memory address, insertps doesn't use
      // countS and just gets an f32 from that address.
      unsigned DestIndex =
          cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
+    
      Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
-  } else
-    return SDValue();
  
-  // Create this as a scalar to vector to match the instruction pattern.
-  SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
-  // countS bits are ignored when loading from memory on insertps, which
-  // means we don't need to explicitly set them to 0.
-  return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
-                     LoadScalarToVector, N->getOperand(2));
+    // Create this as a scalar to vector to match the instruction pattern.
+    SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
+    // countS bits are ignored when loading from memory on insertps, which
+    // means we don't need to explicitly set them to 0.
+    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
+                       LoadScalarToVector, N->getOperand(2));
+  }
+  return SDValue();
+}
+
+static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue V0 = N->getOperand(0);
+  SDValue V1 = N->getOperand(1);
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
+  // operands and changing the mask to 1. This saves us a bunch of
+  // pattern-matching possibilities related to scalar math ops in SSE/AVX.
+  // x86InstrInfo knows how to commute this back after instruction selection
+  // if it would help register allocation.
+  
+  // TODO: If optimizing for size or a processor that doesn't suffer from
+  // partial register update stalls, this should be transformed into a MOVSD
+  // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
+
+  if (VT == MVT::v2f64)
+    if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+      if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
+        SDValue NewMask = DAG.getConstant(1, MVT::i8);
+        return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
+      }
+
+  return SDValue();
  }
  
  // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
@@ -23447,6 +23419,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
        return PerformINSERTPSCombine(N, DAG, Subtarget);
      break;
    }
+  case X86ISD::BLENDI:    return PerformBLENDICombine(N, DAG);
    case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
    }
  
@@ -23994,8 +23967,9 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
    return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
  }
  
-std::pair<unsigned, const TargetRegisterClass*>
-X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+std::pair<unsigned, const TargetRegisterClass *>
+X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                const std::string &Constraint,
                                                  MVT VT) const {
    // First, see if this is a constraint that directly corresponds to an LLVM
    // register class.
@@ -24101,7 +24075,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
    // Use the default implementation in TargetLowering to convert the register
    // constraint into a member of a register class.
    std::pair<unsigned, const TargetRegisterClass*> Res;
-  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+  Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
  
    // Not found as a standard register?
    if (!Res.second) {