From bdecfeb7237dc66d0a1977617fab627c5afc2ed3 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Thu, 25 Sep 2014 00:24:19 +0000
Subject: [PATCH] [x86] Implement v16i16 support with AVX2 in the new vector
 shuffle lowering.

This also implements the fancy blend lowering for v16i16 using AVX2 and
teaches the X86 backend to print shuffle masks for 256-bit PSHUFB
and PBLENDW instructions. It also makes the mask decoding correct for
PBLENDW instructions. The yaks, they are legion.

Tests are updated accordingly. There are some missing tests for the
VBLENDVB lowering, but I'll add those in a follow-up as this commit has
accumulated enough cruft already.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218430 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../X86/InstPrinter/X86InstComments.cpp       |  11 +
 lib/Target/X86/Utils/X86ShuffleDecode.cpp     |  15 +-
 lib/Target/X86/X86ISelLowering.cpp            | 203 ++++++++++----
 lib/Target/X86/X86MCInstLower.cpp             |   3 +-
 test/CodeGen/X86/vector-shuffle-256-v16.ll    | 250 ++++--------------
 5 files changed, 220 insertions(+), 262 deletions(-)
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index fc2932b181f..432cf930b4a 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -102,6 +102,17 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
+  case X86::VPBLENDWYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPBLENDWYrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeBLENDMask(MVT::v16i16,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
 
   case X86::VPBLENDDrri:
     Src2Name = getRegName(MI->getOperand(2).getReg());
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index 713e147fbf5..a3f45233454 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -301,11 +301,18 @@ void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
   }
 }
 
-void DecodeBLENDMask(MVT VT, unsigned Imm,
-                       SmallVectorImpl<int> &ShuffleMask) {
+void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  int ElementBits = VT.getScalarSizeInBits();
   int NumElements = VT.getVectorNumElements();
-  for (int i = 0; i < NumElements; ++i)
-    ShuffleMask.push_back(((Imm >> i) & 1) ? NumElements + i : i);
+  for (int i = 0; i < NumElements; ++i) {
+    // If there are more than 8 elements in the vector, then any immediate blend
+    // mask applies to each 128-bit lane. There can never be more than
+    // 8 elements in a 128-bit lane with an immediate blend.
+    int Bit = NumElements > 8 ? i % (128 / ElementBits) : i;
+    assert(Bit < 8 &&
+           "Immediate blends only operate over 8 elements at a time!");
+    ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i);
+  }
 }
 
 /// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index f93f1490f18..9be0f23597a 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -7183,6 +7183,56 @@ static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
   return true;
 }
 
+/// \brief Test whether there are elements crossing 128-bit lanes in this
+/// shuffle mask.
+///
+/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
+/// and we routinely test for these.
+static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
+  int LaneSize = 128 / VT.getScalarSizeInBits();
+  int Size = Mask.size();
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
+      return true;
+  return false;
+}
+
+/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
+///
+/// This checks a shuffle mask to see if it is performing the same
+/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
+/// that it is also not lane-crossing. It may however involve a blend from the
+/// same lane of a second vector.
+///
+/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
+/// non-trivial to compute in the face of undef lanes. The representation is
+/// *not* suitable for use with existing 128-bit shuffles as it will contain
+/// entries from both V1 and V2 inputs to the wider mask.
+static bool
+is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
+                                SmallVectorImpl<int> &RepeatedMask) {
+  int LaneSize = 128 / VT.getScalarSizeInBits();
+  RepeatedMask.resize(LaneSize, -1);
+  int Size = Mask.size();
+  for (int i = 0; i < Size; ++i) {
+    if (Mask[i] < 0)
+      continue;
+    if ((Mask[i] % Size) / LaneSize != i / LaneSize)
+      // This entry crosses lanes, so there is no way to model this shuffle.
+      return false;
+
+    // Ok, handle the in-lane shuffles by detecting if and when they repeat.
+    if (RepeatedMask[i % LaneSize] == -1)
+      // This is the first non-undef entry in this slot of a 128-bit lane.
+      RepeatedMask[i % LaneSize] =
+          Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
+    else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
+      // Found a mismatch with the repeated mask.
+      return false;
+  }
+  return true;
+}
+
 // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
 // 2013 will allow us to use it as a non-type template parameter.
 namespace {
@@ -7312,6 +7362,38 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
                                    DAG.getConstant(BlendMask, MVT::i8)));
   }
 
+  case MVT::v16i16: {
+    assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+    SmallVector<int, 8> RepeatedMask;
+    if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+      // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
+      assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
+      BlendMask = 0;
+      for (int i = 0; i < 8; ++i)
+        if (RepeatedMask[i] >= 16)
+          BlendMask |= 1u << i;
+      return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
+                         DAG.getConstant(BlendMask, MVT::i8));
+    }
+
+    // Fall back to a fully general variable byte blend.
+    SDValue PBLENDVMask[32];
+    // Scale the blend by the number of bytes per element.
+    int Scale =  VT.getScalarSizeInBits() / 8;
+    assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
+    for (int i = 0, Size = Mask.size(); i < Size; ++i)
+      for (int j = 0; j < Scale; ++j)
+        PBLENDVMask[Scale * i + j] =
+            Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
+                        : DAG.getConstant(Mask[i] < Size ? 0 : 0x80, MVT::i8);
+
+    V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
+    V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
+    return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(
+        X86ISD::BLENDV, DL, MVT::v32i8, V1, V2,
+        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PBLENDVMask)));
+  }
+
   default:
     llvm_unreachable("Not a supported integer vector type!");
   }
@@ -9215,56 +9297,6 @@ static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   }
 }
 
-/// \brief Test whether there are elements crossing 128-bit lanes in this
-/// shuffle mask.
-///
-/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
-/// and we routinely test for these.
-static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
-  int LaneSize = 128 / VT.getScalarSizeInBits();
-  int Size = Mask.size();
-  for (int i = 0; i < Size; ++i)
-    if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
-      return true;
-  return false;
-}
-
-/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
-///
-/// This checks a shuffle mask to see if it is performing the same
-/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
-/// that it is also not lane-crossing. It may however involve a blend from the
-/// same lane of a second vector.
-///
-/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
-/// non-trivial to compute in the face of undef lanes. The representation is
-/// *not* suitable for use with existing 128-bit shuffles as it will contain
-/// entries from both V1 and V2 inputs to the wider mask.
-static bool
-is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
-                                SmallVectorImpl<int> &RepeatedMask) {
-  int LaneSize = 128 / VT.getScalarSizeInBits();
-  RepeatedMask.resize(LaneSize, -1);
-  int Size = Mask.size();
-  for (int i = 0; i < Size; ++i) {
-    if (Mask[i] < 0)
-      continue;
-    if ((Mask[i] % Size) / LaneSize != i / LaneSize)
-      // This entry crosses lanes, so there is no way to model this shuffle.
-      return false;
-
-    // Ok, handle the in-lane shuffles by detecting if and when they repeat.
-    if (RepeatedMask[i % LaneSize] == -1)
-      // This is the first non-undef entry in this slot of a 128-bit lane.
-      RepeatedMask[i % LaneSize] =
-          Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
-    else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
-      // Found a mismatch with the repeated mask.
-      return false;
-  }
-  return true;
-}
-
 /// \brief Generic routine to split a 256-bit vector shuffle into 128-bit
 /// shuffles.
 ///
@@ -9581,9 +9613,74 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
 
-  // FIXME: Actually implement this using AVX2!!!
-  (void)Mask;
-  return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
+                                                Subtarget, DAG))
+    return Blend;
+
+  // If the shuffle mask is repeated in each 128-bit lane we can use more
+  // efficient instructions that mirror the shuffles across the two 128-bit
+  // lanes.
+  SmallVector<int, 4> RepeatedMask;
+  if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+    assert(RepeatedMask.size() == 8 && "Unexpected repeated mask size!");
+    // FIXME: It might be worth it to call into the (terribly complex) v8i16
+    // lowering here.
+
+    // Use dedicated unpack instructions for masks that match their pattern.
+    //
+    if (isShuffleEquivalent(Mask,
+                            // First 128-bit lane:
+                            0, 16, 1, 17, 2, 18, 3, 19,
+                            // Second 128-bit lane:
+                            8, 24, 9, 25, 10, 26, 11, 27))
+      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
+    if (isShuffleEquivalent(Mask,
+                            // First 128-bit lane:
+                            4,  20, 5,  21, 6, 22, 7, 23,
+                            // Second 128-bit lane:
+                            12, 28, 13, 29, 14, 30, 15, 31))
+      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
+  }
+
+  // There are no generalized cross-lane shuffle operations available on i16
+  // element types.
+  // FIXME: We should teach the "split and lower" path to do something more
+  // clever, or do it ourselves here. The optimal lowering of cross-lane
+  // shuffles I am aware of is to swap the lanes into a copy, shuffle both the
+  // original and the copy, and then blend to pick up the cross-lane elements.
+  // This is four instructions with a tree height of three which is better than
+  // the worst case for a gather-cross-scatter approach such as used in SSE2
+  // v8i16 lowering (where we don't have blends). While for cross-lane blends it
+  // results in a blend tree, blends are very cheap in AVX2 and newer chips. We
+  // might also want to special case situations where we can always do a single
+  // VPERMD to produce a non-lane-crossing shuffle.
+  if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
+    return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
+
+  if (isSingleInputShuffleMask(Mask)) {
+    SDValue PSHUFBMask[32];
+    for (int i = 0; i < 16; ++i) {
+      if (Mask[i] == -1) {
+        PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
+        continue;
+      }
+
+      int M = i < 8 ? Mask[i] : Mask[i] - 8;
+      assert(M >= 0 && M < 8 && "Invalid single-input mask!");
+      PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
+      PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
+    }
+    return DAG.getNode(
+        ISD::BITCAST, DL, MVT::v16i16,
+        DAG.getNode(
+            X86ISD::PSHUFB, DL, MVT::v32i8,
+            DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
+            DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
+  }
+
+  // Otherwise fall back on generic blend lowering.
+  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i16, V1, V2,
+                                                    Mask, DAG);
 }
 
 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 004d2aca941..39281c8915d 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -1098,7 +1098,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // a constant shuffle mask. We won't be able to do this at the MC layer
     // because the mask isn't an immediate.
   case X86::PSHUFBrm:
-  case X86::VPSHUFBrm: {
+  case X86::VPSHUFBrm:
+  case X86::VPSHUFBYrm: {
     if (!OutStreamer.isVerboseAsm())
       break;
     assert(MI->getNumOperands() > 5 &&
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 9145e0c7878..0d3a6220343 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -385,11 +385,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*}} # xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -407,11 +403,7 @@ define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_1
 ;
 ; AVX2-LABEL: @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*}} # xmm2 = [14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
   ret <16 x i16> %shuffle
@@ -430,12 +422,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_1
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
   ret <16 x i16> %shuffle
@@ -454,12 +441,7 @@ define <16 x i16> @shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_1
 ;
 ; AVX2-LABEL: @shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm1 = xmm0[3,3,3,3,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,7,7,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,7,7,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[6,7,6,7,6,7,6,7,14,15,14,15,14,15,14,15,22,23,22,23,22,23,22,23,30,31,30,31,30,31,30,31]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7, i32 11, i32 11, i32 11, i32 11, i32 15, i32 15, i32 15, i32 15>
   ret <16 x i16> %shuffle
@@ -478,12 +460,7 @@ define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_1
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm1 = xmm0[0,0,2,2,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,4,6,6]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,6,6]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,16,17,16,17,20,21,20,21,24,25,24,25,28,29,28,29]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   ret <16 x i16> %shuffle
@@ -502,12 +479,7 @@ define <16 x i16> @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_1
 ;
 ; AVX2-LABEL: @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm1 = xmm0[1,1,3,3,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,5,5,7,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm0 = xmm0[1,1,3,3,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,7,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15,18,19,18,19,22,23,22,23,26,27,26,27,30,31,30,31]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
   ret <16 x i16> %shuffle
@@ -637,11 +609,7 @@ define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_1
 ;
 ; AVX2-LABEL: @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT:    vpblendw {{.*}} # xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX2-NEXT:    vpblendw {{.*}} # xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendw {{.*}} # ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
   ret <16 x i16> %shuffle
@@ -684,16 +652,9 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_2
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*}} # xmm3 = [0,1,0,1,4,5,0,1,0,1,0,1,12,13,0,1]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT:    vpshufd {{.*}} # xmm4 = xmm4[0,0,0,0]
-; AVX2-NEXT:    vpblendw {{.*}} # xmm2 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4],xmm2[5],xmm4[6],xmm2[7]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,0,0,0]
-; AVX2-NEXT:    vpblendw {{.*}} # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm1 = ymm1[u,u,0,1,u,u,0,1,u,u,0,1,u,u,0,1,u,u,16,17,u,u,16,17,u,u,16,17,u,u,16,17]
+; AVX2-NEXT:    vpshufd {{.*}} # ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vpblendw {{.*}} # ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24>
   ret <16 x i16> %shuffle
@@ -713,13 +674,8 @@ define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_1
 ;
 ; AVX2-LABEL: @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm3 = xmm3[0,0,0,0,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*}} # xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*}} # xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm1 = ymm1[0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 4, i32 5, i32 6, i32 7, i32 24, i32 24, i32 24, i32 24, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i16> %shuffle
@@ -741,15 +697,9 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_1
 ;
 ; AVX2-LABEL: @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm2 = xmm2[0,1,2,3,7,6,5,4]
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm3 = xmm3[3,2,1,0,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*}} # xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*}} # xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[u,u,u,u,u,u,u,u,14,15,12,13,10,11,8,9,u,u,u,u,u,u,u,u,30,31,28,29,26,27,24,25]
+; AVX2-NEXT:    vpshufb {{.*}} # ymm1 = ymm1[6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 7, i32 6, i32 5, i32 4, i32 27, i32 26, i32 25, i32 24, i32 15, i32 14, i32 13, i32 12>
   ret <16 x i16> %shuffle
@@ -773,17 +723,9 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_0
 ;
 ; AVX2-LABEL: @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm2 = xmm2[3,2,1,0,4,5,6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vpshufd {{.*}} # xmm3 = xmm3[0,1,0,1]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm3 = xmm3[0,1,2,3,7,6,5,4]
-; AVX2-NEXT:    vpblendd {{.*}} # xmm2 = xmm2[0,1],xmm3[2,3]
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; AVX2-NEXT:    vpblendd {{.*}} # xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[u,u,u,u,u,u,u,u,6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17]
+; AVX2-NEXT:    vpshufb {{.*}} # ymm1 = ymm1[6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 3, i32 2, i32 1, i32 0, i32 27, i32 26, i32 25, i32 24, i32 11, i32 10, i32 9, i32 8>
   ret <16 x i16> %shuffle
@@ -801,11 +743,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_0
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*}} # xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 8>
   ret <16 x i16> %shuffle
@@ -823,11 +761,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_0
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*}} # xmm2 = [0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 10, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -845,11 +779,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_0
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*}} # xmm2 = [0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -867,11 +797,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_0
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*}} # xmm2 = [0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,24,25,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 12, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -889,11 +815,7 @@ define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_0
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*}} # xmm2 = [0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,26,27,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 13, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -911,11 +833,7 @@ define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_0
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*}} # xmm2 = [0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,28,29,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 14, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -933,11 +851,7 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_0
 ;
 ; AVX2-LABEL: @shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*}} # xmm2 = [14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,30,31,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -959,15 +873,7 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_2
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpunpcklwd {{.*}} # xmm2 = xmm2[0,0,1,1,2,2,3,3]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vpmovzxwd %xmm3, %xmm3
-; AVX2-NEXT:    vpblendw {{.*}} # xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX2-NEXT:    vpunpcklwd {{.*}} # xmm1 = xmm1[0,0,1,1,2,2,3,3]
-; AVX2-NEXT:    vpmovzxwd %xmm0, %xmm0
-; AVX2-NEXT:    vpblendw {{.*}} # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklwd {{.*}} # ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
   ret <16 x i16> %shuffle
@@ -989,15 +895,7 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_3
 ;
 ; AVX2-LABEL: @shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpunpckhwd {{.*}} # xmm2 = xmm2[4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vpunpckhwd {{.*}} # xmm3 = xmm3[4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vpblendw {{.*}} # xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX2-NEXT:    vpunpckhwd {{.*}} # xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vpunpckhwd {{.*}} # xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vpblendw {{.*}} # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpckhwd {{.*}} # ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   ret <16 x i16> %shuffle
@@ -1019,15 +917,9 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_3
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpunpckhwd {{.*}} # xmm2 = xmm2[4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vpunpckhwd {{.*}} # xmm3 = xmm3[4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vpblendw {{.*}} # xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX2-NEXT:    vpunpcklwd {{.*}} # xmm1 = xmm1[0,0,1,1,2,2,3,3]
-; AVX2-NEXT:    vpmovzxwd %xmm0, %xmm0
-; AVX2-NEXT:    vpblendw {{.*}} # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31]
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31,u,u]
+; AVX2-NEXT:    vpblendw {{.*}} # ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   ret <16 x i16> %shuffle
@@ -1049,15 +941,9 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_2
 ;
 ; AVX2-LABEL: @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpunpcklwd {{.*}} # xmm2 = xmm2[0,0,1,1,2,2,3,3]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vpmovzxwd %xmm3, %xmm3
-; AVX2-NEXT:    vpblendw {{.*}} # xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX2-NEXT:    vpunpckhwd {{.*}} # xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vpunpckhwd {{.*}} # xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vpblendw {{.*}} # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23,u,u]
+; AVX2-NEXT:    vpblendw {{.*}} # ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
   ret <16 x i16> %shuffle
@@ -1074,10 +960,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_0
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*}} # xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,18,19,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 9, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -1094,10 +977,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_0
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*}} # xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,4,5,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,20,21,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 10, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -1114,10 +994,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_0
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*}} # xmm1 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,6,7,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,22,23,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -1134,10 +1011,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_0
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*}} # xmm1 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,0,1,0,1,0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 12, i32 8, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -1154,10 +1028,7 @@ define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_0
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*}} # xmm1 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,26,27,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 13, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -1174,10 +1045,7 @@ define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_0
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*}} # xmm1 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,12,13,0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,28,29,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 14, i32 8>
   ret <16 x i16> %shuffle
@@ -1194,10 +1062,7 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_1
 ;
 ; AVX2-LABEL: @shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*}} # xmm1 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,14,15]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,30,31]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 15>
   ret <16 x i16> %shuffle
@@ -1215,11 +1080,7 @@ define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_0
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm1 = xmm0[0,0,2,2,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,4,6,6]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[12,13,12,13,8,9,8,9,4,5,4,5,0,1,0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,28,29,28,29,24,25,24,25,20,21,20,21,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 14, i32 14, i32 12, i32 12, i32 10, i32 10, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -1237,11 +1098,7 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_1
 ;
 ; AVX2-LABEL: @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*}} # xmm1 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
   ret <16 x i16> %shuffle
@@ -1258,10 +1115,7 @@ define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_0
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*}} # xmm1 = xmm0[0,1,2,3,4,5,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,12,13,0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,u,u,u,u,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,28,29,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 undef, i32 undef, i32 8, i32 8, i32 14, i32 8>
   ret <16 x i16> %shuffle
@@ -1278,10 +1132,7 @@ define <16 x i16> @shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_1
 ;
 ; AVX2-LABEL: @shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*}} # xmm1 = xmm0[14,15,2,3,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,0,1,14,15]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[14,15,u,u,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,16,17,30,31]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 undef, i32 undef, i32 8, i32 8, i32 8, i32 15>
   ret <16 x i16> %shuffle
@@ -1299,11 +1150,7 @@ define <16 x i16> @shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_0
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm1 = xmm0[0,1,2,2,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,4,6,6]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[12,13,12,13,12,13,8,9,4,5,4,5,0,1,0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,u,u,u,u,4,5,8,9,8,9,u,u,12,13,28,29,28,29,u,u,24,25,20,21,20,21,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 undef, i32 2, i32 4, i32 4, i32 undef, i32 6, i32 14, i32 14, i32 undef, i32 12, i32 10, i32 10, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -1322,12 +1169,7 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_1
 ;
 ; AVX2-LABEL: @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufd {{.*}} # xmm1 = xmm0[2,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm0 = xmm0[0,0,0,3,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[8,9,8,9,8,9,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,u,u,u,u,24,25,24,25,24,25]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 12, i32 12, i32 12>
   ret <16 x i16> %shuffle
-- 
2.34.1