From c1c5dcf06913839943d536cf7ed7e2b2b87d0158 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Fri, 5 Sep 2014 10:36:31 +0000
Subject: [PATCH] [x86] Factor out the zero vector insertion logic in the new
 vector shuffle lowering for integer vectors and share it from v4i32, v8i16,
 and v16i8 code paths.

Ironically, the SSE2 v16i8 code for this is now better than the SSSE3!
=] Will have to fix the SSSE3 code next to just using a single pshufb.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217240 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp         | 140 ++++++++++++++-------
 test/CodeGen/X86/vector-shuffle-128-v16.ll |  47 +++++++
 test/CodeGen/X86/vector-shuffle-128-v8.ll  |  59 +++++++++
 3 files changed, 201 insertions(+), 45 deletions(-)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index f85b00a84f8..5b0d315a6d0 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -7488,6 +7488,81 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                      getV4X86ShuffleImm8ForMask(NewMask, DAG));
 }
 
+static SDValue lowerIntegerElementInsertionVectorShuffle(
+    MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+  int V2Index = std::find_if(Mask.begin(), Mask.end(),
+                             [&Mask](int M) { return M >= (int)Mask.size(); }) -
+                Mask.begin();
+
+  // Check for a single input from a SCALAR_TO_VECTOR node.
+  // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
+  // all the smarts here sunk into that routine. However, the current
+  // lowering of BUILD_VECTOR makes that nearly impossible until the old
+  // vector shuffle lowering is dead.
+  if ((Mask[V2Index] == (int)Mask.size() &&
+       V2.getOpcode() == ISD::SCALAR_TO_VECTOR) ||
+      V2.getOpcode() == ISD::BUILD_VECTOR) {
+    SDValue V2S = V2.getOperand(Mask[V2Index] - Mask.size());
+
+    bool V1IsAllZero = false;
+    if (ISD::isBuildVectorAllZeros(V1.getNode())) {
+      V1IsAllZero = true;
+    } else if (V1.getOpcode() == ISD::BUILD_VECTOR) {
+      V1IsAllZero = true;
+      for (int M : Mask) {
+        if (M < 0 || M >= (int)Mask.size())
+          continue;
+        SDValue Input = V1.getOperand(M);
+        if (Input.getOpcode() != ISD::UNDEF && !X86::isZeroNode(Input)) {
+          // A non-zero input!
+          V1IsAllZero = false;
+          break;
+        }
+      }
+    }
+    if (V1IsAllZero) {
+      // First, we need to zext the scalar if it is smaller than an i32.
+      MVT EltVT = VT.getVectorElementType();
+      assert(EltVT == V2S.getSimpleValueType() &&
+             "Different scalar and element types!");
+      MVT ExtVT = VT;
+      if (EltVT == MVT::i8 || EltVT == MVT::i16) {
+        // Zero-extend directly to i32.
+        ExtVT = MVT::v4i32;
+        V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
+      }
+
+      V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT,
+                       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S));
+      if (ExtVT != VT)
+        V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
+
+      if (V2Index != 0) {
+        // If we have 4 or fewer lanes we can cheaply shuffle the element into
+        // the desired position. Otherwise it is more efficient to do a vector
+        // shift left. We know that we can do a vector shift left because all
+        // the inputs are zero.
+        if (VT.getVectorNumElements() <= 4) {
+          SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
+          V2Shuffle[V2Index] = 0;
+          V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
+        } else {
+          V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
+          V2 = DAG.getNode(
+              X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
+              DAG.getConstant(
+                  V2Index * EltVT.getSizeInBits(),
+                  DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
+          V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
+        }
+      }
+      return V2;
+    }
+  }
+  return SDValue();
+}
+
 /// \brief Lower 4-lane i32 vector shuffles.
 ///
 /// We try to handle these with integer-domain shuffles where we can, but for
@@ -7519,50 +7594,10 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
 
   // There are special ways we can lower some single-element blends.
-  if (NumV2Elements == 1) {
-    int V2Index =
-        std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
-        Mask.begin();
-
-    // Check for a single input from a SCALAR_TO_VECTOR node.
-    // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
-    // all the smarts here sunk into that routine. However, the current
-    // lowering of BUILD_VECTOR makes that nearly impossible until the old
-    // vector shuffle lowering is dead.
-    if ((Mask[V2Index] == 4 && V2.getOpcode() == ISD::SCALAR_TO_VECTOR) ||
-        V2.getOpcode() == ISD::BUILD_VECTOR) {
-      SDValue V2S = V2.getOperand(Mask[V2Index] - 4);
-
-      bool V1IsAllZero = false;
-      if (ISD::isBuildVectorAllZeros(V1.getNode())) {
-        V1IsAllZero = true;
-      } else if (V1.getOpcode() == ISD::BUILD_VECTOR) {
-        V1IsAllZero = true;
-        for (int M : Mask) {
-          if (M < 0 || M >= 4)
-            continue;
-          SDValue Input = V1.getOperand(M);
-          if (Input.getOpcode() != ISD::UNDEF && !X86::isZeroNode(Input)) {
-            // A non-zero input!
-            V1IsAllZero = false;
-            break;
-          }
-        }
-      }
-      if (V1IsAllZero) {
-        V2 = DAG.getNode(
-            X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
-            DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V2S));
-        if (V2Index != 0) {
-          int V2Shuffle[] = {1, 1, 1, 1};
-          V2Shuffle[V2Index] = 0;
-          V2 = DAG.getVectorShuffle(MVT::v4i32, DL, V2,
-                                    DAG.getUNDEF(MVT::v4i32), V2Shuffle);
-        }
-        return V2;
-      }
-    }
-  }
+  if (NumV2Elements == 1)
+    if (SDValue V = lowerIntegerElementInsertionVectorShuffle(
+            MVT::v4i32, DL, V1, V2, Mask, Subtarget, DAG))
+      return V;
 
   // We implement this with SHUFPS because it can blend from two vectors.
   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
@@ -8210,6 +8245,12 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
                             "to be V1-input shuffles.");
 
+  // There are special ways we can lower some single-element blends.
+  if (NumV2Inputs == 1)
+    if (SDValue V = lowerIntegerElementInsertionVectorShuffle(
+            MVT::v8i16, DL, V1, V2, Mask, Subtarget, DAG))
+      return V;
+
   if (NumV1Inputs + NumV2Inputs <= 4)
     return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
 
@@ -8347,8 +8388,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   MutableArrayRef<int> LoMask = Mask.slice(0, 8);
   MutableArrayRef<int> HiMask = Mask.slice(8, 8);
 
+  int NumV2Elements =
+      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
+
   // For single-input shuffles, there are some nicer lowering tricks we can use.
-  if (isSingleInputShuffleMask(Mask)) {
+  if (NumV2Elements == 0) {
     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
     // Notably, this handles splat and partial-splat shuffles more efficiently.
     // However, it only makes sense if the pre-duplication shuffle simplifies
@@ -8495,6 +8539,12 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
   }
 
+  // There are special ways we can lower some single-element blends.
+  if (NumV2Elements == 1)
+    if (SDValue V = lowerIntegerElementInsertionVectorShuffle(
+            MVT::v16i8, DL, V1, V2, Mask, Subtarget, DAG))
+      return V;
+
   // Check whether a compaction lowering can be done. This handles shuffles
   // which take every Nth element for some even N. See the helper function for
   // details.
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 6f49a03cb8b..38734eda941 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -325,3 +325,50 @@ define <16 x i8> @PR20540(<8 x i8> %a) {
   %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i8> %shuffle
 }
+
+define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
+; SSE2-LABEL: @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movzbl {{.*}}, %[[R:.*]]
+; SSE2-NEXT:    movd %[[R]], %xmm0
+; SSE2-NEXT:    retq
+  %a = insertelement <16 x i8> undef, i8 %i, i32 0
+  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
+; SSE2-LABEL: @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movzbl {{.*}}, %[[R:.*]]
+; SSE2-NEXT:    movd %[[R]], %xmm0
+; SSE2-NEXT:    pslldq $5, %xmm0
+; SSE2-NEXT:    retq
+  %a = insertelement <16 x i8> undef, i8 %i, i32 0
+  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) {
+; SSE2-LABEL: @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movzbl {{.*}}, %[[R:.*]]
+; SSE2-NEXT:    movd %[[R]], %xmm0
+; SSE2-NEXT:    pslldq $15, %xmm0
+; SSE2-NEXT:    retq
+  %a = insertelement <16 x i8> undef, i8 %i, i32 0
+  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
+; SSE2-LABEL: @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movzbl {{.*}}, %[[R:.*]]
+; SSE2-NEXT:    movd %[[R]], %xmm0
+; SSE2-NEXT:    pslldq $2, %xmm0
+; SSE2-NEXT:    retq
+  %a = insertelement <16 x i8> undef, i8 %i, i32 3
+  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll
index f1e17377c13..33993aae682 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -771,3 +771,62 @@ define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 undef>
   ret <8 x i16> %shuffle
 }
+
+define <8 x i16> @shuffle_v8i16_8zzzzzzz(i16 %i) {
+; ALL-LABEL: @shuffle_v8i16_8zzzzzzz
+; ALL:       # BB#0:
+; ALL-NEXT:    movzwl {{.*}}, %[[R:.*]]
+; ALL-NEXT:    movd %[[R]], %xmm0
+; ALL-NEXT:    retq
+  %a = insertelement <8 x i16> undef, i16 %i, i32 0
+  %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) {
+; ALL-LABEL: @shuffle_v8i16_z8zzzzzz
+; ALL:       # BB#0:
+; ALL-NEXT:    movzwl {{.*}}, %[[R:.*]]
+; ALL-NEXT:    movd %[[R]], %xmm0
+; ALL-NEXT:    pslldq $2, %xmm0
+; ALL-NEXT:    retq
+  %a = insertelement <8 x i16> undef, i16 %i, i32 0
+  %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 2, i32 8, i32 3, i32 7, i32 6, i32 5, i32 4, i32 3>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) {
+; ALL-LABEL: @shuffle_v8i16_zzzzz8zz
+; ALL:       # BB#0:
+; ALL-NEXT:    movzwl {{.*}}, %[[R:.*]]
+; ALL-NEXT:    movd %[[R]], %xmm0
+; ALL-NEXT:    pslldq $10, %xmm0
+; ALL-NEXT:    retq
+  %a = insertelement <8 x i16> undef, i16 %i, i32 0
+  %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) {
+; ALL-LABEL: @shuffle_v8i16_zuuzuuz8
+; ALL:       # BB#0:
+; ALL-NEXT:    movzwl {{.*}}, %[[R:.*]]
+; ALL-NEXT:    movd %[[R]], %xmm0
+; ALL-NEXT:    pslldq $14, %xmm0
+; ALL-NEXT:    retq
+  %a = insertelement <8 x i16> undef, i16 %i, i32 0
+  %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 8>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) {
+; ALL-LABEL: @shuffle_v8i16_zzBzzzzz
+; ALL:       # BB#0:
+; ALL-NEXT:    movzwl {{.*}}, %[[R:.*]]
+; ALL-NEXT:    movd %[[R]], %xmm0
+; ALL-NEXT:    pslldq $4, %xmm0
+; ALL-NEXT:    retq
+  %a = insertelement <8 x i16> undef, i16 %i, i32 3
+  %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 11, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle
+}
-- 
2.34.1