[x86] Just unilaterally prefer SSSE3-style PSHUFB lowerings over clever

author Chandler Carruth <chandlerc@gmail.com>

Mon, 4 Aug 2014 10:17:35 +0000 (10:17 +0000)

committer Chandler Carruth <chandlerc@gmail.com>

Mon, 4 Aug 2014 10:17:35 +0000 (10:17 +0000)
author Chandler Carruth <chandlerc@gmail.com>
Mon, 4 Aug 2014 10:17:35 +0000 (10:17 +0000)
committer Chandler Carruth <chandlerc@gmail.com>
Mon, 4 Aug 2014 10:17:35 +0000 (10:17 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 6fae768a734e40286053ce236ad73e87e06faa28..d0cb0e212c03362f4694e7098deade63980f13fe 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -7973,6 +7973,41 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds);
    }
  
+  // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
+  // with PSHUFB. It is important to do this before we attempt to generate any
+  // blends but after all of the single-input lowerings. If the single input
+  // lowerings can find an instruction sequence that is faster than a PSHUFB, we
+  // want to preserve that and we can DAG combine any longer sequences into
+  // a PSHUFB in the end. But once we start blending from multiple inputs,
+  // the complexity of DAG combining bad patterns back into PSHUFB is too high,
+  // and there are *very* few patterns that would actually be faster than the
+  // PSHUFB approach because of its ability to zero lanes.
+  //
+  // FIXME: The only exceptions to the above are blends which are exact
+  // interleavings with direct instructions supporting them. We currently don't
+  // handle those well here.
+  if (Subtarget->hasSSSE3()) {
+    SDValue V1Mask[16];
+    SDValue V2Mask[16];
+    for (int i = 0; i < 16; ++i)
+      if (Mask[i] == -1) {
+        V1Mask[i] = V2Mask[i] = DAG.getConstant(0x80, MVT::i8);
+      } else {
+        V1Mask[i] = DAG.getConstant(Mask[i] < 16 ? Mask[i] : 0x80, MVT::i8);
+        V2Mask[i] =
+            DAG.getConstant(Mask[i] < 16 ? 0x80 : Mask[i] - 16, MVT::i8);
+      }
+    V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
+                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
+    if (isSingleInputShuffleMask(Mask))
+      return V1; // Single inputs are easy.
+
+    // Otherwise, blend the two.
+    V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
+                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
+    return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
+  }
+
    // Check whether a compaction lowering can be done. This handles shuffles
    // which take every Nth element for some even N. See the helper function for
    // details.
@@ -8011,41 +8046,6 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      return Result;
    }
  
-  // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
-  // with PSHUFB. It is important to do this before we attempt to generate any
-  // blends but after all of the single-input lowerings. If the single input
-  // lowerings can find an instruction sequence that is faster than a PSHUFB, we
-  // want to preserve that and we can DAG combine any longer sequences into
-  // a PSHUFB in the end. But once we start blending from multiple inputs,
-  // the complexity of DAG combining bad patterns back into PSHUFB is too high,
-  // and there are *very* few patterns that would actually be faster than the
-  // PSHUFB approach because of its ability to zero lanes.
-  //
-  // FIXME: The only exceptions to the above are blends which are exact
-  // interleavings with direct instructions supporting them. We currently don't
-  // handle those well here.
-  if (Subtarget->hasSSSE3()) {
-    SDValue V1Mask[16];
-    SDValue V2Mask[16];
-    for (int i = 0; i < 16; ++i)
-      if (Mask[i] == -1) {
-        V1Mask[i] = V2Mask[i] = DAG.getConstant(0x80, MVT::i8);
-      } else {
-        V1Mask[i] = DAG.getConstant(Mask[i] < 16 ? Mask[i] : 0x80, MVT::i8);
-        V2Mask[i] =
-            DAG.getConstant(Mask[i] < 16 ? 0x80 : Mask[i] - 16, MVT::i8);
-      }
-    V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
-                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
-    if (isSingleInputShuffleMask(Mask))
-      return V1; // Single inputs are easy.
-
-    // Otherwise, blend the two.
-    V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
-                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
-    return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
-  }
-
    int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
    int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
    int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll

index 00ce9da26a3b71ab84ab4d53293cd2ae530bd565..c168935134a6672f7d1a39c1d5a41d50f8278227 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -262,10 +262,10 @@ define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) {
  ; SSE2-NEXT:    packuswb %xmm0, %xmm0
  ; SSE2-NEXT:    retq
  ;
-; FIXME-SSSE3-LABEL: @trunc_v4i32_shuffle
-; FIXME-SSSE3:       # BB#0:
-; FIXME-SSSE3-NEXT:    pshufb {{.*}} # xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; FIXME-SSSE3-NEXT:    retq
+; SSSE3-LABEL: @trunc_v4i32_shuffle
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*}} # xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
    %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    ret <16 x i8> %shuffle
  }
author	Chandler Carruth <chandlerc@gmail.com>
	Mon, 4 Aug 2014 10:17:35 +0000 (10:17 +0000)
committer	Chandler Carruth <chandlerc@gmail.com>
	Mon, 4 Aug 2014 10:17:35 +0000 (10:17 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-128-v16.ll		patch \| blob \| history