[x86] Add handling for splat-like widenings of v16i8 shuffles.

author Chandler Carruth <chandlerc@gmail.com>

Sat, 28 Jun 2014 05:16:40 +0000 (05:16 +0000)

committer Chandler Carruth <chandlerc@gmail.com>

Sat, 28 Jun 2014 05:16:40 +0000 (05:16 +0000)
author Chandler Carruth <chandlerc@gmail.com>
Sat, 28 Jun 2014 05:16:40 +0000 (05:16 +0000)
committer Chandler Carruth <chandlerc@gmail.com>
Sat, 28 Jun 2014 05:16:40 +0000 (05:16 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index b77a7fee0b717796b179f49889905f77913ff927..8a00d6e2bc8f30fb0b05f25ccddb836ed2c7b2eb 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -7690,6 +7690,86 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    MutableArrayRef<int> LoMask = Mask.slice(0, 8);
    MutableArrayRef<int> HiMask = Mask.slice(8, 8);
  
+  // For single-input shuffles, there are some nicer lowering tricks we can use.
+  if (isSingleInputShuffleMask(Mask)) {
+    // Check whether we can widen this to an i16 shuffle by duplicating bytes.
+    // Notably, this handles splat and partial-splat shuffles more efficiently.
+    //
+    // FIXME: We should check for other patterns which can be widened into an
+    // i16 shuffle as well.
+    auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
+      for (int i = 0; i < 16; i += 2) {
+        if (Mask[i] != Mask[i + 1])
+          return false;
+      }
+      return true;
+    };
+    if (canWidenViaDuplication(Mask)) {
+      SmallVector<int, 4> LoInputs;
+      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
+                   [](int M) { return M >= 0 && M < 8; });
+      std::sort(LoInputs.begin(), LoInputs.end());
+      LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
+                     LoInputs.end());
+      SmallVector<int, 4> HiInputs;
+      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
+                   [](int M) { return M >= 8; });
+      std::sort(HiInputs.begin(), HiInputs.end());
+      HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
+                     HiInputs.end());
+
+      bool TargetLo = LoInputs.size() >= HiInputs.size();
+      ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
+      ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
+
+      int ByteMask[16];
+      SmallDenseMap<int, int, 8> LaneMap;
+      for (int i = 0; i < 16; ++i)
+        ByteMask[i] = -1;
+      for (int I : InPlaceInputs) {
+        ByteMask[I] = I;
+        LaneMap[I] = I;
+      }
+      int FreeByteIdx = 0;
+      int TargetOffset = TargetLo ? 0 : 8;
+      for (int I : MovingInputs) {
+        // Walk the free index into the byte mask until we find an unoccupied
+        // spot. We bound this to 8 steps to catch bugs, the pigeonhole
+        // principle indicates that there *must* be a spot as we can only have
+        // 8 duplicated inputs. We have to walk the index using modular
+        // arithmetic to wrap around as necessary.
+        // FIXME: We could do a much better job of picking an inexpensive slot
+        // so this doesn't go through the worst case for the byte shuffle.
+        for (int j = 0; j < 8 && ByteMask[FreeByteIdx + TargetOffset] != -1;
+             ++j, FreeByteIdx = (FreeByteIdx + 1) % 8)
+          ;
+        assert(ByteMask[FreeByteIdx + TargetOffset] == -1 &&
+               "Failed to find a free byte!");
+        ByteMask[FreeByteIdx + TargetOffset] = I;
+        LaneMap[I] = FreeByteIdx + TargetOffset;
+      }
+      V1 = DAG.getVectorShuffle(MVT::v16i8, DL, V1, DAG.getUNDEF(MVT::v16i8),
+                                ByteMask);
+      for (int &M : Mask)
+        if (M != -1)
+          M = LaneMap[M];
+
+      // Unpack the bytes to form the i16s that will be shuffled into place.
+      V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+                       MVT::v16i8, V1, V1);
+
+      int I16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+      for (int i = 0; i < 16; i += 2) {
+        if (Mask[i] != -1)
+          I16Shuffle[i / 2] = Mask[i] - (TargetLo ? 0 : 8);
+        assert(I16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!");
+      }
+      return DAG.getVectorShuffle(MVT::v8i16, DL,
+                                  DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+                                  DAG.getUNDEF(MVT::v8i16), I16Shuffle);
+    }
+  }
+
    // Check whether an interleaving lowering is likely to be more efficient.
    // This isn't perfect but it is a strong heuristic that tends to work well on
    // the kinds of shuffles that show up in practice.
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll

index b5e0b79088faa6b4e5adefdb4960072a86492b49..f3a0931f731207de1fb4932015bfc3f39c09892f 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -3,6 +3,93 @@
  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-unknown-unknown"
  
+define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT:    punpckhbw %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    punpcklbw %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    punpcklwd %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    packuswb %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,1,1,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,6,7,7]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT:    punpcklbw %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm2 = xmm2[2,1,2,3]
+; CHECK-SSE2-NEXT:    punpckhbw %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; CHECK-SSE2-NEXT:    punpcklwd %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,0,2,3,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; CHECK-SSE2-NEXT:    packuswb %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,0,1]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,6,6]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  ret <16 x i8> %shuffle
+}
+
  define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
  ; CHECK-SSE2-LABEL: @shuffle_v16i8_0101010101010101
  ; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
@@ -23,12 +110,9 @@ define <16 x i8> @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(
  
  define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) {
  ; CHECK-SSE2-LABEL: @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07
-; CHECK-SSE2:         pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT:    punpcklbw %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[0,1,0,3]
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    punpcklbw %xmm1, %xmm1
  ; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    packuswb %xmm0, %xmm1
  ; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm1
  ; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm0
  ; CHECK-SSE2-NEXT:    retq
author	Chandler Carruth <chandlerc@gmail.com>
	Sat, 28 Jun 2014 05:16:40 +0000 (05:16 +0000)
committer	Chandler Carruth <chandlerc@gmail.com>
	Sat, 28 Jun 2014 05:16:40 +0000 (05:16 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-128-v16.ll		patch \| blob \| history