if (isSingleInputShuffleMask(Mask)) {
// Check whether we can widen this to an i16 shuffle by duplicating bytes.
// Notably, this handles splat and partial-splat shuffles more efficiently.
+ // However, it only makes sense if the pre-duplication shuffle simplifies
+ // things significantly. Currently, this means we need to be able to
+ // express the pre-duplication shuffle as an i16 shuffle.
//
// FIXME: We should check for other patterns which can be widened into an
// i16 shuffle as well.
}
return true;
};
- if (canWidenViaDuplication(Mask)) {
+ auto tryToWidenViaDuplication = [&]() -> SDValue {
+ if (!canWidenViaDuplication(Mask))
+ return SDValue();
SmallVector<int, 4> LoInputs;
std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
[](int M) { return M >= 0 && M < 8; });
ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
- int ByteMask[16];
+ int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
SmallDenseMap<int, int, 8> LaneMap;
- for (int i = 0; i < 16; ++i)
- ByteMask[i] = -1;
for (int I : InPlaceInputs) {
- ByteMask[I] = I;
+ PreDupI16Shuffle[I/2] = I/2;
LaneMap[I] = I;
}
- int FreeByteIdx = 0;
- int TargetOffset = TargetLo ? 0 : 8;
- for (int I : MovingInputs) {
- // Walk the free index into the byte mask until we find an unoccupied
- // spot. We bound this to 8 steps to catch bugs, the pigeonhole
- // principle indicates that there *must* be a spot as we can only have
- // 8 duplicated inputs. We have to walk the index using modular
- // arithmetic to wrap around as necessary.
- // FIXME: We could do a much better job of picking an inexpensive slot
- // so this doesn't go through the worst case for the byte shuffle.
- for (int j = 0; j < 8 && ByteMask[FreeByteIdx + TargetOffset] != -1;
- ++j, FreeByteIdx = (FreeByteIdx + 1) % 8)
- ;
- assert(ByteMask[FreeByteIdx + TargetOffset] == -1 &&
- "Failed to find a free byte!");
- ByteMask[FreeByteIdx + TargetOffset] = I;
- LaneMap[I] = FreeByteIdx + TargetOffset;
+ int j = TargetLo ? 0 : 4, je = j + 4;
+ for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
+ // Check if j is already a shuffle of this input. This happens when
+ // there are two adjacent bytes after we move the low one.
+ if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
+ // If we haven't yet mapped the input, search for a slot into which
+ // we can map it.
+ while (j < je && PreDupI16Shuffle[j] != -1)
+ ++j;
+
+ if (j == je)
+ // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
+ return SDValue();
+
+ // Map this input with the i16 shuffle.
+ PreDupI16Shuffle[j] = MovingInputs[i] / 2;
+ }
+
+ // Update the lane map based on the mapping we ended up with.
+ LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
}
- V1 = DAG.getVectorShuffle(MVT::v16i8, DL, V1, DAG.getUNDEF(MVT::v16i8),
- ByteMask);
- for (int &M : Mask)
- if (M != -1)
- M = LaneMap[M];
+ V1 = DAG.getNode(
+ ISD::BITCAST, DL, MVT::v16i8,
+ DAG.getVectorShuffle(MVT::v8i16, DL,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+ DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
// Unpack the bytes to form the i16s that will be shuffled into place.
V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
MVT::v16i8, V1, V1);
- int I16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
for (int i = 0; i < 16; i += 2) {
if (Mask[i] != -1)
- I16Shuffle[i / 2] = Mask[i] - (TargetLo ? 0 : 8);
- assert(I16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!");
+ PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
+ assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!");
}
- return DAG.getVectorShuffle(MVT::v8i16, DL,
- DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
- DAG.getUNDEF(MVT::v8i16), I16Shuffle);
- }
+ return DAG.getNode(
+ ISD::BITCAST, DL, MVT::v16i8,
+ DAG.getVectorShuffle(MVT::v8i16, DL,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+ DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
+ };
+ if (SDValue V = tryToWidenViaDuplication())
+ return V;
}
// Check whether an interleaving lowering is likely to be more efficient.
define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) {
; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08
; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pand
; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT: packuswb %xmm0, %xmm0
; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
+; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,6,6,6]
; CHECK-SSE2-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <16 x i8> %shuffle
define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) {
; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12
; CHECK-SSE2: # BB#0:
-; CHECK-SSE2-NEXT: pand
+; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7]
; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,1]
-; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,3,3,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,5,6,7]
-; CHECK-SSE2-NEXT: packuswb %xmm0, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,0,1]
+; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,1,3]
; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,6,6]
+; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,6,6]
; CHECK-SSE2-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
ret <16 x i8> %shuffle