DAG.getUNDEF(MVT::v8i16), Mask);
}
+/// \brief Helper to form a PSHUFB-based shuffle+blend.
+static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG, bool &V1InUse,
+ bool &V2InUse) {
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ SDValue V1Mask[16];
+ SDValue V2Mask[16];
+ V1InUse = false;
+ V2InUse = false;
+
+ int Size = Mask.size();
+ int Scale = 16 / Size;
+ for (int i = 0; i < 16; ++i) {
+ if (Mask[i / Scale] == -1) {
+ V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
+ } else {
+ const int ZeroMask = 0x80;
+ int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
+ : ZeroMask;
+ int V2Idx = Mask[i / Scale] < Size
+ ? ZeroMask
+ : (Mask[i / Scale] - Size) * Scale + i % Scale;
+ if (Zeroable[i / Scale])
+ V1Idx = V2Idx = ZeroMask;
+ V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
+ V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
+ V1InUse |= (ZeroMask != V1Idx);
+ V2InUse |= (ZeroMask != V2Idx);
+ }
+ }
+
+ if (V1InUse)
+ V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V1),
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
+ if (V2InUse)
+ V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V2),
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
+
+ // If we need shuffled inputs from both, blend the two.
+ SDValue V;
+ if (V1InUse && V2InUse)
+ V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
+ else
+ V = V1InUse ? V1 : V2;
+
+ // Cast the result back to the correct type.
+ return DAG.getNode(ISD::BITCAST, DL, VT, V);
+}
+
/// \brief Generic lowering of 8-lane i16 shuffles.
///
/// This handles both single-input shuffles and combined shuffle/blends with
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
}
- // If we have direct support for blends, we should lower by decomposing into
- // a permute.
- if (IsBlendSupported)
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
- Mask, DAG);
-
- // Try to lower by permuting the inputs into an unpack instruction.
- if (SDValue Unpack =
- lowerVectorShuffleAsUnpack(MVT::v8i16, DL, V1, V2, Mask, DAG))
- return Unpack;
-
- int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
- int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ // Try to lower by permuting the inputs into an unpack instruction unless we
+ // have direct support for blending.
+ if (!IsBlendSupported) {
+ if (SDValue Unpack =
+ lowerVectorShuffleAsUnpack(MVT::v8i16, DL, V1, V2, Mask, DAG))
+ return Unpack;
- for (int i = 0; i < 4; ++i) {
- LoBlendMask[i] = Mask[i];
- HiBlendMask[i] = Mask[i + 4];
+ // If we can use PSHUFB, that will be better as it can both shuffle and set
+ // up an efficient blend.
+ if (Subtarget->hasSSSE3()) {
+ bool V1InUse, V2InUse;
+ return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG,
+ V1InUse, V2InUse);
+ }
}
- SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
- SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
- LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
- HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
-
- return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
- DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
+ // We can always bit-blend if we have to so the fallback strategy is to
+ // decompose into single-input permutes and blends.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
+ Mask, DAG);
}
/// \brief Check whether a compaction lowering can be done by dropping even
// interleavings with direct instructions supporting them. We currently don't
// handle those well here.
if (Subtarget->hasSSSE3()) {
- SDValue V1Mask[16];
- SDValue V2Mask[16];
bool V1InUse = false;
bool V2InUse = false;
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
- for (int i = 0; i < 16; ++i) {
- if (Mask[i] == -1) {
- V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
- } else {
- const int ZeroMask = 0x80;
- int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask);
- int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16);
- if (Zeroable[i])
- V1Idx = V2Idx = ZeroMask;
- V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
- V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
- V1InUse |= (ZeroMask != V1Idx);
- V2InUse |= (ZeroMask != V2Idx);
- }
- }
+ SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask,
+ DAG, V1InUse, V2InUse);
// If both V1 and V2 are in use and we can use a direct blend or an unpack,
// do so. This avoids using them to handle blends-with-zero which is
return Unpack;
}
- if (V1InUse)
- V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
- if (V2InUse)
- V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
-
- // If we need shuffled inputs from both, blend the two.
- if (V1InUse && V2InUse)
- return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
- if (V1InUse)
- return V1; // Single inputs are easy.
- if (V2InUse)
- return V2; // Single inputs are easy.
- // Shuffling to a zeroable vector.
- return getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
+ return PSHUFB;
}
// There are special ways we can lower some single-element blends.
define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind {
; X64-LABEL: t2:
; X64: ## BB#0:
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,0,3,4,5,6,7]
-; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X64-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535]
+; X64-NEXT: pand %xmm2, %xmm0
+; X64-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,1,4,5,6,7]
+; X64-NEXT: pandn %xmm1, %xmm2
+; X64-NEXT: por %xmm2, %xmm0
; X64-NEXT: retq
%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 9, i32 1, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7 >
ret <8 x i16> %tmp
; SSE2: # BB#0: # %entry
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[3,1,2,3]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,0,0,65535]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pandn %xmm4, %xmm5
+; SSE2-NEXT: por %xmm2, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[1,1,2,3,4,5,6,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,1,4,5,6,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,1,2,3,4,5,6,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,2,2,1,4,5,6,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSE2-NEXT: packuswb %xmm2, %xmm3
+; SSE2-NEXT: packuswb %xmm5, %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_012dcde3:
; SSE2: # BB#0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,0,3,4,5,6,7]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,0,2,4,5,6,7]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_012dcde3:
; SSSE3: # BB#0:
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[10,11,8,9,10,11,12,13],zero,zero
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[6,7]
+; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_012dcde3: