if (Mask[i] >= 0 && Mask[i] != i)
return SDValue(); // Shuffled V1 input!
}
- return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
- DAG.getConstant(BlendMask, MVT::i8));
+ if (VT == MVT::v4f32 || VT == MVT::v2f64)
+ return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
+ DAG.getConstant(BlendMask, MVT::i8));
+ assert(!VT.isFloatingPoint() && "Only v4f32 and v2f64 are supported!");
+
+ // For integer shuffles we need to expand the mask and cast the inputs to
+ // v8i16s prior to blending.
+ assert((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64) &&
+ "Not a supported integer vector type!");
+ int Scale = 8 / VT.getVectorNumElements();
+ BlendMask = 0;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= Size)
+ for (int j = 0; j < Scale; ++j)
+ BlendMask |= 1u << (i * Scale + j);
+
+ V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
+ DAG.getConstant(BlendMask, MVT::i8)));
}
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
if (isShuffleEquivalent(Mask, 1, 3))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
+ if (Subtarget->hasSSE41())
+ if (SDValue Blend =
+ lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, DAG))
+ return Blend;
+
// We implement this with SHUFPD which is pretty lame because it will likely
// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
// However, all the alternatives are still more cycles and newer chips don't
MVT::v4i32, DL, V1, V2, Mask, Subtarget, DAG))
return V;
+ if (Subtarget->hasSSE41())
+ if (SDValue Blend =
+ lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, DAG))
+ return Blend;
+
// We implement this with SHUFPS because it can blend from two vectors.
// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
// up the inputs, bypassing domain shift penalties that we would encur if we
MVT::v8i16, DL, V1, V2, Mask, Subtarget, DAG))
return V;
+ if (Subtarget->hasSSE41())
+ if (SDValue Blend =
+ lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return Blend;
+
if (NumV1Inputs + NumV2Inputs <= 4)
return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
; SSE3-NEXT: retq
;
; SSE41-LABEL: @shuffle_v2i64_03
-; SSE41: blendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; SSE41: pblendw {{.*}} # xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
ret <2 x i64> %shuffle
; SSE3-NEXT: retq
;
; SSE41-LABEL: @shuffle_v2i64_03_copy
-; SSE41: blendpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
-; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41: pblendw {{.*}} # xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
ret <2 x i64> %shuffle
; SSE3-NEXT: retq
;
; SSE41-LABEL: @shuffle_v2i64_21
-; SSE41: blendpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
-; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41: pblendw {{.*}} # xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
ret <2 x i64> %shuffle
; SSE3-NEXT: retq
;
; SSE41-LABEL: @shuffle_v2i64_21_copy
-; SSE41: blendpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
-; SSE41-NEXT: movapd %xmm2, %xmm0
+; SSE41: pblendw {{.*}} # xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
ret <2 x i64> %shuffle
; AVX1-LABEL: @shuffle_v4i64_0300
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
+; AVX1-NEXT: vpblendw {{.*}} # xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm2[0],xmm1[1]
+; AVX1-NEXT: vpblendw {{.*}} # xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm2[0,1,0,1]
-; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
+; AVX1-NEXT: vpblendw {{.*}} # xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0]
; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; AVX1-NEXT: vpblendw {{.*}} # xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0]
; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
-; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT: vpblendw {{.*}} # xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
; AVX1-LABEL: @shuffle_v4i64_0451
; AVX1: # BB#0:
; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm1[2,3,0,1]
-; AVX1-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm0[1]
+; AVX1-NEXT: vpblendw {{.*}} # xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; AVX1-NEXT: vpblendw {{.*}} # xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
; AVX1-LABEL: @shuffle_v4i64_4015
; AVX1: # BB#0:
; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm0[2,3,0,1]
-; AVX1-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
+; AVX1-NEXT: vpblendw {{.*}} # xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
-; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT: vpblendw {{.*}} # xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; AVX1-NEXT: vpblendw {{.*}} # xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[2,3,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq