From cbe6ecfc8192f33397817c4b86368f596c17c533 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Mon, 16 Feb 2015 12:28:18 +0000 Subject: [PATCH] [x86] Add a generic unpack-targeted lowering technique. This can be used to generically lower blends and is particularly nice because it is available frome SSE2 onward. This removes a lot of the remaining domain crossing blends in SSE2 code. I'm hoping to replace some of the "interleaved" lowering hacks with something closer to this which should be more principled. First, this needs to learn how to detect and use other interleavings besides that of the natural type provided. That will be a follow-up patch though. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@229378 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 54 +++++ test/CodeGen/X86/pmul.ll | 25 ++- test/CodeGen/X86/sse2.ll | 5 +- test/CodeGen/X86/vector-idiv.ll | 204 ++++++++++--------- test/CodeGen/X86/vector-shuffle-128-v16.ll | 45 ++-- test/CodeGen/X86/vector-shuffle-combining.ll | 82 ++++---- 6 files changed, 246 insertions(+), 169 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 067ad6dcc5b..00095f04c67 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -8463,6 +8463,50 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2, DAG.getConstant(InsertPSMask, MVT::i8)); } +/// \brief Try to lower a shuffle as a permute of the inputs followed by an +/// UNPCK instruction. +/// +/// This specifically targets cases where we end up with alternating between +/// the two inputs, and so can permute them into something that feeds a single +/// UNPCK instruction. +static SDValue lowerVectorShuffleAsUnpack(MVT VT, SDLoc DL, SDValue V1, + SDValue V2, ArrayRef Mask, + SelectionDAG &DAG) { + assert(!isSingleInputShuffleMask(Mask) && + "This routine should only be used when blending two inputs."); + assert(Mask.size() >= 2 && "Single element masks are invalid."); + + int Size = Mask.size(); + + int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) { + return M >= 0 && M % Size < Size / 2; + }); + int NumHiInputs = std::count_if( + Mask.begin(), Mask.end(), [Size](int M) { return M % Size > Size / 2; }); + + bool UnpackLo = NumLoInputs >= NumHiInputs; + + SmallVector V1Mask(Mask.size(), -1); + SmallVector V2Mask(Mask.size(), -1); + for (int i = 0; i < Size; ++i) { + if (Mask[i] < 0) + continue; + + // We only handle the case where V1 feeds even mask slots and V2 feeds odd + // mask slots. We rely on canonicalization to ensure this is the case. + if ((i % 2 == 0) != (Mask[i] < Size)) + return SDValue(); + + SmallVectorImpl &VMask = (i % 2 == 0) ? V1Mask : V2Mask; + VMask[i / 2 + (UnpackLo ? 0 : Size / 2)] = Mask[i] % Size; + } + + V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); + V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); + return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, VT, V1, + V2); +} + /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full @@ -8921,6 +8965,11 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask, DAG); + // Try to lower by permuting the inputs into an unpack instruction. + if (SDValue Unpack = + lowerVectorShuffleAsUnpack(MVT::v4i32, DL, V1, V2, Mask, DAG)) + return Unpack; + // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build // up the inputs, bypassing domain shift penalties that we would encur if we @@ -9670,6 +9719,11 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, Mask, DAG); + // Try to lower by permuting the inputs into an unpack instruction. + if (SDValue Unpack = + lowerVectorShuffleAsUnpack(MVT::v8i16, DL, V1, V2, Mask, DAG)) + return Unpack; + int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll index ec4077a8a5b..6bfa656b609 100644 --- a/test/CodeGen/X86/pmul.ll +++ b/test/CodeGen/X86/pmul.ll @@ -7,9 +7,10 @@ define <4 x i32> @a(<4 x i32> %i) nounwind { ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [117,117,117,117] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: a: @@ -46,10 +47,11 @@ define <4 x i32> @c(<4 x i32> %i, <4 x i32> %j) nounwind { ; SSE2: # BB#0: # %entry ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: c: @@ -91,13 +93,14 @@ define <4 x i32> @e(<4 x i32> %i, <4 x i32> %j) nounwind { ; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; SSE2-NEXT: callq foo ; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: addq $40, %rsp ; SSE2-NEXT: retq ; diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll index 92411a18129..0b69ae85068 100644 --- a/test/CodeGen/X86/sse2.ll +++ b/test/CodeGen/X86/sse2.ll @@ -315,10 +315,11 @@ define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) { ; CHECK: ## BB#0: ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: retl %m = mul <4 x i32> %x, %y ret <4 x i32> %m diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll index 5d1fe96b254..06ce543e9cc 100644 --- a/test/CodeGen/X86/vector-idiv.ll +++ b/test/CodeGen/X86/vector-idiv.ll @@ -25,11 +25,12 @@ define <4 x i32> @test1(<4 x i32> %a) { ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE-NEXT: pmuludq %xmm1, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: psubd %xmm2, %xmm0 ; SSE-NEXT: psrld $1, %xmm0 ; SSE-NEXT: paddd %xmm2, %xmm0 @@ -85,20 +86,22 @@ define <8 x i32> @test2(<8 x i32> %a) { ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] ; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; SSE-NEXT: psubd %xmm3, %xmm0 ; SSE-NEXT: psrld $1, %xmm0 ; SSE-NEXT: paddd %xmm3, %xmm0 ; SSE-NEXT: psrld $2, %xmm0 ; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; SSE-NEXT: pmuludq %xmm4, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: psubd %xmm2, %xmm1 ; SSE-NEXT: psrld $1, %xmm1 ; SSE-NEXT: paddd %xmm2, %xmm1 @@ -838,21 +841,22 @@ define <4 x i32> @test8(<4 x i32> %a) { ; ; SSE-LABEL: test8: ; SSE: # BB#0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrad $31, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pmuludq %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: paddd %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE-NEXT: pmuludq %xmm4, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: psubd %xmm3, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pmuludq %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; SSE-NEXT: psrad $31, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: psubd %xmm2, %xmm1 ; SSE-NEXT: paddd %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: psrld $31, %xmm0 @@ -911,46 +915,48 @@ define <8 x i32> @test9(<8 x i32> %a) { ; ; SSE-LABEL: test9: ; SSE: # BB#0: -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: psrad $31, %xmm4 ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: psrad $31, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm5 ; SSE-NEXT: paddd %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pmuludq %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] ; SSE-NEXT: pmuludq %xmm6, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm7[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: psubd %xmm5, %xmm0 -; SSE-NEXT: paddd %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrld $31, %xmm3 -; SSE-NEXT: psrad $2, %xmm0 -; SSE-NEXT: paddd %xmm3, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: psrad $31, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: paddd %xmm4, %xmm3 -; SSE-NEXT: pmuludq %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE-NEXT: pmuludq %xmm6, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: psubd %xmm3, %xmm1 -; SSE-NEXT: paddd %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrld $31, %xmm2 -; SSE-NEXT: psrad $2, %xmm1 -; SSE-NEXT: paddd %xmm2, %xmm1 +; SSE-NEXT: psrad $2, %xmm0 +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: psrad $31, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: paddd %xmm4, %xmm5 +; SSE-NEXT: pmuludq %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSE-NEXT: pmuludq %xmm6, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: psubd %xmm5, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: psrld $31, %xmm1 +; SSE-NEXT: psrad $2, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: test9: @@ -1006,41 +1012,45 @@ define <8 x i32> @test10(<8 x i32> %a) { ; ; SSE-LABEL: test10: ; SSE: # BB#0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [613566757,613566757,613566757,613566757] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pmuludq %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] ; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: psubd %xmm3, %xmm5 +; SSE-NEXT: psubd %xmm2, %xmm5 ; SSE-NEXT: psrld $1, %xmm5 -; SSE-NEXT: paddd %xmm3, %xmm5 +; SSE-NEXT: paddd %xmm2, %xmm5 ; SSE-NEXT: psrld $2, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSE-NEXT: pmuludq %xmm3, %xmm5 -; SSE-NEXT: pmuludq %xmm3, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2,1,3] +; SSE-NEXT: pmuludq %xmm2, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pmuludq %xmm2, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; SSE-NEXT: psubd %xmm5, %xmm0 -; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: pmuludq %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] ; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm5[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psubd %xmm2, %xmm4 +; SSE-NEXT: psubd %xmm3, %xmm4 ; SSE-NEXT: psrld $1, %xmm4 -; SSE-NEXT: paddd %xmm2, %xmm4 +; SSE-NEXT: paddd %xmm3, %xmm4 ; SSE-NEXT: psrld $2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE-NEXT: pmuludq %xmm3, %xmm4 -; SSE-NEXT: pmuludq %xmm3, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE-NEXT: pmuludq %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: psubd %xmm4, %xmm1 ; SSE-NEXT: retq ; @@ -1109,13 +1119,14 @@ define <8 x i32> @test11(<8 x i32> %a) { ; SSE-NEXT: psrad $31, %xmm6 ; SSE-NEXT: pand %xmm2, %xmm6 ; SSE-NEXT: paddd %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pmuludq %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pmuludq %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSE-NEXT: pmuludq %xmm5, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] ; SSE-NEXT: psubd %xmm6, %xmm7 ; SSE-NEXT: paddd %xmm0, %xmm7 ; SSE-NEXT: movdqa %xmm7, %xmm4 @@ -1125,9 +1136,10 @@ define <8 x i32> @test11(<8 x i32> %a) { ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] ; SSE-NEXT: pmuludq %xmm4, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] ; SSE-NEXT: pmuludq %xmm4, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; SSE-NEXT: psubd %xmm7, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm6 @@ -1135,10 +1147,11 @@ define <8 x i32> @test11(<8 x i32> %a) { ; SSE-NEXT: pand %xmm2, %xmm6 ; SSE-NEXT: paddd %xmm3, %xmm6 ; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; SSE-NEXT: pmuludq %xmm5, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: psubd %xmm6, %xmm2 ; SSE-NEXT: paddd %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm3 @@ -1147,9 +1160,10 @@ define <8 x i32> @test11(<8 x i32> %a) { ; SSE-NEXT: paddd %xmm3, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; SSE-NEXT: pmuludq %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pmuludq %xmm4, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: psubd %xmm2, %xmm1 ; SSE-NEXT: retq ; @@ -1222,15 +1236,15 @@ define <4 x i32> @PR20355(<4 x i32> %a) { ; SSE-NEXT: paddd %xmm2, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE-NEXT: pmuludq %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: psubd %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $31, %xmm1 -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: psubd %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: psrld $31, %xmm0 +; SSE-NEXT: paddd %xmm4, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: PR20355: diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index bc6f77f3614..269ef6a8fe7 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -549,28 +549,27 @@ define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15( ; SSE2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15: ; SSE2: # BB#0: ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,1,0,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,1,2,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,1,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,0,4,5,6,7] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,2,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,1,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] @@ -580,9 +579,9 @@ define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15( ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll index 048b8b40649..64952f19d2d 100644 --- a/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/test/CodeGen/X86/vector-shuffle-combining.ll @@ -275,16 +275,18 @@ define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32 define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; SSE2-LABEL: combine_bitwise_ops_test1b: ; SSE2: # BB#0: -; SSE2-NEXT: andps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_bitwise_ops_test1b: ; SSSE3: # BB#0: -; SSSE3-NEXT: andps %xmm1, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_bitwise_ops_test1b: @@ -313,16 +315,18 @@ define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i3 define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; SSE2-LABEL: combine_bitwise_ops_test2b: ; SSE2: # BB#0: -; SSE2-NEXT: orps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_bitwise_ops_test2b: ; SSSE3: # BB#0: -; SSSE3-NEXT: orps %xmm1, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_bitwise_ops_test2b: @@ -390,18 +394,18 @@ define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i3 define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; SSE2-LABEL: combine_bitwise_ops_test4b: ; SSE2: # BB#0: -; SSE2-NEXT: andps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_bitwise_ops_test4b: ; SSSE3: # BB#0: -; SSSE3-NEXT: andps %xmm1, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_bitwise_ops_test4b: @@ -430,18 +434,18 @@ define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i3 define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; SSE2-LABEL: combine_bitwise_ops_test5b: ; SSE2: # BB#0: -; SSE2-NEXT: orps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_bitwise_ops_test5b: ; SSSE3: # BB#0: -; SSSE3-NEXT: orps %xmm1, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_bitwise_ops_test5b: @@ -1012,14 +1016,16 @@ define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) { ; SSE2-LABEL: combine_nested_undef_test16: ; SSE2: # BB#0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_nested_undef_test16: ; SSSE3: # BB#0: -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_nested_undef_test16: @@ -1171,16 +1177,16 @@ define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) { ; SSE2-LABEL: combine_nested_undef_test21: ; SSE2: # BB#0: -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_nested_undef_test21: ; SSSE3: # BB#0: -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,1] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_nested_undef_test21: -- 2.34.1