From: Simon Pilgrim Date: Tue, 25 Nov 2014 22:34:59 +0000 (+0000) Subject: [X86][SSE] Improvements to byte shift shuffle matching X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=commitdiff_plain;h=7f6cee962641ac5178a839f3f57144abf61b2ef7 [X86][SSE] Improvements to byte shift shuffle matching Since (v)pslldq / (v)psrldq instructions resolve to a single input argument it is useful to match it much earlier than we currently do - this prevents more complicated shuffles (notably insertion into a zero vector) matching before it. Differential Revision: http://reviews.llvm.org/D6409 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@222796 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 63a3009fd82..0833079d573 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -8217,6 +8217,11 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(WidenedMask, DAG))); } + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v2i64, V1, V2, Mask, DAG)) + return Shift; + // If we have a single input from V2 insert that into V1 if we can do so // cheaply. if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { @@ -8243,11 +8248,6 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Subtarget, DAG)) return Blend; - // Try to use byte shift instructions. - if (SDValue Shift = lowerVectorShuffleAsByteShift( - DL, MVT::v2i64, V1, V2, Mask, DAG)) - return Shift; - // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget->hasSSSE3()) @@ -8508,6 +8508,11 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(Mask, DAG)); } + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v4i32, V1, V2, Mask, DAG)) + return Shift; + // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2, @@ -8525,11 +8530,6 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Subtarget, DAG)) return Blend; - // Try to use byte shift instructions. - if (SDValue Shift = lowerVectorShuffleAsByteShift( - DL, MVT::v4i32, V1, V2, Mask, DAG)) - return Shift; - // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget->hasSSSE3()) @@ -8593,17 +8593,17 @@ static SDValue lowerV8I16SingleInputVectorShuffle( Mask, Subtarget, DAG)) return Broadcast; + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v8i16, V, V, Mask, DAG)) + return Shift; + // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V); if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V); - // Try to use byte shift instructions. - if (SDValue Shift = lowerVectorShuffleAsByteShift( - DL, MVT::v8i16, V, V, Mask, DAG)) - return Shift; - // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i16, V, V, Mask, Subtarget, DAG)) @@ -9210,6 +9210,11 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized " "to be V1-input shuffles."); + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v8i16, V1, V2, Mask, DAG)) + return Shift; + // There are special ways we can lower some single-element blends. if (NumV2Inputs == 1) if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2, @@ -9227,11 +9232,6 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Subtarget, DAG)) return Blend; - // Try to use byte shift instructions. - if (SDValue Shift = lowerVectorShuffleAsByteShift( - DL, MVT::v8i16, V1, V2, Mask, DAG)) - return Shift; - // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll index 9539eae9d6a..f10092a1378 100644 --- a/test/CodeGen/X86/combine-or.ll +++ b/test/CodeGen/X86/combine-or.ll @@ -271,9 +271,8 @@ define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) { define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test21: ; CHECK: # BB#0: -; CHECK-NEXT: orps %xmm1, %xmm0 -; CHECK-NEXT: movq %xmm0, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; CHECK-NEXT: retq %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32> diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll index 9affee91700..cc7ae1f5706 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -667,14 +667,12 @@ define <2 x i64> @shuffle_v2i64_0z(<2 x i64> %a) { define <2 x i64> @shuffle_v2i64_1z(<2 x i64> %a) { ; SSE-LABEL: shuffle_v2i64_1z: ; SSE: # BB#0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v2i64_1z: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> ret <2 x i64> %shuffle @@ -683,14 +681,12 @@ define <2 x i64> @shuffle_v2i64_1z(<2 x i64> %a) { define <2 x i64> @shuffle_v2i64_z0(<2 x i64> %a) { ; SSE-LABEL: shuffle_v2i64_z0: ; SSE: # BB#0: -; SSE-NEXT: movq %xmm0, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v2i64_z0: ; AVX: # BB#0: -; AVX-NEXT: vmovq %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> ret <2 x i64> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll index 833b8225700..38fb3fb86a3 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -775,37 +775,27 @@ define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) { define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) { ; SSE2-LABEL: shuffle_v4i32_zuu4: ; SSE2: # BB#0: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: movss %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0] +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_zuu4: ; SSE3: # BB#0: -; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: movss %xmm0, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0] +; SSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4i32_zuu4: ; SSSE3: # BB#0: -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: movss %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0] +; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v4i32_zuu4: ; SSE41: # BB#0: -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0] +; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v4i32_zuu4: ; AVX: # BB#0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,0] +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> ret <4 x i32> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index 59af434fd15..fc22a3e25e4 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1429,15 +1429,13 @@ define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) { define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) { ; SSE-LABEL: shuffle_v8i16_zuuzuuz8: ; SSE: # BB#0: -; SSE-NEXT: movzwl %di, %eax -; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movd %edi, %xmm0 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v8i16_zuuzuuz8: ; AVX: # BB#0: -; AVX-NEXT: movzwl %di, %eax -; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vmovd %edi, %xmm0 ; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; AVX-NEXT: retq %a = insertelement <8 x i16> undef, i16 %i, i32 0