From da681cc578e56e869b5fa5bb041d7c21fb60dfc7 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Wed, 4 Feb 2015 09:06:05 +0000 Subject: [PATCH] [x86] Start to introduce bit-masking based blend lowering. This is the simplest form of bit-math based blending which only fires when we are blending with zero and is relatively profitable. I've only enabled this path on very specific lowering strategies. I'm planning to widen its applicability in subsequent patches, but so far you'll notice that even though we get fewer shufps instructions, we *still* do the bit math in the FP execution port. I'm looking into why this is still happening. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@228124 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 68 +++++++++++++++++--- test/CodeGen/X86/sse2.ll | 3 +- test/CodeGen/X86/vector-shuffle-128-v4.ll | 37 +++-------- test/CodeGen/X86/vector-shuffle-combining.ll | 18 ++---- 4 files changed, 72 insertions(+), 54 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 4544b47d8e4..efe1de730a2 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7733,6 +7733,46 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef Mask, return Zeroable; } +/// \brief Try to emit a bitmask instruction for a shuffle. +/// +/// This handles cases where we can model a blend exactly as a bitmask due to +/// one of the inputs being zeroable. +static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + SelectionDAG &DAG) { + MVT EltVT = VT.getScalarType(); + int NumEltBits = EltVT.getSizeInBits(); + MVT IntEltVT = MVT::getIntegerVT(NumEltBits); + SDValue Zero = DAG.getConstant(0, IntEltVT); + SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT); + if (EltVT.isFloatingPoint()) { + Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero); + AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes); + } + SmallVector VMaskOps(Mask.size(), Zero); + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + SDValue V; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Zeroable[i]) + continue; + if (Mask[i] % Size != i) + return SDValue(); // Not a blend. + if (!V) + V = Mask[i] < Size ? V1 : V2; + else if (V != (Mask[i] < Size ? V1 : V2)) + return SDValue(); // Can only let one input through the mask. + + VMaskOps[i] = AllOnes; + } + if (!V) + return SDValue(); // No non-zeroable elements! + + SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps); + V = DAG.getNode(VT.isFloatingPoint() ? X86ISD::FAND : ISD::AND, DL, VT, V, + VMask); + return V; +} + /// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2 @@ -8743,17 +8783,21 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Mask, Subtarget, DAG)) return V; + if (Subtarget->hasSSE41()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + if (SDValue Masked = + lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG)) + return Masked; + // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2); if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); - if (Subtarget->hasSSE41()) - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, - Subtarget, DAG)) - return Blend; - // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget->hasSSSE3()) @@ -9455,17 +9499,21 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Mask, Subtarget, DAG)) return V; + if (Subtarget->hasSSE41()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + if (SDValue Masked = + lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG)) + return Masked; + // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2); if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2); - if (Subtarget->hasSSE41()) - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, - Subtarget, DAG)) - return Blend; - // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll index d670fdf6ca3..ee5a47c00c5 100644 --- a/test/CodeGen/X86/sse2.ll +++ b/test/CodeGen/X86/sse2.ll @@ -302,8 +302,7 @@ define <2 x i64> @test_insert_64_zext(<2 x i64> %i) { define <4 x i32> @PR19721(<4 x i32> %i) { ; CHECK-LABEL: PR19721: ; CHECK: ## BB#0: -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; CHECK-NEXT: andps LCPI19_0, %xmm0 ; CHECK-NEXT: retl %bc = bitcast <4 x i32> %i to i128 %insert = and i128 %bc, -4294967296 diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll index 7bdb6459c1f..4c6641cc6da 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1185,17 +1185,12 @@ define <4 x i32> @shuffle_v4i32_01zu(<4 x i32> %a) { define <4 x i32> @shuffle_v4i32_0z23(<4 x i32> %a) { ; SSE-LABEL: shuffle_v4i32_0z23: ; SSE: # BB#0: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v4i32_0z23: ; AVX: # BB#0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[2,3] +; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %shuffle @@ -1204,16 +1199,12 @@ define <4 x i32> @shuffle_v4i32_0z23(<4 x i32> %a) { define <4 x i32> @shuffle_v4i32_01z3(<4 x i32> %a) { ; SSE-LABEL: shuffle_v4i32_01z3: ; SSE: # BB#0: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v4i32_01z3: ; AVX: # BB#0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %shuffle @@ -1222,23 +1213,17 @@ define <4 x i32> @shuffle_v4i32_01z3(<4 x i32> %a) { define <4 x i32> @shuffle_v4i32_012z(<4 x i32> %a) { ; SSE2-LABEL: shuffle_v4i32_012z: ; SSE2: # BB#0: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_012z: ; SSE3: # BB#0: -; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSE3-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4i32_012z: ; SSSE3: # BB#0: -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v4i32_012z: @@ -1265,16 +1250,12 @@ define <4 x i32> @shuffle_v4i32_012z(<4 x i32> %a) { define <4 x i32> @shuffle_v4i32_0zz3(<4 x i32> %a) { ; SSE-LABEL: shuffle_v4i32_0zz3: ; SSE: # BB#0: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v4i32_0zz3: ; AVX: # BB#0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,3,1] +; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll index 8261b89c378..ecc9b6e2c59 100644 --- a/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/test/CodeGen/X86/vector-shuffle-combining.ll @@ -352,17 +352,13 @@ define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i3 ; SSE2-LABEL: combine_bitwise_ops_test3b: ; SSE2: # BB#0: ; SSE2-NEXT: xorps %xmm1, %xmm0 -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_bitwise_ops_test3b: ; SSSE3: # BB#0: ; SSSE3-NEXT: xorps %xmm1, %xmm0 -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_bitwise_ops_test3b: @@ -475,19 +471,13 @@ define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i3 ; SSE2-LABEL: combine_bitwise_ops_test6b: ; SSE2: # BB#0: ; SSE2-NEXT: xorps %xmm1, %xmm0 -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_bitwise_ops_test6b: ; SSSE3: # BB#0: ; SSSE3-NEXT: xorps %xmm1, %xmm0 -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_bitwise_ops_test6b: -- 2.34.1