From: Andrea Di Biagio Date: Mon, 14 Jul 2014 22:46:26 +0000 (+0000) Subject: [DAGCombiner] Add more rules to combine shuffle vector dag nodes. X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=commitdiff_plain;h=7847229c3152026183a676782bb9fa05a598e48a [DAGCombiner] Add more rules to combine shuffle vector dag nodes. This patch teaches the DAGCombiner how to fold a pair of shuffles according to rules: 1. shuffle(shuffle A, B, M0), B, M1) -> shuffle(A, B, M2) 2. shuffle(shuffle A, B, M0), A, M1) -> shuffle(A, B, M3) The new rules would only trigger if the resulting shuffle has legal type and legal mask. Added test 'combine-vec-shuffle-3.ll' to verify that DAGCombiner correctly folds shuffles on x86 when the resulting mask is legal. Also added some negative cases to verify that we avoid introducing illegal shuffles. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213001 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c8daea9a8b6..a1dd8bf5cb2 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10777,6 +10777,50 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { } } + // Try to fold according to rules: + // shuffle(shuffle A, B, M0), B, M1) -> shuffle(A, B, M2) + // shuffle(shuffle A, B, M0), A, M1) -> shuffle(A, B, M2) + // Don't try to fold shuffles with illegal type. + if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG && + TLI.isTypeLegal(VT)) { + ShuffleVectorSDNode *OtherSV = cast(N0); + + // The incoming shuffle must be of the same type as the result of the + // current shuffle. + assert(OtherSV->getOperand(0).getValueType() == VT && + "Shuffle types don't match"); + + SDValue SV0 = OtherSV->getOperand(0); + SDValue SV1 = OtherSV->getOperand(1); + bool HasSameOp0 = N1 == SV0; + if (!HasSameOp0 && N1 != SV1) + // Early exit. + return SDValue(); + + SmallVector Mask; + // Compute the combined shuffle mask for a shuffle with SV0 as the first + // operand, and SV1 as the second operand. + for (unsigned i = 0; i != NumElts; ++i) { + int Idx = SVN->getMaskElt(i); + if (Idx < 0) { + // Propagate Undef. + Mask.push_back(Idx); + continue; + } + + if (Idx < (int)NumElts) + Idx = OtherSV->getMaskElt(Idx); + else + Idx = HasSameOp0 ? Idx - NumElts : Idx; + + Mask.push_back(Idx); + } + + // Avoid introducing shuffles with illegal mask. + if (TLI.isShuffleMaskLegal(Mask, VT)) + return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, &Mask[0]); + } + return SDValue(); } diff --git a/test/CodeGen/X86/combine-vec-shuffle-3.ll b/test/CodeGen/X86/combine-vec-shuffle-3.ll new file mode 100644 index 00000000000..1a54d881ce3 --- /dev/null +++ b/test/CodeGen/X86/combine-vec-shuffle-3.ll @@ -0,0 +1,373 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s + +define <4 x float> @test1(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test1 +; Mask: [0,1,2,3] +; CHECK: movaps +; CHECK: ret + +define <4 x float> @test2(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test2 +; Mask: [0,5,6,7] +; CHECK: movss +; CHECK: ret + +define <4 x float> @test3(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test3 +; Mask: [0,1,4,5] +; CHECK: movlhps +; CHECK: ret + +define <4 x float> @test4(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; FIXME: this should be lowered as a single movhlps. However, the backend +; wrongly thinks that shuffle mask [6,7,2,3] is not legal. Therefore, we +; end up with the sub-optimal sequence 'shufps, palignr'. +; CHECK-LABEL: test4 +; Mask: [6,7,2,3] +; CHECK: shufps $84 +; CHECK: palignr $8 +; CHECK: ret + +define <4 x float> @test5(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test5 +; Mask: [4,1,6,7] +; CHECK: blendps $13 +; CHECK: ret + + +define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test6 +; Mask: [4,5,6,7] +; CHECK: movaps +; CHECK: ret + +define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test7 +; Mask: [0,5,6,7] +; CHECK: movss +; CHECK: ret + +define <4 x i32> @test8(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test8 +; Mask: [0,1,4,5] +; CHECK: movlhps +; CHECK: ret + +define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> + ret <4 x i32> %2 +} +; FIXME: this should be lowered as a single movhlps. However, the backend thinks that +; shuffle mask [6,7,2,3] is not legal. +; CHECK-LABEL: test9 +; Mask: [6,7,2,3] +; CHECK: shufps $84 +; CHECK: palignr $8 +; CHECK: ret + +define <4 x i32> @test10(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test10 +; Mask: [4,1,6,7] +; CHECK: blendps +; CHECK: ret + +define <4 x float> @test11(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test11 +; Mask: [0,1,2,3] +; CHECK-NOT: movaps +; CHECK-NOT: blendps +; CHECK: ret + +define <4 x float> @test12(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test12 +; Mask: [0,5,6,7] +; CHECK: movss +; CHECK: ret + +define <4 x float> @test13(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test13 +; Mask: [0,1,4,5] +; CHECK: movlhps +; CHECK: ret + +define <4 x float> @test14(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} +; FIXME: this should be lowered as a single movhlps. However, the backend +; wrongly thinks that shuffle mask [6,7,2,3] is not legal. Therefore, we +; end up with the sub-optimal sequence 'pshufd, blendps'. +; CHECK-LABEL: test14 +; Mask: [6,7,2,3] +; CHECK: pshufd $94 +; CHECK: blendps $12 +; CHECK: ret + +define <4 x float> @test15(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test15 +; Mask: [4,1,6,7] +; CHECK: blendps $13 +; CHECK: ret + +define <4 x i32> @test16(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test16 +; Mask: [0,1,2,3] +; CHECK-NOT: movaps +; CHECK-NOT: blendps +; CHECK: ret + +define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test17 +; Mask: [0,5,6,7] +; CHECK: movss +; CHECK: ret + +define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test18 +; Mask: [0,1,4,5] +; CHECK: movlhps +; CHECK: ret + +define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> + ret <4 x i32> %2 +} +; FIXME: this should be lowered as a single movhlps. However, the backend +; wrongly thinks that shuffle mask [6,7,2,3] is not legal. Therefore, we +; end up with the sub-optimal sequence 'shufps, palignr'. +; CHECK-LABEL: test19 +; Mask: [6,7,2,3] +; CHECK: pshufd $94 +; CHECK: blendps $12 +; CHECK: ret + +define <4 x i32> @test20(<4 x i32> %a, <4 x i32> %b) { + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test20 +; Mask: [4,1,6,7] +; CHECK: blendps $13 +; CHECK: ret + +; Check some negative cases. +define <4 x float> @test1b(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test1b +; CHECK: shufps +; CHECK: shufps +; CHECK: ret + +define <4 x float> @test2b(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test2b +; CHECK: shufps +; CHECK: pshufd +; CHECK: ret + +define <4 x float> @test3b(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test3b +; CHECK: shufps +; CHECK: shufps +; CHECK: ret + +define <4 x float> @test4b(<4 x float> %a, <4 x float> %b) { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} +; CHECK-LABEL: test4b +; CHECK: shufps +; CHECK: shufps +; CHECK: ret + + +; Verify that we correctly fold shuffles even when we use illegal vector types. +define <4 x i8> @test1c(<4 x i8>* %a, <4 x i8>* %b) { + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> + ret <4 x i8> %2 +} +; CHECK-LABEL: test1c +; Mask: [0,5,6,7] +; CHECK: movss +; CHECK-NEXT: ret + +define <4 x i8> @test2c(<4 x i8>* %a, <4 x i8>* %b) { + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> + ret <4 x i8> %2 +} +; CHECK-LABEL: test2c +; Mask: [0,1,4,5] +; CHECK: movlhps +; CHECK-NEXT: ret + +define <4 x i8> @test3c(<4 x i8>* %a, <4 x i8>* %b) { + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> + ret <4 x i8> %2 +} +; FIXME: this should be lowered as a single movhlps. However, the backend +; wrongly thinks that shuffle mask [6,7,2,3] is not legal. Therefore, we end up +; with a sub-optimal sequence of 'shufps+palignr'. + +; CHECK-LABEL: test3c +; Mask: [6,7,2,3] +; CHECK: shufps $84 +; CHECK: palignr $8 +; CHECK: ret + +define <4 x i8> @test4c(<4 x i8>* %a, <4 x i8>* %b) { + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> + ret <4 x i8> %2 +} +; CHECK-LABEL: test4c +; Mask: [4,1,6,7] +; CHECK: blendps $13 +; CHECK: ret + +; The following test cases are generated from this C++ code +; +;__m128 blend_01(__m128 a, __m128 b) +;{ +; __m128 s = a; +; s = _mm_blend_ps( s, b, 1<<0 ); +; s = _mm_blend_ps( s, b, 1<<1 ); +; return s; +;} +; +;__m128 blend_02(__m128 a, __m128 b) +;{ +; __m128 s = a; +; s = _mm_blend_ps( s, b, 1<<0 ); +; s = _mm_blend_ps( s, b, 1<<2 ); +; return s; +;} +; +;__m128 blend_123(__m128 a, __m128 b) +;{ +; __m128 s = a; +; s = _mm_blend_ps( s, b, 1<<1 ); +; s = _mm_blend_ps( s, b, 1<<2 ); +; s = _mm_blend_ps( s, b, 1<<3 ); +; return s; +;} + +; Ideally, we should collapse the following shuffles into a single one. + +define <4 x float> @blend_01(<4 x float> %a, <4 x float> %b) { + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> + ret <4 x float> %shuffle6 +} +; CHECK-LABEL: blend_01 +; CHECK: movsd +; CHECK-NEXT: ret + +define <4 x float> @blend_02(<4 x float> %a, <4 x float> %b) { + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> + ret <4 x float> %shuffle6 +} +; CHECK-LABEL: blend_02 +; CHECK: blendps $5 +; CHECK-NEXT: ret + +define <4 x float> @blend_123(<4 x float> %a, <4 x float> %b) { + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> + %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> + ret <4 x float> %shuffle12 +} +; CHECK-LABEL: blend_123 +; CHECK: movss +; CHECK: ret +