From: Andrea Di Biagio Date: Tue, 18 Mar 2014 17:12:59 +0000 (+0000) Subject: [DAGCombiner] teach how to simplify xor/and/or nodes according to the following rules: X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=6077ca9abb198829cce6c1cbbe8580c6ca1d2e9e;p=oota-llvm.git [DAGCombiner] teach how to simplify xor/and/or nodes according to the following rules: 1) (AND (shuf (A, C, Mask), shuf (B, C, Mask)) -> shuf (AND (A, B), C, Mask) 2) (OR (shuf (A, C, Mask), shuf (B, C, Mask)) -> shuf (OR (A, B), C, Mask) 3) (XOR (shuf (A, C, Mask), shuf (B, C, Mask)) -> shuf (XOR (A, B), V_0, Mask) 4) (AND (shuf (C, A, Mask), shuf (C, B, Mask)) -> shuf (C, AND (A, B), Mask) 5) (OR (shuf (C, A, Mask), shuf (C, B, Mask)) -> shuf (C, OR (A, B), Mask) 6) (XOR (shuf (C, A, Mask), shuf (C, B, Mask)) -> shuf (V_0, XOR (A, B), Mask) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@204160 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c45d6a1a790..2b2bbf9e296 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2518,35 +2518,66 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) { // The type-legalizer generates this pattern when loading illegal // vector types from memory. In many cases this allows additional shuffle // optimizations. - if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG && - N0.getOperand(1).getOpcode() == ISD::UNDEF && - N1.getOperand(1).getOpcode() == ISD::UNDEF) { + // There are other cases where moving the shuffle after the xor/and/or + // is profitable even if shuffles don't perform a swizzle. + // If both shuffles use the same mask, and both shuffles have the same first + // or second operand, then it might still be profitable to move the shuffle + // after the xor/and/or operation. + if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) { ShuffleVectorSDNode *SVN0 = cast(N0); ShuffleVectorSDNode *SVN1 = cast(N1); - assert(N0.getOperand(0).getValueType() == N1.getOperand(1).getValueType() && + assert(N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() && "Inputs to shuffles are not the same type"); - - unsigned NumElts = VT.getVectorNumElements(); - + // Check that both shuffles use the same mask. The masks are known to be of // the same length because the result vector type is the same. - bool SameMask = true; - for (unsigned i = 0; i != NumElts; ++i) { - int Idx0 = SVN0->getMaskElt(i); - int Idx1 = SVN1->getMaskElt(i); - if (Idx0 != Idx1) { - SameMask = false; - break; + // Check also that shuffles have only one use to avoid introducing extra + // instructions. + if (SVN0->hasOneUse() && SVN1->hasOneUse() && + SVN0->getMask().equals(SVN1->getMask())) { + SDValue ShOp = N0->getOperand(1); + + // Don't try to fold this node if it requires introducing a + // build vector of all zeros that might be illegal at this stage. + if (N->getOpcode() == ISD::XOR && ShOp.getOpcode() != ISD::UNDEF) { + if (!LegalTypes) + ShOp = DAG.getConstant(0, VT); + else + ShOp = SDValue(); } - } - if (SameMask) { - SDValue Op = DAG.getNode(N->getOpcode(), SDLoc(N), VT, - N0.getOperand(0), N1.getOperand(0)); - AddToWorkList(Op.getNode()); - return DAG.getVectorShuffle(VT, SDLoc(N), Op, - DAG.getUNDEF(VT), &SVN0->getMask()[0]); + // (AND (shuf (A, C), shuf (B, C)) -> shuf (AND (A, B), C) + // (OR (shuf (A, C), shuf (B, C)) -> shuf (OR (A, B), C) + // (XOR (shuf (A, C), shuf (B, C)) -> shuf (XOR (A, B), V_0) + if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) { + SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT, + N0->getOperand(0), N1->getOperand(0)); + AddToWorkList(NewNode.getNode()); + return DAG.getVectorShuffle(VT, SDLoc(N), NewNode, ShOp, + &SVN0->getMask()[0]); + } + + // Don't try to fold this node if it requires introducing a + // build vector of all zeros that might be illegal at this stage. + ShOp = N0->getOperand(0); + if (N->getOpcode() == ISD::XOR && ShOp.getOpcode() != ISD::UNDEF) { + if (!LegalTypes) + ShOp = DAG.getConstant(0, VT); + else + ShOp = SDValue(); + } + + // (AND (shuf (C, A), shuf (C, B)) -> shuf (C, AND (A, B)) + // (OR (shuf (C, A), shuf (C, B)) -> shuf (C, OR (A, B)) + // (XOR (shuf (C, A), shuf (C, B)) -> shuf (V_0, XOR (A, B)) + if (N0->getOperand(0) == N1->getOperand(0) && ShOp.getNode()) { + SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT, + N0->getOperand(1), N1->getOperand(1)); + AddToWorkList(NewNode.getNode()); + return DAG.getVectorShuffle(VT, SDLoc(N), ShOp, NewNode, + &SVN0->getMask()[0]); + } } } diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll index 60b6d756165..c1ce53334ec 100644 --- a/test/CodeGen/X86/combine-or.ll +++ b/test/CodeGen/X86/combine-or.ll @@ -251,6 +251,7 @@ define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test20 ; CHECK-NOT: xorps ; CHECK: orps +; CHECK-NEXT: movq ; CHECK-NEXT: ret @@ -262,6 +263,7 @@ define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) { } ; CHECK-LABEL: test21 ; CHECK: por +; CHECK-NEXT: pslldq ; CHECK-NEXT: ret diff --git a/test/CodeGen/X86/combine-vec-shuffle.ll b/test/CodeGen/X86/combine-vec-shuffle.ll new file mode 100644 index 00000000000..9e6ab892713 --- /dev/null +++ b/test/CodeGen/X86/combine-vec-shuffle.ll @@ -0,0 +1,253 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s + +; Verify that the DAGCombiner correctly folds according to the following rules: + +; fold (AND (shuf (A, C), shuf (B, C)) -> shuf (AND (A, B), C) +; fold (OR (shuf (A, C), shuf (B, C)) -> shuf (OR (A, B), C) +; fold (XOR (shuf (A, C), shuf (B, C)) -> shuf (XOR (A, B), V_0) + +; fold (AND (shuf (C, A), shuf (C, B)) -> shuf (C, AND (A, B)) +; fold (OR (shuf (C, A), shuf (C, B)) -> shuf (C, OR (A, B)) +; fold (XOR (shuf (C, A), shuf (C, B)) -> shuf (V_0, XOR (A, B)) + + + +define <4 x i32> @test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> + %and = and <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %and +} +; CHECK-LABEL: test1 +; CHECK-NOT: pshufd +; CHECK: pand +; CHECK-NEXT: pshufd +; CHECK-NEXT: ret + + +define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} +; CHECK-LABEL: test2 +; CHECK-NOT: pshufd +; CHECK: por +; CHECK-NEXT: pshufd +; CHECK-NEXT: ret + + +define <4 x i32> @test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> + %xor = xor <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %xor +} +; CHECK-LABEL: test3 +; CHECK-NOT: pshufd +; CHECK: pxor +; CHECK-NEXT: pshufd +; CHECK-NEXT: ret + + +define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> + %and = and <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %and +} +; CHECK-LABEL: test4 +; CHECK-NOT: pshufd +; CHECK: pand +; CHECK-NEXT: pshufd +; CHECK-NEXT: ret + + +define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} +; CHECK-LABEL: test5 +; CHECK-NOT: pshufd +; CHECK: por +; CHECK-NEXT: pshufd +; CHECK-NEXT: ret + + +define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> + %xor = xor <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %xor +} +; CHECK-LABEL: test6 +; CHECK-NOT: pshufd +; CHECK: pxor +; CHECK-NEXT: pshufd +; CHECK-NEXT: ret + + +; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles +; are not performing a swizzle operations. + +define <4 x i32> @test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> + %and = and <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %and +} +; CHECK-LABEL: test1b +; CHECK-NOT: blendps +; CHECK: andps +; CHECK-NEXT: blendps +; CHECK-NEXT: ret + + +define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} +; CHECK-LABEL: test2b +; CHECK-NOT: blendps +; CHECK: orps +; CHECK-NEXT: blendps +; CHECK-NEXT: ret + + +define <4 x i32> @test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> + %xor = xor <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %xor +} +; CHECK-LABEL: test3b +; CHECK-NOT: blendps +; CHECK: xorps +; CHECK-NEXT: xorps +; CHECK-NEXT: blendps +; CHECK-NEXT: ret + + +define <4 x i32> @test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> + %and = and <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %and +} +; CHECK-LABEL: test4b +; CHECK-NOT: blendps +; CHECK: andps +; CHECK-NEXT: blendps +; CHECK: ret + + +define <4 x i32> @test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} +; CHECK-LABEL: test5b +; CHECK-NOT: blendps +; CHECK: orps +; CHECK-NEXT: blendps +; CHECK: ret + + +define <4 x i32> @test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> + %xor = xor <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %xor +} +; CHECK-LABEL: test6b +; CHECK-NOT: blendps +; CHECK: xorps +; CHECK-NEXT: xorps +; CHECK-NEXT: blendps +; CHECK: ret + +define <4 x i32> @test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> + %and = and <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %and +} +; CHECK-LABEL: test1c +; CHECK-NOT: shufps +; CHECK: andps +; CHECK-NEXT: shufps +; CHECK-NEXT: ret + + +define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} +; CHECK-LABEL: test2c +; CHECK-NOT: shufps +; CHECK: orps +; CHECK-NEXT: shufps +; CHECK-NEXT: ret + + +define <4 x i32> @test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> + %xor = xor <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %xor +} +; CHECK-LABEL: test3c +; CHECK-NOT: shufps +; CHECK: xorps +; CHECK-NEXT: xorps +; CHECK-NEXT: shufps +; CHECK-NEXT: ret + + +define <4 x i32> @test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> + %and = and <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %and +} +; CHECK-LABEL: test4c +; CHECK-NOT: shufps +; CHECK: andps +; CHECK-NEXT: shufps +; CHECK: ret + + +define <4 x i32> @test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} +; CHECK-LABEL: test5c +; CHECK-NOT: shufps +; CHECK: orps +; CHECK-NEXT: shufps +; CHECK: ret + + +define <4 x i32> @test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> + %xor = xor <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %xor +} +; CHECK-LABEL: test6c +; CHECK-NOT: shufps +; CHECK: xorps +; CHECK-NEXT: xorps +; CHECK-NEXT: shufps +; CHECK: ret +