From: Olivier Sallenave Date: Thu, 9 Apr 2015 17:55:26 +0000 (+0000) Subject: Refactoring and enhancement to FMA combine. X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=commitdiff_plain;h=ef67194fd21d6ec7767fdbe2c7774533f962f976 Refactoring and enhancement to FMA combine. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@234513 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 3902d4ec969..f8f52339e8c 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -308,6 +308,9 @@ namespace { SDValue visitMLOAD(SDNode *N); SDValue visitMSTORE(SDNode *N); + SDValue visitFADDForFMACombine(SDNode *N); + SDValue visitFSUBForFMACombine(SDNode *N); + SDValue XformToShuffleWithZero(SDNode *N); SDValue ReassociateOps(unsigned Opc, SDLoc DL, SDValue LHS, SDValue RHS); @@ -7057,20 +7060,40 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) { return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BV), VT, Ops); } -// Attempt different variants of (fadd (fmul a, b), c) -> fma or fmad -static SDValue performFaddFmulCombines(unsigned FusedOpcode, - bool Aggressive, - SDNode *N, - const TargetLowering &TLI, - SelectionDAG &DAG) { +/// Try to perform FMA combining on a given FADD node. +SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); + SDLoc SL(N); + + const TargetOptions &Options = DAG.getTarget().Options; + bool UnsafeFPMath = (Options.AllowFPOpFusion == FPOpFusion::Fast || + Options.UnsafeFPMath); + + // Floating-point multiply-add with intermediate rounding. + bool HasFMAD = (LegalOperations && + TLI.isOperationLegal(ISD::FMAD, VT)); + + // Floating-point multiply-add without intermediate rounding. + bool HasFMA = ((!LegalOperations || + TLI.isOperationLegalOrCustom(ISD::FMA, VT)) && + TLI.isFMAFasterThanFMulAndFAdd(VT) && + UnsafeFPMath); + + // No valid opcode, do not combine. + if (!HasFMAD && !HasFMA) + return SDValue(); + + // Always prefer FMAD to FMA for precision. + unsigned int PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; + bool Aggressive = TLI.enableAggressiveFMAFusion(VT); + bool LookThroughFPExt = TLI.isFPExtFree(VT); // fold (fadd (fmul x, y), z) -> (fma x, y, z) if (N0.getOpcode() == ISD::FMUL && (Aggressive || N0->hasOneUse())) { - return DAG.getNode(FusedOpcode, SDLoc(N), VT, + return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), N1); } @@ -7078,53 +7101,176 @@ static SDValue performFaddFmulCombines(unsigned FusedOpcode, // Note: Commutes FADD operands. if (N1.getOpcode() == ISD::FMUL && (Aggressive || N1->hasOneUse())) { - return DAG.getNode(FusedOpcode, SDLoc(N), VT, + return DAG.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0), N1.getOperand(1), N0); } + // Look through FP_EXTEND nodes to do more combining. + if (UnsafeFPMath && LookThroughFPExt) { + // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) + if (N0.getOpcode() == ISD::FP_EXTEND) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == ISD::FMUL) + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N00.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N00.getOperand(1)), N1); + } + + // fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x) + // Note: Commutes FADD operands. + if (N1.getOpcode() == ISD::FP_EXTEND) { + SDValue N10 = N1.getOperand(0); + if (N10.getOpcode() == ISD::FMUL) + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N10.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N10.getOperand(1)), N0); + } + } + // More folding opportunities when target permits. - if (Aggressive) { + if (UnsafeFPMath && Aggressive) { // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z)) - if (N0.getOpcode() == ISD::FMA && + if (N0.getOpcode() == PreferredFusedOpcode && N0.getOperand(2).getOpcode() == ISD::FMUL) { - return DAG.getNode(FusedOpcode, SDLoc(N), VT, + return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), - DAG.getNode(FusedOpcode, SDLoc(N), VT, + DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(2).getOperand(0), N0.getOperand(2).getOperand(1), N1)); } // fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x)) - if (N1->getOpcode() == ISD::FMA && + if (N1->getOpcode() == PreferredFusedOpcode && N1.getOperand(2).getOpcode() == ISD::FMUL) { - return DAG.getNode(FusedOpcode, SDLoc(N), VT, + return DAG.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0), N1.getOperand(1), - DAG.getNode(FusedOpcode, SDLoc(N), VT, + DAG.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(2).getOperand(0), N1.getOperand(2).getOperand(1), N0)); } + + if (LookThroughFPExt) { + // fold (fadd (fma x, y, (fpext (fmul u, v))), z) + // -> (fma x, y, (fma (fpext u), (fpext v), z)) + auto FoldFAddFMAFPExtFMul = [&] ( + SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) { + return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y, + DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, U), + DAG.getNode(ISD::FP_EXTEND, SL, VT, V), + Z)); + }; + if (N0.getOpcode() == PreferredFusedOpcode) { + SDValue N02 = N0.getOperand(2); + if (N02.getOpcode() == ISD::FP_EXTEND) { + SDValue N020 = N02.getOperand(0); + if (N020.getOpcode() == ISD::FMUL) + return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1), + N020.getOperand(0), N020.getOperand(1), + N1); + } + } + + // fold (fadd (fpext (fma x, y, (fmul u, v))), z) + // -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z)) + // FIXME: This turns two single-precision and one double-precision + // operation into two double-precision operations, which might not be + // interesting for all targets, especially GPUs. + auto FoldFAddFPExtFMAFMul = [&] ( + SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) { + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, X), + DAG.getNode(ISD::FP_EXTEND, SL, VT, Y), + DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, U), + DAG.getNode(ISD::FP_EXTEND, SL, VT, V), + Z)); + }; + if (N0.getOpcode() == ISD::FP_EXTEND) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == PreferredFusedOpcode) { + SDValue N002 = N00.getOperand(2); + if (N002.getOpcode() == ISD::FMUL) + return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1), + N002.getOperand(0), N002.getOperand(1), + N1); + } + } + + // fold (fadd x, (fma y, z, (fpext (fmul u, v))) + // -> (fma y, z, (fma (fpext u), (fpext v), x)) + if (N1.getOpcode() == PreferredFusedOpcode) { + SDValue N12 = N1.getOperand(2); + if (N12.getOpcode() == ISD::FP_EXTEND) { + SDValue N120 = N12.getOperand(0); + if (N120.getOpcode() == ISD::FMUL) + return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1), + N120.getOperand(0), N120.getOperand(1), + N0); + } + } + + // fold (fadd x, (fpext (fma y, z, (fmul u, v))) + // -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x)) + // FIXME: This turns two single-precision and one double-precision + // operation into two double-precision operations, which might not be + // interesting for all targets, especially GPUs. + if (N1.getOpcode() == ISD::FP_EXTEND) { + SDValue N10 = N1.getOperand(0); + if (N10.getOpcode() == PreferredFusedOpcode) { + SDValue N102 = N10.getOperand(2); + if (N102.getOpcode() == ISD::FMUL) + return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1), + N102.getOperand(0), N102.getOperand(1), + N0); + } + } + } } return SDValue(); } -static SDValue performFsubFmulCombines(unsigned FusedOpcode, - bool Aggressive, - SDNode *N, - const TargetLowering &TLI, - SelectionDAG &DAG) { +/// Try to perform FMA combining on a given FSUB node. +SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); - SDLoc SL(N); + const TargetOptions &Options = DAG.getTarget().Options; + bool UnsafeFPMath = (Options.AllowFPOpFusion == FPOpFusion::Fast || + Options.UnsafeFPMath); + + // Floating-point multiply-add with intermediate rounding. + bool HasFMAD = (LegalOperations && + TLI.isOperationLegal(ISD::FMAD, VT)); + + // Floating-point multiply-add without intermediate rounding. + bool HasFMA = ((!LegalOperations || + TLI.isOperationLegalOrCustom(ISD::FMA, VT)) && + TLI.isFMAFasterThanFMulAndFAdd(VT) && + UnsafeFPMath); + + // No valid opcode, do not combine. + if (!HasFMAD && !HasFMA) + return SDValue(); + + // Always prefer FMAD to FMA for precision. + unsigned int PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; + bool Aggressive = TLI.enableAggressiveFMAFusion(VT); + bool LookThroughFPExt = TLI.isFPExtFree(VT); + // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) if (N0.getOpcode() == ISD::FMUL && (Aggressive || N0->hasOneUse())) { - return DAG.getNode(FusedOpcode, SL, VT, + return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, N1)); } @@ -7133,7 +7279,7 @@ static SDValue performFsubFmulCombines(unsigned FusedOpcode, // Note: Commutes FSUB operands. if (N1.getOpcode() == ISD::FMUL && (Aggressive || N1->hasOneUse())) - return DAG.getNode(FusedOpcode, SL, VT, + return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1), N0); @@ -7144,41 +7290,213 @@ static SDValue performFsubFmulCombines(unsigned FusedOpcode, (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) { SDValue N00 = N0.getOperand(0).getOperand(0); SDValue N01 = N0.getOperand(0).getOperand(1); - return DAG.getNode(FusedOpcode, SL, VT, + return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, N00), N01, DAG.getNode(ISD::FNEG, SL, VT, N1)); } + // Look through FP_EXTEND nodes to do more combining. + if (UnsafeFPMath && LookThroughFPExt) { + // fold (fsub (fpext (fmul x, y)), z) + // -> (fma (fpext x), (fpext y), (fneg z)) + if (N0.getOpcode() == ISD::FP_EXTEND) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == ISD::FMUL) + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N00.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N00.getOperand(1)), + DAG.getNode(ISD::FNEG, SL, VT, N1)); + } + + // fold (fsub x, (fpext (fmul y, z))) + // -> (fma (fneg (fpext y)), (fpext z), x) + // Note: Commutes FSUB operands. + if (N1.getOpcode() == ISD::FP_EXTEND) { + SDValue N10 = N1.getOperand(0); + if (N10.getOpcode() == ISD::FMUL) + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N10.getOperand(0))), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N10.getOperand(1)), + N0); + } + + // fold (fsub (fpext (fneg (fmul, x, y))), z) + // -> (fneg (fma (fpext x), (fpext y), z)) + // Note: This could be removed with appropriate canonicalization of the + // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the + // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent + // from implementing the canonicalization in visitFSUB. + if (N0.getOpcode() == ISD::FP_EXTEND) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == ISD::FNEG) { + SDValue N000 = N00.getOperand(0); + if (N000.getOpcode() == ISD::FMUL) { + return DAG.getNode(ISD::FNEG, SL, VT, + DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N000.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N000.getOperand(1)), + N1)); + } + } + } + + // fold (fsub (fneg (fpext (fmul, x, y))), z) + // -> (fneg (fma (fpext x)), (fpext y), z) + // Note: This could be removed with appropriate canonicalization of the + // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the + // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent + // from implementing the canonicalization in visitFSUB. + if (N0.getOpcode() == ISD::FNEG) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == ISD::FP_EXTEND) { + SDValue N000 = N00.getOperand(0); + if (N000.getOpcode() == ISD::FMUL) { + return DAG.getNode(ISD::FNEG, SL, VT, + DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N000.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N000.getOperand(1)), + N1)); + } + } + } + + } + // More folding opportunities when target permits. - if (Aggressive) { + if (UnsafeFPMath && Aggressive) { // fold (fsub (fma x, y, (fmul u, v)), z) // -> (fma x, y (fma u, v, (fneg z))) - if (N0.getOpcode() == FusedOpcode && + if (N0.getOpcode() == PreferredFusedOpcode && N0.getOperand(2).getOpcode() == ISD::FMUL) { - return DAG.getNode(FusedOpcode, SDLoc(N), VT, + return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), - DAG.getNode(FusedOpcode, SDLoc(N), VT, + DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(2).getOperand(0), N0.getOperand(2).getOperand(1), - DAG.getNode(ISD::FNEG, SDLoc(N), VT, + DAG.getNode(ISD::FNEG, SL, VT, N1))); } // fold (fsub x, (fma y, z, (fmul u, v))) // -> (fma (fneg y), z, (fma (fneg u), v, x)) - if (N1.getOpcode() == FusedOpcode && + if (N1.getOpcode() == PreferredFusedOpcode && N1.getOperand(2).getOpcode() == ISD::FMUL) { SDValue N20 = N1.getOperand(2).getOperand(0); SDValue N21 = N1.getOperand(2).getOperand(1); - return DAG.getNode(FusedOpcode, SDLoc(N), VT, - DAG.getNode(ISD::FNEG, SDLoc(N), VT, + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1), - DAG.getNode(FusedOpcode, SDLoc(N), VT, - DAG.getNode(ISD::FNEG, SDLoc(N), VT, - N20), + DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, N20), N21, N0)); } + + if (LookThroughFPExt) { + // fold (fsub (fma x, y, (fpext (fmul u, v))), z) + // -> (fma x, y (fma (fpext u), (fpext v), (fneg z))) + if (N0.getOpcode() == PreferredFusedOpcode) { + SDValue N02 = N0.getOperand(2); + if (N02.getOpcode() == ISD::FP_EXTEND) { + SDValue N020 = N02.getOperand(0); + if (N020.getOpcode() == ISD::FMUL) + return DAG.getNode(PreferredFusedOpcode, SL, VT, + N0.getOperand(0), N0.getOperand(1), + DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N020.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N020.getOperand(1)), + DAG.getNode(ISD::FNEG, SL, VT, + N1))); + } + } + + // fold (fsub (fpext (fma x, y, (fmul u, v))), z) + // -> (fma (fpext x), (fpext y), + // (fma (fpext u), (fpext v), (fneg z))) + // FIXME: This turns two single-precision and one double-precision + // operation into two double-precision operations, which might not be + // interesting for all targets, especially GPUs. + if (N0.getOpcode() == ISD::FP_EXTEND) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == PreferredFusedOpcode) { + SDValue N002 = N00.getOperand(2); + if (N002.getOpcode() == ISD::FMUL) + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N00.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N00.getOperand(1)), + DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N002.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N002.getOperand(1)), + DAG.getNode(ISD::FNEG, SL, VT, + N1))); + } + } + + // fold (fsub x, (fma y, z, (fpext (fmul u, v)))) + // -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x)) + if (N1.getOpcode() == PreferredFusedOpcode && + N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) { + SDValue N120 = N1.getOperand(2).getOperand(0); + if (N120.getOpcode() == ISD::FMUL) { + SDValue N1200 = N120.getOperand(0); + SDValue N1201 = N120.getOperand(1); + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), + N1.getOperand(1), + DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, + VT, N1200)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N1201), + N0)); + } + } + + // fold (fsub x, (fpext (fma y, z, (fmul u, v)))) + // -> (fma (fneg (fpext y)), (fpext z), + // (fma (fneg (fpext u)), (fpext v), x)) + // FIXME: This turns two single-precision and one double-precision + // operation into two double-precision operations, which might not be + // interesting for all targets, especially GPUs. + if (N1.getOpcode() == ISD::FP_EXTEND && + N1.getOperand(0).getOpcode() == PreferredFusedOpcode) { + SDValue N100 = N1.getOperand(0).getOperand(0); + SDValue N101 = N1.getOperand(0).getOperand(1); + SDValue N102 = N1.getOperand(0).getOperand(2); + if (N102.getOpcode() == ISD::FMUL) { + SDValue N1020 = N102.getOperand(0); + SDValue N1021 = N102.getOperand(1); + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N100)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, N101), + DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, + VT, N1020)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N1021), + N0)); + } + } + } } return SDValue(); @@ -7322,55 +7640,11 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { } } // enable-unsafe-fp-math - if (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)) { - // Assume if there is an fmad instruction that it should be aggressively - // used. - if (SDValue Fused = performFaddFmulCombines(ISD::FMAD, true, N, TLI, DAG)) - return Fused; - } - // FADD -> FMA combines: - if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) && - TLI.isFMAFasterThanFMulAndFAdd(VT) && - (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) { - - if (!TLI.isOperationLegal(ISD::FMAD, VT)) { - // Don't form FMA if we are preferring FMAD. - if (SDValue Fused - = performFaddFmulCombines(ISD::FMA, - TLI.enableAggressiveFMAFusion(VT), - N, TLI, DAG)) { - return Fused; - } - } - - // When FP_EXTEND nodes are free on the target, and there is an opportunity - // to combine into FMA, arrange such nodes accordingly. - if (TLI.isFPExtFree(VT)) { - - // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) - if (N0.getOpcode() == ISD::FP_EXTEND) { - SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == ISD::FMUL) - return DAG.getNode(ISD::FMA, SDLoc(N), VT, - DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, - N00.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, - N00.getOperand(1)), N1); - } - - // fold (fadd x, (fpext (fmul y, z)), z) -> (fma (fpext y), (fpext z), x) - // Note: Commutes FADD operands. - if (N1.getOpcode() == ISD::FP_EXTEND) { - SDValue N10 = N1.getOperand(0); - if (N10.getOpcode() == ISD::FMUL) - return DAG.getNode(ISD::FMA, SDLoc(N), VT, - DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, - N10.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, - N10.getOperand(1)), N0); - } - } + SDValue Fused = visitFADDForFMACombine(N); + if (Fused) { + AddToWorklist(Fused.getNode()); + return Fused; } return SDValue(); @@ -7431,96 +7705,11 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { } } - if (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)) { - // Assume if there is an fmad instruction that it should be aggressively - // used. - if (SDValue Fused = performFsubFmulCombines(ISD::FMAD, true, N, TLI, DAG)) - return Fused; - } - // FSUB -> FMA combines: - if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) && - TLI.isFMAFasterThanFMulAndFAdd(VT) && - (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) { - - if (!TLI.isOperationLegal(ISD::FMAD, VT)) { - // Don't form FMA if we are preferring FMAD. - - if (SDValue Fused - = performFsubFmulCombines(ISD::FMA, - TLI.enableAggressiveFMAFusion(VT), - N, TLI, DAG)) { - return Fused; - } - } - - // When FP_EXTEND nodes are free on the target, and there is an opportunity - // to combine into FMA, arrange such nodes accordingly. - if (TLI.isFPExtFree(VT)) { - // fold (fsub (fpext (fmul x, y)), z) - // -> (fma (fpext x), (fpext y), (fneg z)) - if (N0.getOpcode() == ISD::FP_EXTEND) { - SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == ISD::FMUL) - return DAG.getNode(ISD::FMA, SDLoc(N), VT, - DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, - N00.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, - N00.getOperand(1)), - DAG.getNode(ISD::FNEG, SDLoc(N), VT, N1)); - } - - // fold (fsub x, (fpext (fmul y, z))) - // -> (fma (fneg (fpext y)), (fpext z), x) - // Note: Commutes FSUB operands. - if (N1.getOpcode() == ISD::FP_EXTEND) { - SDValue N10 = N1.getOperand(0); - if (N10.getOpcode() == ISD::FMUL) - return DAG.getNode(ISD::FMA, SDLoc(N), VT, - DAG.getNode(ISD::FNEG, SDLoc(N), VT, - DAG.getNode(ISD::FP_EXTEND, SDLoc(N), - VT, N10.getOperand(0))), - DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, - N10.getOperand(1)), - N0); - } - - // fold (fsub (fpext (fneg (fmul, x, y))), z) - // -> (fma (fneg (fpext x)), (fpext y), (fneg z)) - if (N0.getOpcode() == ISD::FP_EXTEND) { - SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == ISD::FNEG) { - SDValue N000 = N00.getOperand(0); - if (N000.getOpcode() == ISD::FMUL) { - return DAG.getNode(ISD::FMA, dl, VT, - DAG.getNode(ISD::FNEG, dl, VT, - DAG.getNode(ISD::FP_EXTEND, SDLoc(N), - VT, N000.getOperand(0))), - DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, - N000.getOperand(1)), - DAG.getNode(ISD::FNEG, dl, VT, N1)); - } - } - } - - // fold (fsub (fneg (fpext (fmul, x, y))), z) - // -> (fma (fneg (fpext x)), (fpext y), (fneg z)) - if (N0.getOpcode() == ISD::FNEG) { - SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == ISD::FP_EXTEND) { - SDValue N000 = N00.getOperand(0); - if (N000.getOpcode() == ISD::FMUL) { - return DAG.getNode(ISD::FMA, dl, VT, - DAG.getNode(ISD::FNEG, dl, VT, - DAG.getNode(ISD::FP_EXTEND, SDLoc(N), - VT, N000.getOperand(0))), - DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, - N000.getOperand(1)), - DAG.getNode(ISD::FNEG, dl, VT, N1)); - } - } - } - } + SDValue Fused = visitFSUBForFMACombine(N); + if (Fused) { + AddToWorklist(Fused.getNode()); + return Fused; } return SDValue(); diff --git a/test/CodeGen/PowerPC/fma-assoc.ll b/test/CodeGen/PowerPC/fma-assoc.ll index dc1316e5e24..363d7b7ce48 100644 --- a/test/CodeGen/PowerPC/fma-assoc.ll +++ b/test/CodeGen/PowerPC/fma-assoc.ll @@ -77,3 +77,159 @@ define double @test_FMSUB_ASSOC2(double %A, double %B, double %C, ; CHECK-VSX-NEXT: blr } +define double @test_FMADD_ASSOC_EXT1(float %A, float %B, double %C, + double %D, double %E) { + %F = fmul float %A, %B ; [#uses=1] + %G = fpext float %F to double ; [#uses=1] + %H = fmul double %C, %D ; [#uses=1] + %I = fadd double %H, %G ; [#uses=1] + %J = fadd double %I, %E ; [#uses=1] + ret double %J +; CHECK-LABEL: test_FMADD_ASSOC_EXT1: +; CHECK: fmadd +; CHECK-NEXT: fmadd +; CHECK-NEXT: blr + +; CHECK-VSX-LABEL: test_FMADD_ASSOC_EXT1: +; CHECK-VSX: xsmaddmdp +; CHECK-VSX-NEXT: xsmaddadp +; CHECK-VSX-NEXT: blr +} + +define double @test_FMADD_ASSOC_EXT2(float %A, float %B, float %C, + float %D, double %E) { + %F = fmul float %A, %B ; [#uses=1] + %G = fmul float %C, %D ; [#uses=1] + %H = fadd float %F, %G ; [#uses=1] + %I = fpext float %H to double ; [#uses=1] + %J = fadd double %I, %E ; [#uses=1] + ret double %J +; CHECK-LABEL: test_FMADD_ASSOC_EXT2: +; CHECK: fmadd +; CHECK-NEXT: fmadd +; CHECK-NEXT: blr + +; CHECK-VSX-LABEL: test_FMADD_ASSOC_EXT2: +; CHECK-VSX: xsmaddmdp +; CHECK-VSX-NEXT: xsmaddadp +; CHECK-VSX-NEXT: fmr +; CHECK-VSX-NEXT: blr +} + +define double @test_FMADD_ASSOC_EXT3(float %A, float %B, double %C, + double %D, double %E) { + %F = fmul float %A, %B ; [#uses=1] + %G = fpext float %F to double ; [#uses=1] + %H = fmul double %C, %D ; [#uses=1] + %I = fadd double %H, %G ; [#uses=1] + %J = fadd double %E, %I ; [#uses=1] + ret double %J +; CHECK-LABEL: test_FMADD_ASSOC_EXT3: +; CHECK: fmadd +; CHECK-NEXT: fmadd +; CHECK-NEXT: blr + +; CHECK-VSX-LABEL: test_FMADD_ASSOC_EXT3: +; CHECK-VSX: xsmaddmdp +; CHECK-VSX-NEXT: xsmaddadp +; CHECK-VSX-NEXT: blr +} + +define double @test_FMADD_ASSOC_EXT4(float %A, float %B, float %C, + float %D, double %E) { + %F = fmul float %A, %B ; [#uses=1] + %G = fmul float %C, %D ; [#uses=1] + %H = fadd float %F, %G ; [#uses=1] + %I = fpext float %H to double ; [#uses=1] + %J = fadd double %E, %I ; [#uses=1] + ret double %J +; CHECK-LABEL: test_FMADD_ASSOC_EXT4: +; CHECK: fmadd +; CHECK-NEXT: fmadd +; CHECK-NEXT: blr + +; CHECK-VSX-LABEL: test_FMADD_ASSOC_EXT4: +; CHECK-VSX: xsmaddmdp +; CHECK-VSX-NEXT: xsmaddadp +; CHECK-VSX-NEXT: fmr +; CHECK-VSX-NEXT: blr +} + +define double @test_FMSUB_ASSOC_EXT1(float %A, float %B, double %C, + double %D, double %E) { + %F = fmul float %A, %B ; [#uses=1] + %G = fpext float %F to double ; [#uses=1] + %H = fmul double %C, %D ; [#uses=1] + %I = fadd double %H, %G ; [#uses=1] + %J = fsub double %I, %E ; [#uses=1] + ret double %J +; CHECK-LABEL: test_FMSUB_ASSOC_EXT1: +; CHECK: fmsub +; CHECK-NEXT: fmadd +; CHECK-NEXT: blr + +; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT1: +; CHECK-VSX: xsmsubmdp +; CHECK-VSX-NEXT: xsmaddadp +; CHECK-VSX-NEXT: blr +} + +define double @test_FMSUB_ASSOC_EXT2(float %A, float %B, float %C, + float %D, double %E) { + %F = fmul float %A, %B ; [#uses=1] + %G = fmul float %C, %D ; [#uses=1] + %H = fadd float %F, %G ; [#uses=1] + %I = fpext float %H to double ; [#uses=1] + %J = fsub double %I, %E ; [#uses=1] + ret double %J +; CHECK-LABEL: test_FMSUB_ASSOC_EXT2: +; CHECK: fmsub +; CHECK-NEXT: fmadd +; CHECK-NEXT: blr + +; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT2: +; CHECK-VSX: xsmsubmdp +; CHECK-VSX-NEXT: xsmaddadp +; CHECK-VSX-NEXT: fmr +; CHECK-VSX-NEXT: blr +} + +define double @test_FMSUB_ASSOC_EXT3(float %A, float %B, double %C, + double %D, double %E) { + %F = fmul float %A, %B ; [#uses=1] + %G = fpext float %F to double ; [#uses=1] + %H = fmul double %C, %D ; [#uses=1] + %I = fadd double %H, %G ; [#uses=1] + %J = fsub double %E, %I ; [#uses=1] + ret double %J +; CHECK-LABEL: test_FMSUB_ASSOC_EXT3: +; CHECK: fnmsub +; CHECK-NEXT: fnmsub +; CHECK-NEXT: blr + +; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT3: +; CHECK-VSX: xsnmsubmdp +; CHECK-VSX-NEXT: xsnmsubadp +; CHECK-VSX-NEXT: fmr +; CHECK-VSX-NEXT: blr +} + +define double @test_FMSUB_ASSOC_EXT4(float %A, float %B, float %C, + float %D, double %E) { + %F = fmul float %A, %B ; [#uses=1] + %G = fmul float %C, %D ; [#uses=1] + %H = fadd float %F, %G ; [#uses=1] + %I = fpext float %H to double ; [#uses=1] + %J = fsub double %E, %I ; [#uses=1] + ret double %J +; CHECK-LABEL: test_FMSUB_ASSOC_EXT4: +; CHECK: fnmsub +; CHECK-NEXT: fnmsub +; CHECK-NEXT: blr + +; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT4: +; CHECK-VSX: xsnmsubmdp +; CHECK-VSX-NEXT: xsnmsubadp +; CHECK-VSX-NEXT: fmr +; CHECK-VSX-NEXT: blr +} \ No newline at end of file diff --git a/test/CodeGen/PowerPC/fma-ext.ll b/test/CodeGen/PowerPC/fma-ext.ll index 56825ce8f22..9ab32a659cb 100644 --- a/test/CodeGen/PowerPC/fma-ext.ll +++ b/test/CodeGen/PowerPC/fma-ext.ll @@ -65,13 +65,11 @@ define double @test_FMSUB_EXT3(float %A, float %B, double %C) { %G = fsub double %F, %C ; [#uses=1] ret double %G ; CHECK-LABEL: test_FMSUB_EXT3: -; CHECK: fneg -; CHECK-NEXT: fmsub +; CHECK: fnmadd ; CHECK-NEXT: blr ; CHECK-VSX-LABEL: test_FMSUB_EXT3: -; CHECK-VSX: xsnegdp -; CHECK-VSX-NEXT: xsmsubmdp +; CHECK-VSX: xsnmaddmdp ; CHECK-VSX-NEXT: blr } @@ -82,12 +80,10 @@ define double @test_FMSUB_EXT4(float %A, float %B, double %C) { %G = fsub double %F, %C ; [#uses=1] ret double %G ; CHECK-LABEL: test_FMSUB_EXT4: -; CHECK: fneg -; CHECK-NEXT: fmsub +; CHECK: fnmadd ; CHECK-NEXT: blr ; CHECK-VSX-LABEL: test_FMSUB_EXT4: -; CHECK-VSX: xsnegdp -; CHECK-VSX-NEXT: xsmsubmdp +; CHECK-VSX: xsnmaddmdp ; CHECK-VSX-NEXT: blr -} \ No newline at end of file +}