From: Hal Finkel <hfinkel@anl.gov>
Date: Fri, 19 Sep 2014 11:42:56 +0000 (+0000)
Subject: Optionally enable more-aggressive FMA formation in DAGCombine
X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=c404e8208cf90eb12ec9f7c11ab94f8e63a9bea3;p=oota-llvm.git

Optionally enable more-aggressive FMA formation in DAGCombine

The heuristic used by DAGCombine to form FMAs checks that the FMUL has only one
use, but this is overly-conservative on some systems. Specifically, if the FMA
and the FADD have the same latency (and the FMA does not compete for resources
with the FMUL any more than the FADD does), there is no need for the
restriction, and furthermore, forming the FMA leaving the FMUL can still allow
for higher overall throughput and decreased critical-path length.

Here we add a new TLI callback, enableAggressiveFMAFusion, false by default, to
elide the hasOneUse check. This is enabled for PowerPC by default, as most
PowerPC systems will benefit.

Patch by Olivier Sallenave, thanks!

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218120 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 54d8b99ad45..e6c4634079c 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -268,6 +268,13 @@ public:
     return HasFloatingPointExceptions;
   }
 
+  /// Return true if target always beneficiates from combining into FMA for a
+  /// given value type. This must typically return false on targets where FMA
+  /// takes more cycles to execute than FADD.
+  virtual bool enableAggressiveFMAFusion(EVT VT) const {
+    return false;
+  }
+
   /// Return the ValueType of the result of SETCC operations.  Also used to
   /// obtain the target's preferred type for the condition operand of SELECT and
   /// BRCOND nodes.  In the case of BRCOND the argument passed is MVT::Other
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 013fbb0b004..aa2f2d1f2b1 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6684,13 +6684,15 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) {
 
     // fold (fadd (fmul x, y), z) -> (fma x, y, z)
-    if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse())
+    if (N0.getOpcode() == ISD::FMUL &&
+        (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
       return DAG.getNode(ISD::FMA, SDLoc(N), VT,
                          N0.getOperand(0), N0.getOperand(1), N1);
 
     // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
     // Note: Commutes FADD operands.
-    if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse())
+    if (N1.getOpcode() == ISD::FMUL &&
+        (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
       return DAG.getNode(ISD::FMA, SDLoc(N), VT,
                          N1.getOperand(0), N1.getOperand(1), N0);
   }
@@ -6762,14 +6764,16 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) {
 
     // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
-    if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse())
+    if (N0.getOpcode() == ISD::FMUL &&
+        (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
       return DAG.getNode(ISD::FMA, dl, VT,
                          N0.getOperand(0), N0.getOperand(1),
                          DAG.getNode(ISD::FNEG, dl, VT, N1));
 
     // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
     // Note: Commutes FSUB operands.
-    if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse())
+    if (N1.getOpcode() == ISD::FMUL &&
+        (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
       return DAG.getNode(ISD::FMA, dl, VT,
                          DAG.getNode(ISD::FNEG, dl, VT,
                          N1.getOperand(0)),
@@ -6778,7 +6782,8 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
     // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
     if (N0.getOpcode() == ISD::FNEG &&
         N0.getOperand(0).getOpcode() == ISD::FMUL &&
-        N0->hasOneUse() && N0.getOperand(0).hasOneUse()) {
+        ((N0->hasOneUse() && N0.getOperand(0).hasOneUse()) ||
+            TLI.enableAggressiveFMAFusion(VT))) {
       SDValue N00 = N0.getOperand(0).getOperand(0);
       SDValue N01 = N0.getOperand(0).getOperand(1);
       return DAG.getNode(ISD::FMA, dl, VT,
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 6eded03313d..fd188fe37e2 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -826,6 +826,11 @@ EVT PPCTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   return VT.changeVectorElementTypeToInteger();
 }
 
+bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const {
+  assert(VT.isFloatingPoint() && "Non-floating-point FMA?");
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // Node matching predicates, for use by the tblgen matching code.
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index de8f8366f96..c53dc83fa8a 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -360,6 +360,11 @@ namespace llvm {
     /// getSetCCResultType - Return the ISD::SETCC ValueType
     EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
 
+    /// Return true if target always beneficiates from combining into FMA for a
+    /// given value type. This must typically return false on targets where FMA
+    /// takes more cycles to execute than FADD.
+    bool enableAggressiveFMAFusion(EVT VT) const override;
+
     /// getPreIndexedAddressParts - returns true by value, base pointer and
     /// offset pointer and addressing mode by reference if the node's address
     /// can be legally represented as pre-indexed load / store address.
diff --git a/test/CodeGen/PowerPC/fma.ll b/test/CodeGen/PowerPC/fma.ll
index db19761b431..6bef3acf64b 100644
--- a/test/CodeGen/PowerPC/fma.ll
+++ b/test/CodeGen/PowerPC/fma.ll
@@ -1,8 +1,12 @@
 ; RUN: llc < %s -march=ppc32 -fp-contract=fast | FileCheck %s
 
+declare double @dummy1(double) #0
+declare double @dummy2(double, double) #0
+declare double @dummy3(double, double, double) #0
+
 define double @test_FMADD1(double %A, double %B, double %C) {
 	%D = fmul double %A, %B		; <double> [#uses=1]
-	%E = fadd double %D, %C		; <double> [#uses=1]
+	%E = fadd double %C, %D		; <double> [#uses=1]
 	ret double %E
 ; CHECK-LABEL: test_FMADD1:
 ; CHECK: fmadd
@@ -18,15 +22,26 @@ define double @test_FMADD2(double %A, double %B, double %C) {
 ; CHECK-NEXT: blr
 }
 
-define double @test_FMSUB(double %A, double %B, double %C) {
+define double @test_FMSUB1(double %A, double %B, double %C) {
 	%D = fmul double %A, %B		; <double> [#uses=1]
 	%E = fsub double %D, %C		; <double> [#uses=1]
 	ret double %E
-; CHECK-LABEL: test_FMSUB:
+; CHECK-LABEL: test_FMSUB1:
 ; CHECK: fmsub
 ; CHECK-NEXT: blr
 }
 
+define double @test_FMSUB2(double %A, double %B, double %C, double %D) {
+	%E = fmul double %A, %B 	; <double> [#uses=2]
+	%F = fadd double %E, %C 	; <double> [#uses=1]
+	%G = fsub double %E, %D 	; <double> [#uses=1]
+	%H = call double @dummy2(double %F, double %G)      ; <double> [#uses=1]
+	ret double %H
+; CHECK-LABEL: test_FMSUB2:
+; CHECK: fmadd
+; CHECK-NEXT: fmsub
+}
+
 define double @test_FNMADD1(double %A, double %B, double %C) {
 	%D = fmul double %A, %B		; <double> [#uses=1]
 	%E = fadd double %D, %C		; <double> [#uses=1]