From a4554c28976480cda89d72313cc717140d11ce6c Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 9 Oct 2014 21:26:35 +0000
Subject: [PATCH] Improve sqrt estimate algorithm (fast-math)

This patch changes the fast-math implementation for calculating sqrt(x) from:
y = 1 / (1 / sqrt(x))
to:
y = x * (1 / sqrt(x))

This has 2 benefits: less code / faster code and one less estimate instruction
that may lose precision.

The only target that will be affected (until http://reviews.llvm.org/D5658 is approved)
is PPC. The difference in codegen for PPC is 2 less flops for a single-precision sqrtf
or vector sqrtf and 4 less flops for a double-precision sqrt.
We also eliminate a constant load and extra register usage.

Differential Revision: http://reviews.llvm.org/D5682


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@219445 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 33 ++++++++++++------------
 test/CodeGen/PowerPC/recipest.ll         | 11 ++------
 2 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 73f661d7c04..76442fbda93 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7088,26 +7088,25 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
 
 SDValue DAGCombiner::visitFSQRT(SDNode *N) {
   if (DAG.getTarget().Options.UnsafeFPMath) {
-    // Compute this as 1/(1/sqrt(X)): the reciprocal of the reciprocal sqrt.
+    // Compute this as X * (1/sqrt(X)) = X * (X ** -0.5)
     if (SDValue RV = BuildRsqrtEstimate(N->getOperand(0))) {
       AddToWorklist(RV.getNode());
-      RV = BuildReciprocalEstimate(RV);
-      if (RV.getNode()) {
-        // Unfortunately, RV is now NaN if the input was exactly 0.
-        // Select out this case and force the answer to 0.
-        EVT VT = RV.getValueType();
-      
-        SDValue Zero = DAG.getConstantFP(0.0, VT);
-        SDValue ZeroCmp =
-          DAG.getSetCC(SDLoc(N), TLI.getSetCCResultType(*DAG.getContext(), VT),
-                       N->getOperand(0), Zero, ISD::SETEQ);
-        AddToWorklist(ZeroCmp.getNode());
-        AddToWorklist(RV.getNode());
+      EVT VT = RV.getValueType();
+      RV = DAG.getNode(ISD::FMUL, SDLoc(N), VT, N->getOperand(0), RV);
+      AddToWorklist(RV.getNode());
 
-        RV = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT,
-                         SDLoc(N), VT, ZeroCmp, Zero, RV);
-        return RV;
-      }
+      // Unfortunately, RV is now NaN if the input was exactly 0.
+      // Select out this case and force the answer to 0.
+      SDValue Zero = DAG.getConstantFP(0.0, VT);
+      SDValue ZeroCmp =
+        DAG.getSetCC(SDLoc(N), TLI.getSetCCResultType(*DAG.getContext(), VT),
+                     N->getOperand(0), Zero, ISD::SETEQ);
+      AddToWorklist(ZeroCmp.getNode());
+      AddToWorklist(RV.getNode());
+
+      RV = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT,
+                       SDLoc(N), VT, ZeroCmp, Zero, RV);
+      return RV;
     }
   }
   return SDValue();
diff --git a/test/CodeGen/PowerPC/recipest.ll b/test/CodeGen/PowerPC/recipest.ll
index de74c043ece..2f6a3eca488 100644
--- a/test/CodeGen/PowerPC/recipest.ll
+++ b/test/CodeGen/PowerPC/recipest.ll
@@ -197,11 +197,7 @@ define double @foo3(double %a) nounwind {
 ; CHECK-NEXT: fmul
 ; CHECK-NEXT: fmadd
 ; CHECK-NEXT: fmul
-; CHECK-NEXT: fre
-; CHECK-NEXT: fnmsub
-; CHECK-NEXT: fmadd
-; CHECK-NEXT: fnmsub
-; CHECK-NEXT: fmadd
+; CHECK-NEXT: fmul
 ; CHECK: blr
 
 ; CHECK-SAFE: @foo3
@@ -220,9 +216,7 @@ define float @goo3(float %a) nounwind {
 ; CHECK: fmuls
 ; CHECK-NEXT: fmadds
 ; CHECK-NEXT: fmuls
-; CHECK-NEXT: fres
-; CHECK-NEXT: fnmsubs
-; CHECK-NEXT: fmadds
+; CHECK-NEXT: fmuls
 ; CHECK: blr
 
 ; CHECK-SAFE: @goo3
@@ -236,7 +230,6 @@ define <4 x float> @hoo3(<4 x float> %a) nounwind {
 
 ; CHECK: @hoo3
 ; CHECK: vrsqrtefp
-; CHECK-DAG: vrefp
 ; CHECK-DAG: vcmpeqfp
 
 ; CHECK-SAFE: @hoo3
-- 
2.34.1