From: Sanjay Patel Date: Fri, 24 Oct 2014 17:02:16 +0000 (+0000) Subject: Use rsqrt (X86) to speed up reciprocal square root calcs X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=commitdiff_plain;h=a46f06efe217b3c3328d9ececf8de0c8c7dee446;ds=sidebyside Use rsqrt (X86) to speed up reciprocal square root calcs This is a first step for generating SSE rsqrt instructions for reciprocal square root calcs when fast-math is allowed. For now, be conservative and only enable this for AMD btver2 where performance improves significantly - for example, 29% on llvm/projects/test-suite/SingleSource/Benchmarks/BenchmarkGame/n-body.c (if we convert the data type to single-precision float). This patch adds a two constant version of the Newton-Raphson refinement algorithm to DAGCombiner that can be selected by any target via a parameter returned by getRsqrtEstimate().. See PR20900 for more details: http://llvm.org/bugs/show_bug.cgi?id=20900 Differential Revision: http://reviews.llvm.org/D5658 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@220570 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index c6eac43de95..95b050ac7b8 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -2652,13 +2652,16 @@ public: /// The RefinementSteps output is the number of Newton-Raphson refinement /// iterations required to generate a sufficient (though not necessarily /// IEEE-754 compliant) estimate for the value type. + /// The boolean UseOneConstNR output is used to select a Newton-Raphson + /// algorithm implementation that uses one constant or two constants. /// A target may choose to implement its own refinement within this function. /// If that's true, then return '0' as the number of RefinementSteps to avoid /// any further refinement of the estimate. /// An empty SDValue return means no estimate sequence can be created. virtual SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, - unsigned &RefinementSteps) const { + unsigned &RefinementSteps, + bool &UseOneConstNR) const { return SDValue(); } diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 6fe4ade4aa8..4ad2e5dd492 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -331,6 +331,8 @@ namespace { SDValue BuildUDIV(SDNode *N); SDValue BuildReciprocalEstimate(SDValue Op); SDValue BuildRsqrtEstimate(SDValue Op); + SDValue BuildRsqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations); + SDValue BuildRsqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations); SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, bool DemandHighBits = true); SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1); @@ -7033,13 +7035,11 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { // into a target-specific square root estimate instruction. if (N1.getOpcode() == ISD::FSQRT) { if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0))) { - AddToWorklist(RV.getNode()); return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); } } else if (N1.getOpcode() == ISD::FP_EXTEND && N1.getOperand(0).getOpcode() == ISD::FSQRT) { if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0))) { - AddToWorklist(RV.getNode()); RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV); AddToWorklist(RV.getNode()); return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); @@ -7047,7 +7047,6 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { } else if (N1.getOpcode() == ISD::FP_ROUND && N1.getOperand(0).getOpcode() == ISD::FSQRT) { if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0))) { - AddToWorklist(RV.getNode()); RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1)); AddToWorklist(RV.getNode()); return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); @@ -7068,7 +7067,6 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { // We found a FSQRT, so try to make this fold: // x / (y * sqrt(z)) -> x * (rsqrt(z) / y) if (SDValue RV = BuildRsqrtEstimate(SqrtOp.getOperand(0))) { - AddToWorklist(RV.getNode()); RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp); AddToWorklist(RV.getNode()); return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); @@ -7116,7 +7114,6 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) { if (DAG.getTarget().Options.UnsafeFPMath) { // Compute this as X * (1/sqrt(X)) = X * (X ** -0.5) if (SDValue RV = BuildRsqrtEstimate(N->getOperand(0))) { - AddToWorklist(RV.getNode()); EVT VT = RV.getValueType(); RV = DAG.getNode(ISD::FMUL, SDLoc(N), VT, N->getOperand(0), RV); AddToWorklist(RV.getNode()); @@ -11985,49 +11982,89 @@ SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op) { return SDValue(); } -SDValue DAGCombiner::BuildRsqrtEstimate(SDValue Op) { - if (Level >= AfterLegalizeDAG) - return SDValue(); +/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) +/// For the reciprocal sqrt, we need to find the zero of the function: +/// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] +/// => +/// X_{i+1} = X_i (1.5 - A X_i^2 / 2) +/// As a result, we precompute A/2 prior to the iteration loop. +SDValue DAGCombiner::BuildRsqrtNROneConst(SDValue Arg, SDValue Est, + unsigned Iterations) { + EVT VT = Arg.getValueType(); + SDLoc DL(Arg); + SDValue ThreeHalves = DAG.getConstantFP(1.5, VT); + + // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that + // this entire sequence requires only one FP constant. + SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg); + AddToWorklist(HalfArg.getNode()); + + HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg); + AddToWorklist(HalfArg.getNode()); + + // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est) + for (unsigned i = 0; i < Iterations; ++i) { + SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est); + AddToWorklist(NewEst.getNode()); + + NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst); + AddToWorklist(NewEst.getNode()); + + NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst); + AddToWorklist(NewEst.getNode()); + + Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst); + AddToWorklist(Est.getNode()); + } + return Est; +} - // Expose the DAG combiner to the target combiner implementations. - TargetLowering::DAGCombinerInfo DCI(DAG, Level, false, this); - unsigned Iterations = 0; - if (SDValue Est = TLI.getRsqrtEstimate(Op, DCI, Iterations)) { - if (Iterations) { - // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) - // For the reciprocal sqrt, we need to find the zero of the function: - // F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] - // => - // X_{i+1} = X_i (1.5 - A X_i^2 / 2) - // As a result, we precompute A/2 prior to the iteration loop. - EVT VT = Op.getValueType(); - SDLoc DL(Op); - SDValue FPThreeHalves = DAG.getConstantFP(1.5, VT); +/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) +/// For the reciprocal sqrt, we need to find the zero of the function: +/// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] +/// => +/// X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0)) +SDValue DAGCombiner::BuildRsqrtNRTwoConst(SDValue Arg, SDValue Est, + unsigned Iterations) { + EVT VT = Arg.getValueType(); + SDLoc DL(Arg); + SDValue MinusThree = DAG.getConstantFP(-3.0, VT); + SDValue MinusHalf = DAG.getConstantFP(-0.5, VT); - AddToWorklist(Est.getNode()); + // Newton iterations: Est = -0.5 * Est * (-3.0 + Arg * Est * Est) + for (unsigned i = 0; i < Iterations; ++i) { + SDValue HalfEst = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf); + AddToWorklist(HalfEst.getNode()); - // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that - // this entire sequence requires only one FP constant. - SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, FPThreeHalves, Op); - AddToWorklist(HalfArg.getNode()); + Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Est); + AddToWorklist(Est.getNode()); - HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Op); - AddToWorklist(HalfArg.getNode()); + Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg); + AddToWorklist(Est.getNode()); - // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est) - for (unsigned i = 0; i < Iterations; ++i) { - SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est); - AddToWorklist(NewEst.getNode()); + Est = DAG.getNode(ISD::FADD, DL, VT, Est, MinusThree); + AddToWorklist(Est.getNode()); - NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst); - AddToWorklist(NewEst.getNode()); + Est = DAG.getNode(ISD::FMUL, DL, VT, Est, HalfEst); + AddToWorklist(Est.getNode()); + } + return Est; +} - NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPThreeHalves, NewEst); - AddToWorklist(NewEst.getNode()); +SDValue DAGCombiner::BuildRsqrtEstimate(SDValue Op) { + if (Level >= AfterLegalizeDAG) + return SDValue(); - Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst); - AddToWorklist(Est.getNode()); - } + // Expose the DAG combiner to the target combiner implementations. + TargetLowering::DAGCombinerInfo DCI(DAG, Level, false, this); + unsigned Iterations = 0; + bool UseOneConstNR = false; + if (SDValue Est = TLI.getRsqrtEstimate(Op, DCI, Iterations, UseOneConstNR)) { + AddToWorklist(Est.getNode()); + if (Iterations) { + Est = UseOneConstNR ? + BuildRsqrtNROneConst(Op, Est, Iterations) : + BuildRsqrtNRTwoConst(Op, Est, Iterations); } return Est; } diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index ef357f4b7f2..72d3a59192b 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -7466,7 +7466,8 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, - unsigned &RefinementSteps) const { + unsigned &RefinementSteps, + bool &UseOneConstNR) const { EVT VT = Operand.getValueType(); if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || @@ -7479,6 +7480,7 @@ SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand, RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3; if (VT.getScalarType() == MVT::f64) ++RefinementSteps; + UseOneConstNR = true; return DCI.DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand); } return SDValue(); diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 39f5987b9cb..7ae3673ecbf 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -702,7 +702,8 @@ namespace llvm { SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const; SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, - unsigned &RefinementSteps) const override; + unsigned &RefinementSteps, + bool &UseOneConstNR) const override; SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI, unsigned &RefinementSteps) const override; diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index b635f438650..5c88b5df443 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -182,6 +182,8 @@ def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true", "LEA instruction with certain arguments is slow">; def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", "INC and DEC instructions are slower than ADD and SUB">; +def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true", + "Use RSQRT* to optimize square root calculations">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -347,7 +349,8 @@ def : ProcessorModel<"btver2", BtVer2Model, [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B, FeaturePRFCHW, FeatureAES, FeaturePCLMUL, FeatureBMI, FeatureF16C, FeatureMOVBE, - FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>; + FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD, + FeatureUseSqrtEst]>; // Bulldozer def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index dbe3c4aee1c..b3541545403 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -14367,6 +14367,36 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } +/// The minimum architected relative accuracy is 2^-12. We need one +/// Newton-Raphson step to have a good float result (24 bits of precision). +SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const { + // FIXME: We should use instruction latency models to calculate the cost of + // each potential sequence, but this is very hard to do reliably because + // at least Intel's Core* chips have variable timing based on the number of + // significant digits in the divisor and/or sqrt operand. + if (!Subtarget->useSqrtEst()) + return SDValue(); + + EVT VT = Op.getValueType(); + + // SSE1 has rsqrtss and rsqrtps. + // TODO: Add support for AVX (v8f32) and AVX512 (v16f32). + // It is likely not profitable to do this for f64 because a double-precision + // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 + // instructions: convert to single, rsqrtss, convert back to double, refine + // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA + // along with FMA, this could be a throughput win. + if (Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) { + RefinementSteps = 1; + UseOneConstNR = false; + return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); + } + return SDValue(); +} + static bool isAllOnes(SDValue V) { ConstantSDNode *C = dyn_cast(V); return C && C->isAllOnesValue(); diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index e8e611d0719..e81a9d1209c 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -1017,6 +1017,11 @@ namespace llvm { /// Convert a comparison if required by the subtarget. SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const; + + /// Use rsqrt* to speed up sqrt calculations. + SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const override; }; namespace X86 { diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 413fc4b94db..82d62b4a365 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -278,6 +278,7 @@ void X86Subtarget::initializeEnvironment() { LEAUsesAG = false; SlowLEA = false; SlowIncDec = false; + UseSqrtEst = false; stackAlignment = 4; // FIXME: this is a known good value for Yonah. How about others? MaxInlineSizeThreshold = 128; diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 24ee5538347..cdecf2a9cb2 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -192,6 +192,11 @@ protected: /// SlowIncDec - True if INC and DEC instructions are slow when writing to flags bool SlowIncDec; + /// Use the RSQRT* instructions to optimize square root calculations. + /// For this to be profitable, the cost of FSQRT and FDIV must be + /// substantially higher than normal FP ops like FADD and FMUL. + bool UseSqrtEst; + /// Processor has AVX-512 PreFetch Instructions bool HasPFI; @@ -369,6 +374,7 @@ public: bool LEAusesAG() const { return LEAUsesAG; } bool slowLEA() const { return SlowLEA; } bool slowIncDec() const { return SlowIncDec; } + bool useSqrtEst() const { return UseSqrtEst; } bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } bool hasERI() const { return HasERI; } diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll index fc79e31e72e..65f719e4e17 100644 --- a/test/CodeGen/X86/sqrt-fastmath.ll +++ b/test/CodeGen/X86/sqrt-fastmath.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -mcpu=core2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 ; generated using "clang -S -O2 -ffast-math -emit-llvm sqrt.c" from ; #include @@ -52,9 +53,59 @@ entry: ret x86_fp80 %call } -; Function Attrs: nounwind readnone declare x86_fp80 @__sqrtl_finite(x86_fp80) #1 +; If the target's sqrtss and divss instructions are substantially +; slower than rsqrtss with a Newton-Raphson refinement, we should +; generate the estimate sequence. +define float @reciprocal_square_root(float %x) #0 { + %sqrt = tail call float @llvm.sqrt.f32(float %x) + %div = fdiv fast float 1.0, %sqrt + ret float %div + +; CHECK-LABEL: reciprocal_square_root: +; CHECK: sqrtss +; CHECK-NEXT: movss +; CHECK-NEXT: divss +; CHECK-NEXT: retq +; BTVER2-LABEL: reciprocal_square_root: +; BTVER2: vrsqrtss +; BTVER2-NEXT: vmulss +; BTVER2-NEXT: vmulss +; BTVER2-NEXT: vmulss +; BTVER2-NEXT: vaddss +; BTVER2-NEXT: vmulss +; BTVER2-NEXT: retq +} + +declare float @llvm.sqrt.f32(float) #1 + +; If the target's sqrtps and divps instructions are substantially +; slower than rsqrtps with a Newton-Raphson refinement, we should +; generate the estimate sequence. +define <4 x float> @reciprocal_square_root_v4f32(<4 x float> %x) #0 { + %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) + %div = fdiv fast <4 x float> , %sqrt + ret <4 x float> %div + +; CHECK-LABEL: reciprocal_square_root_v4f32: +; CHECK: sqrtps +; CHECK-NEXT: movaps +; CHECK-NEXT: divps +; CHECK-NEXT: retq +; BTVER2-LABEL: reciprocal_square_root_v4f32: +; BTVER2: vrsqrtps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vaddps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: retq +} + +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #1 + + attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } attributes #2 = { nounwind readnone }