From f031e8ad011e9ad95d7c965936da07e3a9c42add Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Fri, 1 Jan 2010 03:32:16 +0000 Subject: [PATCH] Teach codegen to lower llvm.powi to an efficient (but not optimal) multiply sequence when the power is a constant integer. Before, our codegen for std::pow(.., int) always turned into a libcall, which was really inefficient. This should also make many gfortran programs happier I'd imagine. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@92388 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../SelectionDAG/SelectionDAGBuilder.cpp | 62 +++++++++++++++++-- lib/Target/README.txt | 30 --------- test/CodeGen/X86/2007-09-27-LDIntrinsics.ll | 53 ++++++---------- test/CodeGen/X86/powi.ll | 11 ++++ 4 files changed, 86 insertions(+), 70 deletions(-) create mode 100644 test/CodeGen/X86/powi.ll diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 7747825f48c..db127f260ff 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -462,7 +462,8 @@ static void getCopyToParts(SelectionDAG &DAG, DebugLoc dl, unsigned Order, // The number of parts is a power of 2. Repeatedly bisect the value using // EXTRACT_ELEMENT. Parts[0] = DAG.getNode(ISD::BIT_CONVERT, dl, - EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits()), + EVT::getIntegerVT(*DAG.getContext(), + ValueVT.getSizeInBits()), Val); if (DisableScheduling) @@ -4261,6 +4262,59 @@ SelectionDAGBuilder::visitPow(CallInst &I) { setValue(&I, result); } + +/// ExpandPowI - Expand a llvm.powi intrinsic. +static SDValue ExpandPowI(DebugLoc DL, SDValue LHS, SDValue RHS, + SelectionDAG &DAG) { + // If RHS is a constant, we can expand this out to a multiplication tree, + // otherwise we end up lowering to a call to __powidf2 (for example). When + // optimizing for size, we only want to do this if the expansion would produce + // a small number of multiplies, otherwise we do the full expansion. + if (ConstantSDNode *RHSC = dyn_cast(RHS)) { + // Get the exponent as a positive value. + unsigned Val = RHSC->getSExtValue(); + if ((int)Val < 0) Val = -Val; + + // powi(x, 0) -> 1.0 + if (Val == 0) + return DAG.getConstantFP(1.0, LHS.getValueType()); + + Function *F = DAG.getMachineFunction().getFunction(); + if (!F->hasFnAttr(Attribute::OptimizeForSize) || + // If optimizing for size, don't insert too many multiplies. This + // inserts up to 5 multiplies. + CountPopulation_32(Val)+Log2_32(Val) < 7) { + // We use the simple binary decomposition method to generate the multiply + // sequence. There are more optimal ways to do this (for example, + // powi(x,15) generates one more multiply than it should), but this has + // the benefit of being both really simple and much better than a libcall. + SDValue Res; // Logically starts equal to 1.0 + SDValue CurSquare = LHS; + while (Val) { + if (Val & 1) + if (Res.getNode()) + Res = DAG.getNode(ISD::FMUL, DL,Res.getValueType(), Res, CurSquare); + else + Res = CurSquare; // 1.0*CurSquare. + + CurSquare = DAG.getNode(ISD::FMUL, DL, CurSquare.getValueType(), + CurSquare, CurSquare); + Val >>= 1; + } + + // If the original was negative, invert the result, producing 1/(x*x*x). + if (RHSC->getSExtValue() < 0) + Res = DAG.getNode(ISD::FDIV, DL, LHS.getValueType(), + DAG.getConstantFP(1.0, LHS.getValueType()), Res); + return Res; + } + } + + // Otherwise, expand to a libcall. + return DAG.getNode(ISD::FPOWI, DL, LHS.getValueType(), LHS, RHS); +} + + /// visitIntrinsicCall - Lower the call to the specified intrinsic function. If /// we want to emit this as a call to a named external function, return the name /// otherwise lower it and return null. @@ -4536,10 +4590,8 @@ SelectionDAGBuilder::visitIntrinsicCall(CallInst &I, unsigned Intrinsic) { DAG.AssignOrdering(Res.getNode(), SDNodeOrder); return 0; case Intrinsic::powi: - Res = DAG.getNode(ISD::FPOWI, dl, - getValue(I.getOperand(1)).getValueType(), - getValue(I.getOperand(1)), - getValue(I.getOperand(2))); + Res = ExpandPowI(dl, getValue(I.getOperand(1)), getValue(I.getOperand(2)), + DAG); setValue(&I, Res); if (DisableScheduling) DAG.AssignOrdering(Res.getNode(), SDNodeOrder); diff --git a/lib/Target/README.txt b/lib/Target/README.txt index d6a9ea9fdb6..a6e05fadefd 100644 --- a/lib/Target/README.txt +++ b/lib/Target/README.txt @@ -756,36 +756,6 @@ be done safely if "b" isn't modified between the strlen and memcpy of course. //===---------------------------------------------------------------------===// -We generate a horrible libcall for llvm.powi. For example, we compile: - -#include -double f(double a) { return std::pow(a, 4); } - -into: - -__Z1fd: - subl $12, %esp - movsd 16(%esp), %xmm0 - movsd %xmm0, (%esp) - movl $4, 8(%esp) - call L___powidf2$stub - addl $12, %esp - ret - -GCC produces: - -__Z1fd: - subl $12, %esp - movsd 16(%esp), %xmm0 - mulsd %xmm0, %xmm0 - mulsd %xmm0, %xmm0 - movsd %xmm0, (%esp) - fldl (%esp) - addl $12, %esp - ret - -//===---------------------------------------------------------------------===// - We compile this program: (from GCC PR11680) http://gcc.gnu.org/bugzilla/attachment.cgi?id=4487 diff --git a/test/CodeGen/X86/2007-09-27-LDIntrinsics.ll b/test/CodeGen/X86/2007-09-27-LDIntrinsics.ll index 4a56ee446a0..4d6971586c2 100644 --- a/test/CodeGen/X86/2007-09-27-LDIntrinsics.ll +++ b/test/CodeGen/X86/2007-09-27-LDIntrinsics.ll @@ -1,47 +1,30 @@ -; RUN: llc < %s | grep powixf2 -; RUN: llc < %s | grep fsqrt -; ModuleID = 'yyy.c' +; RUN: llc < %s | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "i686-apple-darwin8" -define x86_fp80 @foo(x86_fp80 %x) { +define x86_fp80 @foo(x86_fp80 %x) nounwind{ entry: - %x_addr = alloca x86_fp80 ; [#uses=2] - %retval = alloca x86_fp80 ; [#uses=2] - %tmp = alloca x86_fp80 ; [#uses=2] - %"alloca point" = bitcast i32 0 to i32 ; [#uses=0] - store x86_fp80 %x, x86_fp80* %x_addr - %tmp1 = load x86_fp80* %x_addr, align 16 ; [#uses=1] - %tmp2 = call x86_fp80 @llvm.sqrt.f80( x86_fp80 %tmp1 ) ; [#uses=1] - store x86_fp80 %tmp2, x86_fp80* %tmp, align 16 - %tmp3 = load x86_fp80* %tmp, align 16 ; [#uses=1] - store x86_fp80 %tmp3, x86_fp80* %retval, align 16 - br label %return - -return: ; preds = %entry - %retval4 = load x86_fp80* %retval ; [#uses=1] - ret x86_fp80 %retval4 + %tmp2 = call x86_fp80 @llvm.sqrt.f80( x86_fp80 %x ) + ret x86_fp80 %tmp2 + +; CHECK: foo: +; CHECK: fldt 4(%esp) +; CHECK-NEXT: fsqrt +; CHECK-NEXT: ret } declare x86_fp80 @llvm.sqrt.f80(x86_fp80) -define x86_fp80 @bar(x86_fp80 %x) { +define x86_fp80 @bar(x86_fp80 %x) nounwind { entry: - %x_addr = alloca x86_fp80 ; [#uses=2] - %retval = alloca x86_fp80 ; [#uses=2] - %tmp = alloca x86_fp80 ; [#uses=2] - %"alloca point" = bitcast i32 0 to i32 ; [#uses=0] - store x86_fp80 %x, x86_fp80* %x_addr - %tmp1 = load x86_fp80* %x_addr, align 16 ; [#uses=1] - %tmp2 = call x86_fp80 @llvm.powi.f80( x86_fp80 %tmp1, i32 3 ) ; [#uses=1] - store x86_fp80 %tmp2, x86_fp80* %tmp, align 16 - %tmp3 = load x86_fp80* %tmp, align 16 ; [#uses=1] - store x86_fp80 %tmp3, x86_fp80* %retval, align 16 - br label %return - -return: ; preds = %entry - %retval4 = load x86_fp80* %retval ; [#uses=1] - ret x86_fp80 %retval4 + %tmp2 = call x86_fp80 @llvm.powi.f80( x86_fp80 %x, i32 3 ) + ret x86_fp80 %tmp2 +; CHECK: bar: +; CHECK: fldt 4(%esp) +; CHECK-NEXT: fld %st(0) +; CHECK-NEXT: fmul %st(1) +; CHECK-NEXT: fmulp %st(1) +; CHECK-NEXT: ret } declare x86_fp80 @llvm.powi.f80(x86_fp80, i32) diff --git a/test/CodeGen/X86/powi.ll b/test/CodeGen/X86/powi.ll new file mode 100644 index 00000000000..c3d68312ce1 --- /dev/null +++ b/test/CodeGen/X86/powi.ll @@ -0,0 +1,11 @@ +; RUN: llc %s -march=x86 -mcpu=yonah -o - | grep mulsd | count 6 +; Ideally this would compile to 5 multiplies. + +define double @_Z3f10d(double %a) nounwind readonly ssp noredzone { +entry: + %0 = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; [#uses=1] + ret double %0 +} + +declare double @llvm.powi.f64(double, i32) nounwind readonly + -- 2.34.1