When considering whether to inline Callee into Caller,

author Dale Johannesen <dalej@apple.com>

Fri, 9 Oct 2009 00:11:32 +0000 (00:11 +0000)

committer Dale Johannesen <dalej@apple.com>

Fri, 9 Oct 2009 00:11:32 +0000 (00:11 +0000)
author Dale Johannesen <dalej@apple.com>
Fri, 9 Oct 2009 00:11:32 +0000 (00:11 +0000)
committer Dale Johannesen <dalej@apple.com>
Fri, 9 Oct 2009 00:11:32 +0000 (00:11 +0000)
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp

index c38cf82a692b239b2a4c4ef60dc249dd0082ea39..7b512d60d97f8cc48487625cdeac723de51a5d7e 100644 (file)
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -189,9 +189,9 @@ bool Inliner::shouldInline(CallSite CS) {
    
    int Cost = IC.getValue();
    int CurrentThreshold = InlineThreshold;
-  Function *Fn = CS.getCaller();
-  if (Fn && !Fn->isDeclaration() &&
-      Fn->hasFnAttr(Attribute::OptimizeForSize) &&
+  Function *Caller = CS.getCaller();
+  if (Caller && !Caller->isDeclaration() &&
+      Caller->hasFnAttr(Attribute::OptimizeForSize) &&
        InlineLimit.getNumOccurrences() == 0 &&
        InlineThreshold != 50)
      CurrentThreshold = 50;
@@ -203,8 +203,72 @@ bool Inliner::shouldInline(CallSite CS) {
      return false;
    }
    
+  // Try to detect the case where the current inlining candidate caller
+  // (call it B) is a static function and is an inlining candidate elsewhere,
+  // and the current candidate callee (call it C) is large enough that
+  // inlining it into B would make B too big to inline later.  In these
+  // circumstances it may be best not to inline C into B, but to inline B
+  // into its callers.
+  if (Caller->hasLocalLinkage()) {
+    int TotalSecondaryCost = 0;
+    bool outerCallsFound = false;
+    bool allOuterCallsWillBeInlined = true;
+    bool someOuterCallWouldNotBeInlined = false;
+    for (Value::use_iterator I = Caller->use_begin(), E =Caller->use_end(); 
+         I != E; ++I) {
+      CallSite CS2 = CallSite::get(*I);
+
+      // If this isn't a call to Caller (it could be some other sort
+      // of reference) skip it.
+      if (CS2.getInstruction() == 0 || CS2.getCalledFunction() != Caller)
+        continue;
+
+      InlineCost IC2 = getInlineCost(CS2);
+      if (IC2.isNever())
+        allOuterCallsWillBeInlined = false;
+      if (IC2.isAlways() || IC2.isNever())
+        continue;
+
+      outerCallsFound = true;
+      int Cost2 = IC2.getValue();
+      int CurrentThreshold2 = InlineThreshold;
+      Function *Caller2 = CS2.getCaller();
+      if (Caller2 && !Caller2->isDeclaration() &&
+          Caller2->hasFnAttr(Attribute::OptimizeForSize) &&
+          InlineThreshold != 50)
+        CurrentThreshold2 = 50;
+
+      float FudgeFactor2 = getInlineFudgeFactor(CS2);
+
+      if (Cost2 >= (int)(CurrentThreshold2 * FudgeFactor2))
+        allOuterCallsWillBeInlined = false;
+
+      // See if we have this case.  The magic 6 is what InlineCost assigns
+      // for the call instruction, which we would be deleting.
+      if (Cost2 < (int)(CurrentThreshold2 * FudgeFactor2) &&
+          Cost2 + Cost - 6 >= (int)(CurrentThreshold2 * FudgeFactor2)) {
+        someOuterCallWouldNotBeInlined = true;
+        TotalSecondaryCost += Cost2;
+      }
+    }
+    // If all outer calls to Caller would get inlined, the cost for the last
+    // one is set very low by getInlineCost, in anticipation that Caller will
+    // be removed entirely.  We did not account for this above unless there
+    // is only one caller of Caller.
+    if (allOuterCallsWillBeInlined && Caller->use_begin() != Caller->use_end())
+      TotalSecondaryCost -= 15000;
+
+    if (outerCallsFound && someOuterCallWouldNotBeInlined && 
+        TotalSecondaryCost < Cost) {
+      DEBUG(errs() << "    NOT Inlining: " << *CS.getInstruction() << 
+           " Cost = " << Cost << 
+           ", outer Cost = " << TotalSecondaryCost << '\n');
+      return false;
+    }
+  }
+
    DEBUG(errs() << "    Inlining: cost=" << Cost
-        << ", Call: " << *CS.getInstruction() << "\n");
+        << ", Call: " << *CS.getInstruction() << '\n');
    return true;
  }
  
@@ -232,7 +296,7 @@ bool Inliner::runOnSCC(std::vector<CallGraphNode*> &SCC) {
      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
        for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
          CallSite CS = CallSite::get(I);
-        // If this this isn't a call, or it is a call to an intrinsic, it can
+        // If this isn't a call, or it is a call to an intrinsic, it can
          // never be inlined.
          if (CS.getInstruction() == 0 || isa<IntrinsicInst>(I))
            continue;
@@ -279,7 +343,7 @@ bool Inliner::runOnSCC(std::vector<CallGraphNode*> &SCC) {
        // try to do so.
        if (!shouldInline(CS))
          continue;
-      
+
        Function *Caller = CS.getCaller();
        // Attempt to inline the function...
        if (!InlineCallIfPossible(CS, CG, TD, InlinedArrayAllocas))
diff --git a/test/Transforms/Inline/nested-inline.ll b/test/Transforms/Inline/nested-inline.ll

new file mode 100644 (file)

index 0000000..1292667
--- /dev/null
+++ b/test/Transforms/Inline/nested-inline.ll
@@ -0,0 +1,111 @@
+; RUN: opt < %s -inline -S | FileCheck %s
+; Test that bar and bar2 are both inlined throughout and removed.
+@A = weak global i32 0         ; <i32*> [#uses=1]
+@B = weak global i32 0         ; <i32*> [#uses=1]
+@C = weak global i32 0         ; <i32*> [#uses=1]
+
+define fastcc void @foo(i32 %X) {
+entry:
+; CHECK: @foo
+       %ALL = alloca i32, align 4              ; <i32*> [#uses=1]
+       %tmp1 = and i32 %X, 1           ; <i32> [#uses=1]
+       %tmp1.upgrd.1 = icmp eq i32 %tmp1, 0            ; <i1> [#uses=1]
+       br i1 %tmp1.upgrd.1, label %cond_next, label %cond_true
+
+cond_true:             ; preds = %entry
+       store i32 1, i32* @A
+       br label %cond_next
+
+cond_next:             ; preds = %cond_true, %entry
+       %tmp4 = and i32 %X, 2           ; <i32> [#uses=1]
+       %tmp4.upgrd.2 = icmp eq i32 %tmp4, 0            ; <i1> [#uses=1]
+       br i1 %tmp4.upgrd.2, label %cond_next7, label %cond_true5
+
+cond_true5:            ; preds = %cond_next
+       store i32 1, i32* @B
+       br label %cond_next7
+
+cond_next7:            ; preds = %cond_true5, %cond_next
+       %tmp10 = and i32 %X, 4          ; <i32> [#uses=1]
+       %tmp10.upgrd.3 = icmp eq i32 %tmp10, 0          ; <i1> [#uses=1]
+       br i1 %tmp10.upgrd.3, label %cond_next13, label %cond_true11
+
+cond_true11:           ; preds = %cond_next7
+       store i32 1, i32* @C
+       br label %cond_next13
+
+cond_next13:           ; preds = %cond_true11, %cond_next7
+       %tmp16 = and i32 %X, 8          ; <i32> [#uses=1]
+       %tmp16.upgrd.4 = icmp eq i32 %tmp16, 0          ; <i1> [#uses=1]
+       br i1 %tmp16.upgrd.4, label %UnifiedReturnBlock, label %cond_true17
+
+cond_true17:           ; preds = %cond_next13
+       call void @ext( i32* %ALL )
+       ret void
+
+UnifiedReturnBlock:            ; preds = %cond_next13
+       ret void
+}
+
+; CHECK-NOT: @bar
+define internal fastcc void @bar(i32 %X) {
+entry:
+       %ALL = alloca i32, align 4              ; <i32*> [#uses=1]
+       %tmp1 = and i32 %X, 1           ; <i32> [#uses=1]
+       %tmp1.upgrd.1 = icmp eq i32 %tmp1, 0            ; <i1> [#uses=1]
+       br i1 %tmp1.upgrd.1, label %cond_next, label %cond_true
+
+cond_true:             ; preds = %entry
+       store i32 1, i32* @A
+       br label %cond_next
+
+cond_next:             ; preds = %cond_true, %entry
+       %tmp4 = and i32 %X, 2           ; <i32> [#uses=1]
+       %tmp4.upgrd.2 = icmp eq i32 %tmp4, 0            ; <i1> [#uses=1]
+       br i1 %tmp4.upgrd.2, label %cond_next7, label %cond_true5
+
+cond_true5:            ; preds = %cond_next
+       store i32 1, i32* @B
+       br label %cond_next7
+
+cond_next7:            ; preds = %cond_true5, %cond_next
+       %tmp10 = and i32 %X, 4          ; <i32> [#uses=1]
+       %tmp10.upgrd.3 = icmp eq i32 %tmp10, 0          ; <i1> [#uses=1]
+       br i1 %tmp10.upgrd.3, label %cond_next13, label %cond_true11
+
+cond_true11:           ; preds = %cond_next7
+       store i32 1, i32* @C
+       br label %cond_next13
+
+cond_next13:           ; preds = %cond_true11, %cond_next7
+       %tmp16 = and i32 %X, 8          ; <i32> [#uses=1]
+       %tmp16.upgrd.4 = icmp eq i32 %tmp16, 0          ; <i1> [#uses=1]
+       br i1 %tmp16.upgrd.4, label %UnifiedReturnBlock, label %cond_true17
+
+cond_true17:           ; preds = %cond_next13
+       call void @foo( i32 %X )
+       ret void
+
+UnifiedReturnBlock:            ; preds = %cond_next13
+       ret void
+}
+
+define internal fastcc void @bar2(i32 %X) {
+entry:
+       call void @foo( i32 %X )
+       ret void
+}
+
+declare void @ext(i32*)
+
+define void @test(i32 %X) {
+entry:
+; CHECK: test
+; CHECK-NOT: @bar
+       tail call fastcc void @bar( i32 %X )
+       tail call fastcc void @bar( i32 %X )
+       tail call fastcc void @bar2( i32 %X )
+       tail call fastcc void @bar2( i32 %X )
+       ret void
+; CHECK: ret
+}
author	Dale Johannesen <dalej@apple.com>
	Fri, 9 Oct 2009 00:11:32 +0000 (00:11 +0000)
committer	Dale Johannesen <dalej@apple.com>
	Fri, 9 Oct 2009 00:11:32 +0000 (00:11 +0000)
lib/Transforms/IPO/Inliner.cpp		patch \| blob \| history
test/Transforms/Inline/nested-inline.ll	[new file with mode: 0644]	patch \| blob