Remove the Function::getFnAttributes method in favor of using the AttributeSet

[oota-llvm.git] / lib / Analysis / InlineCost.cpp
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp

index bc1ecd2ea4308e27a98af43d94f083f4576d88a7..ba8a5dd8e73cc3ce47d36732edc9d3843104e291 100644 (file)
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -13,23 +13,23 @@
  
  #define DEBUG_TYPE "inline-cost"
  #include "llvm/Analysis/InlineCost.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
  #include "llvm/Analysis/ConstantFolding.h"
  #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/CallingConv.h"
+#include "llvm/DataLayout.h"
+#include "llvm/GlobalAlias.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Operator.h"
  #include "llvm/Support/CallSite.h"
  #include "llvm/Support/Debug.h"
-#include "llvm/Support/InstVisitor.h"
  #include "llvm/Support/GetElementPtrTypeIterator.h"
  #include "llvm/Support/raw_ostream.h"
-#include "llvm/CallingConv.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Operator.h"
-#include "llvm/GlobalAlias.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
  
  using namespace llvm;
  
@@ -41,19 +41,23 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
    typedef InstVisitor<CallAnalyzer, bool> Base;
    friend class InstVisitor<CallAnalyzer, bool>;
  
-  // TargetData if available, or null.
-  const TargetData *const TD;
+  // DataLayout if available, or null.
+  const DataLayout *const TD;
  
    // The called function.
    Function &F;
  
    int Threshold;
    int Cost;
-  const bool AlwaysInline;
  
-  bool IsRecursive;
+  bool IsCallerRecursive;
+  bool IsRecursiveCall;
    bool ExposesReturnsTwice;
    bool HasDynamicAlloca;
+  bool ContainsNoDuplicateCall;
+
+  /// Number of bytes allocated statically by the callee.
+  uint64_t AllocatedSize;
    unsigned NumInstructions, NumVectorInstructions;
    int FiftyPercentVectorBonus, TenPercentVectorBonus;
    int VectorBonus;
@@ -92,6 +96,7 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
                             int InstructionCost);
    bool isGEPOffsetConstant(GetElementPtrInst &GEP);
    bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
+  bool simplifyCallSite(Function *F, CallSite CS);
    ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
  
    // Custom analysis routines.
@@ -120,14 +125,16 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
    bool visitBinaryOperator(BinaryOperator &I);
    bool visitLoad(LoadInst &I);
    bool visitStore(StoreInst &I);
+  bool visitExtractValue(ExtractValueInst &I);
+  bool visitInsertValue(InsertValueInst &I);
    bool visitCallSite(CallSite CS);
  
  public:
-  CallAnalyzer(const TargetData *TD, Function &Callee, int Threshold)
+  CallAnalyzer(const DataLayout *TD, Function &Callee, int Threshold)
      : TD(TD), F(Callee), Threshold(Threshold), Cost(0),
-      AlwaysInline(F.hasFnAttr(Attribute::AlwaysInline)),
-      IsRecursive(false), ExposesReturnsTwice(false), HasDynamicAlloca(false),
-      NumInstructions(0), NumVectorInstructions(0),
+      IsCallerRecursive(false), IsRecursiveCall(false),
+      ExposesReturnsTwice(false), HasDynamicAlloca(false), ContainsNoDuplicateCall(false),
+      AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0),
        FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0),
        NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
        NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
@@ -269,9 +276,15 @@ bool CallAnalyzer::visitAlloca(AllocaInst &I) {
    // FIXME: Check whether inlining will turn a dynamic alloca into a static
    // alloca, and handle that case.
  
-  // We will happily inline static alloca instructions or dynamic alloca
-  // instructions in always-inline situations.
-  if (AlwaysInline || I.isStaticAlloca())
+  // Accumulate the allocated size.
+  if (I.isStaticAlloca()) {
+    Type *Ty = I.getAllocatedType();
+    AllocatedSize += (TD ? TD->getTypeAllocSize(Ty) :
+                      Ty->getPrimitiveSizeInBits());
+  }
+
+  // We will happily inline static alloca instructions.
+  if (I.isStaticAlloca())
      return Base::visitAlloca(I);
  
    // FIXME: This is overly conservative. Dynamic allocas are inefficient for
@@ -345,7 +358,10 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
  
  bool CallAnalyzer::visitBitCast(BitCastInst &I) {
    // Propagate constants through bitcasts.
-  if (Constant *COp = dyn_cast<Constant>(I.getOperand(0)))
+  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
+  if (!COp)
+    COp = SimplifiedValues.lookup(I.getOperand(0));
+  if (COp)
      if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) {
        SimplifiedValues[&I] = C;
        return true;
@@ -370,7 +386,10 @@ bool CallAnalyzer::visitBitCast(BitCastInst &I) {
  
  bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
    // Propagate constants through ptrtoint.
-  if (Constant *COp = dyn_cast<Constant>(I.getOperand(0)))
+  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
+  if (!COp)
+    COp = SimplifiedValues.lookup(I.getOperand(0));
+  if (COp)
      if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) {
        SimplifiedValues[&I] = C;
        return true;
@@ -403,7 +422,10 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
  
  bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
    // Propagate constants through ptrtoint.
-  if (Constant *COp = dyn_cast<Constant>(I.getOperand(0)))
+  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
+  if (!COp)
+    COp = SimplifiedValues.lookup(I.getOperand(0));
+  if (COp)
      if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) {
        SimplifiedValues[&I] = C;
        return true;
@@ -430,7 +452,10 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
  
  bool CallAnalyzer::visitCastInst(CastInst &I) {
    // Propagate constants through ptrtoint.
-  if (Constant *COp = dyn_cast<Constant>(I.getOperand(0)))
+  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
+  if (!COp)
+    COp = SimplifiedValues.lookup(I.getOperand(0));
+  if (COp)
      if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
        SimplifiedValues[&I] = C;
        return true;
@@ -600,32 +625,109 @@ bool CallAnalyzer::visitStore(StoreInst &I) {
    return false;
  }
  
+bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
+  // Constant folding for extract value is trivial.
+  Constant *C = dyn_cast<Constant>(I.getAggregateOperand());
+  if (!C)
+    C = SimplifiedValues.lookup(I.getAggregateOperand());
+  if (C) {
+    SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices());
+    return true;
+  }
+
+  // SROA can look through these but give them a cost.
+  return false;
+}
+
+bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
+  // Constant folding for insert value is trivial.
+  Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand());
+  if (!AggC)
+    AggC = SimplifiedValues.lookup(I.getAggregateOperand());
+  Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand());
+  if (!InsertedC)
+    InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand());
+  if (AggC && InsertedC) {
+    SimplifiedValues[&I] = ConstantExpr::getInsertValue(AggC, InsertedC,
+                                                        I.getIndices());
+    return true;
+  }
+
+  // SROA can look through these but give them a cost.
+  return false;
+}
+
+/// \brief Try to simplify a call site.
+///
+/// Takes a concrete function and callsite and tries to actually simplify it by
+/// analyzing the arguments and call itself with instsimplify. Returns true if
+/// it has simplified the callsite to some other entity (a constant), making it
+/// free.
+bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
+  // FIXME: Using the instsimplify logic directly for this is inefficient
+  // because we have to continually rebuild the argument list even when no
+  // simplifications can be performed. Until that is fixed with remapping
+  // inside of instsimplify, directly constant fold calls here.
+  if (!canConstantFoldCallTo(F))
+    return false;
+
+  // Try to re-map the arguments to constants.
+  SmallVector<Constant *, 4> ConstantArgs;
+  ConstantArgs.reserve(CS.arg_size());
+  for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+       I != E; ++I) {
+    Constant *C = dyn_cast<Constant>(*I);
+    if (!C)
+      C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
+    if (!C)
+      return false; // This argument doesn't map to a constant.
+
+    ConstantArgs.push_back(C);
+  }
+  if (Constant *C = ConstantFoldCall(F, ConstantArgs)) {
+    SimplifiedValues[CS.getInstruction()] = C;
+    return true;
+  }
+
+  return false;
+}
+
  bool CallAnalyzer::visitCallSite(CallSite CS) {
    if (CS.isCall() && cast<CallInst>(CS.getInstruction())->canReturnTwice() &&
-      !F.hasFnAttr(Attribute::ReturnsTwice)) {
+      !F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                      Attribute::ReturnsTwice)) {
      // This aborts the entire analysis.
      ExposesReturnsTwice = true;
      return false;
    }
+  if (CS.isCall() &&
+      cast<CallInst>(CS.getInstruction())->hasFnAttr(Attribute::NoDuplicate))
+    ContainsNoDuplicateCall = true;
  
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
-    switch (II->getIntrinsicID()) {
-    default:
-      return Base::visitCallSite(CS);
+  if (Function *F = CS.getCalledFunction()) {
+    // When we have a concrete function, first try to simplify it directly.
+    if (simplifyCallSite(F, CS))
+      return true;
  
-    case Intrinsic::memset:
-    case Intrinsic::memcpy:
-    case Intrinsic::memmove:
-      // SROA can usually chew through these intrinsics, but they aren't free.
-      return false;
+    // Next check if it is an intrinsic we know about.
+    // FIXME: Lift this into part of the InstVisitor.
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
+      switch (II->getIntrinsicID()) {
+      default:
+        return Base::visitCallSite(CS);
+
+      case Intrinsic::memset:
+      case Intrinsic::memcpy:
+      case Intrinsic::memmove:
+        // SROA can usually chew through these intrinsics, but they aren't free.
+        return false;
+      }
      }
-  }
  
-  if (Function *F = CS.getCalledFunction()) {
      if (F == CS.getInstruction()->getParent()->getParent()) {
        // This flag will fully abort the analysis, so don't bother with anything
        // else.
-      IsRecursive = true;
+      IsRecursiveCall = true;
        return false;
      }
  
@@ -712,7 +814,14 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
        Cost += InlineConstants::InstrCost;
  
      // If the visit this instruction detected an uninlinable pattern, abort.
-    if (IsRecursive || ExposesReturnsTwice || HasDynamicAlloca)
+    if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca)
+      return false;
+
+    // If the caller is a recursive function then we don't want to inline
+    // functions which allocate a lot of stack space because it would increase
+    // the caller stack usage dramatically.
+    if (IsCallerRecursive &&
+        AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
        return false;
  
      if (NumVectorInstructions > NumInstructions/2)
@@ -724,7 +833,7 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
  
      // Check if we've past the threshold so we don't spin in huge basic
      // blocks that will never inline.
-    if (!AlwaysInline && Cost > (Threshold + VectorBonus))
+    if (Cost > (Threshold + VectorBonus))
        return false;
    }
  
@@ -775,7 +884,7 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
  /// viable. It computes the cost and adjusts the threshold based on numerous
  /// factors and heuristics. If this method returns false but the computed cost
  /// is below the computed threshold, then inlining was forcibly disabled by
-/// some artifact of the rountine.
+/// some artifact of the routine.
  bool CallAnalyzer::analyzeCall(CallSite CS) {
    ++NumCallsAnalyzed;
  
@@ -786,72 +895,89 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
    int SingleBBBonus = Threshold / 2;
    Threshold += SingleBBBonus;
  
-  // Unless we are always-inlining, perform some tweaks to the cost and
-  // threshold based on the direct callsite information.
-  if (!AlwaysInline) {
-    // We want to more aggressively inline vector-dense kernels, so up the
-    // threshold, and we'll lower it if the % of vector instructions gets too
-    // low.
-    assert(NumInstructions == 0);
-    assert(NumVectorInstructions == 0);
-    FiftyPercentVectorBonus = Threshold;
-    TenPercentVectorBonus = Threshold / 2;
-
-    // Give out bonuses per argument, as the instructions setting them up will
-    // be gone after inlining.
-    for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
-      if (TD && CS.isByValArgument(I)) {
-        // We approximate the number of loads and stores needed by dividing the
-        // size of the byval type by the target's pointer size.
-        PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
-        unsigned TypeSize = TD->getTypeSizeInBits(PTy->getElementType());
-        unsigned PointerSize = TD->getPointerSizeInBits();
-        // Ceiling division.
-        unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
-
-        // If it generates more than 8 stores it is likely to be expanded as an
-        // inline memcpy so we take that as an upper bound. Otherwise we assume
-        // one load and one store per word copied.
-        // FIXME: The maxStoresPerMemcpy setting from the target should be used
-        // here instead of a magic number of 8, but it's not available via
-        // TargetData.
-        NumStores = std::min(NumStores, 8U);
-
-        Cost -= 2 * NumStores * InlineConstants::InstrCost;
-      } else {
-        // For non-byval arguments subtract off one instruction per call
-        // argument.
-        Cost -= InlineConstants::InstrCost;
-      }
+  // Perform some tweaks to the cost and threshold based on the direct
+  // callsite information.
+
+  // We want to more aggressively inline vector-dense kernels, so up the
+  // threshold, and we'll lower it if the % of vector instructions gets too
+  // low.
+  assert(NumInstructions == 0);
+  assert(NumVectorInstructions == 0);
+  FiftyPercentVectorBonus = Threshold;
+  TenPercentVectorBonus = Threshold / 2;
+
+  // Give out bonuses per argument, as the instructions setting them up will
+  // be gone after inlining.
+  for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
+    if (TD && CS.isByValArgument(I)) {
+      // We approximate the number of loads and stores needed by dividing the
+      // size of the byval type by the target's pointer size.
+      PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
+      unsigned TypeSize = TD->getTypeSizeInBits(PTy->getElementType());
+      unsigned PointerSize = TD->getPointerSizeInBits();
+      // Ceiling division.
+      unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
+
+      // If it generates more than 8 stores it is likely to be expanded as an
+      // inline memcpy so we take that as an upper bound. Otherwise we assume
+      // one load and one store per word copied.
+      // FIXME: The maxStoresPerMemcpy setting from the target should be used
+      // here instead of a magic number of 8, but it's not available via
+      // DataLayout.
+      NumStores = std::min(NumStores, 8U);
+
+      Cost -= 2 * NumStores * InlineConstants::InstrCost;
+    } else {
+      // For non-byval arguments subtract off one instruction per call
+      // argument.
+      Cost -= InlineConstants::InstrCost;
      }
+  }
  
-    // If there is only one call of the function, and it has internal linkage,
-    // the cost of inlining it drops dramatically.
-    if (F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction())
-      Cost += InlineConstants::LastCallToStaticBonus;
-
-    // If the instruction after the call, or if the normal destination of the
-    // invoke is an unreachable instruction, the function is noreturn.  As such,
-    // there is little point in inlining this unless there is literally zero cost.
-    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
-      if (isa<UnreachableInst>(II->getNormalDest()->begin()))
-        Threshold = 1;
-    } else if (isa<UnreachableInst>(++BasicBlock::iterator(CS.getInstruction())))
+  // If there is only one call of the function, and it has internal linkage,
+  // the cost of inlining it drops dramatically.
+  bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() &&
+    &F == CS.getCalledFunction();
+  if (OnlyOneCallAndLocalLinkage)
+    Cost += InlineConstants::LastCallToStaticBonus;
+
+  // If the instruction after the call, or if the normal destination of the
+  // invoke is an unreachable instruction, the function is noreturn. As such,
+  // there is little point in inlining this unless there is literally zero
+  // cost.
+  Instruction *Instr = CS.getInstruction();
+  if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
+    if (isa<UnreachableInst>(II->getNormalDest()->begin()))
        Threshold = 1;
+  } else if (isa<UnreachableInst>(++BasicBlock::iterator(Instr)))
+    Threshold = 1;
  
-    // If this function uses the coldcc calling convention, prefer not to inline
-    // it.
-    if (F.getCallingConv() == CallingConv::Cold)
-      Cost += InlineConstants::ColdccPenalty;
+  // If this function uses the coldcc calling convention, prefer not to inline
+  // it.
+  if (F.getCallingConv() == CallingConv::Cold)
+    Cost += InlineConstants::ColdccPenalty;
  
-    // Check if we're done. This can happen due to bonuses and penalties.
-    if (Cost > Threshold)
-      return false;
-  }
+  // Check if we're done. This can happen due to bonuses and penalties.
+  if (Cost > Threshold)
+    return false;
  
    if (F.empty())
      return true;
  
+  Function *Caller = CS.getInstruction()->getParent()->getParent();
+  // Check if the caller function is recursive itself.
+  for (Value::use_iterator U = Caller->use_begin(), E = Caller->use_end();
+       U != E; ++U) {
+    CallSite Site(cast<Value>(*U));
+    if (!Site)
+      continue;
+    Instruction *I = Site.getInstruction();
+    if (I->getParent()->getParent() == Caller) {
+      IsCallerRecursive = true;
+      break;
+    }
+  }
+
    // Track whether we've seen a return instruction. The first return
    // instruction is free, as at least one will usually disappear in inlining.
    bool HasReturn = false;
@@ -895,7 +1021,7 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
    for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
      // Bail out the moment we cross the threshold. This means we'll under-count
      // the cost, but only when undercounting doesn't matter.
-    if (!AlwaysInline && Cost > (Threshold + VectorBonus))
+    if (Cost > (Threshold + VectorBonus))
        break;
  
      BasicBlock *BB = BBWorklist[Idx];
@@ -908,9 +1034,9 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
  
      // We never want to inline functions that contain an indirectbr.  This is
      // incorrect because all the blockaddress's (in static global initializers
-    // for example) would be referring to the original function, and this indirect
-    // jump would jump from the inlined copy of the function into the original
-    // function which is extremely undefined behavior.
+    // for example) would be referring to the original function, and this
+    // indirect jump would jump from the inlined copy of the function into the 
+    // original function which is extremely undefined behavior.
      // FIXME: This logic isn't really right; we can safely inline functions
      // with indirectbr's as long as no other function or global references the
      // blockaddress of a block within the current function.  And as a QOI issue,
@@ -928,8 +1054,16 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
      // Analyze the cost of this block. If we blow through the threshold, this
      // returns false, and we can bail on out.
      if (!analyzeBlock(BB)) {
-      if (IsRecursive || ExposesReturnsTwice || HasDynamicAlloca)
+      if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca)
+        return false;
+
+      // If the caller is a recursive function then we don't want to inline
+      // functions which allocate a lot of stack space because it would increase
+      // the caller stack usage dramatically.
+      if (IsCallerRecursive &&
+          AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
          return false;
+
        break;
      }
  
@@ -955,7 +1089,8 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
  
      // If we're unable to select a particular successor, just count all of
      // them.
-    for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; ++TIdx)
+    for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
+         ++TIdx)
        BBWorklist.insert(TI->getSuccessor(TIdx));
  
      // If we had any successors at this point, than post-inlining is likely to
@@ -969,11 +1104,18 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
      }
    }
  
+  // If this is a noduplicate call, we can still inline as long as 
+  // inlining this would cause the removal of the caller (so the instruction
+  // is not actually duplicated, just moved).
+  if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
+    return false;
+
    Threshold += VectorBonus;
  
-  return AlwaysInline || Cost < Threshold;
+  return Cost < Threshold;
  }
  
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
  /// \brief Dump stats about this call's analysis.
  void CallAnalyzer::dump() {
  #define DEBUG_PRINT_STAT(x) llvm::dbgs() << "      " #x ": " << x << "\n"
@@ -985,8 +1127,10 @@ void CallAnalyzer::dump() {
    DEBUG_PRINT_STAT(NumInstructionsSimplified);
    DEBUG_PRINT_STAT(SROACostSavings);
    DEBUG_PRINT_STAT(SROACostSavingsLost);
+  DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
  #undef DEBUG_PRINT_STAT
  }
+#endif
  
  InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, int Threshold) {
    return getInlineCost(CS, CS.getCalledFunction(), Threshold);
@@ -994,14 +1138,30 @@ InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, int Threshold) {
  
  InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, Function *Callee,
                                               int Threshold) {
+  // Cannot inline indirect calls.
+  if (!Callee)
+    return llvm::InlineCost::getNever();
+
+  // Calls to functions with always-inline attributes should be inlined
+  // whenever possible.
+  if (Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                           Attribute::AlwaysInline)) {
+    if (isInlineViable(*Callee))
+      return llvm::InlineCost::getAlways();
+    return llvm::InlineCost::getNever();
+  }
+
    // Don't inline functions which can be redefined at link-time to mean
    // something else.  Don't inline functions marked noinline or call sites
    // marked noinline.
-  if (!Callee || Callee->mayBeOverridden() ||
-      Callee->hasFnAttr(Attribute::NoInline) || CS.isNoInline())
+  if (Callee->mayBeOverridden() ||
+      Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                           Attribute::NoInline) ||
+      CS.isNoInline())
      return llvm::InlineCost::getNever();
  
-  DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName() << "...\n");
+  DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
+        << "...\n");
  
    CallAnalyzer CA(TD, *Callee, Threshold);
    bool ShouldInline = CA.analyzeCall(CS);
@@ -1016,3 +1176,32 @@ InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, Function *Callee,
  
    return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
  }
+
+bool InlineCostAnalyzer::isInlineViable(Function &F) {
+  bool ReturnsTwice =F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                                    Attribute::ReturnsTwice);
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    // Disallow inlining of functions which contain an indirect branch.
+    if (isa<IndirectBrInst>(BI->getTerminator()))
+      return false;
+
+    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
+         ++II) {
+      CallSite CS(II);
+      if (!CS)
+        continue;
+
+      // Disallow recursive calls.
+      if (&F == CS.getCalledFunction())
+        return false;
+
+      // Disallow calls which expose returns-twice to a function not previously
+      // attributed as such.
+      if (!ReturnsTwice && CS.isCall() &&
+          cast<CallInst>(CS.getInstruction())->canReturnTwice())
+        return false;
+    }
+  }
+
+  return true;
+}