Teach InlineCost to account for a null check which can be folded away

[oota-llvm.git] / lib / Analysis / IPA / InlineCost.cpp
diff --git a/lib/Analysis/IPA/InlineCost.cpp b/lib/Analysis/IPA/InlineCost.cpp

index 683637f5b7f737eb24c0539544b8b0474e9593df..5ae7d44e06d36d6dfc0321000f624b1807dfa8f3 100644 (file)
--- a/lib/Analysis/IPA/InlineCost.cpp
+++ b/lib/Analysis/IPA/InlineCost.cpp
@@ -11,13 +11,14 @@
  //
  //===----------------------------------------------------------------------===//
  
-#define DEBUG_TYPE "inline-cost"
  #include "llvm/Analysis/InlineCost.h"
  #include "llvm/ADT/STLExtras.h"
  #include "llvm/ADT/SetVector.h"
  #include "llvm/ADT/SmallPtrSet.h"
  #include "llvm/ADT/SmallVector.h"
  #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
  #include "llvm/Analysis/ConstantFolding.h"
  #include "llvm/Analysis/InstructionSimplify.h"
  #include "llvm/Analysis/TargetTransformInfo.h"
@@ -34,6 +35,8 @@
  
  using namespace llvm;
  
+#define DEBUG_TYPE "inline-cost"
+
  STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
  
  namespace {
@@ -42,15 +45,20 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
    typedef InstVisitor<CallAnalyzer, bool> Base;
    friend class InstVisitor<CallAnalyzer, bool>;
  
-  // DataLayout if available, or null.
-  const DataLayout *const DL;
-
    /// The TargetTransformInfo available for this compilation.
    const TargetTransformInfo &TTI;
  
+  /// The cache of @llvm.assume intrinsics.
+  AssumptionCacheTracker *ACT;
+
    // The called function.
    Function &F;
  
+  // The candidate callsite being analyzed. Please do not use this to do
+  // analysis in the caller function; we want the inline cost query to be
+  // easily cacheable. Instead, use the cover function paramHasAttr.
+  CallSite CandidateCS;
+
    int Threshold;
    int Cost;
  
@@ -61,6 +69,7 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
    bool ContainsNoDuplicateCall;
    bool HasReturn;
    bool HasIndirectBr;
+  bool HasFrameEscape;
  
    /// Number of bytes allocated statically by the callee.
    uint64_t AllocatedSize;
@@ -97,16 +106,24 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
    void disableSROA(Value *V);
    void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
                            int InstructionCost);
-  bool handleSROACandidate(bool IsSROAValid,
-                           DenseMap<Value *, int>::iterator CostIt,
-                           int InstructionCost);
    bool isGEPOffsetConstant(GetElementPtrInst &GEP);
    bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
    bool simplifyCallSite(Function *F, CallSite CS);
    ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
  
+  /// Return true if the given argument to the function being considered for
+  /// inlining has the given attribute set either at the call site or the
+  /// function declaration.  Primarily used to inspect call site specific
+  /// attributes since these can be more precise than the ones on the callee
+  /// itself. 
+  bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
+  
+  /// Return true if the given value is known non null within the callee if
+  /// inlined through this particular callsite. 
+  bool isKnownNonNullInCallee(Value *V);
+
    // Custom analysis routines.
-  bool analyzeBlock(BasicBlock *BB);
+  bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
  
    // Disable several entry points to the visitor so we don't accidentally use
    // them by declaring but not defining them here.
@@ -142,18 +159,18 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
    bool visitUnreachableInst(UnreachableInst &I);
  
  public:
-  CallAnalyzer(const DataLayout *DL, const TargetTransformInfo &TTI,
-               Function &Callee, int Threshold)
-      : DL(DL), TTI(TTI), F(Callee), Threshold(Threshold), Cost(0),
-        IsCallerRecursive(false), IsRecursiveCall(false),
+  CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT,
+               Function &Callee, int Threshold, CallSite CSArg)
+    : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), Threshold(Threshold),
+        Cost(0), IsCallerRecursive(false), IsRecursiveCall(false),
          ExposesReturnsTwice(false), HasDynamicAlloca(false),
          ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
-        AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0),
-        FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0),
-        NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
-        NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
-        NumInstructionsSimplified(0), SROACostSavings(0),
-        SROACostSavingsLost(0) {}
+        HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
+        NumVectorInstructions(0), FiftyPercentVectorBonus(0),
+        TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
+        NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
+        NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
+        SROACostSavings(0), SROACostSavingsLost(0) {}
  
    bool analyzeCall(CallSite CS);
  
@@ -225,21 +242,6 @@ void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
    SROACostSavings += InstructionCost;
  }
  
-/// \brief Helper for the common pattern of handling a SROA candidate.
-/// Either accumulates the cost savings if the SROA remains valid, or disables
-/// SROA for the candidate.
-bool CallAnalyzer::handleSROACandidate(bool IsSROAValid,
-                                       DenseMap<Value *, int>::iterator CostIt,
-                                       int InstructionCost) {
-  if (IsSROAValid) {
-    accumulateSROACost(CostIt, InstructionCost);
-    return true;
-  }
-
-  disableSROA(CostIt);
-  return false;
-}
-
  /// \brief Check whether a GEP's indices are all constant.
  ///
  /// Respects any simplified values known during the analysis of this callsite.
@@ -256,10 +258,8 @@ bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) {
  /// Returns false if unable to compute the offset for any reason. Respects any
  /// simplified values known during the analysis of this callsite.
  bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
-  if (!DL)
-    return false;
-
-  unsigned IntPtrWidth = DL->getPointerSizeInBits();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  unsigned IntPtrWidth = DL.getPointerSizeInBits();
    assert(IntPtrWidth == Offset.getBitWidth());
  
    for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
@@ -275,12 +275,12 @@ bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
      // Handle a struct index, which adds its field offset to the pointer.
      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
        unsigned ElementIdx = OpC->getZExtValue();
-      const StructLayout *SL = DL->getStructLayout(STy);
+      const StructLayout *SL = DL.getStructLayout(STy);
        Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
        continue;
      }
  
-    APInt TypeSize(IntPtrWidth, DL->getTypeAllocSize(GTI.getIndexedType()));
+    APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
      Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
    }
    return true;
@@ -301,9 +301,9 @@ bool CallAnalyzer::visitAlloca(AllocaInst &I) {
  
    // Accumulate the allocated size.
    if (I.isStaticAlloca()) {
+    const DataLayout &DL = F.getParent()->getDataLayout();
      Type *Ty = I.getAllocatedType();
-    AllocatedSize += (DL ? DL->getTypeAllocSize(Ty) :
-                      Ty->getPrimitiveSizeInBits());
+    AllocatedSize += DL.getTypeAllocSize(Ty);
    }
  
    // We will happily inline static alloca instructions.
@@ -339,7 +339,7 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
  
    // Try to fold GEPs of constant-offset call site argument pointers. This
    // requires target data and inbounds GEPs.
-  if (DL && I.isInBounds()) {
+  if (I.isInBounds()) {
      // Check if we have a base + offset for the pointer.
      Value *Ptr = I.getPointerOperand();
      std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr);
@@ -408,7 +408,6 @@ bool CallAnalyzer::visitBitCast(BitCastInst &I) {
  }
  
  bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
-  const DataLayout *DL = I.getDataLayout();
    // Propagate constants through ptrtoint.
    Constant *COp = dyn_cast<Constant>(I.getOperand(0));
    if (!COp)
@@ -422,7 +421,8 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
    // Track base/offset pairs when converted to a plain integer provided the
    // integer is large enough to represent the pointer.
    unsigned IntegerSize = I.getType()->getScalarSizeInBits();
-  if (DL && IntegerSize >= DL->getPointerSizeInBits()) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  if (IntegerSize >= DL.getPointerSizeInBits()) {
      std::pair<Value *, APInt> BaseAndOffset
        = ConstantOffsetPtrs.lookup(I.getOperand(0));
      if (BaseAndOffset.first)
@@ -445,7 +445,6 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
  }
  
  bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
-  const DataLayout *DL = I.getDataLayout();
    // Propagate constants through ptrtoint.
    Constant *COp = dyn_cast<Constant>(I.getOperand(0));
    if (!COp)
@@ -460,7 +459,8 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
    // modifications provided the integer is not too large.
    Value *Op = I.getOperand(0);
    unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
-  if (DL && IntegerSize <= DL->getPointerSizeInBits()) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  if (IntegerSize <= DL.getPointerSizeInBits()) {
      std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
      if (BaseAndOffset.first)
        ConstantOffsetPtrs[&I] = BaseAndOffset;
@@ -497,12 +497,14 @@ bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
    Constant *COp = dyn_cast<Constant>(Operand);
    if (!COp)
      COp = SimplifiedValues.lookup(Operand);
-  if (COp)
+  if (COp) {
+    const DataLayout &DL = F.getParent()->getDataLayout();
      if (Constant *C = ConstantFoldInstOperands(I.getOpcode(), I.getType(),
                                                 COp, DL)) {
        SimplifiedValues[&I] = C;
        return true;
      }
+  }
  
    // Disable any SROA on the argument to arbitrary unary operators.
    disableSROA(Operand);
@@ -510,6 +512,33 @@ bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
    return false;
  }
  
+bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
+  unsigned ArgNo = A->getArgNo();
+  return CandidateCS.paramHasAttr(ArgNo+1, Attr);
+}
+
+bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
+  // Does the *call site* have the NonNull attribute set on an argument?  We
+  // use the attribute on the call site to memoize any analysis done in the
+  // caller. This will also trip if the callee function has a non-null
+  // parameter attribute, but that's a less interesting case because hopefully
+  // the callee would already have been simplified based on that.
+  if (Argument *A = dyn_cast<Argument>(V))
+    if (paramHasAttr(A, Attribute::NonNull))
+      return true;
+  
+  // Is this an alloca in the caller?  This is distinct from the attribute case
+  // above because attributes aren't updated within the inliner itself and we
+  // always want to catch the alloca derived case.
+  if (isAllocaDerivedArg(V))
+    // We can actually predict the result of comparisons between an
+    // alloca-derived value and null. Note that this fires regardless of
+    // SROA firing.
+    return true;
+  
+  return false;
+}
+
  bool CallAnalyzer::visitCmpInst(CmpInst &I) {
    Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
    // First try to handle simplified comparisons.
@@ -551,18 +580,14 @@ bool CallAnalyzer::visitCmpInst(CmpInst &I) {
    }
  
    // If the comparison is an equality comparison with null, we can simplify it
-  // for any alloca-derived argument.
-  if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)))
-    if (isAllocaDerivedArg(I.getOperand(0))) {
-      // We can actually predict the result of comparisons between an
-      // alloca-derived value and null. Note that this fires regardless of
-      // SROA firing.
-      bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
-      SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
-                                        : ConstantInt::getFalse(I.getType());
-      return true;
-    }
-
+  // if we know the value (argument) can't be null
+  if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
+      isKnownNonNullInCallee(I.getOperand(0))) {
+    bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
+    SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
+                                      : ConstantInt::getFalse(I.getType());
+    return true;
+  }
    // Finally check for SROA candidates in comparisons.
    Value *SROAArg;
    DenseMap<Value *, int>::iterator CostIt;
@@ -607,13 +632,20 @@ bool CallAnalyzer::visitSub(BinaryOperator &I) {
  
  bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
    Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+  const DataLayout &DL = F.getParent()->getDataLayout();
    if (!isa<Constant>(LHS))
      if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
        LHS = SimpleLHS;
    if (!isa<Constant>(RHS))
      if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
        RHS = SimpleRHS;
-  Value *SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
+  Value *SimpleV = nullptr;
+  if (auto FI = dyn_cast<FPMathOperator>(&I))
+    SimpleV =
+        SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
+  else
+    SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
+
    if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
      SimplifiedValues[&I] = C;
      return true;
@@ -629,7 +661,7 @@ bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
  bool CallAnalyzer::visitLoad(LoadInst &I) {
    Value *SROAArg;
    DenseMap<Value *, int>::iterator CostIt;
-  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
+  if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
      if (I.isSimple()) {
        accumulateSROACost(CostIt, InlineConstants::InstrCost);
        return true;
@@ -644,7 +676,7 @@ bool CallAnalyzer::visitLoad(LoadInst &I) {
  bool CallAnalyzer::visitStore(StoreInst &I) {
    Value *SROAArg;
    DenseMap<Value *, int>::iterator CostIt;
-  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
+  if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
      if (I.isSimple()) {
        accumulateSROACost(CostIt, InlineConstants::InstrCost);
        return true;
@@ -725,8 +757,7 @@ bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
  
  bool CallAnalyzer::visitCallSite(CallSite CS) {
    if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
-      !F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                      Attribute::ReturnsTwice)) {
+      !F.hasFnAttribute(Attribute::ReturnsTwice)) {
      // This aborts the entire analysis.
      ExposesReturnsTwice = true;
      return false;
@@ -752,6 +783,9 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
        case Intrinsic::memmove:
          // SROA can usually chew through these intrinsics, but they aren't free.
          return false;
+      case Intrinsic::frameescape:
+        HasFrameEscape = true;
+        return false;
        }
      }
  
@@ -795,7 +829,7 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
    // during devirtualization and so we want to give it a hefty bonus for
    // inlining, but cap that bonus in the event that inlining wouldn't pan
    // out. Pretend to inline the function, with a custom threshold.
-  CallAnalyzer CA(DL, TTI, *F, InlineConstants::IndirectCallThreshold);
+  CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS);
    if (CA.analyzeCall(CS)) {
      // We were able to inline the indirect call! Subtract the cost from the
      // bonus we want to apply, but don't go below zero.
@@ -825,9 +859,29 @@ bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
  bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
    // We model unconditional switches as free, see the comments on handling
    // branches.
-  return isa<ConstantInt>(SI.getCondition()) ||
-         dyn_cast_or_null<ConstantInt>(
-             SimplifiedValues.lookup(SI.getCondition()));
+  if (isa<ConstantInt>(SI.getCondition()))
+    return true;
+  if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
+    if (isa<ConstantInt>(V))
+      return true;
+
+  // Otherwise, we need to accumulate a cost proportional to the number of
+  // distinct successor blocks. This fan-out in the CFG cannot be represented
+  // for free even if we can represent the core switch as a jumptable that
+  // takes a single instruction.
+  //
+  // NB: We convert large switches which are just used to initialize large phi
+  // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
+  // inlining those. It will prevent inlining in cases where the optimization
+  // does not (yet) fire.
+  SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
+  SuccessorBlocks.insert(SI.getDefaultDest());
+  for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I)
+    SuccessorBlocks.insert(I.getCaseSuccessor());
+  // Add cost corresponding to the number of distinct destinations. The first
+  // we model as free because of fallthrough.
+  Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
+  return false;
  }
  
  bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
@@ -838,10 +892,7 @@ bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
    // original function which is extremely undefined behavior.
    // FIXME: This logic isn't really right; we can safely inline functions with
    // indirectbr's as long as no other function or global references the
-  // blockaddress of a block within the current function.  And as a QOI issue,
-  // if someone is using a blockaddress without an indirectbr, and that
-  // reference somehow ends up in another function or global, we probably don't
-  // want to inline this function.
+  // blockaddress of a block within the current function.
    HasIndirectBr = true;
    return false;
  }
@@ -881,7 +932,8 @@ bool CallAnalyzer::visitInstruction(Instruction &I) {
  /// aborts early if the threshold has been exceeded or an impossible to inline
  /// construct has been detected. It returns false if inlining is no longer
  /// viable, and true if inlining remains viable.
-bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
+bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
+                                SmallPtrSetImpl<const Value *> &EphValues) {
    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
      // FIXME: Currently, the number of instructions in a function regardless of
      // our ability to simplify them during inline to constants or dead code,
@@ -893,10 +945,33 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
      if (isa<DbgInfoIntrinsic>(I))
        continue;
  
+    // Skip ephemeral values.
+    if (EphValues.count(I))
+      continue;
+
      ++NumInstructions;
      if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
        ++NumVectorInstructions;
  
+    // If the instruction is floating point, and the target says this operation is
+    // expensive or the function has the "use-soft-float" attribute, this may
+    // eventually become a library call.  Treat the cost as such.
+    if (I->getType()->isFloatingPointTy()) {
+      bool hasSoftFloatAttr = false;
+
+      // If the function has the "use-soft-float" attribute, mark it as expensive.
+      if (F.hasFnAttribute("use-soft-float")) {
+        Attribute Attr = F.getFnAttribute("use-soft-float");
+        StringRef Val = Attr.getValueAsString();
+        if (Val == "true")
+          hasSoftFloatAttr = true;
+      }
+
+      if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
+          hasSoftFloatAttr)
+        Cost += InlineConstants::CallPenalty;
+    }
+
      // If the instruction simplified to a constant, there is no cost to this
      // instruction. Visit the instructions using our InstVisitor to account for
      // all of the per-instruction logic. The visit tree returns true if we
@@ -909,7 +984,7 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
  
      // If the visit this instruction detected an uninlinable pattern, abort.
      if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
-        HasIndirectBr)
+        HasIndirectBr || HasFrameEscape)
        return false;
  
      // If the caller is a recursive function then we don't want to inline
@@ -919,16 +994,9 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
          AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
        return false;
  
-    if (NumVectorInstructions > NumInstructions/2)
-      VectorBonus = FiftyPercentVectorBonus;
-    else if (NumVectorInstructions > NumInstructions/10)
-      VectorBonus = TenPercentVectorBonus;
-    else
-      VectorBonus = 0;
-
-    // Check if we've past the threshold so we don't spin in huge basic
-    // blocks that will never inline.
-    if (Cost > (Threshold + VectorBonus))
+    // Check if we've past the maximum possible threshold so we don't spin in
+    // huge basic blocks that will never inline.
+    if (Cost > Threshold)
        return false;
    }
  
@@ -942,10 +1010,11 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
  /// returns 0 if V is not a pointer, and returns the constant '0' if there are
  /// no constant offsets applied.
  ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
-  if (!DL || !V->getType()->isPointerTy())
-    return 0;
+  if (!V->getType()->isPointerTy())
+    return nullptr;
  
-  unsigned IntPtrWidth = DL->getPointerSizeInBits();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  unsigned IntPtrWidth = DL.getPointerSizeInBits();
    APInt Offset = APInt::getNullValue(IntPtrWidth);
  
    // Even though we don't look through PHI nodes, we could be called on an
@@ -955,7 +1024,7 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
    do {
      if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
        if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
-        return 0;
+        return nullptr;
        V = GEP->getPointerOperand();
      } else if (Operator::getOpcode(V) == Instruction::BitCast) {
        V = cast<Operator>(V)->getOperand(0);
@@ -967,9 +1036,9 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
        break;
      }
      assert(V->getType()->isPointerTy() && "Unexpected operand type!");
-  } while (Visited.insert(V));
+  } while (Visited.insert(V).second);
  
-  Type *IntPtrTy = DL->getIntPtrType(V->getContext());
+  Type *IntPtrTy = DL.getIntPtrType(V->getContext());
    return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
  }
  
@@ -983,33 +1052,42 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
  bool CallAnalyzer::analyzeCall(CallSite CS) {
    ++NumCallsAnalyzed;
  
-  // Track whether the post-inlining function would have more than one basic
-  // block. A single basic block is often intended for inlining. Balloon the
-  // threshold by 50% until we pass the single-BB phase.
-  bool SingleBB = true;
-  int SingleBBBonus = Threshold / 2;
-  Threshold += SingleBBBonus;
-
    // Perform some tweaks to the cost and threshold based on the direct
    // callsite information.
  
    // We want to more aggressively inline vector-dense kernels, so up the
    // threshold, and we'll lower it if the % of vector instructions gets too
-  // low.
+  // low. Note that these bonuses are some what arbitrary and evolved over time
+  // by accident as much as because they are principled bonuses.
+  //
+  // FIXME: It would be nice to remove all such bonuses. At least it would be
+  // nice to base the bonus values on something more scientific.
    assert(NumInstructions == 0);
    assert(NumVectorInstructions == 0);
-  FiftyPercentVectorBonus = Threshold;
-  TenPercentVectorBonus = Threshold / 2;
+  FiftyPercentVectorBonus = 3 * Threshold / 2;
+  TenPercentVectorBonus = 3 * Threshold / 4;
+  const DataLayout &DL = F.getParent()->getDataLayout();
+
+  // Track whether the post-inlining function would have more than one basic
+  // block. A single basic block is often intended for inlining. Balloon the
+  // threshold by 50% until we pass the single-BB phase.
+  bool SingleBB = true;
+  int SingleBBBonus = Threshold / 2;
+
+  // Speculatively apply all possible bonuses to Threshold. If cost exceeds
+  // this Threshold any time, and cost cannot decrease, we can stop processing
+  // the rest of the function body.
+  Threshold += (SingleBBBonus + FiftyPercentVectorBonus);
  
    // Give out bonuses per argument, as the instructions setting them up will
    // be gone after inlining.
    for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
-    if (DL && CS.isByValArgument(I)) {
+    if (CS.isByValArgument(I)) {
        // We approximate the number of loads and stores needed by dividing the
        // size of the byval type by the target's pointer size.
        PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
-      unsigned TypeSize = DL->getTypeSizeInBits(PTy->getElementType());
-      unsigned PointerSize = DL->getPointerSizeInBits();
+      unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
+      unsigned PointerSize = DL.getPointerSizeInBits();
        // Ceiling division.
        unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
  
@@ -1043,9 +1121,9 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
    Instruction *Instr = CS.getInstruction();
    if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
      if (isa<UnreachableInst>(II->getNormalDest()->begin()))
-      Threshold = 1;
+      Threshold = 0;
    } else if (isa<UnreachableInst>(++BasicBlock::iterator(Instr)))
-    Threshold = 1;
+    Threshold = 0;
  
    // If this function uses the coldcc calling convention, prefer not to inline
    // it.
@@ -1096,6 +1174,12 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
    NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
    NumAllocaArgs = SROAArgValues.size();
  
+  // FIXME: If a caller has multiple calls to a callee, we end up recomputing
+  // the ephemeral values multiple times (and they're completely determined by
+  // the callee, so this is purely duplicate work).
+  SmallPtrSet<const Value *, 32> EphValues;
+  CodeMetrics::collectEphemeralValues(&F, &ACT->getAssumptionCache(F), EphValues);
+
    // The worklist of live basic blocks in the callee *after* inlining. We avoid
    // adding basic blocks of the callee which can be proven to be dead for this
    // particular call site in order to get more accurate cost estimates. This
@@ -1111,18 +1195,27 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
    for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
      // Bail out the moment we cross the threshold. This means we'll under-count
      // the cost, but only when undercounting doesn't matter.
-    if (Cost > (Threshold + VectorBonus))
+    if (Cost > Threshold)
        break;
  
      BasicBlock *BB = BBWorklist[Idx];
      if (BB->empty())
        continue;
  
+    // Disallow inlining a blockaddress. A blockaddress only has defined
+    // behavior for an indirect branch in the same function, and we do not
+    // currently support inlining indirect branches. But, the inliner may not
+    // see an indirect branch that ends up being dead code at a particular call
+    // site. If the blockaddress escapes the function, e.g., via a global
+    // variable, inlining may lead to an invalid cross-function reference.
+    if (BB->hasAddressTaken())
+      return false;
+
      // Analyze the cost of this block. If we blow through the threshold, this
      // returns false, and we can bail on out.
-    if (!analyzeBlock(BB)) {
+    if (!analyzeBlock(BB, EphValues)) {
        if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
-          HasIndirectBr)
+          HasIndirectBr || HasFrameEscape)
          return false;
  
        // If the caller is a recursive function then we don't want to inline
@@ -1180,7 +1273,13 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
    if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
      return false;
  
-  Threshold += VectorBonus;
+  // We applied the maximum possible vector bonus at the beginning. Now,
+  // subtract the excess bonus, if any, from the Threshold before
+  // comparing against Cost.
+  if (NumVectorInstructions <= NumInstructions / 10)
+    Threshold -= FiftyPercentVectorBonus;
+  else if (NumVectorInstructions <= NumInstructions / 2)
+    Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus);
  
    return Cost < Threshold;
  }
@@ -1195,19 +1294,20 @@ void CallAnalyzer::dump() {
    DEBUG_PRINT_STAT(NumConstantPtrCmps);
    DEBUG_PRINT_STAT(NumConstantPtrDiffs);
    DEBUG_PRINT_STAT(NumInstructionsSimplified);
+  DEBUG_PRINT_STAT(NumInstructions);
    DEBUG_PRINT_STAT(SROACostSavings);
    DEBUG_PRINT_STAT(SROACostSavingsLost);
    DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
    DEBUG_PRINT_STAT(Cost);
    DEBUG_PRINT_STAT(Threshold);
-  DEBUG_PRINT_STAT(VectorBonus);
  #undef DEBUG_PRINT_STAT
  }
  #endif
  
  INITIALIZE_PASS_BEGIN(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
                        true, true)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
  INITIALIZE_PASS_END(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
                      true, true)
  
@@ -1219,12 +1319,14 @@ InlineCostAnalysis::~InlineCostAnalysis() {}
  
  void InlineCostAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
    AU.setPreservesAll();
-  AU.addRequired<TargetTransformInfo>();
+  AU.addRequired<AssumptionCacheTracker>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
    CallGraphSCCPass::getAnalysisUsage(AU);
  }
  
  bool InlineCostAnalysis::runOnSCC(CallGraphSCC &SCC) {
-  TTI = &getAnalysis<TargetTransformInfo>();
+  TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
+  ACT = &getAnalysis<AssumptionCacheTracker>();
    return false;
  }
  
@@ -1234,16 +1336,18 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, int Threshold) {
  
  /// \brief Test that two functions either have or have not the given attribute
  ///        at the same time.
-static bool attributeMatches(Function *F1, Function *F2,
-                             Attribute::AttrKind Attr) {
-  return F1->hasFnAttribute(Attr) == F2->hasFnAttribute(Attr);
+template<typename AttrKind>
+static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) {
+  return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr);
  }
  
  /// \brief Test that there are no attribute conflicts between Caller and Callee
  ///        that prevent inlining.
  static bool functionsHaveCompatibleAttributes(Function *Caller,
                                                Function *Callee) {
-  return attributeMatches(Caller, Callee, Attribute::SanitizeAddress) &&
+  return attributeMatches(Caller, Callee, "target-cpu") &&
+         attributeMatches(Caller, Callee, "target-features") &&
+         attributeMatches(Caller, Callee, Attribute::SanitizeAddress) &&
           attributeMatches(Caller, Callee, Attribute::SanitizeMemory) &&
           attributeMatches(Caller, Callee, Attribute::SanitizeThread);
  }
@@ -1256,7 +1360,7 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
  
    // Calls to functions with always-inline attributes should be inlined
    // whenever possible.
-  if (Callee->hasFnAttribute(Attribute::AlwaysInline)) {
+  if (CS.hasFnAttr(Attribute::AlwaysInline)) {
      if (isInlineViable(*Callee))
        return llvm::InlineCost::getAlways();
      return llvm::InlineCost::getNever();
@@ -1281,7 +1385,7 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
    DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
          << "...\n");
  
-  CallAnalyzer CA(Callee->getDataLayout(), *TTI, *Callee, Threshold);
+  CallAnalyzer CA(TTIWP->getTTI(*Callee), ACT, *Callee, Threshold, CS);
    bool ShouldInline = CA.analyzeCall(CS);
  
    DEBUG(CA.dump());
@@ -1296,18 +1400,11 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
  }
  
  bool InlineCostAnalysis::isInlineViable(Function &F) {
-  bool ReturnsTwice =
-    F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                   Attribute::ReturnsTwice);
+  bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
    for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
-    // Disallow inlining of functions which contain an indirect branch,
-    // unless the always_inline attribute is set.
-    // The attribute serves as a assertion that no local address
-    // like a block label can escpape the function.
-    // Revisit enabling inlining for functions with indirect branches
-    // when a more sophisticated espape/points-to analysis becomes available.
-    if (isa<IndirectBrInst>(BI->getTerminator()) &&
-        !F.hasFnAttribute(Attribute::AlwaysInline))
+    // Disallow inlining of functions which contain indirect branches or
+    // blockaddresses.
+    if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
        return false;
  
      for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
@@ -1325,6 +1422,13 @@ bool InlineCostAnalysis::isInlineViable(Function &F) {
        if (!ReturnsTwice && CS.isCall() &&
            cast<CallInst>(CS.getInstruction())->canReturnTwice())
          return false;
+
+      // Disallow inlining functions that call @llvm.frameescape. Doing this
+      // correctly would require major changes to the inliner.
+      if (CS.getCalledFunction() &&
+          CS.getCalledFunction()->getIntrinsicID() ==
+              llvm::Intrinsic::frameescape)
+        return false;
      }
    }