Inliner: Do zero-cost inlines even if above a negative threshold (PR24851)

[oota-llvm.git] / lib / Analysis / InlineCost.cpp
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp

index 5c4e7026d1ef6e21fd462659d0a2cbe1d058351b..26f2e7ff504a8a6ad8f0b013ea7daad7643e1a77 100644 (file)
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -11,29 +11,32 @@
  //
  //===----------------------------------------------------------------------===//
  
-#define DEBUG_TYPE "inline-cost"
  #include "llvm/Analysis/InlineCost.h"
  #include "llvm/ADT/STLExtras.h"
  #include "llvm/ADT/SetVector.h"
  #include "llvm/ADT/SmallPtrSet.h"
  #include "llvm/ADT/SmallVector.h"
  #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
  #include "llvm/Analysis/ConstantFolding.h"
  #include "llvm/Analysis/InstructionSimplify.h"
  #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CallSite.h"
  #include "llvm/IR/CallingConv.h"
  #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
  #include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/InstVisitor.h"
  #include "llvm/IR/IntrinsicInst.h"
  #include "llvm/IR/Operator.h"
-#include "llvm/InstVisitor.h"
-#include "llvm/Support/CallSite.h"
  #include "llvm/Support/Debug.h"
-#include "llvm/Support/GetElementPtrTypeIterator.h"
  #include "llvm/Support/raw_ostream.h"
  
  using namespace llvm;
  
+#define DEBUG_TYPE "inline-cost"
+
  STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
  
  namespace {
@@ -42,15 +45,20 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
    typedef InstVisitor<CallAnalyzer, bool> Base;
    friend class InstVisitor<CallAnalyzer, bool>;
  
-  // DataLayout if available, or null.
-  const DataLayout *const TD;
-
    /// The TargetTransformInfo available for this compilation.
    const TargetTransformInfo &TTI;
  
+  /// The cache of @llvm.assume intrinsics.
+  AssumptionCacheTracker *ACT;
+
    // The called function.
    Function &F;
  
+  // The candidate callsite being analyzed. Please do not use this to do
+  // analysis in the caller function; we want the inline cost query to be
+  // easily cacheable. Instead, use the cover function paramHasAttr.
+  CallSite CandidateCS;
+
    int Threshold;
    int Cost;
  
@@ -59,6 +67,9 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
    bool ExposesReturnsTwice;
    bool HasDynamicAlloca;
    bool ContainsNoDuplicateCall;
+  bool HasReturn;
+  bool HasIndirectBr;
+  bool HasFrameEscape;
  
    /// Number of bytes allocated statically by the callee.
    uint64_t AllocatedSize;
@@ -95,16 +106,24 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
    void disableSROA(Value *V);
    void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
                            int InstructionCost);
-  bool handleSROACandidate(bool IsSROAValid,
-                           DenseMap<Value *, int>::iterator CostIt,
-                           int InstructionCost);
    bool isGEPOffsetConstant(GetElementPtrInst &GEP);
    bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
    bool simplifyCallSite(Function *F, CallSite CS);
    ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
  
+  /// Return true if the given argument to the function being considered for
+  /// inlining has the given attribute set either at the call site or the
+  /// function declaration.  Primarily used to inspect call site specific
+  /// attributes since these can be more precise than the ones on the callee
+  /// itself. 
+  bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
+  
+  /// Return true if the given value is known non null within the callee if
+  /// inlined through this particular callsite. 
+  bool isKnownNonNullInCallee(Value *V);
+
    // Custom analysis routines.
-  bool analyzeBlock(BasicBlock *BB);
+  bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
  
    // Disable several entry points to the visitor so we don't accidentally use
    // them by declaring but not defining them here.
@@ -124,7 +143,7 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
    bool visitIntToPtr(IntToPtrInst &I);
    bool visitCastInst(CastInst &I);
    bool visitUnaryInstruction(UnaryInstruction &I);
-  bool visitICmp(ICmpInst &I);
+  bool visitCmpInst(CmpInst &I);
    bool visitSub(BinaryOperator &I);
    bool visitBinaryOperator(BinaryOperator &I);
    bool visitLoad(LoadInst &I);
@@ -132,14 +151,23 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
    bool visitExtractValue(ExtractValueInst &I);
    bool visitInsertValue(InsertValueInst &I);
    bool visitCallSite(CallSite CS);
+  bool visitReturnInst(ReturnInst &RI);
+  bool visitBranchInst(BranchInst &BI);
+  bool visitSwitchInst(SwitchInst &SI);
+  bool visitIndirectBrInst(IndirectBrInst &IBI);
+  bool visitResumeInst(ResumeInst &RI);
+  bool visitCleanupReturnInst(CleanupReturnInst &RI);
+  bool visitCatchReturnInst(CatchReturnInst &RI);
+  bool visitUnreachableInst(UnreachableInst &I);
  
  public:
-  CallAnalyzer(const DataLayout *TD, const TargetTransformInfo &TTI,
-               Function &Callee, int Threshold)
-      : TD(TD), TTI(TTI), F(Callee), Threshold(Threshold), Cost(0),
-        IsCallerRecursive(false), IsRecursiveCall(false),
+  CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT,
+               Function &Callee, int Threshold, CallSite CSArg)
+    : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), Threshold(Threshold),
+        Cost(0), IsCallerRecursive(false), IsRecursiveCall(false),
          ExposesReturnsTwice(false), HasDynamicAlloca(false),
-        ContainsNoDuplicateCall(false), AllocatedSize(0), NumInstructions(0),
+        ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
+        HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
          NumVectorInstructions(0), FiftyPercentVectorBonus(0),
          TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
          NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
@@ -216,21 +244,6 @@ void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
    SROACostSavings += InstructionCost;
  }
  
-/// \brief Helper for the common pattern of handling a SROA candidate.
-/// Either accumulates the cost savings if the SROA remains valid, or disables
-/// SROA for the candidate.
-bool CallAnalyzer::handleSROACandidate(bool IsSROAValid,
-                                       DenseMap<Value *, int>::iterator CostIt,
-                                       int InstructionCost) {
-  if (IsSROAValid) {
-    accumulateSROACost(CostIt, InstructionCost);
-    return true;
-  }
-
-  disableSROA(CostIt);
-  return false;
-}
-
  /// \brief Check whether a GEP's indices are all constant.
  ///
  /// Respects any simplified values known during the analysis of this callsite.
@@ -247,10 +260,8 @@ bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) {
  /// Returns false if unable to compute the offset for any reason. Respects any
  /// simplified values known during the analysis of this callsite.
  bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
-  if (!TD)
-    return false;
-
-  unsigned IntPtrWidth = TD->getPointerSizeInBits();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  unsigned IntPtrWidth = DL.getPointerSizeInBits();
    assert(IntPtrWidth == Offset.getBitWidth());
  
    for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
@@ -266,26 +277,35 @@ bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
      // Handle a struct index, which adds its field offset to the pointer.
      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
        unsigned ElementIdx = OpC->getZExtValue();
-      const StructLayout *SL = TD->getStructLayout(STy);
+      const StructLayout *SL = DL.getStructLayout(STy);
        Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
        continue;
      }
  
-    APInt TypeSize(IntPtrWidth, TD->getTypeAllocSize(GTI.getIndexedType()));
+    APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
      Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
    }
    return true;
  }
  
  bool CallAnalyzer::visitAlloca(AllocaInst &I) {
-  // FIXME: Check whether inlining will turn a dynamic alloca into a static
+  // Check whether inlining will turn a dynamic alloca into a static
    // alloca, and handle that case.
+  if (I.isArrayAllocation()) {
+    if (Constant *Size = SimplifiedValues.lookup(I.getArraySize())) {
+      ConstantInt *AllocSize = dyn_cast<ConstantInt>(Size);
+      assert(AllocSize && "Allocation size not a constant int?");
+      Type *Ty = I.getAllocatedType();
+      AllocatedSize += Ty->getPrimitiveSizeInBits() * AllocSize->getZExtValue();
+      return Base::visitAlloca(I);
+    }
+  }
  
    // Accumulate the allocated size.
    if (I.isStaticAlloca()) {
+    const DataLayout &DL = F.getParent()->getDataLayout();
      Type *Ty = I.getAllocatedType();
-    AllocatedSize += (TD ? TD->getTypeAllocSize(Ty) :
-                      Ty->getPrimitiveSizeInBits());
+    AllocatedSize += DL.getTypeAllocSize(Ty);
    }
  
    // We will happily inline static alloca instructions.
@@ -321,7 +341,7 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
  
    // Try to fold GEPs of constant-offset call site argument pointers. This
    // requires target data and inbounds GEPs.
-  if (TD && I.isInBounds()) {
+  if (I.isInBounds()) {
      // Check if we have a base + offset for the pointer.
      Value *Ptr = I.getPointerOperand();
      std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr);
@@ -403,7 +423,8 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
    // Track base/offset pairs when converted to a plain integer provided the
    // integer is large enough to represent the pointer.
    unsigned IntegerSize = I.getType()->getScalarSizeInBits();
-  if (TD && IntegerSize >= TD->getPointerSizeInBits()) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  if (IntegerSize >= DL.getPointerSizeInBits()) {
      std::pair<Value *, APInt> BaseAndOffset
        = ConstantOffsetPtrs.lookup(I.getOperand(0));
      if (BaseAndOffset.first)
@@ -422,7 +443,7 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
    if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
      SROAArgValues[&I] = SROAArg;
  
-  return isInstructionFree(&I, TD);
+  return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
  }
  
  bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
@@ -440,7 +461,8 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
    // modifications provided the integer is not too large.
    Value *Op = I.getOperand(0);
    unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
-  if (TD && IntegerSize <= TD->getPointerSizeInBits()) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  if (IntegerSize <= DL.getPointerSizeInBits()) {
      std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
      if (BaseAndOffset.first)
        ConstantOffsetPtrs[&I] = BaseAndOffset;
@@ -452,7 +474,7 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
    if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
      SROAArgValues[&I] = SROAArg;
  
-  return isInstructionFree(&I, TD);
+  return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
  }
  
  bool CallAnalyzer::visitCastInst(CastInst &I) {
@@ -469,18 +491,22 @@ bool CallAnalyzer::visitCastInst(CastInst &I) {
    // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
    disableSROA(I.getOperand(0));
  
-  return isInstructionFree(&I, TD);
+  return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
  }
  
  bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
    Value *Operand = I.getOperand(0);
-  Constant *Ops[1] = { dyn_cast<Constant>(Operand) };
-  if (Ops[0] || (Ops[0] = SimplifiedValues.lookup(Operand)))
+  Constant *COp = dyn_cast<Constant>(Operand);
+  if (!COp)
+    COp = SimplifiedValues.lookup(Operand);
+  if (COp) {
+    const DataLayout &DL = F.getParent()->getDataLayout();
      if (Constant *C = ConstantFoldInstOperands(I.getOpcode(), I.getType(),
-                                               Ops, TD)) {
+                                               COp, DL)) {
        SimplifiedValues[&I] = C;
        return true;
      }
+  }
  
    // Disable any SROA on the argument to arbitrary unary operators.
    disableSROA(Operand);
@@ -488,7 +514,34 @@ bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
    return false;
  }
  
-bool CallAnalyzer::visitICmp(ICmpInst &I) {
+bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
+  unsigned ArgNo = A->getArgNo();
+  return CandidateCS.paramHasAttr(ArgNo+1, Attr);
+}
+
+bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
+  // Does the *call site* have the NonNull attribute set on an argument?  We
+  // use the attribute on the call site to memoize any analysis done in the
+  // caller. This will also trip if the callee function has a non-null
+  // parameter attribute, but that's a less interesting case because hopefully
+  // the callee would already have been simplified based on that.
+  if (Argument *A = dyn_cast<Argument>(V))
+    if (paramHasAttr(A, Attribute::NonNull))
+      return true;
+  
+  // Is this an alloca in the caller?  This is distinct from the attribute case
+  // above because attributes aren't updated within the inliner itself and we
+  // always want to catch the alloca derived case.
+  if (isAllocaDerivedArg(V))
+    // We can actually predict the result of comparisons between an
+    // alloca-derived value and null. Note that this fires regardless of
+    // SROA firing.
+    return true;
+  
+  return false;
+}
+
+bool CallAnalyzer::visitCmpInst(CmpInst &I) {
    Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
    // First try to handle simplified comparisons.
    if (!isa<Constant>(LHS))
@@ -497,20 +550,24 @@ bool CallAnalyzer::visitICmp(ICmpInst &I) {
    if (!isa<Constant>(RHS))
      if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
        RHS = SimpleRHS;
-  if (Constant *CLHS = dyn_cast<Constant>(LHS))
+  if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
      if (Constant *CRHS = dyn_cast<Constant>(RHS))
-      if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
+      if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
          SimplifiedValues[&I] = C;
          return true;
        }
+  }
+
+  if (I.getOpcode() == Instruction::FCmp)
+    return false;
  
    // Otherwise look for a comparison between constant offset pointers with
    // a common base.
    Value *LHSBase, *RHSBase;
    APInt LHSOffset, RHSOffset;
-  llvm::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
+  std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
    if (LHSBase) {
-    llvm::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
+    std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
      if (RHSBase && LHSBase == RHSBase) {
        // We have common bases, fold the icmp to a constant based on the
        // offsets.
@@ -525,18 +582,14 @@ bool CallAnalyzer::visitICmp(ICmpInst &I) {
    }
  
    // If the comparison is an equality comparison with null, we can simplify it
-  // for any alloca-derived argument.
-  if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)))
-    if (isAllocaDerivedArg(I.getOperand(0))) {
-      // We can actually predict the result of comparisons between an
-      // alloca-derived value and null. Note that this fires regardless of
-      // SROA firing.
-      bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
-      SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
-                                        : ConstantInt::getFalse(I.getType());
-      return true;
-    }
-
+  // if we know the value (argument) can't be null
+  if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
+      isKnownNonNullInCallee(I.getOperand(0))) {
+    bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
+    SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
+                                      : ConstantInt::getFalse(I.getType());
+    return true;
+  }
    // Finally check for SROA candidates in comparisons.
    Value *SROAArg;
    DenseMap<Value *, int>::iterator CostIt;
@@ -558,9 +611,9 @@ bool CallAnalyzer::visitSub(BinaryOperator &I) {
    Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
    Value *LHSBase, *RHSBase;
    APInt LHSOffset, RHSOffset;
-  llvm::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
+  std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
    if (LHSBase) {
-    llvm::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
+    std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
      if (RHSBase && LHSBase == RHSBase) {
        // We have common bases, fold the subtract to a constant based on the
        // offsets.
@@ -581,13 +634,20 @@ bool CallAnalyzer::visitSub(BinaryOperator &I) {
  
  bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
    Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+  const DataLayout &DL = F.getParent()->getDataLayout();
    if (!isa<Constant>(LHS))
      if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
        LHS = SimpleLHS;
    if (!isa<Constant>(RHS))
      if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
        RHS = SimpleRHS;
-  Value *SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, TD);
+  Value *SimpleV = nullptr;
+  if (auto FI = dyn_cast<FPMathOperator>(&I))
+    SimpleV =
+        SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
+  else
+    SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
+
    if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
      SimplifiedValues[&I] = C;
      return true;
@@ -603,7 +663,7 @@ bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
  bool CallAnalyzer::visitLoad(LoadInst &I) {
    Value *SROAArg;
    DenseMap<Value *, int>::iterator CostIt;
-  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
+  if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
      if (I.isSimple()) {
        accumulateSROACost(CostIt, InlineConstants::InstrCost);
        return true;
@@ -618,7 +678,7 @@ bool CallAnalyzer::visitLoad(LoadInst &I) {
  bool CallAnalyzer::visitStore(StoreInst &I) {
    Value *SROAArg;
    DenseMap<Value *, int>::iterator CostIt;
-  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
+  if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
      if (I.isSimple()) {
        accumulateSROACost(CostIt, InlineConstants::InstrCost);
        return true;
@@ -698,15 +758,14 @@ bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
  }
  
  bool CallAnalyzer::visitCallSite(CallSite CS) {
-  if (CS.isCall() && cast<CallInst>(CS.getInstruction())->canReturnTwice() &&
-      !F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                      Attribute::ReturnsTwice)) {
+  if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
+      !F.hasFnAttribute(Attribute::ReturnsTwice)) {
      // This aborts the entire analysis.
      ExposesReturnsTwice = true;
      return false;
    }
    if (CS.isCall() &&
-      cast<CallInst>(CS.getInstruction())->hasFnAttr(Attribute::NoDuplicate))
+      cast<CallInst>(CS.getInstruction())->cannotDuplicate())
      ContainsNoDuplicateCall = true;
  
    if (Function *F = CS.getCalledFunction()) {
@@ -726,6 +785,9 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
        case Intrinsic::memmove:
          // SROA can usually chew through these intrinsics, but they aren't free.
          return false;
+      case Intrinsic::localescape:
+        HasFrameEscape = true;
+        return false;
        }
      }
  
@@ -736,7 +798,7 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
        return false;
      }
  
-    if (!callIsSmall(CS)) {
+    if (TTI.isLoweredToCall(F)) {
        // We account for the average 1 instruction per call argument setup
        // here.
        Cost += CS.arg_size() * InlineConstants::InstrCost;
@@ -769,7 +831,7 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
    // during devirtualization and so we want to give it a hefty bonus for
    // inlining, but cap that bonus in the event that inlining wouldn't pan
    // out. Pretend to inline the function, with a custom threshold.
-  CallAnalyzer CA(TD, TTI, *F, InlineConstants::IndirectCallThreshold);
+  CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS);
    if (CA.analyzeCall(CS)) {
      // We were able to inline the indirect call! Subtract the cost from the
      // bonus we want to apply, but don't go below zero.
@@ -779,10 +841,93 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
    return Base::visitCallSite(CS);
  }
  
+bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
+  // At least one return instruction will be free after inlining.
+  bool Free = !HasReturn;
+  HasReturn = true;
+  return Free;
+}
+
+bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
+  // We model unconditional branches as essentially free -- they really
+  // shouldn't exist at all, but handling them makes the behavior of the
+  // inliner more regular and predictable. Interestingly, conditional branches
+  // which will fold away are also free.
+  return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
+         dyn_cast_or_null<ConstantInt>(
+             SimplifiedValues.lookup(BI.getCondition()));
+}
+
+bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
+  // We model unconditional switches as free, see the comments on handling
+  // branches.
+  if (isa<ConstantInt>(SI.getCondition()))
+    return true;
+  if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
+    if (isa<ConstantInt>(V))
+      return true;
+
+  // Otherwise, we need to accumulate a cost proportional to the number of
+  // distinct successor blocks. This fan-out in the CFG cannot be represented
+  // for free even if we can represent the core switch as a jumptable that
+  // takes a single instruction.
+  //
+  // NB: We convert large switches which are just used to initialize large phi
+  // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
+  // inlining those. It will prevent inlining in cases where the optimization
+  // does not (yet) fire.
+  SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
+  SuccessorBlocks.insert(SI.getDefaultDest());
+  for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I)
+    SuccessorBlocks.insert(I.getCaseSuccessor());
+  // Add cost corresponding to the number of distinct destinations. The first
+  // we model as free because of fallthrough.
+  Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
+  return false;
+}
+
+bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
+  // We never want to inline functions that contain an indirectbr.  This is
+  // incorrect because all the blockaddress's (in static global initializers
+  // for example) would be referring to the original function, and this
+  // indirect jump would jump from the inlined copy of the function into the
+  // original function which is extremely undefined behavior.
+  // FIXME: This logic isn't really right; we can safely inline functions with
+  // indirectbr's as long as no other function or global references the
+  // blockaddress of a block within the current function.
+  HasIndirectBr = true;
+  return false;
+}
+
+bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
+  // FIXME: It's not clear that a single instruction is an accurate model for
+  // the inline cost of a resume instruction.
+  return false;
+}
+
+bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
+  // FIXME: It's not clear that a single instruction is an accurate model for
+  // the inline cost of a cleanupret instruction.
+  return false;
+}
+
+bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
+  // FIXME: It's not clear that a single instruction is an accurate model for
+  // the inline cost of a catchret instruction.
+  return false;
+}
+
+bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
+  // FIXME: It might be reasonably to discount the cost of instructions leading
+  // to unreachable as they have the lowest possible impact on both runtime and
+  // code size.
+  return true; // No actual code is needed for unreachable.
+}
+
  bool CallAnalyzer::visitInstruction(Instruction &I) {
    // Some instructions are free. All of the free intrinsics can also be
    // handled by SROA, etc.
-  if (isInstructionFree(&I, TD))
+  if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
      return true;
  
    // We found something we don't understand or can't handle. Mark any SROA-able
@@ -801,25 +946,60 @@ bool CallAnalyzer::visitInstruction(Instruction &I) {
  /// aborts early if the threshold has been exceeded or an impossible to inline
  /// construct has been detected. It returns false if inlining is no longer
  /// viable, and true if inlining remains viable.
-bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
-  for (BasicBlock::iterator I = BB->begin(), E = llvm::prior(BB->end());
-       I != E; ++I) {
+bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
+                                SmallPtrSetImpl<const Value *> &EphValues) {
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    // FIXME: Currently, the number of instructions in a function regardless of
+    // our ability to simplify them during inline to constants or dead code,
+    // are actually used by the vector bonus heuristic. As long as that's true,
+    // we have to special case debug intrinsics here to prevent differences in
+    // inlining due to debug symbols. Eventually, the number of unsimplified
+    // instructions shouldn't factor into the cost computation, but until then,
+    // hack around it here.
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    // Skip ephemeral values.
+    if (EphValues.count(&*I))
+      continue;
+
      ++NumInstructions;
      if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
        ++NumVectorInstructions;
  
+    // If the instruction is floating point, and the target says this operation
+    // is expensive or the function has the "use-soft-float" attribute, this may
+    // eventually become a library call. Treat the cost as such.
+    if (I->getType()->isFloatingPointTy()) {
+      bool hasSoftFloatAttr = false;
+
+      // If the function has the "use-soft-float" attribute, mark it as
+      // expensive.
+      if (F.hasFnAttribute("use-soft-float")) {
+        Attribute Attr = F.getFnAttribute("use-soft-float");
+        StringRef Val = Attr.getValueAsString();
+        if (Val == "true")
+          hasSoftFloatAttr = true;
+      }
+
+      if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
+          hasSoftFloatAttr)
+        Cost += InlineConstants::CallPenalty;
+    }
+
      // If the instruction simplified to a constant, there is no cost to this
      // instruction. Visit the instructions using our InstVisitor to account for
      // all of the per-instruction logic. The visit tree returns true if we
      // consumed the instruction in any way, and false if the instruction's base
      // cost should count against inlining.
-    if (Base::visit(I))
+    if (Base::visit(&*I))
        ++NumInstructionsSimplified;
      else
        Cost += InlineConstants::InstrCost;
  
      // If the visit this instruction detected an uninlinable pattern, abort.
-    if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca)
+    if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
+        HasIndirectBr || HasFrameEscape)
        return false;
  
      // If the caller is a recursive function then we don't want to inline
@@ -829,16 +1009,9 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
          AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
        return false;
  
-    if (NumVectorInstructions > NumInstructions/2)
-      VectorBonus = FiftyPercentVectorBonus;
-    else if (NumVectorInstructions > NumInstructions/10)
-      VectorBonus = TenPercentVectorBonus;
-    else
-      VectorBonus = 0;
-
-    // Check if we've past the threshold so we don't spin in huge basic
-    // blocks that will never inline.
-    if (Cost > (Threshold + VectorBonus))
+    // Check if we've past the maximum possible threshold so we don't spin in
+    // huge basic blocks that will never inline.
+    if (Cost > Threshold)
        return false;
    }
  
@@ -852,10 +1025,11 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
  /// returns 0 if V is not a pointer, and returns the constant '0' if there are
  /// no constant offsets applied.
  ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
-  if (!TD || !V->getType()->isPointerTy())
-    return 0;
+  if (!V->getType()->isPointerTy())
+    return nullptr;
  
-  unsigned IntPtrWidth = TD->getPointerSizeInBits();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  unsigned IntPtrWidth = DL.getPointerSizeInBits();
    APInt Offset = APInt::getNullValue(IntPtrWidth);
  
    // Even though we don't look through PHI nodes, we could be called on an
@@ -865,7 +1039,7 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
    do {
      if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
        if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
-        return 0;
+        return nullptr;
        V = GEP->getPointerOperand();
      } else if (Operator::getOpcode(V) == Instruction::BitCast) {
        V = cast<Operator>(V)->getOperand(0);
@@ -877,9 +1051,9 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
        break;
      }
      assert(V->getType()->isPointerTy() && "Unexpected operand type!");
-  } while (Visited.insert(V));
+  } while (Visited.insert(V).second);
  
-  Type *IntPtrTy = TD->getIntPtrType(V->getContext());
+  Type *IntPtrTy = DL.getIntPtrType(V->getContext());
    return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
  }
  
@@ -893,33 +1067,42 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
  bool CallAnalyzer::analyzeCall(CallSite CS) {
    ++NumCallsAnalyzed;
  
-  // Track whether the post-inlining function would have more than one basic
-  // block. A single basic block is often intended for inlining. Balloon the
-  // threshold by 50% until we pass the single-BB phase.
-  bool SingleBB = true;
-  int SingleBBBonus = Threshold / 2;
-  Threshold += SingleBBBonus;
-
    // Perform some tweaks to the cost and threshold based on the direct
    // callsite information.
  
    // We want to more aggressively inline vector-dense kernels, so up the
    // threshold, and we'll lower it if the % of vector instructions gets too
-  // low.
+  // low. Note that these bonuses are some what arbitrary and evolved over time
+  // by accident as much as because they are principled bonuses.
+  //
+  // FIXME: It would be nice to remove all such bonuses. At least it would be
+  // nice to base the bonus values on something more scientific.
    assert(NumInstructions == 0);
    assert(NumVectorInstructions == 0);
-  FiftyPercentVectorBonus = Threshold;
-  TenPercentVectorBonus = Threshold / 2;
+  FiftyPercentVectorBonus = 3 * Threshold / 2;
+  TenPercentVectorBonus = 3 * Threshold / 4;
+  const DataLayout &DL = F.getParent()->getDataLayout();
+
+  // Track whether the post-inlining function would have more than one basic
+  // block. A single basic block is often intended for inlining. Balloon the
+  // threshold by 50% until we pass the single-BB phase.
+  bool SingleBB = true;
+  int SingleBBBonus = Threshold / 2;
+
+  // Speculatively apply all possible bonuses to Threshold. If cost exceeds
+  // this Threshold any time, and cost cannot decrease, we can stop processing
+  // the rest of the function body.
+  Threshold += (SingleBBBonus + FiftyPercentVectorBonus);
  
    // Give out bonuses per argument, as the instructions setting them up will
    // be gone after inlining.
    for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
-    if (TD && CS.isByValArgument(I)) {
+    if (CS.isByValArgument(I)) {
        // We approximate the number of loads and stores needed by dividing the
        // size of the byval type by the target's pointer size.
        PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
-      unsigned TypeSize = TD->getTypeSizeInBits(PTy->getElementType());
-      unsigned PointerSize = TD->getPointerSizeInBits();
+      unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
+      unsigned PointerSize = DL.getPointerSizeInBits();
        // Ceiling division.
        unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
  
@@ -953,9 +1136,9 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
    Instruction *Instr = CS.getInstruction();
    if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
      if (isa<UnreachableInst>(II->getNormalDest()->begin()))
-      Threshold = 1;
+      Threshold = 0;
    } else if (isa<UnreachableInst>(++BasicBlock::iterator(Instr)))
-    Threshold = 1;
+    Threshold = 0;
  
    // If this function uses the coldcc calling convention, prefer not to inline
    // it.
@@ -971,9 +1154,8 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
  
    Function *Caller = CS.getInstruction()->getParent()->getParent();
    // Check if the caller function is recursive itself.
-  for (Value::use_iterator U = Caller->use_begin(), E = Caller->use_end();
-       U != E; ++U) {
-    CallSite Site(cast<Value>(*U));
+  for (User *U : Caller->users()) {
+    CallSite Site(U);
      if (!Site)
        continue;
      Instruction *I = Site.getInstruction();
@@ -983,10 +1165,6 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
      }
    }
  
-  // Track whether we've seen a return instruction. The first return
-  // instruction is free, as at least one will usually disappear in inlining.
-  bool HasReturn = false;
-
    // Populate our simplified values by mapping from function arguments to call
    // arguments with known important simplifications.
    CallSite::arg_iterator CAI = CS.arg_begin();
@@ -994,15 +1172,15 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
         FAI != FAE; ++FAI, ++CAI) {
      assert(CAI != CS.arg_end());
      if (Constant *C = dyn_cast<Constant>(CAI))
-      SimplifiedValues[FAI] = C;
+      SimplifiedValues[&*FAI] = C;
  
      Value *PtrArg = *CAI;
      if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
-      ConstantOffsetPtrs[FAI] = std::make_pair(PtrArg, C->getValue());
+      ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
  
        // We can SROA any pointer arguments derived from alloca instructions.
        if (isa<AllocaInst>(PtrArg)) {
-        SROAArgValues[FAI] = PtrArg;
+        SROAArgValues[&*FAI] = PtrArg;
          SROAArgCosts[PtrArg] = 0;
        }
      }
@@ -1011,6 +1189,12 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
    NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
    NumAllocaArgs = SROAArgValues.size();
  
+  // FIXME: If a caller has multiple calls to a callee, we end up recomputing
+  // the ephemeral values multiple times (and they're completely determined by
+  // the callee, so this is purely duplicate work).
+  SmallPtrSet<const Value *, 32> EphValues;
+  CodeMetrics::collectEphemeralValues(&F, &ACT->getAssumptionCache(F), EphValues);
+
    // The worklist of live basic blocks in the callee *after* inlining. We avoid
    // adding basic blocks of the callee which can be proven to be dead for this
    // particular call site in order to get more accurate cost estimates. This
@@ -1026,40 +1210,27 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
    for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
      // Bail out the moment we cross the threshold. This means we'll under-count
      // the cost, but only when undercounting doesn't matter.
-    if (Cost > (Threshold + VectorBonus))
+    if (Cost > Threshold)
        break;
  
      BasicBlock *BB = BBWorklist[Idx];
      if (BB->empty())
        continue;
  
-    // Handle the terminator cost here where we can track returns and other
-    // function-wide constructs.
-    TerminatorInst *TI = BB->getTerminator();
-
-    // We never want to inline functions that contain an indirectbr.  This is
-    // incorrect because all the blockaddress's (in static global initializers
-    // for example) would be referring to the original function, and this
-    // indirect jump would jump from the inlined copy of the function into the 
-    // original function which is extremely undefined behavior.
-    // FIXME: This logic isn't really right; we can safely inline functions
-    // with indirectbr's as long as no other function or global references the
-    // blockaddress of a block within the current function.  And as a QOI issue,
-    // if someone is using a blockaddress without an indirectbr, and that
-    // reference somehow ends up in another function or global, we probably
-    // don't want to inline this function.
-    if (isa<IndirectBrInst>(TI))
+    // Disallow inlining a blockaddress. A blockaddress only has defined
+    // behavior for an indirect branch in the same function, and we do not
+    // currently support inlining indirect branches. But, the inliner may not
+    // see an indirect branch that ends up being dead code at a particular call
+    // site. If the blockaddress escapes the function, e.g., via a global
+    // variable, inlining may lead to an invalid cross-function reference.
+    if (BB->hasAddressTaken())
        return false;
  
-    if (!HasReturn && isa<ReturnInst>(TI))
-      HasReturn = true;
-    else
-      Cost += InlineConstants::InstrCost;
-
      // Analyze the cost of this block. If we blow through the threshold, this
      // returns false, and we can bail on out.
-    if (!analyzeBlock(BB)) {
-      if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca)
+    if (!analyzeBlock(BB, EphValues)) {
+      if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
+          HasIndirectBr || HasFrameEscape)
          return false;
  
        // If the caller is a recursive function then we don't want to inline
@@ -1072,6 +1243,8 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
        break;
      }
  
+    TerminatorInst *TI = BB->getTerminator();
+
      // Add in the live successors by first checking whether we have terminator
      // that may be simplified based on the values simplified by this call.
      if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
@@ -1109,55 +1282,66 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
      }
    }
  
-  // If this is a noduplicate call, we can still inline as long as 
+  // If this is a noduplicate call, we can still inline as long as
    // inlining this would cause the removal of the caller (so the instruction
    // is not actually duplicated, just moved).
    if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
      return false;
  
-  Threshold += VectorBonus;
+  // We applied the maximum possible vector bonus at the beginning. Now,
+  // subtract the excess bonus, if any, from the Threshold before
+  // comparing against Cost.
+  if (NumVectorInstructions <= NumInstructions / 10)
+    Threshold -= FiftyPercentVectorBonus;
+  else if (NumVectorInstructions <= NumInstructions / 2)
+    Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus);
  
-  return Cost < Threshold;
+  return Cost <= std::max(0, Threshold);
  }
  
  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
  /// \brief Dump stats about this call's analysis.
  void CallAnalyzer::dump() {
-#define DEBUG_PRINT_STAT(x) llvm::dbgs() << "      " #x ": " << x << "\n"
+#define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
    DEBUG_PRINT_STAT(NumConstantArgs);
    DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
    DEBUG_PRINT_STAT(NumAllocaArgs);
    DEBUG_PRINT_STAT(NumConstantPtrCmps);
    DEBUG_PRINT_STAT(NumConstantPtrDiffs);
    DEBUG_PRINT_STAT(NumInstructionsSimplified);
+  DEBUG_PRINT_STAT(NumInstructions);
    DEBUG_PRINT_STAT(SROACostSavings);
    DEBUG_PRINT_STAT(SROACostSavingsLost);
    DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
+  DEBUG_PRINT_STAT(Cost);
+  DEBUG_PRINT_STAT(Threshold);
  #undef DEBUG_PRINT_STAT
  }
  #endif
  
  INITIALIZE_PASS_BEGIN(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
                        true, true)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
  INITIALIZE_PASS_END(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
                      true, true)
  
  char InlineCostAnalysis::ID = 0;
  
-InlineCostAnalysis::InlineCostAnalysis() : CallGraphSCCPass(ID), TD(0) {}
+InlineCostAnalysis::InlineCostAnalysis() : CallGraphSCCPass(ID) {}
  
  InlineCostAnalysis::~InlineCostAnalysis() {}
  
  void InlineCostAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
    AU.setPreservesAll();
-  AU.addRequired<TargetTransformInfo>();
+  AU.addRequired<AssumptionCacheTracker>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
    CallGraphSCCPass::getAnalysisUsage(AU);
  }
  
  bool InlineCostAnalysis::runOnSCC(CallGraphSCC &SCC) {
-  TD = getAnalysisIfAvailable<DataLayout>();
-  TTI = &getAnalysis<TargetTransformInfo>();
+  TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
+  ACT = &getAnalysis<AssumptionCacheTracker>();
    return false;
  }
  
@@ -1165,6 +1349,24 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, int Threshold) {
    return getInlineCost(CS, CS.getCalledFunction(), Threshold);
  }
  
+/// \brief Test that two functions either have or have not the given attribute
+///        at the same time.
+template<typename AttrKind>
+static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) {
+  return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr);
+}
+
+/// \brief Test that there are no attribute conflicts between Caller and Callee
+///        that prevent inlining.
+static bool functionsHaveCompatibleAttributes(Function *Caller,
+                                              Function *Callee,
+                                              TargetTransformInfo &TTI) {
+  return TTI.areInlineCompatible(Caller, Callee) &&
+         attributeMatches(Caller, Callee, Attribute::SanitizeAddress) &&
+         attributeMatches(Caller, Callee, Attribute::SanitizeMemory) &&
+         attributeMatches(Caller, Callee, Attribute::SanitizeThread);
+}
+
  InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
                                               int Threshold) {
    // Cannot inline indirect calls.
@@ -1173,26 +1375,33 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
  
    // Calls to functions with always-inline attributes should be inlined
    // whenever possible.
-  if (Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                           Attribute::AlwaysInline)) {
+  if (CS.hasFnAttr(Attribute::AlwaysInline)) {
      if (isInlineViable(*Callee))
        return llvm::InlineCost::getAlways();
      return llvm::InlineCost::getNever();
    }
  
+  // Never inline functions with conflicting attributes (unless callee has
+  // always-inline attribute).
+  if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee,
+                                         TTIWP->getTTI(*Callee)))
+    return llvm::InlineCost::getNever();
+
+  // Don't inline this call if the caller has the optnone attribute.
+  if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone))
+    return llvm::InlineCost::getNever();
+
    // Don't inline functions which can be redefined at link-time to mean
    // something else.  Don't inline functions marked noinline or call sites
    // marked noinline.
    if (Callee->mayBeOverridden() ||
-      Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                           Attribute::NoInline) ||
-      CS.isNoInline())
+      Callee->hasFnAttribute(Attribute::NoInline) || CS.isNoInline())
      return llvm::InlineCost::getNever();
  
    DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
          << "...\n");
  
-  CallAnalyzer CA(TD, *TTI, *Callee, Threshold);
+  CallAnalyzer CA(TTIWP->getTTI(*Callee), ACT, *Callee, Threshold, CS);
    bool ShouldInline = CA.analyzeCall(CS);
  
    DEBUG(CA.dump());
@@ -1207,17 +1416,15 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
  }
  
  bool InlineCostAnalysis::isInlineViable(Function &F) {
-  bool ReturnsTwice =
-    F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                   Attribute::ReturnsTwice);
+  bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
    for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
-    // Disallow inlining of functions which contain an indirect branch.
-    if (isa<IndirectBrInst>(BI->getTerminator()))
+    // Disallow inlining of functions which contain indirect branches or
+    // blockaddresses.
+    if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
        return false;
  
-    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
-         ++II) {
-      CallSite CS(II);
+    for (auto &II : *BI) {
+      CallSite CS(&II);
        if (!CS)
          continue;
  
@@ -1230,6 +1437,13 @@ bool InlineCostAnalysis::isInlineViable(Function &F) {
        if (!ReturnsTwice && CS.isCall() &&
            cast<CallInst>(CS.getInstruction())->canReturnTwice())
          return false;
+
+      // Disallow inlining functions that call @llvm.localescape. Doing this
+      // correctly would require major changes to the inliner.
+      if (CS.getCalledFunction() &&
+          CS.getCalledFunction()->getIntrinsicID() ==
+              llvm::Intrinsic::localescape)
+        return false;
      }
    }