Inliner: Do zero-cost inlines even if above a negative threshold (PR24851)

[oota-llvm.git] / lib / Analysis / InlineCost.cpp
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp

index 12be7fdc14adffb5bbd14be55fb5fef36d36f3e6..26f2e7ff504a8a6ad8f0b013ea7daad7643e1a77 100644 (file)
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -11,28 +11,32 @@
  //
  //===----------------------------------------------------------------------===//
  
-#define DEBUG_TYPE "inline-cost"
  #include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Support/CallSite.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/InstVisitor.h"
-#include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/CallingConv.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Operator.h"
-#include "llvm/GlobalAlias.h"
-#include "llvm/Target/TargetData.h"
  #include "llvm/ADT/STLExtras.h"
  #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
  #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
  #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
  
  using namespace llvm;
  
+#define DEBUG_TYPE "inline-cost"
+
  STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
  
  namespace {
@@ -41,19 +45,34 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
    typedef InstVisitor<CallAnalyzer, bool> Base;
    friend class InstVisitor<CallAnalyzer, bool>;
  
-  // TargetData if available, or null.
-  const TargetData *const TD;
+  /// The TargetTransformInfo available for this compilation.
+  const TargetTransformInfo &TTI;
+
+  /// The cache of @llvm.assume intrinsics.
+  AssumptionCacheTracker *ACT;
  
    // The called function.
    Function &F;
  
+  // The candidate callsite being analyzed. Please do not use this to do
+  // analysis in the caller function; we want the inline cost query to be
+  // easily cacheable. Instead, use the cover function paramHasAttr.
+  CallSite CandidateCS;
+
    int Threshold;
    int Cost;
-  const bool AlwaysInline;
  
-  bool IsRecursive;
+  bool IsCallerRecursive;
+  bool IsRecursiveCall;
    bool ExposesReturnsTwice;
    bool HasDynamicAlloca;
+  bool ContainsNoDuplicateCall;
+  bool HasReturn;
+  bool HasIndirectBr;
+  bool HasFrameEscape;
+
+  /// Number of bytes allocated statically by the callee.
+  uint64_t AllocatedSize;
    unsigned NumInstructions, NumVectorInstructions;
    int FiftyPercentVectorBonus, TenPercentVectorBonus;
    int VectorBonus;
@@ -87,15 +106,24 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
    void disableSROA(Value *V);
    void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
                            int InstructionCost);
-  bool handleSROACandidate(bool IsSROAValid,
-                           DenseMap<Value *, int>::iterator CostIt,
-                           int InstructionCost);
    bool isGEPOffsetConstant(GetElementPtrInst &GEP);
    bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
+  bool simplifyCallSite(Function *F, CallSite CS);
    ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
  
+  /// Return true if the given argument to the function being considered for
+  /// inlining has the given attribute set either at the call site or the
+  /// function declaration.  Primarily used to inspect call site specific
+  /// attributes since these can be more precise than the ones on the callee
+  /// itself. 
+  bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
+  
+  /// Return true if the given value is known non null within the callee if
+  /// inlined through this particular callsite. 
+  bool isKnownNonNullInCallee(Value *V);
+
    // Custom analysis routines.
-  bool analyzeBlock(BasicBlock *BB);
+  bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
  
    // Disable several entry points to the visitor so we don't accidentally use
    // them by declaring but not defining them here.
@@ -115,24 +143,36 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
    bool visitIntToPtr(IntToPtrInst &I);
    bool visitCastInst(CastInst &I);
    bool visitUnaryInstruction(UnaryInstruction &I);
-  bool visitICmp(ICmpInst &I);
+  bool visitCmpInst(CmpInst &I);
    bool visitSub(BinaryOperator &I);
    bool visitBinaryOperator(BinaryOperator &I);
    bool visitLoad(LoadInst &I);
    bool visitStore(StoreInst &I);
+  bool visitExtractValue(ExtractValueInst &I);
+  bool visitInsertValue(InsertValueInst &I);
    bool visitCallSite(CallSite CS);
+  bool visitReturnInst(ReturnInst &RI);
+  bool visitBranchInst(BranchInst &BI);
+  bool visitSwitchInst(SwitchInst &SI);
+  bool visitIndirectBrInst(IndirectBrInst &IBI);
+  bool visitResumeInst(ResumeInst &RI);
+  bool visitCleanupReturnInst(CleanupReturnInst &RI);
+  bool visitCatchReturnInst(CatchReturnInst &RI);
+  bool visitUnreachableInst(UnreachableInst &I);
  
  public:
-  CallAnalyzer(const TargetData *TD, Function &Callee, int Threshold)
-    : TD(TD), F(Callee), Threshold(Threshold), Cost(0),
-      AlwaysInline(F.hasFnAttr(Attribute::AlwaysInline)),
-      IsRecursive(false), ExposesReturnsTwice(false), HasDynamicAlloca(false),
-      NumInstructions(0), NumVectorInstructions(0),
-      FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0),
-      NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
-      NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
-      NumInstructionsSimplified(0), SROACostSavings(0), SROACostSavingsLost(0) {
-  }
+  CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT,
+               Function &Callee, int Threshold, CallSite CSArg)
+    : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), Threshold(Threshold),
+        Cost(0), IsCallerRecursive(false), IsRecursiveCall(false),
+        ExposesReturnsTwice(false), HasDynamicAlloca(false),
+        ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
+        HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
+        NumVectorInstructions(0), FiftyPercentVectorBonus(0),
+        TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
+        NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
+        NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
+        SROACostSavings(0), SROACostSavingsLost(0) {}
  
    bool analyzeCall(CallSite CS);
  
@@ -204,21 +244,6 @@ void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
    SROACostSavings += InstructionCost;
  }
  
-/// \brief Helper for the common pattern of handling a SROA candidate.
-/// Either accumulates the cost savings if the SROA remains valid, or disables
-/// SROA for the candidate.
-bool CallAnalyzer::handleSROACandidate(bool IsSROAValid,
-                                       DenseMap<Value *, int>::iterator CostIt,
-                                       int InstructionCost) {
-  if (IsSROAValid) {
-    accumulateSROACost(CostIt, InstructionCost);
-    return true;
-  }
-
-  disableSROA(CostIt);
-  return false;
-}
-
  /// \brief Check whether a GEP's indices are all constant.
  ///
  /// Respects any simplified values known during the analysis of this callsite.
@@ -235,10 +260,8 @@ bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) {
  /// Returns false if unable to compute the offset for any reason. Respects any
  /// simplified values known during the analysis of this callsite.
  bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
-  if (!TD)
-    return false;
-
-  unsigned IntPtrWidth = TD->getPointerSizeInBits();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  unsigned IntPtrWidth = DL.getPointerSizeInBits();
    assert(IntPtrWidth == Offset.getBitWidth());
  
    for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
@@ -254,24 +277,39 @@ bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
      // Handle a struct index, which adds its field offset to the pointer.
      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
        unsigned ElementIdx = OpC->getZExtValue();
-      const StructLayout *SL = TD->getStructLayout(STy);
+      const StructLayout *SL = DL.getStructLayout(STy);
        Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
        continue;
      }
  
-    APInt TypeSize(IntPtrWidth, TD->getTypeAllocSize(GTI.getIndexedType()));
+    APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
      Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
    }
    return true;
  }
  
  bool CallAnalyzer::visitAlloca(AllocaInst &I) {
-  // FIXME: Check whether inlining will turn a dynamic alloca into a static
+  // Check whether inlining will turn a dynamic alloca into a static
    // alloca, and handle that case.
+  if (I.isArrayAllocation()) {
+    if (Constant *Size = SimplifiedValues.lookup(I.getArraySize())) {
+      ConstantInt *AllocSize = dyn_cast<ConstantInt>(Size);
+      assert(AllocSize && "Allocation size not a constant int?");
+      Type *Ty = I.getAllocatedType();
+      AllocatedSize += Ty->getPrimitiveSizeInBits() * AllocSize->getZExtValue();
+      return Base::visitAlloca(I);
+    }
+  }
+
+  // Accumulate the allocated size.
+  if (I.isStaticAlloca()) {
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    Type *Ty = I.getAllocatedType();
+    AllocatedSize += DL.getTypeAllocSize(Ty);
+  }
  
-  // We will happily inline static alloca instructions or dynamic alloca
-  // instructions in always-inline situations.
-  if (AlwaysInline || I.isStaticAlloca())
+  // We will happily inline static alloca instructions.
+  if (I.isStaticAlloca())
      return Base::visitAlloca(I);
  
    // FIXME: This is overly conservative. Dynamic allocas are inefficient for
@@ -303,7 +341,7 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
  
    // Try to fold GEPs of constant-offset call site argument pointers. This
    // requires target data and inbounds GEPs.
-  if (TD && I.isInBounds()) {
+  if (I.isInBounds()) {
      // Check if we have a base + offset for the pointer.
      Value *Ptr = I.getPointerOperand();
      std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr);
@@ -345,7 +383,10 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
  
  bool CallAnalyzer::visitBitCast(BitCastInst &I) {
    // Propagate constants through bitcasts.
-  if (Constant *COp = dyn_cast<Constant>(I.getOperand(0)))
+  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
+  if (!COp)
+    COp = SimplifiedValues.lookup(I.getOperand(0));
+  if (COp)
      if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) {
        SimplifiedValues[&I] = C;
        return true;
@@ -370,7 +411,10 @@ bool CallAnalyzer::visitBitCast(BitCastInst &I) {
  
  bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
    // Propagate constants through ptrtoint.
-  if (Constant *COp = dyn_cast<Constant>(I.getOperand(0)))
+  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
+  if (!COp)
+    COp = SimplifiedValues.lookup(I.getOperand(0));
+  if (COp)
      if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) {
        SimplifiedValues[&I] = C;
        return true;
@@ -379,7 +423,8 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
    // Track base/offset pairs when converted to a plain integer provided the
    // integer is large enough to represent the pointer.
    unsigned IntegerSize = I.getType()->getScalarSizeInBits();
-  if (TD && IntegerSize >= TD->getPointerSizeInBits()) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  if (IntegerSize >= DL.getPointerSizeInBits()) {
      std::pair<Value *, APInt> BaseAndOffset
        = ConstantOffsetPtrs.lookup(I.getOperand(0));
      if (BaseAndOffset.first)
@@ -398,12 +443,15 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
    if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
      SROAArgValues[&I] = SROAArg;
  
-  return isInstructionFree(&I, TD);
+  return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
  }
  
  bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
    // Propagate constants through ptrtoint.
-  if (Constant *COp = dyn_cast<Constant>(I.getOperand(0)))
+  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
+  if (!COp)
+    COp = SimplifiedValues.lookup(I.getOperand(0));
+  if (COp)
      if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) {
        SimplifiedValues[&I] = C;
        return true;
@@ -413,7 +461,8 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
    // modifications provided the integer is not too large.
    Value *Op = I.getOperand(0);
    unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
-  if (TD && IntegerSize <= TD->getPointerSizeInBits()) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  if (IntegerSize <= DL.getPointerSizeInBits()) {
      std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
      if (BaseAndOffset.first)
        ConstantOffsetPtrs[&I] = BaseAndOffset;
@@ -425,12 +474,15 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
    if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
      SROAArgValues[&I] = SROAArg;
  
-  return isInstructionFree(&I, TD);
+  return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
  }
  
  bool CallAnalyzer::visitCastInst(CastInst &I) {
    // Propagate constants through ptrtoint.
-  if (Constant *COp = dyn_cast<Constant>(I.getOperand(0)))
+  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
+  if (!COp)
+    COp = SimplifiedValues.lookup(I.getOperand(0));
+  if (COp)
      if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
        SimplifiedValues[&I] = C;
        return true;
@@ -439,18 +491,22 @@ bool CallAnalyzer::visitCastInst(CastInst &I) {
    // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
    disableSROA(I.getOperand(0));
  
-  return isInstructionFree(&I, TD);
+  return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
  }
  
  bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
    Value *Operand = I.getOperand(0);
-  Constant *Ops[1] = { dyn_cast<Constant>(Operand) };
-  if (Ops[0] || (Ops[0] = SimplifiedValues.lookup(Operand)))
+  Constant *COp = dyn_cast<Constant>(Operand);
+  if (!COp)
+    COp = SimplifiedValues.lookup(Operand);
+  if (COp) {
+    const DataLayout &DL = F.getParent()->getDataLayout();
      if (Constant *C = ConstantFoldInstOperands(I.getOpcode(), I.getType(),
-                                               Ops, TD)) {
+                                               COp, DL)) {
        SimplifiedValues[&I] = C;
        return true;
      }
+  }
  
    // Disable any SROA on the argument to arbitrary unary operators.
    disableSROA(Operand);
@@ -458,7 +514,34 @@ bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
    return false;
  }
  
-bool CallAnalyzer::visitICmp(ICmpInst &I) {
+bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
+  unsigned ArgNo = A->getArgNo();
+  return CandidateCS.paramHasAttr(ArgNo+1, Attr);
+}
+
+bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
+  // Does the *call site* have the NonNull attribute set on an argument?  We
+  // use the attribute on the call site to memoize any analysis done in the
+  // caller. This will also trip if the callee function has a non-null
+  // parameter attribute, but that's a less interesting case because hopefully
+  // the callee would already have been simplified based on that.
+  if (Argument *A = dyn_cast<Argument>(V))
+    if (paramHasAttr(A, Attribute::NonNull))
+      return true;
+  
+  // Is this an alloca in the caller?  This is distinct from the attribute case
+  // above because attributes aren't updated within the inliner itself and we
+  // always want to catch the alloca derived case.
+  if (isAllocaDerivedArg(V))
+    // We can actually predict the result of comparisons between an
+    // alloca-derived value and null. Note that this fires regardless of
+    // SROA firing.
+    return true;
+  
+  return false;
+}
+
+bool CallAnalyzer::visitCmpInst(CmpInst &I) {
    Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
    // First try to handle simplified comparisons.
    if (!isa<Constant>(LHS))
@@ -467,20 +550,24 @@ bool CallAnalyzer::visitICmp(ICmpInst &I) {
    if (!isa<Constant>(RHS))
      if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
        RHS = SimpleRHS;
-  if (Constant *CLHS = dyn_cast<Constant>(LHS))
+  if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
      if (Constant *CRHS = dyn_cast<Constant>(RHS))
-      if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
+      if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
          SimplifiedValues[&I] = C;
          return true;
        }
+  }
+
+  if (I.getOpcode() == Instruction::FCmp)
+    return false;
  
    // Otherwise look for a comparison between constant offset pointers with
    // a common base.
    Value *LHSBase, *RHSBase;
    APInt LHSOffset, RHSOffset;
-  llvm::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
+  std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
    if (LHSBase) {
-    llvm::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
+    std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
      if (RHSBase && LHSBase == RHSBase) {
        // We have common bases, fold the icmp to a constant based on the
        // offsets.
@@ -495,18 +582,14 @@ bool CallAnalyzer::visitICmp(ICmpInst &I) {
    }
  
    // If the comparison is an equality comparison with null, we can simplify it
-  // for any alloca-derived argument.
-  if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)))
-    if (isAllocaDerivedArg(I.getOperand(0))) {
-      // We can actually predict the result of comparisons between an
-      // alloca-derived value and null. Note that this fires regardless of
-      // SROA firing.
-      bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
-      SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
-                                        : ConstantInt::getFalse(I.getType());
-      return true;
-    }
-
+  // if we know the value (argument) can't be null
+  if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
+      isKnownNonNullInCallee(I.getOperand(0))) {
+    bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
+    SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
+                                      : ConstantInt::getFalse(I.getType());
+    return true;
+  }
    // Finally check for SROA candidates in comparisons.
    Value *SROAArg;
    DenseMap<Value *, int>::iterator CostIt;
@@ -528,9 +611,9 @@ bool CallAnalyzer::visitSub(BinaryOperator &I) {
    Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
    Value *LHSBase, *RHSBase;
    APInt LHSOffset, RHSOffset;
-  llvm::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
+  std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
    if (LHSBase) {
-    llvm::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
+    std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
      if (RHSBase && LHSBase == RHSBase) {
        // We have common bases, fold the subtract to a constant based on the
        // offsets.
@@ -551,13 +634,20 @@ bool CallAnalyzer::visitSub(BinaryOperator &I) {
  
  bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
    Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+  const DataLayout &DL = F.getParent()->getDataLayout();
    if (!isa<Constant>(LHS))
      if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
        LHS = SimpleLHS;
    if (!isa<Constant>(RHS))
      if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
        RHS = SimpleRHS;
-  Value *SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, TD);
+  Value *SimpleV = nullptr;
+  if (auto FI = dyn_cast<FPMathOperator>(&I))
+    SimpleV =
+        SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
+  else
+    SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
+
    if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
      SimplifiedValues[&I] = C;
      return true;
@@ -573,7 +663,7 @@ bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
  bool CallAnalyzer::visitLoad(LoadInst &I) {
    Value *SROAArg;
    DenseMap<Value *, int>::iterator CostIt;
-  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
+  if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
      if (I.isSimple()) {
        accumulateSROACost(CostIt, InlineConstants::InstrCost);
        return true;
@@ -588,7 +678,7 @@ bool CallAnalyzer::visitLoad(LoadInst &I) {
  bool CallAnalyzer::visitStore(StoreInst &I) {
    Value *SROAArg;
    DenseMap<Value *, int>::iterator CostIt;
-  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
+  if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
      if (I.isSimple()) {
        accumulateSROACost(CostIt, InlineConstants::InstrCost);
        return true;
@@ -600,36 +690,115 @@ bool CallAnalyzer::visitStore(StoreInst &I) {
    return false;
  }
  
+bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
+  // Constant folding for extract value is trivial.
+  Constant *C = dyn_cast<Constant>(I.getAggregateOperand());
+  if (!C)
+    C = SimplifiedValues.lookup(I.getAggregateOperand());
+  if (C) {
+    SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices());
+    return true;
+  }
+
+  // SROA can look through these but give them a cost.
+  return false;
+}
+
+bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
+  // Constant folding for insert value is trivial.
+  Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand());
+  if (!AggC)
+    AggC = SimplifiedValues.lookup(I.getAggregateOperand());
+  Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand());
+  if (!InsertedC)
+    InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand());
+  if (AggC && InsertedC) {
+    SimplifiedValues[&I] = ConstantExpr::getInsertValue(AggC, InsertedC,
+                                                        I.getIndices());
+    return true;
+  }
+
+  // SROA can look through these but give them a cost.
+  return false;
+}
+
+/// \brief Try to simplify a call site.
+///
+/// Takes a concrete function and callsite and tries to actually simplify it by
+/// analyzing the arguments and call itself with instsimplify. Returns true if
+/// it has simplified the callsite to some other entity (a constant), making it
+/// free.
+bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
+  // FIXME: Using the instsimplify logic directly for this is inefficient
+  // because we have to continually rebuild the argument list even when no
+  // simplifications can be performed. Until that is fixed with remapping
+  // inside of instsimplify, directly constant fold calls here.
+  if (!canConstantFoldCallTo(F))
+    return false;
+
+  // Try to re-map the arguments to constants.
+  SmallVector<Constant *, 4> ConstantArgs;
+  ConstantArgs.reserve(CS.arg_size());
+  for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+       I != E; ++I) {
+    Constant *C = dyn_cast<Constant>(*I);
+    if (!C)
+      C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
+    if (!C)
+      return false; // This argument doesn't map to a constant.
+
+    ConstantArgs.push_back(C);
+  }
+  if (Constant *C = ConstantFoldCall(F, ConstantArgs)) {
+    SimplifiedValues[CS.getInstruction()] = C;
+    return true;
+  }
+
+  return false;
+}
+
  bool CallAnalyzer::visitCallSite(CallSite CS) {
-  if (CS.isCall() && cast<CallInst>(CS.getInstruction())->canReturnTwice() &&
-      !F.hasFnAttr(Attribute::ReturnsTwice)) {
+  if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
+      !F.hasFnAttribute(Attribute::ReturnsTwice)) {
      // This aborts the entire analysis.
      ExposesReturnsTwice = true;
      return false;
    }
+  if (CS.isCall() &&
+      cast<CallInst>(CS.getInstruction())->cannotDuplicate())
+    ContainsNoDuplicateCall = true;
  
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
-    switch (II->getIntrinsicID()) {
-    default:
-      return Base::visitCallSite(CS);
+  if (Function *F = CS.getCalledFunction()) {
+    // When we have a concrete function, first try to simplify it directly.
+    if (simplifyCallSite(F, CS))
+      return true;
  
-    case Intrinsic::memset:
-    case Intrinsic::memcpy:
-    case Intrinsic::memmove:
-      // SROA can usually chew through these intrinsics, but they aren't free.
-      return false;
+    // Next check if it is an intrinsic we know about.
+    // FIXME: Lift this into part of the InstVisitor.
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
+      switch (II->getIntrinsicID()) {
+      default:
+        return Base::visitCallSite(CS);
+
+      case Intrinsic::memset:
+      case Intrinsic::memcpy:
+      case Intrinsic::memmove:
+        // SROA can usually chew through these intrinsics, but they aren't free.
+        return false;
+      case Intrinsic::localescape:
+        HasFrameEscape = true;
+        return false;
+      }
      }
-  }
  
-  if (Function *F = CS.getCalledFunction()) {
      if (F == CS.getInstruction()->getParent()->getParent()) {
        // This flag will fully abort the analysis, so don't bother with anything
        // else.
-      IsRecursive = true;
+      IsRecursiveCall = true;
        return false;
      }
  
-    if (!callIsSmall(CS)) {
+    if (TTI.isLoweredToCall(F)) {
        // We account for the average 1 instruction per call argument setup
        // here.
        Cost += CS.arg_size() * InlineConstants::InstrCost;
@@ -662,7 +831,7 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
    // during devirtualization and so we want to give it a hefty bonus for
    // inlining, but cap that bonus in the event that inlining wouldn't pan
    // out. Pretend to inline the function, with a custom threshold.
-  CallAnalyzer CA(TD, *F, InlineConstants::IndirectCallThreshold);
+  CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS);
    if (CA.analyzeCall(CS)) {
      // We were able to inline the indirect call! Subtract the cost from the
      // bonus we want to apply, but don't go below zero.
@@ -672,10 +841,93 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
    return Base::visitCallSite(CS);
  }
  
+bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
+  // At least one return instruction will be free after inlining.
+  bool Free = !HasReturn;
+  HasReturn = true;
+  return Free;
+}
+
+bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
+  // We model unconditional branches as essentially free -- they really
+  // shouldn't exist at all, but handling them makes the behavior of the
+  // inliner more regular and predictable. Interestingly, conditional branches
+  // which will fold away are also free.
+  return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
+         dyn_cast_or_null<ConstantInt>(
+             SimplifiedValues.lookup(BI.getCondition()));
+}
+
+bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
+  // We model unconditional switches as free, see the comments on handling
+  // branches.
+  if (isa<ConstantInt>(SI.getCondition()))
+    return true;
+  if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
+    if (isa<ConstantInt>(V))
+      return true;
+
+  // Otherwise, we need to accumulate a cost proportional to the number of
+  // distinct successor blocks. This fan-out in the CFG cannot be represented
+  // for free even if we can represent the core switch as a jumptable that
+  // takes a single instruction.
+  //
+  // NB: We convert large switches which are just used to initialize large phi
+  // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
+  // inlining those. It will prevent inlining in cases where the optimization
+  // does not (yet) fire.
+  SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
+  SuccessorBlocks.insert(SI.getDefaultDest());
+  for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I)
+    SuccessorBlocks.insert(I.getCaseSuccessor());
+  // Add cost corresponding to the number of distinct destinations. The first
+  // we model as free because of fallthrough.
+  Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
+  return false;
+}
+
+bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
+  // We never want to inline functions that contain an indirectbr.  This is
+  // incorrect because all the blockaddress's (in static global initializers
+  // for example) would be referring to the original function, and this
+  // indirect jump would jump from the inlined copy of the function into the
+  // original function which is extremely undefined behavior.
+  // FIXME: This logic isn't really right; we can safely inline functions with
+  // indirectbr's as long as no other function or global references the
+  // blockaddress of a block within the current function.
+  HasIndirectBr = true;
+  return false;
+}
+
+bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
+  // FIXME: It's not clear that a single instruction is an accurate model for
+  // the inline cost of a resume instruction.
+  return false;
+}
+
+bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
+  // FIXME: It's not clear that a single instruction is an accurate model for
+  // the inline cost of a cleanupret instruction.
+  return false;
+}
+
+bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
+  // FIXME: It's not clear that a single instruction is an accurate model for
+  // the inline cost of a catchret instruction.
+  return false;
+}
+
+bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
+  // FIXME: It might be reasonably to discount the cost of instructions leading
+  // to unreachable as they have the lowest possible impact on both runtime and
+  // code size.
+  return true; // No actual code is needed for unreachable.
+}
+
  bool CallAnalyzer::visitInstruction(Instruction &I) {
    // Some instructions are free. All of the free intrinsics can also be
    // handled by SROA, etc.
-  if (isInstructionFree(&I, TD))
+  if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
      return true;
  
    // We found something we don't understand or can't handle. Mark any SROA-able
@@ -694,37 +946,72 @@ bool CallAnalyzer::visitInstruction(Instruction &I) {
  /// aborts early if the threshold has been exceeded or an impossible to inline
  /// construct has been detected. It returns false if inlining is no longer
  /// viable, and true if inlining remains viable.
-bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
-  for (BasicBlock::iterator I = BB->begin(), E = llvm::prior(BB->end());
-       I != E; ++I) {
+bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
+                                SmallPtrSetImpl<const Value *> &EphValues) {
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    // FIXME: Currently, the number of instructions in a function regardless of
+    // our ability to simplify them during inline to constants or dead code,
+    // are actually used by the vector bonus heuristic. As long as that's true,
+    // we have to special case debug intrinsics here to prevent differences in
+    // inlining due to debug symbols. Eventually, the number of unsimplified
+    // instructions shouldn't factor into the cost computation, but until then,
+    // hack around it here.
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    // Skip ephemeral values.
+    if (EphValues.count(&*I))
+      continue;
+
      ++NumInstructions;
      if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
        ++NumVectorInstructions;
  
+    // If the instruction is floating point, and the target says this operation
+    // is expensive or the function has the "use-soft-float" attribute, this may
+    // eventually become a library call. Treat the cost as such.
+    if (I->getType()->isFloatingPointTy()) {
+      bool hasSoftFloatAttr = false;
+
+      // If the function has the "use-soft-float" attribute, mark it as
+      // expensive.
+      if (F.hasFnAttribute("use-soft-float")) {
+        Attribute Attr = F.getFnAttribute("use-soft-float");
+        StringRef Val = Attr.getValueAsString();
+        if (Val == "true")
+          hasSoftFloatAttr = true;
+      }
+
+      if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
+          hasSoftFloatAttr)
+        Cost += InlineConstants::CallPenalty;
+    }
+
      // If the instruction simplified to a constant, there is no cost to this
      // instruction. Visit the instructions using our InstVisitor to account for
      // all of the per-instruction logic. The visit tree returns true if we
      // consumed the instruction in any way, and false if the instruction's base
      // cost should count against inlining.
-    if (Base::visit(I))
+    if (Base::visit(&*I))
        ++NumInstructionsSimplified;
      else
        Cost += InlineConstants::InstrCost;
  
      // If the visit this instruction detected an uninlinable pattern, abort.
-    if (IsRecursive || ExposesReturnsTwice || HasDynamicAlloca)
+    if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
+        HasIndirectBr || HasFrameEscape)
        return false;
  
-    if (NumVectorInstructions > NumInstructions/2)
-      VectorBonus = FiftyPercentVectorBonus;
-    else if (NumVectorInstructions > NumInstructions/10)
-      VectorBonus = TenPercentVectorBonus;
-    else
-      VectorBonus = 0;
+    // If the caller is a recursive function then we don't want to inline
+    // functions which allocate a lot of stack space because it would increase
+    // the caller stack usage dramatically.
+    if (IsCallerRecursive &&
+        AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
+      return false;
  
-    // Check if we've past the threshold so we don't spin in huge basic
-    // blocks that will never inline.
-    if (!AlwaysInline && Cost > (Threshold + VectorBonus))
+    // Check if we've past the maximum possible threshold so we don't spin in
+    // huge basic blocks that will never inline.
+    if (Cost > Threshold)
        return false;
    }
  
@@ -738,10 +1025,11 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
  /// returns 0 if V is not a pointer, and returns the constant '0' if there are
  /// no constant offsets applied.
  ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
-  if (!TD || !V->getType()->isPointerTy())
-    return 0;
+  if (!V->getType()->isPointerTy())
+    return nullptr;
  
-  unsigned IntPtrWidth = TD->getPointerSizeInBits();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  unsigned IntPtrWidth = DL.getPointerSizeInBits();
    APInt Offset = APInt::getNullValue(IntPtrWidth);
  
    // Even though we don't look through PHI nodes, we could be called on an
@@ -751,7 +1039,7 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
    do {
      if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
        if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
-        return 0;
+        return nullptr;
        V = GEP->getPointerOperand();
      } else if (Operator::getOpcode(V) == Instruction::BitCast) {
        V = cast<Operator>(V)->getOperand(0);
@@ -763,9 +1051,9 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
        break;
      }
      assert(V->getType()->isPointerTy() && "Unexpected operand type!");
-  } while (Visited.insert(V));
+  } while (Visited.insert(V).second);
  
-  Type *IntPtrTy = TD->getIntPtrType(V->getContext());
+  Type *IntPtrTy = DL.getIntPtrType(V->getContext());
    return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
  }
  
@@ -775,86 +1063,107 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
  /// viable. It computes the cost and adjusts the threshold based on numerous
  /// factors and heuristics. If this method returns false but the computed cost
  /// is below the computed threshold, then inlining was forcibly disabled by
-/// some artifact of the rountine.
+/// some artifact of the routine.
  bool CallAnalyzer::analyzeCall(CallSite CS) {
    ++NumCallsAnalyzed;
  
+  // Perform some tweaks to the cost and threshold based on the direct
+  // callsite information.
+
+  // We want to more aggressively inline vector-dense kernels, so up the
+  // threshold, and we'll lower it if the % of vector instructions gets too
+  // low. Note that these bonuses are some what arbitrary and evolved over time
+  // by accident as much as because they are principled bonuses.
+  //
+  // FIXME: It would be nice to remove all such bonuses. At least it would be
+  // nice to base the bonus values on something more scientific.
+  assert(NumInstructions == 0);
+  assert(NumVectorInstructions == 0);
+  FiftyPercentVectorBonus = 3 * Threshold / 2;
+  TenPercentVectorBonus = 3 * Threshold / 4;
+  const DataLayout &DL = F.getParent()->getDataLayout();
+
    // Track whether the post-inlining function would have more than one basic
    // block. A single basic block is often intended for inlining. Balloon the
    // threshold by 50% until we pass the single-BB phase.
    bool SingleBB = true;
    int SingleBBBonus = Threshold / 2;
-  Threshold += SingleBBBonus;
-
-  // Unless we are always-inlining, perform some tweaks to the cost and
-  // threshold based on the direct callsite information.
-  if (!AlwaysInline) {
-    // We want to more aggressively inline vector-dense kernels, so up the
-    // threshold, and we'll lower it if the % of vector instructions gets too
-    // low.
-    assert(NumInstructions == 0);
-    assert(NumVectorInstructions == 0);
-    FiftyPercentVectorBonus = Threshold;
-    TenPercentVectorBonus = Threshold / 2;
-
-    // Give out bonuses per argument, as the instructions setting them up will
-    // be gone after inlining.
-    for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
-      if (TD && CS.isByValArgument(I)) {
-        // We approximate the number of loads and stores needed by dividing the
-        // size of the byval type by the target's pointer size.
-        PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
-        unsigned TypeSize = TD->getTypeSizeInBits(PTy->getElementType());
-        unsigned PointerSize = TD->getPointerSizeInBits();
-        // Ceiling division.
-        unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
-
-        // If it generates more than 8 stores it is likely to be expanded as an
-        // inline memcpy so we take that as an upper bound. Otherwise we assume
-        // one load and one store per word copied.
-        // FIXME: The maxStoresPerMemcpy setting from the target should be used
-        // here instead of a magic number of 8, but it's not available via
-        // TargetData.
-        NumStores = std::min(NumStores, 8U);
-
-        Cost -= 2 * NumStores * InlineConstants::InstrCost;
-      } else {
-        // For non-byval arguments subtract off one instruction per call
-        // argument.
-        Cost -= InlineConstants::InstrCost;
-      }
-    }
  
-    // If there is only one call of the function, and it has internal linkage,
-    // the cost of inlining it drops dramatically.
-    if (F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction())
-      Cost += InlineConstants::LastCallToStaticBonus;
-
-    // If the instruction after the call, or if the normal destination of the
-    // invoke is an unreachable instruction, the function is noreturn.  As such,
-    // there is little point in inlining this unless there is literally zero cost.
-    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
-      if (isa<UnreachableInst>(II->getNormalDest()->begin()))
-        Threshold = 1;
-    } else if (isa<UnreachableInst>(++BasicBlock::iterator(CS.getInstruction())))
-      Threshold = 1;
-
-    // If this function uses the coldcc calling convention, prefer not to inline
-    // it.
-    if (F.getCallingConv() == CallingConv::Cold)
-      Cost += InlineConstants::ColdccPenalty;
-
-    // Check if we're done. This can happen due to bonuses and penalties.
-    if (Cost > Threshold)
-      return false;
+  // Speculatively apply all possible bonuses to Threshold. If cost exceeds
+  // this Threshold any time, and cost cannot decrease, we can stop processing
+  // the rest of the function body.
+  Threshold += (SingleBBBonus + FiftyPercentVectorBonus);
+
+  // Give out bonuses per argument, as the instructions setting them up will
+  // be gone after inlining.
+  for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
+    if (CS.isByValArgument(I)) {
+      // We approximate the number of loads and stores needed by dividing the
+      // size of the byval type by the target's pointer size.
+      PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
+      unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
+      unsigned PointerSize = DL.getPointerSizeInBits();
+      // Ceiling division.
+      unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
+
+      // If it generates more than 8 stores it is likely to be expanded as an
+      // inline memcpy so we take that as an upper bound. Otherwise we assume
+      // one load and one store per word copied.
+      // FIXME: The maxStoresPerMemcpy setting from the target should be used
+      // here instead of a magic number of 8, but it's not available via
+      // DataLayout.
+      NumStores = std::min(NumStores, 8U);
+
+      Cost -= 2 * NumStores * InlineConstants::InstrCost;
+    } else {
+      // For non-byval arguments subtract off one instruction per call
+      // argument.
+      Cost -= InlineConstants::InstrCost;
+    }
    }
  
+  // If there is only one call of the function, and it has internal linkage,
+  // the cost of inlining it drops dramatically.
+  bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() &&
+    &F == CS.getCalledFunction();
+  if (OnlyOneCallAndLocalLinkage)
+    Cost += InlineConstants::LastCallToStaticBonus;
+
+  // If the instruction after the call, or if the normal destination of the
+  // invoke is an unreachable instruction, the function is noreturn. As such,
+  // there is little point in inlining this unless there is literally zero
+  // cost.
+  Instruction *Instr = CS.getInstruction();
+  if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
+    if (isa<UnreachableInst>(II->getNormalDest()->begin()))
+      Threshold = 0;
+  } else if (isa<UnreachableInst>(++BasicBlock::iterator(Instr)))
+    Threshold = 0;
+
+  // If this function uses the coldcc calling convention, prefer not to inline
+  // it.
+  if (F.getCallingConv() == CallingConv::Cold)
+    Cost += InlineConstants::ColdccPenalty;
+
+  // Check if we're done. This can happen due to bonuses and penalties.
+  if (Cost > Threshold)
+    return false;
+
    if (F.empty())
      return true;
  
-  // Track whether we've seen a return instruction. The first return
-  // instruction is free, as at least one will usually disappear in inlining.
-  bool HasReturn = false;
+  Function *Caller = CS.getInstruction()->getParent()->getParent();
+  // Check if the caller function is recursive itself.
+  for (User *U : Caller->users()) {
+    CallSite Site(U);
+    if (!Site)
+      continue;
+    Instruction *I = Site.getInstruction();
+    if (I->getParent()->getParent() == Caller) {
+      IsCallerRecursive = true;
+      break;
+    }
+  }
  
    // Populate our simplified values by mapping from function arguments to call
    // arguments with known important simplifications.
@@ -863,15 +1172,15 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
         FAI != FAE; ++FAI, ++CAI) {
      assert(CAI != CS.arg_end());
      if (Constant *C = dyn_cast<Constant>(CAI))
-      SimplifiedValues[FAI] = C;
+      SimplifiedValues[&*FAI] = C;
  
      Value *PtrArg = *CAI;
      if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
-      ConstantOffsetPtrs[FAI] = std::make_pair(PtrArg, C->getValue());
+      ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
  
        // We can SROA any pointer arguments derived from alloca instructions.
        if (isa<AllocaInst>(PtrArg)) {
-        SROAArgValues[FAI] = PtrArg;
+        SROAArgValues[&*FAI] = PtrArg;
          SROAArgCosts[PtrArg] = 0;
        }
      }
@@ -880,6 +1189,12 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
    NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
    NumAllocaArgs = SROAArgValues.size();
  
+  // FIXME: If a caller has multiple calls to a callee, we end up recomputing
+  // the ephemeral values multiple times (and they're completely determined by
+  // the callee, so this is purely duplicate work).
+  SmallPtrSet<const Value *, 32> EphValues;
+  CodeMetrics::collectEphemeralValues(&F, &ACT->getAssumptionCache(F), EphValues);
+
    // The worklist of live basic blocks in the callee *after* inlining. We avoid
    // adding basic blocks of the callee which can be proven to be dead for this
    // particular call site in order to get more accurate cost estimates. This
@@ -895,44 +1210,41 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
    for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
      // Bail out the moment we cross the threshold. This means we'll under-count
      // the cost, but only when undercounting doesn't matter.
-    if (!AlwaysInline && Cost > (Threshold + VectorBonus))
+    if (Cost > Threshold)
        break;
  
      BasicBlock *BB = BBWorklist[Idx];
      if (BB->empty())
        continue;
  
-    // Handle the terminator cost here where we can track returns and other
-    // function-wide constructs.
-    TerminatorInst *TI = BB->getTerminator();
-
-    // We never want to inline functions that contain an indirectbr.  This is
-    // incorrect because all the blockaddress's (in static global initializers
-    // for example) would be referring to the original function, and this indirect
-    // jump would jump from the inlined copy of the function into the original
-    // function which is extremely undefined behavior.
-    // FIXME: This logic isn't really right; we can safely inline functions
-    // with indirectbr's as long as no other function or global references the
-    // blockaddress of a block within the current function.  And as a QOI issue,
-    // if someone is using a blockaddress without an indirectbr, and that
-    // reference somehow ends up in another function or global, we probably
-    // don't want to inline this function.
-    if (isa<IndirectBrInst>(TI))
+    // Disallow inlining a blockaddress. A blockaddress only has defined
+    // behavior for an indirect branch in the same function, and we do not
+    // currently support inlining indirect branches. But, the inliner may not
+    // see an indirect branch that ends up being dead code at a particular call
+    // site. If the blockaddress escapes the function, e.g., via a global
+    // variable, inlining may lead to an invalid cross-function reference.
+    if (BB->hasAddressTaken())
        return false;
  
-    if (!HasReturn && isa<ReturnInst>(TI))
-      HasReturn = true;
-    else
-      Cost += InlineConstants::InstrCost;
-
      // Analyze the cost of this block. If we blow through the threshold, this
      // returns false, and we can bail on out.
-    if (!analyzeBlock(BB)) {
-      if (IsRecursive || ExposesReturnsTwice || HasDynamicAlloca)
+    if (!analyzeBlock(BB, EphValues)) {
+      if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
+          HasIndirectBr || HasFrameEscape)
+        return false;
+
+      // If the caller is a recursive function then we don't want to inline
+      // functions which allocate a lot of stack space because it would increase
+      // the caller stack usage dramatically.
+      if (IsCallerRecursive &&
+          AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
          return false;
+
        break;
      }
  
+    TerminatorInst *TI = BB->getTerminator();
+
      // Add in the live successors by first checking whether we have terminator
      // that may be simplified based on the values simplified by this call.
      if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
@@ -955,7 +1267,8 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
  
      // If we're unable to select a particular successor, just count all of
      // them.
-    for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; ++TIdx)
+    for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
+         ++TIdx)
        BBWorklist.insert(TI->getSuccessor(TIdx));
  
      // If we had any successors at this point, than post-inlining is likely to
@@ -969,43 +1282,126 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
      }
    }
  
-  Threshold += VectorBonus;
+  // If this is a noduplicate call, we can still inline as long as
+  // inlining this would cause the removal of the caller (so the instruction
+  // is not actually duplicated, just moved).
+  if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
+    return false;
+
+  // We applied the maximum possible vector bonus at the beginning. Now,
+  // subtract the excess bonus, if any, from the Threshold before
+  // comparing against Cost.
+  if (NumVectorInstructions <= NumInstructions / 10)
+    Threshold -= FiftyPercentVectorBonus;
+  else if (NumVectorInstructions <= NumInstructions / 2)
+    Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus);
  
-  return AlwaysInline || Cost < Threshold;
+  return Cost <= std::max(0, Threshold);
  }
  
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
  /// \brief Dump stats about this call's analysis.
  void CallAnalyzer::dump() {
-#define DEBUG_PRINT_STAT(x) llvm::dbgs() << "      " #x ": " << x << "\n"
+#define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
    DEBUG_PRINT_STAT(NumConstantArgs);
    DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
    DEBUG_PRINT_STAT(NumAllocaArgs);
    DEBUG_PRINT_STAT(NumConstantPtrCmps);
    DEBUG_PRINT_STAT(NumConstantPtrDiffs);
    DEBUG_PRINT_STAT(NumInstructionsSimplified);
+  DEBUG_PRINT_STAT(NumInstructions);
    DEBUG_PRINT_STAT(SROACostSavings);
    DEBUG_PRINT_STAT(SROACostSavingsLost);
+  DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
+  DEBUG_PRINT_STAT(Cost);
+  DEBUG_PRINT_STAT(Threshold);
  #undef DEBUG_PRINT_STAT
  }
  #endif
  
-InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, int Threshold) {
+INITIALIZE_PASS_BEGIN(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
+                      true, true)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
+                    true, true)
+
+char InlineCostAnalysis::ID = 0;
+
+InlineCostAnalysis::InlineCostAnalysis() : CallGraphSCCPass(ID) {}
+
+InlineCostAnalysis::~InlineCostAnalysis() {}
+
+void InlineCostAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<AssumptionCacheTracker>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
+  CallGraphSCCPass::getAnalysisUsage(AU);
+}
+
+bool InlineCostAnalysis::runOnSCC(CallGraphSCC &SCC) {
+  TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
+  ACT = &getAnalysis<AssumptionCacheTracker>();
+  return false;
+}
+
+InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, int Threshold) {
    return getInlineCost(CS, CS.getCalledFunction(), Threshold);
  }
  
-InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, Function *Callee,
+/// \brief Test that two functions either have or have not the given attribute
+///        at the same time.
+template<typename AttrKind>
+static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) {
+  return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr);
+}
+
+/// \brief Test that there are no attribute conflicts between Caller and Callee
+///        that prevent inlining.
+static bool functionsHaveCompatibleAttributes(Function *Caller,
+                                              Function *Callee,
+                                              TargetTransformInfo &TTI) {
+  return TTI.areInlineCompatible(Caller, Callee) &&
+         attributeMatches(Caller, Callee, Attribute::SanitizeAddress) &&
+         attributeMatches(Caller, Callee, Attribute::SanitizeMemory) &&
+         attributeMatches(Caller, Callee, Attribute::SanitizeThread);
+}
+
+InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
                                               int Threshold) {
+  // Cannot inline indirect calls.
+  if (!Callee)
+    return llvm::InlineCost::getNever();
+
+  // Calls to functions with always-inline attributes should be inlined
+  // whenever possible.
+  if (CS.hasFnAttr(Attribute::AlwaysInline)) {
+    if (isInlineViable(*Callee))
+      return llvm::InlineCost::getAlways();
+    return llvm::InlineCost::getNever();
+  }
+
+  // Never inline functions with conflicting attributes (unless callee has
+  // always-inline attribute).
+  if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee,
+                                         TTIWP->getTTI(*Callee)))
+    return llvm::InlineCost::getNever();
+
+  // Don't inline this call if the caller has the optnone attribute.
+  if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone))
+    return llvm::InlineCost::getNever();
+
    // Don't inline functions which can be redefined at link-time to mean
    // something else.  Don't inline functions marked noinline or call sites
    // marked noinline.
-  if (!Callee || Callee->mayBeOverridden() ||
-      Callee->hasFnAttr(Attribute::NoInline) || CS.isNoInline())
+  if (Callee->mayBeOverridden() ||
+      Callee->hasFnAttribute(Attribute::NoInline) || CS.isNoInline())
      return llvm::InlineCost::getNever();
  
-  DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName() << "...\n");
+  DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
+        << "...\n");
  
-  CallAnalyzer CA(TD, *Callee, Threshold);
+  CallAnalyzer CA(TTIWP->getTTI(*Callee), ACT, *Callee, Threshold, CS);
    bool ShouldInline = CA.analyzeCall(CS);
  
    DEBUG(CA.dump());
@@ -1018,3 +1414,38 @@ InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, Function *Callee,
  
    return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
  }
+
+bool InlineCostAnalysis::isInlineViable(Function &F) {
+  bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    // Disallow inlining of functions which contain indirect branches or
+    // blockaddresses.
+    if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
+      return false;
+
+    for (auto &II : *BI) {
+      CallSite CS(&II);
+      if (!CS)
+        continue;
+
+      // Disallow recursive calls.
+      if (&F == CS.getCalledFunction())
+        return false;
+
+      // Disallow calls which expose returns-twice to a function not previously
+      // attributed as such.
+      if (!ReturnsTwice && CS.isCall() &&
+          cast<CallInst>(CS.getInstruction())->canReturnTwice())
+        return false;
+
+      // Disallow inlining functions that call @llvm.localescape. Doing this
+      // correctly would require major changes to the inliner.
+      if (CS.getCalledFunction() &&
+          CS.getCalledFunction()->getIntrinsicID() ==
+              llvm::Intrinsic::localescape)
+        return false;
+    }
+  }
+
+  return true;
+}