lib/Analysis/IPA/InlineCost.cpp

   1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file implements inline cost analysis.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "llvm/Analysis/InlineCost.h"
  15 #include "llvm/ADT/STLExtras.h"
  16 #include "llvm/ADT/SetVector.h"
  17 #include "llvm/ADT/SmallPtrSet.h"
  18 #include "llvm/ADT/SmallVector.h"
  19 #include "llvm/ADT/Statistic.h"
  20 #include "llvm/Analysis/AssumptionCache.h"
  21 #include "llvm/Analysis/CodeMetrics.h"
  22 #include "llvm/Analysis/ConstantFolding.h"
  23 #include "llvm/Analysis/InstructionSimplify.h"
  24 #include "llvm/Analysis/TargetTransformInfo.h"
  25 #include "llvm/IR/CallSite.h"
  26 #include "llvm/IR/CallingConv.h"
  27 #include "llvm/IR/DataLayout.h"
  28 #include "llvm/IR/GetElementPtrTypeIterator.h"
  29 #include "llvm/IR/GlobalAlias.h"
  30 #include "llvm/IR/InstVisitor.h"
  31 #include "llvm/IR/IntrinsicInst.h"
  32 #include "llvm/IR/Operator.h"
  33 #include "llvm/Support/Debug.h"
  34 #include "llvm/Support/raw_ostream.h"
  35
  36 using namespace llvm;
  37
  38 #define DEBUG_TYPE "inline-cost"
  39
  40 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
  41
  42 namespace {
  43
  44 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
  45   typedef InstVisitor<CallAnalyzer, bool> Base;
  46   friend class InstVisitor<CallAnalyzer, bool>;
  47
  48   /// The TargetTransformInfo available for this compilation.
  49   const TargetTransformInfo &TTI;
  50
  51   /// The cache of @llvm.assume intrinsics.
  52   AssumptionCacheTracker *ACT;
  53
  54   // The called function.
  55   Function &F;
  56
  57   // The candidate callsite being analyzed. Please do not use this to do
  58   // analysis in the caller function; we want the inline cost query to be
  59   // easily cacheable. Instead, use the cover function paramHasAttr.
  60   CallSite CandidateCS;
  61
  62   int Threshold;
  63   int Cost;
  64
  65   bool IsCallerRecursive;
  66   bool IsRecursiveCall;
  67   bool ExposesReturnsTwice;
  68   bool HasDynamicAlloca;
  69   bool ContainsNoDuplicateCall;
  70   bool HasReturn;
  71   bool HasIndirectBr;
  72   bool HasFrameEscape;
  73
  74   /// Number of bytes allocated statically by the callee.
  75   uint64_t AllocatedSize;
  76   unsigned NumInstructions, NumVectorInstructions;
  77   int FiftyPercentVectorBonus, TenPercentVectorBonus;
  78   int VectorBonus;
  79
  80   // While we walk the potentially-inlined instructions, we build up and
  81   // maintain a mapping of simplified values specific to this callsite. The
  82   // idea is to propagate any special information we have about arguments to
  83   // this call through the inlinable section of the function, and account for
  84   // likely simplifications post-inlining. The most important aspect we track
  85   // is CFG altering simplifications -- when we prove a basic block dead, that
  86   // can cause dramatic shifts in the cost of inlining a function.
  87   DenseMap<Value *, Constant *> SimplifiedValues;
  88
  89   // Keep track of the values which map back (through function arguments) to
  90   // allocas on the caller stack which could be simplified through SROA.
  91   DenseMap<Value *, Value *> SROAArgValues;
  92
  93   // The mapping of caller Alloca values to their accumulated cost savings. If
  94   // we have to disable SROA for one of the allocas, this tells us how much
  95   // cost must be added.
  96   DenseMap<Value *, int> SROAArgCosts;
  97
  98   // Keep track of values which map to a pointer base and constant offset.
  99   DenseMap<Value *, std::pair<Value *, APInt> > ConstantOffsetPtrs;
 100
 101   // Custom simplification helper routines.
 102   bool isAllocaDerivedArg(Value *V);
 103   bool lookupSROAArgAndCost(Value *V, Value *&Arg,
 104                             DenseMap<Value *, int>::iterator &CostIt);
 105   void disableSROA(DenseMap<Value *, int>::iterator CostIt);
 106   void disableSROA(Value *V);
 107   void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
 108                           int InstructionCost);
 109   bool isGEPOffsetConstant(GetElementPtrInst &GEP);
 110   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
 111   bool simplifyCallSite(Function *F, CallSite CS);
 112   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
 113
 114   /// Return true if the given argument to the function being considered for
 115   /// inlining has the given attribute set either at the call site or the
 116   /// function declaration.  Primarily used to inspect call site specific
 117   /// attributes since these can be more precise than the ones on the callee
 118   /// itself.
 119   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
 120
 121   /// Return true if the given value is known non null within the callee if
 122   /// inlined through this particular callsite.
 123   bool isKnownNonNullInCallee(Value *V);
 124
 125   // Custom analysis routines.
 126   bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
 127
 128   // Disable several entry points to the visitor so we don't accidentally use
 129   // them by declaring but not defining them here.
 130   void visit(Module *);     void visit(Module &);
 131   void visit(Function *);   void visit(Function &);
 132   void visit(BasicBlock *); void visit(BasicBlock &);
 133
 134   // Provide base case for our instruction visit.
 135   bool visitInstruction(Instruction &I);
 136
 137   // Our visit overrides.
 138   bool visitAlloca(AllocaInst &I);
 139   bool visitPHI(PHINode &I);
 140   bool visitGetElementPtr(GetElementPtrInst &I);
 141   bool visitBitCast(BitCastInst &I);
 142   bool visitPtrToInt(PtrToIntInst &I);
 143   bool visitIntToPtr(IntToPtrInst &I);
 144   bool visitCastInst(CastInst &I);
 145   bool visitUnaryInstruction(UnaryInstruction &I);
 146   bool visitCmpInst(CmpInst &I);
 147   bool visitSub(BinaryOperator &I);
 148   bool visitBinaryOperator(BinaryOperator &I);
 149   bool visitLoad(LoadInst &I);
 150   bool visitStore(StoreInst &I);
 151   bool visitExtractValue(ExtractValueInst &I);
 152   bool visitInsertValue(InsertValueInst &I);
 153   bool visitCallSite(CallSite CS);
 154   bool visitReturnInst(ReturnInst &RI);
 155   bool visitBranchInst(BranchInst &BI);
 156   bool visitSwitchInst(SwitchInst &SI);
 157   bool visitIndirectBrInst(IndirectBrInst &IBI);
 158   bool visitResumeInst(ResumeInst &RI);
 159   bool visitUnreachableInst(UnreachableInst &I);
 160
 161 public:
 162   CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT,
 163                Function &Callee, int Threshold, CallSite CSArg)
 164     : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), Threshold(Threshold),
 165         Cost(0), IsCallerRecursive(false), IsRecursiveCall(false),
 166         ExposesReturnsTwice(false), HasDynamicAlloca(false),
 167         ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
 168         HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
 169         NumVectorInstructions(0), FiftyPercentVectorBonus(0),
 170         TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
 171         NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
 172         NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
 173         SROACostSavings(0), SROACostSavingsLost(0) {}
 174
 175   bool analyzeCall(CallSite CS);
 176
 177   int getThreshold() { return Threshold; }
 178   int getCost() { return Cost; }
 179
 180   // Keep a bunch of stats about the cost savings found so we can print them
 181   // out when debugging.
 182   unsigned NumConstantArgs;
 183   unsigned NumConstantOffsetPtrArgs;
 184   unsigned NumAllocaArgs;
 185   unsigned NumConstantPtrCmps;
 186   unsigned NumConstantPtrDiffs;
 187   unsigned NumInstructionsSimplified;
 188   unsigned SROACostSavings;
 189   unsigned SROACostSavingsLost;
 190
 191   void dump();
 192 };
 193
 194 } // namespace
 195
 196 /// \brief Test whether the given value is an Alloca-derived function argument.
 197 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
 198   return SROAArgValues.count(V);
 199 }
 200
 201 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to.
 202 /// Returns false if V does not map to a SROA-candidate.
 203 bool CallAnalyzer::lookupSROAArgAndCost(
 204     Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
 205   if (SROAArgValues.empty() || SROAArgCosts.empty())
 206     return false;
 207
 208   DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
 209   if (ArgIt == SROAArgValues.end())
 210     return false;
 211
 212   Arg = ArgIt->second;
 213   CostIt = SROAArgCosts.find(Arg);
 214   return CostIt != SROAArgCosts.end();
 215 }
 216
 217 /// \brief Disable SROA for the candidate marked by this cost iterator.
 218 ///
 219 /// This marks the candidate as no longer viable for SROA, and adds the cost
 220 /// savings associated with it back into the inline cost measurement.
 221 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
 222   // If we're no longer able to perform SROA we need to undo its cost savings
 223   // and prevent subsequent analysis.
 224   Cost += CostIt->second;
 225   SROACostSavings -= CostIt->second;
 226   SROACostSavingsLost += CostIt->second;
 227   SROAArgCosts.erase(CostIt);
 228 }
 229
 230 /// \brief If 'V' maps to a SROA candidate, disable SROA for it.
 231 void CallAnalyzer::disableSROA(Value *V) {
 232   Value *SROAArg;
 233   DenseMap<Value *, int>::iterator CostIt;
 234   if (lookupSROAArgAndCost(V, SROAArg, CostIt))
 235     disableSROA(CostIt);
 236 }
 237
 238 /// \brief Accumulate the given cost for a particular SROA candidate.
 239 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
 240                                       int InstructionCost) {
 241   CostIt->second += InstructionCost;
 242   SROACostSavings += InstructionCost;
 243 }
 244
 245 /// \brief Check whether a GEP's indices are all constant.
 246 ///
 247 /// Respects any simplified values known during the analysis of this callsite.
 248 bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) {
 249   for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
 250     if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
 251       return false;
 252
 253   return true;
 254 }
 255
 256 /// \brief Accumulate a constant GEP offset into an APInt if possible.
 257 ///
 258 /// Returns false if unable to compute the offset for any reason. Respects any
 259 /// simplified values known during the analysis of this callsite.
 260 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
 261   const DataLayout &DL = F.getParent()->getDataLayout();
 262   unsigned IntPtrWidth = DL.getPointerSizeInBits();
 263   assert(IntPtrWidth == Offset.getBitWidth());
 264
 265   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
 266        GTI != GTE; ++GTI) {
 267     ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
 268     if (!OpC)
 269       if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
 270         OpC = dyn_cast<ConstantInt>(SimpleOp);
 271     if (!OpC)
 272       return false;
 273     if (OpC->isZero()) continue;
 274
 275     // Handle a struct index, which adds its field offset to the pointer.
 276     if (StructType *STy = dyn_cast<StructType>(*GTI)) {
 277       unsigned ElementIdx = OpC->getZExtValue();
 278       const StructLayout *SL = DL.getStructLayout(STy);
 279       Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
 280       continue;
 281     }
 282
 283     APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
 284     Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
 285   }
 286   return true;
 287 }
 288
 289 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
 290   // Check whether inlining will turn a dynamic alloca into a static
 291   // alloca, and handle that case.
 292   if (I.isArrayAllocation()) {
 293     if (Constant *Size = SimplifiedValues.lookup(I.getArraySize())) {
 294       ConstantInt *AllocSize = dyn_cast<ConstantInt>(Size);
 295       assert(AllocSize && "Allocation size not a constant int?");
 296       Type *Ty = I.getAllocatedType();
 297       AllocatedSize += Ty->getPrimitiveSizeInBits() * AllocSize->getZExtValue();
 298       return Base::visitAlloca(I);
 299     }
 300   }
 301
 302   // Accumulate the allocated size.
 303   if (I.isStaticAlloca()) {
 304     const DataLayout &DL = F.getParent()->getDataLayout();
 305     Type *Ty = I.getAllocatedType();
 306     AllocatedSize += DL.getTypeAllocSize(Ty);
 307   }
 308
 309   // We will happily inline static alloca instructions.
 310   if (I.isStaticAlloca())
 311     return Base::visitAlloca(I);
 312
 313   // FIXME: This is overly conservative. Dynamic allocas are inefficient for
 314   // a variety of reasons, and so we would like to not inline them into
 315   // functions which don't currently have a dynamic alloca. This simply
 316   // disables inlining altogether in the presence of a dynamic alloca.
 317   HasDynamicAlloca = true;
 318   return false;
 319 }
 320
 321 bool CallAnalyzer::visitPHI(PHINode &I) {
 322   // FIXME: We should potentially be tracking values through phi nodes,
 323   // especially when they collapse to a single value due to deleted CFG edges
 324   // during inlining.
 325
 326   // FIXME: We need to propagate SROA *disabling* through phi nodes, even
 327   // though we don't want to propagate it's bonuses. The idea is to disable
 328   // SROA if it *might* be used in an inappropriate manner.
 329
 330   // Phi nodes are always zero-cost.
 331   return true;
 332 }
 333
 334 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
 335   Value *SROAArg;
 336   DenseMap<Value *, int>::iterator CostIt;
 337   bool SROACandidate = lookupSROAArgAndCost(I.getPointerOperand(),
 338                                             SROAArg, CostIt);
 339
 340   // Try to fold GEPs of constant-offset call site argument pointers. This
 341   // requires target data and inbounds GEPs.
 342   if (I.isInBounds()) {
 343     // Check if we have a base + offset for the pointer.
 344     Value *Ptr = I.getPointerOperand();
 345     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr);
 346     if (BaseAndOffset.first) {
 347       // Check if the offset of this GEP is constant, and if so accumulate it
 348       // into Offset.
 349       if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) {
 350         // Non-constant GEPs aren't folded, and disable SROA.
 351         if (SROACandidate)
 352           disableSROA(CostIt);
 353         return false;
 354       }
 355
 356       // Add the result as a new mapping to Base + Offset.
 357       ConstantOffsetPtrs[&I] = BaseAndOffset;
 358
 359       // Also handle SROA candidates here, we already know that the GEP is
 360       // all-constant indexed.
 361       if (SROACandidate)
 362         SROAArgValues[&I] = SROAArg;
 363
 364       return true;
 365     }
 366   }
 367
 368   if (isGEPOffsetConstant(I)) {
 369     if (SROACandidate)
 370       SROAArgValues[&I] = SROAArg;
 371
 372     // Constant GEPs are modeled as free.
 373     return true;
 374   }
 375
 376   // Variable GEPs will require math and will disable SROA.
 377   if (SROACandidate)
 378     disableSROA(CostIt);
 379   return false;
 380 }
 381
 382 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
 383   // Propagate constants through bitcasts.
 384   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
 385   if (!COp)
 386     COp = SimplifiedValues.lookup(I.getOperand(0));
 387   if (COp)
 388     if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) {
 389       SimplifiedValues[&I] = C;
 390       return true;
 391     }
 392
 393   // Track base/offsets through casts
 394   std::pair<Value *, APInt> BaseAndOffset
 395     = ConstantOffsetPtrs.lookup(I.getOperand(0));
 396   // Casts don't change the offset, just wrap it up.
 397   if (BaseAndOffset.first)
 398     ConstantOffsetPtrs[&I] = BaseAndOffset;
 399
 400   // Also look for SROA candidates here.
 401   Value *SROAArg;
 402   DenseMap<Value *, int>::iterator CostIt;
 403   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
 404     SROAArgValues[&I] = SROAArg;
 405
 406   // Bitcasts are always zero cost.
 407   return true;
 408 }
 409
 410 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
 411   // Propagate constants through ptrtoint.
 412   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
 413   if (!COp)
 414     COp = SimplifiedValues.lookup(I.getOperand(0));
 415   if (COp)
 416     if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) {
 417       SimplifiedValues[&I] = C;
 418       return true;
 419     }
 420
 421   // Track base/offset pairs when converted to a plain integer provided the
 422   // integer is large enough to represent the pointer.
 423   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
 424   const DataLayout &DL = F.getParent()->getDataLayout();
 425   if (IntegerSize >= DL.getPointerSizeInBits()) {
 426     std::pair<Value *, APInt> BaseAndOffset
 427       = ConstantOffsetPtrs.lookup(I.getOperand(0));
 428     if (BaseAndOffset.first)
 429       ConstantOffsetPtrs[&I] = BaseAndOffset;
 430   }
 431
 432   // This is really weird. Technically, ptrtoint will disable SROA. However,
 433   // unless that ptrtoint is *used* somewhere in the live basic blocks after
 434   // inlining, it will be nuked, and SROA should proceed. All of the uses which
 435   // would block SROA would also block SROA if applied directly to a pointer,
 436   // and so we can just add the integer in here. The only places where SROA is
 437   // preserved either cannot fire on an integer, or won't in-and-of themselves
 438   // disable SROA (ext) w/o some later use that we would see and disable.
 439   Value *SROAArg;
 440   DenseMap<Value *, int>::iterator CostIt;
 441   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
 442     SROAArgValues[&I] = SROAArg;
 443
 444   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
 445 }
 446
 447 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
 448   // Propagate constants through ptrtoint.
 449   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
 450   if (!COp)
 451     COp = SimplifiedValues.lookup(I.getOperand(0));
 452   if (COp)
 453     if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) {
 454       SimplifiedValues[&I] = C;
 455       return true;
 456     }
 457
 458   // Track base/offset pairs when round-tripped through a pointer without
 459   // modifications provided the integer is not too large.
 460   Value *Op = I.getOperand(0);
 461   unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
 462   const DataLayout &DL = F.getParent()->getDataLayout();
 463   if (IntegerSize <= DL.getPointerSizeInBits()) {
 464     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
 465     if (BaseAndOffset.first)
 466       ConstantOffsetPtrs[&I] = BaseAndOffset;
 467   }
 468
 469   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
 470   Value *SROAArg;
 471   DenseMap<Value *, int>::iterator CostIt;
 472   if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
 473     SROAArgValues[&I] = SROAArg;
 474
 475   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
 476 }
 477
 478 bool CallAnalyzer::visitCastInst(CastInst &I) {
 479   // Propagate constants through ptrtoint.
 480   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
 481   if (!COp)
 482     COp = SimplifiedValues.lookup(I.getOperand(0));
 483   if (COp)
 484     if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
 485       SimplifiedValues[&I] = C;
 486       return true;
 487     }
 488
 489   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
 490   disableSROA(I.getOperand(0));
 491
 492   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
 493 }
 494
 495 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
 496   Value *Operand = I.getOperand(0);
 497   Constant *COp = dyn_cast<Constant>(Operand);
 498   if (!COp)
 499     COp = SimplifiedValues.lookup(Operand);
 500   if (COp) {
 501     const DataLayout &DL = F.getParent()->getDataLayout();
 502     if (Constant *C = ConstantFoldInstOperands(I.getOpcode(), I.getType(),
 503                                                COp, DL)) {
 504       SimplifiedValues[&I] = C;
 505       return true;
 506     }
 507   }
 508
 509   // Disable any SROA on the argument to arbitrary unary operators.
 510   disableSROA(Operand);
 511
 512   return false;
 513 }
 514
 515 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
 516   unsigned ArgNo = A->getArgNo();
 517   return CandidateCS.paramHasAttr(ArgNo+1, Attr);
 518 }
 519
 520 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
 521   // Does the *call site* have the NonNull attribute set on an argument?  We
 522   // use the attribute on the call site to memoize any analysis done in the
 523   // caller. This will also trip if the callee function has a non-null
 524   // parameter attribute, but that's a less interesting case because hopefully
 525   // the callee would already have been simplified based on that.
 526   if (Argument *A = dyn_cast<Argument>(V))
 527     if (paramHasAttr(A, Attribute::NonNull))
 528       return true;
 529
 530   // Is this an alloca in the caller?  This is distinct from the attribute case
 531   // above because attributes aren't updated within the inliner itself and we
 532   // always want to catch the alloca derived case.
 533   if (isAllocaDerivedArg(V))
 534     // We can actually predict the result of comparisons between an
 535     // alloca-derived value and null. Note that this fires regardless of
 536     // SROA firing.
 537     return true;
 538
 539   return false;
 540 }
 541
 542 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
 543   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
 544   // First try to handle simplified comparisons.
 545   if (!isa<Constant>(LHS))
 546     if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
 547       LHS = SimpleLHS;
 548   if (!isa<Constant>(RHS))
 549     if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
 550       RHS = SimpleRHS;
 551   if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
 552     if (Constant *CRHS = dyn_cast<Constant>(RHS))
 553       if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
 554         SimplifiedValues[&I] = C;
 555         return true;
 556       }
 557   }
 558
 559   if (I.getOpcode() == Instruction::FCmp)
 560     return false;
 561
 562   // Otherwise look for a comparison between constant offset pointers with
 563   // a common base.
 564   Value *LHSBase, *RHSBase;
 565   APInt LHSOffset, RHSOffset;
 566   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
 567   if (LHSBase) {
 568     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
 569     if (RHSBase && LHSBase == RHSBase) {
 570       // We have common bases, fold the icmp to a constant based on the
 571       // offsets.
 572       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
 573       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
 574       if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
 575         SimplifiedValues[&I] = C;
 576         ++NumConstantPtrCmps;
 577         return true;
 578       }
 579     }
 580   }
 581
 582   // If the comparison is an equality comparison with null, we can simplify it
 583   // if we know the value (argument) can't be null
 584   if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
 585       isKnownNonNullInCallee(I.getOperand(0))) {
 586     bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
 587     SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
 588                                       : ConstantInt::getFalse(I.getType());
 589     return true;
 590   }
 591   // Finally check for SROA candidates in comparisons.
 592   Value *SROAArg;
 593   DenseMap<Value *, int>::iterator CostIt;
 594   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
 595     if (isa<ConstantPointerNull>(I.getOperand(1))) {
 596       accumulateSROACost(CostIt, InlineConstants::InstrCost);
 597       return true;
 598     }
 599
 600     disableSROA(CostIt);
 601   }
 602
 603   return false;
 604 }
 605
 606 bool CallAnalyzer::visitSub(BinaryOperator &I) {
 607   // Try to handle a special case: we can fold computing the difference of two
 608   // constant-related pointers.
 609   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
 610   Value *LHSBase, *RHSBase;
 611   APInt LHSOffset, RHSOffset;
 612   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
 613   if (LHSBase) {
 614     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
 615     if (RHSBase && LHSBase == RHSBase) {
 616       // We have common bases, fold the subtract to a constant based on the
 617       // offsets.
 618       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
 619       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
 620       if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
 621         SimplifiedValues[&I] = C;
 622         ++NumConstantPtrDiffs;
 623         return true;
 624       }
 625     }
 626   }
 627
 628   // Otherwise, fall back to the generic logic for simplifying and handling
 629   // instructions.
 630   return Base::visitSub(I);
 631 }
 632
 633 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
 634   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
 635   const DataLayout &DL = F.getParent()->getDataLayout();
 636   if (!isa<Constant>(LHS))
 637     if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
 638       LHS = SimpleLHS;
 639   if (!isa<Constant>(RHS))
 640     if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
 641       RHS = SimpleRHS;
 642   Value *SimpleV = nullptr;
 643   if (auto FI = dyn_cast<FPMathOperator>(&I))
 644     SimpleV =
 645         SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
 646   else
 647     SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
 648
 649   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
 650     SimplifiedValues[&I] = C;
 651     return true;
 652   }
 653
 654   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
 655   disableSROA(LHS);
 656   disableSROA(RHS);
 657
 658   return false;
 659 }
 660
 661 bool CallAnalyzer::visitLoad(LoadInst &I) {
 662   Value *SROAArg;
 663   DenseMap<Value *, int>::iterator CostIt;
 664   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
 665     if (I.isSimple()) {
 666       accumulateSROACost(CostIt, InlineConstants::InstrCost);
 667       return true;
 668     }
 669
 670     disableSROA(CostIt);
 671   }
 672
 673   return false;
 674 }
 675
 676 bool CallAnalyzer::visitStore(StoreInst &I) {
 677   Value *SROAArg;
 678   DenseMap<Value *, int>::iterator CostIt;
 679   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
 680     if (I.isSimple()) {
 681       accumulateSROACost(CostIt, InlineConstants::InstrCost);
 682       return true;
 683     }
 684
 685     disableSROA(CostIt);
 686   }
 687
 688   return false;
 689 }
 690
 691 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
 692   // Constant folding for extract value is trivial.
 693   Constant *C = dyn_cast<Constant>(I.getAggregateOperand());
 694   if (!C)
 695     C = SimplifiedValues.lookup(I.getAggregateOperand());
 696   if (C) {
 697     SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices());
 698     return true;
 699   }
 700
 701   // SROA can look through these but give them a cost.
 702   return false;
 703 }
 704
 705 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
 706   // Constant folding for insert value is trivial.
 707   Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand());
 708   if (!AggC)
 709     AggC = SimplifiedValues.lookup(I.getAggregateOperand());
 710   Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand());
 711   if (!InsertedC)
 712     InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand());
 713   if (AggC && InsertedC) {
 714     SimplifiedValues[&I] = ConstantExpr::getInsertValue(AggC, InsertedC,
 715                                                         I.getIndices());
 716     return true;
 717   }
 718
 719   // SROA can look through these but give them a cost.
 720   return false;
 721 }
 722
 723 /// \brief Try to simplify a call site.
 724 ///
 725 /// Takes a concrete function and callsite and tries to actually simplify it by
 726 /// analyzing the arguments and call itself with instsimplify. Returns true if
 727 /// it has simplified the callsite to some other entity (a constant), making it
 728 /// free.
 729 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
 730   // FIXME: Using the instsimplify logic directly for this is inefficient
 731   // because we have to continually rebuild the argument list even when no
 732   // simplifications can be performed. Until that is fixed with remapping
 733   // inside of instsimplify, directly constant fold calls here.
 734   if (!canConstantFoldCallTo(F))
 735     return false;
 736
 737   // Try to re-map the arguments to constants.
 738   SmallVector<Constant *, 4> ConstantArgs;
 739   ConstantArgs.reserve(CS.arg_size());
 740   for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
 741        I != E; ++I) {
 742     Constant *C = dyn_cast<Constant>(*I);
 743     if (!C)
 744       C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
 745     if (!C)
 746       return false; // This argument doesn't map to a constant.
 747
 748     ConstantArgs.push_back(C);
 749   }
 750   if (Constant *C = ConstantFoldCall(F, ConstantArgs)) {
 751     SimplifiedValues[CS.getInstruction()] = C;
 752     return true;
 753   }
 754
 755   return false;
 756 }
 757
 758 bool CallAnalyzer::visitCallSite(CallSite CS) {
 759   if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
 760       !F.hasFnAttribute(Attribute::ReturnsTwice)) {
 761     // This aborts the entire analysis.
 762     ExposesReturnsTwice = true;
 763     return false;
 764   }
 765   if (CS.isCall() &&
 766       cast<CallInst>(CS.getInstruction())->cannotDuplicate())
 767     ContainsNoDuplicateCall = true;
 768
 769   if (Function *F = CS.getCalledFunction()) {
 770     // When we have a concrete function, first try to simplify it directly.
 771     if (simplifyCallSite(F, CS))
 772       return true;
 773
 774     // Next check if it is an intrinsic we know about.
 775     // FIXME: Lift this into part of the InstVisitor.
 776     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
 777       switch (II->getIntrinsicID()) {
 778       default:
 779         return Base::visitCallSite(CS);
 780
 781       case Intrinsic::memset:
 782       case Intrinsic::memcpy:
 783       case Intrinsic::memmove:
 784         // SROA can usually chew through these intrinsics, but they aren't free.
 785         return false;
 786       case Intrinsic::frameescape:
 787         HasFrameEscape = true;
 788         return false;
 789       }
 790     }
 791
 792     if (F == CS.getInstruction()->getParent()->getParent()) {
 793       // This flag will fully abort the analysis, so don't bother with anything
 794       // else.
 795       IsRecursiveCall = true;
 796       return false;
 797     }
 798
 799     if (TTI.isLoweredToCall(F)) {
 800       // We account for the average 1 instruction per call argument setup
 801       // here.
 802       Cost += CS.arg_size() * InlineConstants::InstrCost;
 803
 804       // Everything other than inline ASM will also have a significant cost
 805       // merely from making the call.
 806       if (!isa<InlineAsm>(CS.getCalledValue()))
 807         Cost += InlineConstants::CallPenalty;
 808     }
 809
 810     return Base::visitCallSite(CS);
 811   }
 812
 813   // Otherwise we're in a very special case -- an indirect function call. See
 814   // if we can be particularly clever about this.
 815   Value *Callee = CS.getCalledValue();
 816
 817   // First, pay the price of the argument setup. We account for the average
 818   // 1 instruction per call argument setup here.
 819   Cost += CS.arg_size() * InlineConstants::InstrCost;
 820
 821   // Next, check if this happens to be an indirect function call to a known
 822   // function in this inline context. If not, we've done all we can.
 823   Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
 824   if (!F)
 825     return Base::visitCallSite(CS);
 826
 827   // If we have a constant that we are calling as a function, we can peer
 828   // through it and see the function target. This happens not infrequently
 829   // during devirtualization and so we want to give it a hefty bonus for
 830   // inlining, but cap that bonus in the event that inlining wouldn't pan
 831   // out. Pretend to inline the function, with a custom threshold.
 832   CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS);
 833   if (CA.analyzeCall(CS)) {
 834     // We were able to inline the indirect call! Subtract the cost from the
 835     // bonus we want to apply, but don't go below zero.
 836     Cost -= std::max(0, InlineConstants::IndirectCallThreshold - CA.getCost());
 837   }
 838
 839   return Base::visitCallSite(CS);
 840 }
 841
 842 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
 843   // At least one return instruction will be free after inlining.
 844   bool Free = !HasReturn;
 845   HasReturn = true;
 846   return Free;
 847 }
 848
 849 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
 850   // We model unconditional branches as essentially free -- they really
 851   // shouldn't exist at all, but handling them makes the behavior of the
 852   // inliner more regular and predictable. Interestingly, conditional branches
 853   // which will fold away are also free.
 854   return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
 855          dyn_cast_or_null<ConstantInt>(
 856              SimplifiedValues.lookup(BI.getCondition()));
 857 }
 858
 859 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
 860   // We model unconditional switches as free, see the comments on handling
 861   // branches.
 862   if (isa<ConstantInt>(SI.getCondition()))
 863     return true;
 864   if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
 865     if (isa<ConstantInt>(V))
 866       return true;
 867
 868   // Otherwise, we need to accumulate a cost proportional to the number of
 869   // distinct successor blocks. This fan-out in the CFG cannot be represented
 870   // for free even if we can represent the core switch as a jumptable that
 871   // takes a single instruction.
 872   //
 873   // NB: We convert large switches which are just used to initialize large phi
 874   // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
 875   // inlining those. It will prevent inlining in cases where the optimization
 876   // does not (yet) fire.
 877   SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
 878   SuccessorBlocks.insert(SI.getDefaultDest());
 879   for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I)
 880     SuccessorBlocks.insert(I.getCaseSuccessor());
 881   // Add cost corresponding to the number of distinct destinations. The first
 882   // we model as free because of fallthrough.
 883   Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
 884   return false;
 885 }
 886
 887 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
 888   // We never want to inline functions that contain an indirectbr.  This is
 889   // incorrect because all the blockaddress's (in static global initializers
 890   // for example) would be referring to the original function, and this
 891   // indirect jump would jump from the inlined copy of the function into the
 892   // original function which is extremely undefined behavior.
 893   // FIXME: This logic isn't really right; we can safely inline functions with
 894   // indirectbr's as long as no other function or global references the
 895   // blockaddress of a block within the current function.
 896   HasIndirectBr = true;
 897   return false;
 898 }
 899
 900 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
 901   // FIXME: It's not clear that a single instruction is an accurate model for
 902   // the inline cost of a resume instruction.
 903   return false;
 904 }
 905
 906 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
 907   // FIXME: It might be reasonably to discount the cost of instructions leading
 908   // to unreachable as they have the lowest possible impact on both runtime and
 909   // code size.
 910   return true; // No actual code is needed for unreachable.
 911 }
 912
 913 bool CallAnalyzer::visitInstruction(Instruction &I) {
 914   // Some instructions are free. All of the free intrinsics can also be
 915   // handled by SROA, etc.
 916   if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
 917     return true;
 918
 919   // We found something we don't understand or can't handle. Mark any SROA-able
 920   // values in the operand list as no longer viable.
 921   for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
 922     disableSROA(*OI);
 923
 924   return false;
 925 }
 926
 927
 928 /// \brief Analyze a basic block for its contribution to the inline cost.
 929 ///
 930 /// This method walks the analyzer over every instruction in the given basic
 931 /// block and accounts for their cost during inlining at this callsite. It
 932 /// aborts early if the threshold has been exceeded or an impossible to inline
 933 /// construct has been detected. It returns false if inlining is no longer
 934 /// viable, and true if inlining remains viable.
 935 bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
 936                                 SmallPtrSetImpl<const Value *> &EphValues) {
 937   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
 938     // FIXME: Currently, the number of instructions in a function regardless of
 939     // our ability to simplify them during inline to constants or dead code,
 940     // are actually used by the vector bonus heuristic. As long as that's true,
 941     // we have to special case debug intrinsics here to prevent differences in
 942     // inlining due to debug symbols. Eventually, the number of unsimplified
 943     // instructions shouldn't factor into the cost computation, but until then,
 944     // hack around it here.
 945     if (isa<DbgInfoIntrinsic>(I))
 946       continue;
 947
 948     // Skip ephemeral values.
 949     if (EphValues.count(I))
 950       continue;
 951
 952     ++NumInstructions;
 953     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
 954       ++NumVectorInstructions;
 955
 956     // If the instruction is floating point, and the target says this operation is
 957     // expensive or the function has the "use-soft-float" attribute, this may
 958     // eventually become a library call.  Treat the cost as such.
 959     if (I->getType()->isFloatingPointTy()) {
 960       bool hasSoftFloatAttr = false;
 961
 962       // If the function has the "use-soft-float" attribute, mark it as expensive.
 963       if (F.hasFnAttribute("use-soft-float")) {
 964         Attribute Attr = F.getFnAttribute("use-soft-float");
 965         StringRef Val = Attr.getValueAsString();
 966         if (Val == "true")
 967           hasSoftFloatAttr = true;
 968       }
 969
 970       if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
 971           hasSoftFloatAttr)
 972         Cost += InlineConstants::CallPenalty;
 973     }
 974
 975     // If the instruction simplified to a constant, there is no cost to this
 976     // instruction. Visit the instructions using our InstVisitor to account for
 977     // all of the per-instruction logic. The visit tree returns true if we
 978     // consumed the instruction in any way, and false if the instruction's base
 979     // cost should count against inlining.
 980     if (Base::visit(I))
 981       ++NumInstructionsSimplified;
 982     else
 983       Cost += InlineConstants::InstrCost;
 984
 985     // If the visit this instruction detected an uninlinable pattern, abort.
 986     if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
 987         HasIndirectBr || HasFrameEscape)
 988       return false;
 989
 990     // If the caller is a recursive function then we don't want to inline
 991     // functions which allocate a lot of stack space because it would increase
 992     // the caller stack usage dramatically.
 993     if (IsCallerRecursive &&
 994         AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
 995       return false;
 996
 997     // Check if we've past the maximum possible threshold so we don't spin in
 998     // huge basic blocks that will never inline.
 999     if (Cost > Threshold)
1000       return false;
1001   }
1002
1003   return true;
1004 }
1005
1006 /// \brief Compute the base pointer and cumulative constant offsets for V.
1007 ///
1008 /// This strips all constant offsets off of V, leaving it the base pointer, and
1009 /// accumulates the total constant offset applied in the returned constant. It
1010 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1011 /// no constant offsets applied.
1012 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1013   if (!V->getType()->isPointerTy())
1014     return nullptr;
1015
1016   const DataLayout &DL = F.getParent()->getDataLayout();
1017   unsigned IntPtrWidth = DL.getPointerSizeInBits();
1018   APInt Offset = APInt::getNullValue(IntPtrWidth);
1019
1020   // Even though we don't look through PHI nodes, we could be called on an
1021   // instruction in an unreachable block, which may be on a cycle.
1022   SmallPtrSet<Value *, 4> Visited;
1023   Visited.insert(V);
1024   do {
1025     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1026       if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1027         return nullptr;
1028       V = GEP->getPointerOperand();
1029     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1030       V = cast<Operator>(V)->getOperand(0);
1031     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1032       if (GA->mayBeOverridden())
1033         break;
1034       V = GA->getAliasee();
1035     } else {
1036       break;
1037     }
1038     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1039   } while (Visited.insert(V).second);
1040
1041   Type *IntPtrTy = DL.getIntPtrType(V->getContext());
1042   return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
1043 }
1044
1045 /// \brief Analyze a call site for potential inlining.
1046 ///
1047 /// Returns true if inlining this call is viable, and false if it is not
1048 /// viable. It computes the cost and adjusts the threshold based on numerous
1049 /// factors and heuristics. If this method returns false but the computed cost
1050 /// is below the computed threshold, then inlining was forcibly disabled by
1051 /// some artifact of the routine.
1052 bool CallAnalyzer::analyzeCall(CallSite CS) {
1053   ++NumCallsAnalyzed;
1054
1055   // Perform some tweaks to the cost and threshold based on the direct
1056   // callsite information.
1057
1058   // We want to more aggressively inline vector-dense kernels, so up the
1059   // threshold, and we'll lower it if the % of vector instructions gets too
1060   // low. Note that these bonuses are some what arbitrary and evolved over time
1061   // by accident as much as because they are principled bonuses.
1062   //
1063   // FIXME: It would be nice to remove all such bonuses. At least it would be
1064   // nice to base the bonus values on something more scientific.
1065   assert(NumInstructions == 0);
1066   assert(NumVectorInstructions == 0);
1067   FiftyPercentVectorBonus = 3 * Threshold / 2;
1068   TenPercentVectorBonus = 3 * Threshold / 4;
1069   const DataLayout &DL = F.getParent()->getDataLayout();
1070
1071   // Track whether the post-inlining function would have more than one basic
1072   // block. A single basic block is often intended for inlining. Balloon the
1073   // threshold by 50% until we pass the single-BB phase.
1074   bool SingleBB = true;
1075   int SingleBBBonus = Threshold / 2;
1076
1077   // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1078   // this Threshold any time, and cost cannot decrease, we can stop processing
1079   // the rest of the function body.
1080   Threshold += (SingleBBBonus + FiftyPercentVectorBonus);
1081
1082   // Give out bonuses per argument, as the instructions setting them up will
1083   // be gone after inlining.
1084   for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
1085     if (CS.isByValArgument(I)) {
1086       // We approximate the number of loads and stores needed by dividing the
1087       // size of the byval type by the target's pointer size.
1088       PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1089       unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1090       unsigned PointerSize = DL.getPointerSizeInBits();
1091       // Ceiling division.
1092       unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1093
1094       // If it generates more than 8 stores it is likely to be expanded as an
1095       // inline memcpy so we take that as an upper bound. Otherwise we assume
1096       // one load and one store per word copied.
1097       // FIXME: The maxStoresPerMemcpy setting from the target should be used
1098       // here instead of a magic number of 8, but it's not available via
1099       // DataLayout.
1100       NumStores = std::min(NumStores, 8U);
1101
1102       Cost -= 2 * NumStores * InlineConstants::InstrCost;
1103     } else {
1104       // For non-byval arguments subtract off one instruction per call
1105       // argument.
1106       Cost -= InlineConstants::InstrCost;
1107     }
1108   }
1109
1110   // If there is only one call of the function, and it has internal linkage,
1111   // the cost of inlining it drops dramatically.
1112   bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() &&
1113     &F == CS.getCalledFunction();
1114   if (OnlyOneCallAndLocalLinkage)
1115     Cost += InlineConstants::LastCallToStaticBonus;
1116
1117   // If the instruction after the call, or if the normal destination of the
1118   // invoke is an unreachable instruction, the function is noreturn. As such,
1119   // there is little point in inlining this unless there is literally zero
1120   // cost.
1121   Instruction *Instr = CS.getInstruction();
1122   if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
1123     if (isa<UnreachableInst>(II->getNormalDest()->begin()))
1124       Threshold = 0;
1125   } else if (isa<UnreachableInst>(++BasicBlock::iterator(Instr)))
1126     Threshold = 0;
1127
1128   // If this function uses the coldcc calling convention, prefer not to inline
1129   // it.
1130   if (F.getCallingConv() == CallingConv::Cold)
1131     Cost += InlineConstants::ColdccPenalty;
1132
1133   // Check if we're done. This can happen due to bonuses and penalties.
1134   if (Cost > Threshold)
1135     return false;
1136
1137   if (F.empty())
1138     return true;
1139
1140   Function *Caller = CS.getInstruction()->getParent()->getParent();
1141   // Check if the caller function is recursive itself.
1142   for (User *U : Caller->users()) {
1143     CallSite Site(U);
1144     if (!Site)
1145       continue;
1146     Instruction *I = Site.getInstruction();
1147     if (I->getParent()->getParent() == Caller) {
1148       IsCallerRecursive = true;
1149       break;
1150     }
1151   }
1152
1153   // Populate our simplified values by mapping from function arguments to call
1154   // arguments with known important simplifications.
1155   CallSite::arg_iterator CAI = CS.arg_begin();
1156   for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1157        FAI != FAE; ++FAI, ++CAI) {
1158     assert(CAI != CS.arg_end());
1159     if (Constant *C = dyn_cast<Constant>(CAI))
1160       SimplifiedValues[FAI] = C;
1161
1162     Value *PtrArg = *CAI;
1163     if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1164       ConstantOffsetPtrs[FAI] = std::make_pair(PtrArg, C->getValue());
1165
1166       // We can SROA any pointer arguments derived from alloca instructions.
1167       if (isa<AllocaInst>(PtrArg)) {
1168         SROAArgValues[FAI] = PtrArg;
1169         SROAArgCosts[PtrArg] = 0;
1170       }
1171     }
1172   }
1173   NumConstantArgs = SimplifiedValues.size();
1174   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1175   NumAllocaArgs = SROAArgValues.size();
1176
1177   // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1178   // the ephemeral values multiple times (and they're completely determined by
1179   // the callee, so this is purely duplicate work).
1180   SmallPtrSet<const Value *, 32> EphValues;
1181   CodeMetrics::collectEphemeralValues(&F, &ACT->getAssumptionCache(F), EphValues);
1182
1183   // The worklist of live basic blocks in the callee *after* inlining. We avoid
1184   // adding basic blocks of the callee which can be proven to be dead for this
1185   // particular call site in order to get more accurate cost estimates. This
1186   // requires a somewhat heavyweight iteration pattern: we need to walk the
1187   // basic blocks in a breadth-first order as we insert live successors. To
1188   // accomplish this, prioritizing for small iterations because we exit after
1189   // crossing our threshold, we use a small-size optimized SetVector.
1190   typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1191                                   SmallPtrSet<BasicBlock *, 16> > BBSetVector;
1192   BBSetVector BBWorklist;
1193   BBWorklist.insert(&F.getEntryBlock());
1194   // Note that we *must not* cache the size, this loop grows the worklist.
1195   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1196     // Bail out the moment we cross the threshold. This means we'll under-count
1197     // the cost, but only when undercounting doesn't matter.
1198     if (Cost > Threshold)
1199       break;
1200
1201     BasicBlock *BB = BBWorklist[Idx];
1202     if (BB->empty())
1203       continue;
1204
1205     // Disallow inlining a blockaddress. A blockaddress only has defined
1206     // behavior for an indirect branch in the same function, and we do not
1207     // currently support inlining indirect branches. But, the inliner may not
1208     // see an indirect branch that ends up being dead code at a particular call
1209     // site. If the blockaddress escapes the function, e.g., via a global
1210     // variable, inlining may lead to an invalid cross-function reference.
1211     if (BB->hasAddressTaken())
1212       return false;
1213
1214     // Analyze the cost of this block. If we blow through the threshold, this
1215     // returns false, and we can bail on out.
1216     if (!analyzeBlock(BB, EphValues)) {
1217       if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
1218           HasIndirectBr || HasFrameEscape)
1219         return false;
1220
1221       // If the caller is a recursive function then we don't want to inline
1222       // functions which allocate a lot of stack space because it would increase
1223       // the caller stack usage dramatically.
1224       if (IsCallerRecursive &&
1225           AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
1226         return false;
1227
1228       break;
1229     }
1230
1231     TerminatorInst *TI = BB->getTerminator();
1232
1233     // Add in the live successors by first checking whether we have terminator
1234     // that may be simplified based on the values simplified by this call.
1235     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
1236       if (BI->isConditional()) {
1237         Value *Cond = BI->getCondition();
1238         if (ConstantInt *SimpleCond
1239               = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1240           BBWorklist.insert(BI->getSuccessor(SimpleCond->isZero() ? 1 : 0));
1241           continue;
1242         }
1243       }
1244     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1245       Value *Cond = SI->getCondition();
1246       if (ConstantInt *SimpleCond
1247             = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1248         BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor());
1249         continue;
1250       }
1251     }
1252
1253     // If we're unable to select a particular successor, just count all of
1254     // them.
1255     for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
1256          ++TIdx)
1257       BBWorklist.insert(TI->getSuccessor(TIdx));
1258
1259     // If we had any successors at this point, than post-inlining is likely to
1260     // have them as well. Note that we assume any basic blocks which existed
1261     // due to branches or switches which folded above will also fold after
1262     // inlining.
1263     if (SingleBB && TI->getNumSuccessors() > 1) {
1264       // Take off the bonus we applied to the threshold.
1265       Threshold -= SingleBBBonus;
1266       SingleBB = false;
1267     }
1268   }
1269
1270   // If this is a noduplicate call, we can still inline as long as
1271   // inlining this would cause the removal of the caller (so the instruction
1272   // is not actually duplicated, just moved).
1273   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1274     return false;
1275
1276   // We applied the maximum possible vector bonus at the beginning. Now,
1277   // subtract the excess bonus, if any, from the Threshold before
1278   // comparing against Cost.
1279   if (NumVectorInstructions <= NumInstructions / 10)
1280     Threshold -= FiftyPercentVectorBonus;
1281   else if (NumVectorInstructions <= NumInstructions / 2)
1282     Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus);
1283
1284   return Cost < Threshold;
1285 }
1286
1287 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1288 /// \brief Dump stats about this call's analysis.
1289 void CallAnalyzer::dump() {
1290 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
1291   DEBUG_PRINT_STAT(NumConstantArgs);
1292   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
1293   DEBUG_PRINT_STAT(NumAllocaArgs);
1294   DEBUG_PRINT_STAT(NumConstantPtrCmps);
1295   DEBUG_PRINT_STAT(NumConstantPtrDiffs);
1296   DEBUG_PRINT_STAT(NumInstructionsSimplified);
1297   DEBUG_PRINT_STAT(NumInstructions);
1298   DEBUG_PRINT_STAT(SROACostSavings);
1299   DEBUG_PRINT_STAT(SROACostSavingsLost);
1300   DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
1301   DEBUG_PRINT_STAT(Cost);
1302   DEBUG_PRINT_STAT(Threshold);
1303 #undef DEBUG_PRINT_STAT
1304 }
1305 #endif
1306
1307 INITIALIZE_PASS_BEGIN(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
1308                       true, true)
1309 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1310 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1311 INITIALIZE_PASS_END(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
1312                     true, true)
1313
1314 char InlineCostAnalysis::ID = 0;
1315
1316 InlineCostAnalysis::InlineCostAnalysis() : CallGraphSCCPass(ID) {}
1317
1318 InlineCostAnalysis::~InlineCostAnalysis() {}
1319
1320 void InlineCostAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
1321   AU.setPreservesAll();
1322   AU.addRequired<AssumptionCacheTracker>();
1323   AU.addRequired<TargetTransformInfoWrapperPass>();
1324   CallGraphSCCPass::getAnalysisUsage(AU);
1325 }
1326
1327 bool InlineCostAnalysis::runOnSCC(CallGraphSCC &SCC) {
1328   TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
1329   ACT = &getAnalysis<AssumptionCacheTracker>();
1330   return false;
1331 }
1332
1333 InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, int Threshold) {
1334   return getInlineCost(CS, CS.getCalledFunction(), Threshold);
1335 }
1336
1337 /// \brief Test that two functions either have or have not the given attribute
1338 ///        at the same time.
1339 template<typename AttrKind>
1340 static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) {
1341   return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr);
1342 }
1343
1344 /// \brief Test that there are no attribute conflicts between Caller and Callee
1345 ///        that prevent inlining.
1346 static bool functionsHaveCompatibleAttributes(Function *Caller,
1347                                               Function *Callee,
1348                                               TargetTransformInfo &TTI) {
1349   return TTI.hasCompatibleFunctionAttributes(Caller, Callee) &&
1350          attributeMatches(Caller, Callee, Attribute::SanitizeAddress) &&
1351          attributeMatches(Caller, Callee, Attribute::SanitizeMemory) &&
1352          attributeMatches(Caller, Callee, Attribute::SanitizeThread);
1353 }
1354
1355 InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
1356                                              int Threshold) {
1357   // Cannot inline indirect calls.
1358   if (!Callee)
1359     return llvm::InlineCost::getNever();
1360
1361   // Calls to functions with always-inline attributes should be inlined
1362   // whenever possible.
1363   if (CS.hasFnAttr(Attribute::AlwaysInline)) {
1364     if (isInlineViable(*Callee))
1365       return llvm::InlineCost::getAlways();
1366     return llvm::InlineCost::getNever();
1367   }
1368
1369   // Never inline functions with conflicting attributes (unless callee has
1370   // always-inline attribute).
1371   if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee,
1372                                          TTIWP->getTTI(*Callee)))
1373     return llvm::InlineCost::getNever();
1374
1375   // Don't inline this call if the caller has the optnone attribute.
1376   if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone))
1377     return llvm::InlineCost::getNever();
1378
1379   // Don't inline functions which can be redefined at link-time to mean
1380   // something else.  Don't inline functions marked noinline or call sites
1381   // marked noinline.
1382   if (Callee->mayBeOverridden() ||
1383       Callee->hasFnAttribute(Attribute::NoInline) || CS.isNoInline())
1384     return llvm::InlineCost::getNever();
1385
1386   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
1387         << "...\n");
1388
1389   CallAnalyzer CA(TTIWP->getTTI(*Callee), ACT, *Callee, Threshold, CS);
1390   bool ShouldInline = CA.analyzeCall(CS);
1391
1392   DEBUG(CA.dump());
1393
1394   // Check if there was a reason to force inlining or no inlining.
1395   if (!ShouldInline && CA.getCost() < CA.getThreshold())
1396     return InlineCost::getNever();
1397   if (ShouldInline && CA.getCost() >= CA.getThreshold())
1398     return InlineCost::getAlways();
1399
1400   return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
1401 }
1402
1403 bool InlineCostAnalysis::isInlineViable(Function &F) {
1404   bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
1405   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
1406     // Disallow inlining of functions which contain indirect branches or
1407     // blockaddresses.
1408     if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
1409       return false;
1410
1411     for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
1412          ++II) {
1413       CallSite CS(II);
1414       if (!CS)
1415         continue;
1416
1417       // Disallow recursive calls.
1418       if (&F == CS.getCalledFunction())
1419         return false;
1420
1421       // Disallow calls which expose returns-twice to a function not previously
1422       // attributed as such.
1423       if (!ReturnsTwice && CS.isCall() &&
1424           cast<CallInst>(CS.getInstruction())->canReturnTwice())
1425         return false;
1426
1427       // Disallow inlining functions that call @llvm.frameescape. Doing this
1428       // correctly would require major changes to the inliner.
1429       if (CS.getCalledFunction() &&
1430           CS.getCalledFunction()->getIntrinsicID() ==
1431               llvm::Intrinsic::frameescape)
1432         return false;
1433     }
1434   }
1435
1436   return true;
1437 }