typedef InstVisitor<CallAnalyzer, bool> Base;
friend class InstVisitor<CallAnalyzer, bool>;
- // DataLayout if available, or null.
- const DataLayout *const DL;
-
/// The TargetTransformInfo available for this compilation.
const TargetTransformInfo &TTI;
// The called function.
Function &F;
+ // The candidate callsite being analyzed. Please do not use this to do
+ // analysis in the caller function; we want the inline cost query to be
+ // easily cacheable. Instead, use the cover function paramHasAttr.
+ CallSite CandidateCS;
+
int Threshold;
int Cost;
bool ContainsNoDuplicateCall;
bool HasReturn;
bool HasIndirectBr;
+ bool HasFrameEscape;
/// Number of bytes allocated statically by the callee.
uint64_t AllocatedSize;
bool simplifyCallSite(Function *F, CallSite CS);
ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
+ /// Return true if the given argument to the function being considered for
+ /// inlining has the given attribute set either at the call site or the
+ /// function declaration. Primarily used to inspect call site specific
+ /// attributes since these can be more precise than the ones on the callee
+ /// itself.
+ bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
+
+ /// Return true if the given value is known non null within the callee if
+ /// inlined through this particular callsite.
+ bool isKnownNonNullInCallee(Value *V);
+
// Custom analysis routines.
bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
bool visitUnreachableInst(UnreachableInst &I);
public:
- CallAnalyzer(const DataLayout *DL, const TargetTransformInfo &TTI,
- AssumptionCacheTracker *ACT, Function &Callee, int Threshold)
- : DL(DL), TTI(TTI), ACT(ACT), F(Callee), Threshold(Threshold), Cost(0),
- IsCallerRecursive(false), IsRecursiveCall(false),
+ CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT,
+ Function &Callee, int Threshold, CallSite CSArg)
+ : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), Threshold(Threshold),
+ Cost(0), IsCallerRecursive(false), IsRecursiveCall(false),
ExposesReturnsTwice(false), HasDynamicAlloca(false),
ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
- AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0),
- FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0),
- NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
- NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
- NumInstructionsSimplified(0), SROACostSavings(0),
- SROACostSavingsLost(0) {}
+ HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
+ NumVectorInstructions(0), FiftyPercentVectorBonus(0),
+ TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
+ NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
+ NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
+ SROACostSavings(0), SROACostSavingsLost(0) {}
bool analyzeCall(CallSite CS);
/// Returns false if unable to compute the offset for any reason. Respects any
/// simplified values known during the analysis of this callsite.
bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
- if (!DL)
- return false;
-
- unsigned IntPtrWidth = DL->getPointerSizeInBits();
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ unsigned IntPtrWidth = DL.getPointerSizeInBits();
assert(IntPtrWidth == Offset.getBitWidth());
for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
// Handle a struct index, which adds its field offset to the pointer.
if (StructType *STy = dyn_cast<StructType>(*GTI)) {
unsigned ElementIdx = OpC->getZExtValue();
- const StructLayout *SL = DL->getStructLayout(STy);
+ const StructLayout *SL = DL.getStructLayout(STy);
Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
continue;
}
- APInt TypeSize(IntPtrWidth, DL->getTypeAllocSize(GTI.getIndexedType()));
+ APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
}
return true;
// Accumulate the allocated size.
if (I.isStaticAlloca()) {
+ const DataLayout &DL = F.getParent()->getDataLayout();
Type *Ty = I.getAllocatedType();
- AllocatedSize += (DL ? DL->getTypeAllocSize(Ty) :
- Ty->getPrimitiveSizeInBits());
+ AllocatedSize += DL.getTypeAllocSize(Ty);
}
// We will happily inline static alloca instructions.
// Try to fold GEPs of constant-offset call site argument pointers. This
// requires target data and inbounds GEPs.
- if (DL && I.isInBounds()) {
+ if (I.isInBounds()) {
// Check if we have a base + offset for the pointer.
Value *Ptr = I.getPointerOperand();
std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr);
}
bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
- const DataLayout *DL = I.getModule()->getDataLayout();
// Propagate constants through ptrtoint.
Constant *COp = dyn_cast<Constant>(I.getOperand(0));
if (!COp)
// Track base/offset pairs when converted to a plain integer provided the
// integer is large enough to represent the pointer.
unsigned IntegerSize = I.getType()->getScalarSizeInBits();
- if (DL && IntegerSize >= DL->getPointerSizeInBits()) {
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ if (IntegerSize >= DL.getPointerSizeInBits()) {
std::pair<Value *, APInt> BaseAndOffset
= ConstantOffsetPtrs.lookup(I.getOperand(0));
if (BaseAndOffset.first)
}
bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
- const DataLayout *DL = I.getModule()->getDataLayout();
// Propagate constants through ptrtoint.
Constant *COp = dyn_cast<Constant>(I.getOperand(0));
if (!COp)
// modifications provided the integer is not too large.
Value *Op = I.getOperand(0);
unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
- if (DL && IntegerSize <= DL->getPointerSizeInBits()) {
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ if (IntegerSize <= DL.getPointerSizeInBits()) {
std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
if (BaseAndOffset.first)
ConstantOffsetPtrs[&I] = BaseAndOffset;
Constant *COp = dyn_cast<Constant>(Operand);
if (!COp)
COp = SimplifiedValues.lookup(Operand);
- if (COp)
+ if (COp) {
+ const DataLayout &DL = F.getParent()->getDataLayout();
if (Constant *C = ConstantFoldInstOperands(I.getOpcode(), I.getType(),
COp, DL)) {
SimplifiedValues[&I] = C;
return true;
}
+ }
// Disable any SROA on the argument to arbitrary unary operators.
disableSROA(Operand);
return false;
}
+bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
+ unsigned ArgNo = A->getArgNo();
+ return CandidateCS.paramHasAttr(ArgNo+1, Attr);
+}
+
+bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
+ // Does the *call site* have the NonNull attribute set on an argument? We
+ // use the attribute on the call site to memoize any analysis done in the
+ // caller. This will also trip if the callee function has a non-null
+ // parameter attribute, but that's a less interesting case because hopefully
+ // the callee would already have been simplified based on that.
+ if (Argument *A = dyn_cast<Argument>(V))
+ if (paramHasAttr(A, Attribute::NonNull))
+ return true;
+
+ // Is this an alloca in the caller? This is distinct from the attribute case
+ // above because attributes aren't updated within the inliner itself and we
+ // always want to catch the alloca derived case.
+ if (isAllocaDerivedArg(V))
+ // We can actually predict the result of comparisons between an
+ // alloca-derived value and null. Note that this fires regardless of
+ // SROA firing.
+ return true;
+
+ return false;
+}
+
bool CallAnalyzer::visitCmpInst(CmpInst &I) {
Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
// First try to handle simplified comparisons.
}
// If the comparison is an equality comparison with null, we can simplify it
- // for any alloca-derived argument.
- if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)))
- if (isAllocaDerivedArg(I.getOperand(0))) {
- // We can actually predict the result of comparisons between an
- // alloca-derived value and null. Note that this fires regardless of
- // SROA firing.
- bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
- SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
- : ConstantInt::getFalse(I.getType());
- return true;
- }
-
+ // if we know the value (argument) can't be null
+ if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
+ isKnownNonNullInCallee(I.getOperand(0))) {
+ bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
+ SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
+ : ConstantInt::getFalse(I.getType());
+ return true;
+ }
// Finally check for SROA candidates in comparisons.
Value *SROAArg;
DenseMap<Value *, int>::iterator CostIt;
bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+ const DataLayout &DL = F.getParent()->getDataLayout();
if (!isa<Constant>(LHS))
if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
LHS = SimpleLHS;
bool CallAnalyzer::visitLoad(LoadInst &I) {
Value *SROAArg;
DenseMap<Value *, int>::iterator CostIt;
- if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
+ if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
if (I.isSimple()) {
accumulateSROACost(CostIt, InlineConstants::InstrCost);
return true;
bool CallAnalyzer::visitStore(StoreInst &I) {
Value *SROAArg;
DenseMap<Value *, int>::iterator CostIt;
- if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
+ if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
if (I.isSimple()) {
accumulateSROACost(CostIt, InlineConstants::InstrCost);
return true;
case Intrinsic::memmove:
// SROA can usually chew through these intrinsics, but they aren't free.
return false;
+ case Intrinsic::frameescape:
+ HasFrameEscape = true;
+ return false;
}
}
// during devirtualization and so we want to give it a hefty bonus for
// inlining, but cap that bonus in the event that inlining wouldn't pan
// out. Pretend to inline the function, with a custom threshold.
- CallAnalyzer CA(DL, TTI, ACT, *F, InlineConstants::IndirectCallThreshold);
+ CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS);
if (CA.analyzeCall(CS)) {
// We were able to inline the indirect call! Subtract the cost from the
// bonus we want to apply, but don't go below zero.
// If the visit this instruction detected an uninlinable pattern, abort.
if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
- HasIndirectBr)
+ HasIndirectBr || HasFrameEscape)
return false;
// If the caller is a recursive function then we don't want to inline
AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
return false;
- if (NumVectorInstructions > NumInstructions/2)
- VectorBonus = FiftyPercentVectorBonus;
- else if (NumVectorInstructions > NumInstructions/10)
- VectorBonus = TenPercentVectorBonus;
- else
- VectorBonus = 0;
-
- // Check if we've past the threshold so we don't spin in huge basic
- // blocks that will never inline.
- if (Cost > (Threshold + VectorBonus))
+ // Check if we've past the maximum possible threshold so we don't spin in
+ // huge basic blocks that will never inline.
+ if (Cost > Threshold)
return false;
}
/// returns 0 if V is not a pointer, and returns the constant '0' if there are
/// no constant offsets applied.
ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
- if (!DL || !V->getType()->isPointerTy())
+ if (!V->getType()->isPointerTy())
return nullptr;
- unsigned IntPtrWidth = DL->getPointerSizeInBits();
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ unsigned IntPtrWidth = DL.getPointerSizeInBits();
APInt Offset = APInt::getNullValue(IntPtrWidth);
// Even though we don't look through PHI nodes, we could be called on an
assert(V->getType()->isPointerTy() && "Unexpected operand type!");
} while (Visited.insert(V).second);
- Type *IntPtrTy = DL->getIntPtrType(V->getContext());
+ Type *IntPtrTy = DL.getIntPtrType(V->getContext());
return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
}
bool CallAnalyzer::analyzeCall(CallSite CS) {
++NumCallsAnalyzed;
- // Track whether the post-inlining function would have more than one basic
- // block. A single basic block is often intended for inlining. Balloon the
- // threshold by 50% until we pass the single-BB phase.
- bool SingleBB = true;
- int SingleBBBonus = Threshold / 2;
- Threshold += SingleBBBonus;
-
// Perform some tweaks to the cost and threshold based on the direct
// callsite information.
// We want to more aggressively inline vector-dense kernels, so up the
// threshold, and we'll lower it if the % of vector instructions gets too
- // low.
+ // low. Note that these bonuses are some what arbitrary and evolved over time
+ // by accident as much as because they are principled bonuses.
+ //
+ // FIXME: It would be nice to remove all such bonuses. At least it would be
+ // nice to base the bonus values on something more scientific.
assert(NumInstructions == 0);
assert(NumVectorInstructions == 0);
- FiftyPercentVectorBonus = Threshold;
- TenPercentVectorBonus = Threshold / 2;
+ FiftyPercentVectorBonus = 3 * Threshold / 2;
+ TenPercentVectorBonus = 3 * Threshold / 4;
+ const DataLayout &DL = F.getParent()->getDataLayout();
+
+ // Track whether the post-inlining function would have more than one basic
+ // block. A single basic block is often intended for inlining. Balloon the
+ // threshold by 50% until we pass the single-BB phase.
+ bool SingleBB = true;
+ int SingleBBBonus = Threshold / 2;
+
+ // Speculatively apply all possible bonuses to Threshold. If cost exceeds
+ // this Threshold any time, and cost cannot decrease, we can stop processing
+ // the rest of the function body.
+ Threshold += (SingleBBBonus + FiftyPercentVectorBonus);
// Give out bonuses per argument, as the instructions setting them up will
// be gone after inlining.
for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
- if (DL && CS.isByValArgument(I)) {
+ if (CS.isByValArgument(I)) {
// We approximate the number of loads and stores needed by dividing the
// size of the byval type by the target's pointer size.
PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
- unsigned TypeSize = DL->getTypeSizeInBits(PTy->getElementType());
- unsigned PointerSize = DL->getPointerSizeInBits();
+ unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
+ unsigned PointerSize = DL.getPointerSizeInBits();
// Ceiling division.
unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
Instruction *Instr = CS.getInstruction();
if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
if (isa<UnreachableInst>(II->getNormalDest()->begin()))
- Threshold = 1;
+ Threshold = 0;
} else if (isa<UnreachableInst>(++BasicBlock::iterator(Instr)))
- Threshold = 1;
+ Threshold = 0;
// If this function uses the coldcc calling convention, prefer not to inline
// it.
for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
// Bail out the moment we cross the threshold. This means we'll under-count
// the cost, but only when undercounting doesn't matter.
- if (Cost > (Threshold + VectorBonus))
+ if (Cost > Threshold)
break;
BasicBlock *BB = BBWorklist[Idx];
// returns false, and we can bail on out.
if (!analyzeBlock(BB, EphValues)) {
if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
- HasIndirectBr)
+ HasIndirectBr || HasFrameEscape)
return false;
// If the caller is a recursive function then we don't want to inline
if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
return false;
- Threshold += VectorBonus;
+ // We applied the maximum possible vector bonus at the beginning. Now,
+ // subtract the excess bonus, if any, from the Threshold before
+ // comparing against Cost.
+ if (NumVectorInstructions <= NumInstructions / 10)
+ Threshold -= FiftyPercentVectorBonus;
+ else if (NumVectorInstructions <= NumInstructions / 2)
+ Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus);
return Cost < Threshold;
}
DEBUG_PRINT_STAT(NumConstantPtrCmps);
DEBUG_PRINT_STAT(NumConstantPtrDiffs);
DEBUG_PRINT_STAT(NumInstructionsSimplified);
+ DEBUG_PRINT_STAT(NumInstructions);
DEBUG_PRINT_STAT(SROACostSavings);
DEBUG_PRINT_STAT(SROACostSavingsLost);
DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
DEBUG_PRINT_STAT(Cost);
DEBUG_PRINT_STAT(Threshold);
- DEBUG_PRINT_STAT(VectorBonus);
#undef DEBUG_PRINT_STAT
}
#endif
/// \brief Test that two functions either have or have not the given attribute
/// at the same time.
-static bool attributeMatches(Function *F1, Function *F2,
- Attribute::AttrKind Attr) {
- return F1->hasFnAttribute(Attr) == F2->hasFnAttribute(Attr);
+template<typename AttrKind>
+static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) {
+ return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr);
}
/// \brief Test that there are no attribute conflicts between Caller and Callee
/// that prevent inlining.
static bool functionsHaveCompatibleAttributes(Function *Caller,
- Function *Callee) {
- return attributeMatches(Caller, Callee, Attribute::SanitizeAddress) &&
+ Function *Callee,
+ TargetTransformInfo &TTI) {
+ return TTI.hasCompatibleFunctionAttributes(Caller, Callee) &&
+ attributeMatches(Caller, Callee, Attribute::SanitizeAddress) &&
attributeMatches(Caller, Callee, Attribute::SanitizeMemory) &&
attributeMatches(Caller, Callee, Attribute::SanitizeThread);
}
// Never inline functions with conflicting attributes (unless callee has
// always-inline attribute).
- if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee))
+ if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee,
+ TTIWP->getTTI(*Callee)))
return llvm::InlineCost::getNever();
// Don't inline this call if the caller has the optnone attribute.
DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
<< "...\n");
- CallAnalyzer CA(Callee->getParent()->getDataLayout(), TTIWP->getTTI(*Callee),
- ACT, *Callee, Threshold);
+ CallAnalyzer CA(TTIWP->getTTI(*Callee), ACT, *Callee, Threshold, CS);
bool ShouldInline = CA.analyzeCall(CS);
DEBUG(CA.dump());
if (!ReturnsTwice && CS.isCall() &&
cast<CallInst>(CS.getInstruction())->canReturnTwice())
return false;
+
+ // Disallow inlining functions that call @llvm.frameescape. Doing this
+ // correctly would require major changes to the inliner.
+ if (CS.getCalledFunction() &&
+ CS.getCalledFunction()->getIntrinsicID() ==
+ llvm::Intrinsic::frameescape)
+ return false;
}
}