#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LazyValueInfo.h"
#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+#include <memory>
using namespace llvm;
#define DEBUG_TYPE "jump-threading"
STATISTIC(NumDupes, "Number of branch blocks duplicated to eliminate phi");
static cl::opt<unsigned>
-Threshold("jump-threading-threshold",
+BBDuplicateThreshold("jump-threading-threshold",
cl::desc("Max block size to duplicate for jump threading"),
cl::init(6), cl::Hidden);
+static cl::opt<unsigned>
+ImplicationSearchThreshold(
+ "jump-threading-implication-search-threshold",
+ cl::desc("The number of predecessors to search for a stronger "
+ "condition to use to thread over a weaker condition"),
+ cl::init(3), cl::Hidden);
+
namespace {
// These are at global scope so static functions can use them too.
typedef SmallVectorImpl<std::pair<Constant*, BasicBlock*> > PredValueInfo;
/// revectored to the false side of the second if.
///
class JumpThreading : public FunctionPass {
- const DataLayout *DL;
TargetLibraryInfo *TLI;
LazyValueInfo *LVI;
+ std::unique_ptr<BlockFrequencyInfo> BFI;
+ std::unique_ptr<BranchProbabilityInfo> BPI;
+ bool HasProfileData;
#ifdef NDEBUG
SmallPtrSet<BasicBlock*, 16> LoopHeaders;
#else
#endif
DenseSet<std::pair<Value*, BasicBlock*> > RecursionSet;
+ unsigned BBDupThreshold;
+
// RAII helper for updating the recursion stack.
struct RecursionSetRemover {
DenseSet<std::pair<Value*, BasicBlock*> > &TheSet;
};
public:
static char ID; // Pass identification
- JumpThreading() : FunctionPass(ID) {
+ JumpThreading(int T = -1) : FunctionPass(ID) {
+ BBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T);
initializeJumpThreadingPass(*PassRegistry::getPassRegistry());
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LazyValueInfo>();
AU.addPreserved<LazyValueInfo>();
- AU.addRequired<TargetLibraryInfo>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+
+ void releaseMemory() override {
+ BFI.reset();
+ BPI.reset();
}
void FindLoopHeaders(Function &F);
bool ProcessBranchOnPHI(PHINode *PN);
bool ProcessBranchOnXOR(BinaryOperator *BO);
+ bool ProcessImpliedCondition(BasicBlock *BB);
bool SimplifyPartiallyRedundantLoad(LoadInst *LI);
bool TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB);
+
+ private:
+ BasicBlock *SplitBlockPreds(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
+ const char *Suffix);
+ void UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, BasicBlock *BB,
+ BasicBlock *NewBB, BasicBlock *SuccBB);
};
}
INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
"Jump Threading", false, false)
INITIALIZE_PASS_DEPENDENCY(LazyValueInfo)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_END(JumpThreading, "jump-threading",
"Jump Threading", false, false)
// Public interface to the Jump Threading pass
-FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); }
+FunctionPass *llvm::createJumpThreadingPass(int Threshold) { return new JumpThreading(Threshold); }
/// runOnFunction - Top level algorithm.
///
return false;
DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
- DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
- DL = DLP ? &DLP->getDataLayout() : nullptr;
- TLI = &getAnalysis<TargetLibraryInfo>();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
LVI = &getAnalysis<LazyValueInfo>();
+ BFI.reset();
+ BPI.reset();
+ // When profile data is available, we need to update edge weights after
+ // successful jump threading, which requires both BPI and BFI being available.
+ HasProfileData = F.getEntryCount().hasValue();
+ if (HasProfileData) {
+ LoopInfo LI{DominatorTree(F)};
+ BPI.reset(new BranchProbabilityInfo(F, LI));
+ BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
+ }
// Remove unreachable blocks from function as they may result in infinite
// loop. We do threading if we found something profitable. Jump threading a
// branch can create other opportunities. If these opportunities form a cycle
- // i.e. if any jump treading is undoing previous threading in the path, then
+ // i.e. if any jump threading is undoing previous threading in the path, then
// we will loop forever. We take care of this issue by not jump threading for
// back edges. This works for normal cases but not for unreachable blocks as
// they may have cycle with no back edge.
do {
Changed = false;
for (Function::iterator I = F.begin(), E = F.end(); I != E;) {
- BasicBlock *BB = I;
+ BasicBlock *BB = &*I;
// Thread all of the branches we can over this block.
while (ProcessBlock(BB))
Changed = true;
// If the block is trivially dead, zap it. This eliminates the successor
// edges which simplifies the CFG.
- if (pred_begin(BB) == pred_end(BB) &&
+ if (pred_empty(BB) &&
BB != &BB->getParent()->getEntryBlock()) {
DEBUG(dbgs() << " JT: Deleting dead block '" << BB->getName()
<< "' with terminator: " << *BB->getTerminator() << '\n');
static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
unsigned Threshold) {
/// Ignore PHI nodes, these will be flattened when duplication happens.
- BasicBlock::const_iterator I = BB->getFirstNonPHI();
+ BasicBlock::const_iterator I(BB->getFirstNonPHI());
// FIXME: THREADING will delete values that are just used to compute the
// branch, so they shouldn't count against the duplication cost.
+ unsigned Bonus = 0;
+ const TerminatorInst *BBTerm = BB->getTerminator();
+ // Threading through a switch statement is particularly profitable. If this
+ // block ends in a switch, decrease its cost to make it more likely to happen.
+ if (isa<SwitchInst>(BBTerm))
+ Bonus = 6;
+
+ // The same holds for indirect branches, but slightly more so.
+ if (isa<IndirectBrInst>(BBTerm))
+ Bonus = 8;
+
+ // Bump the threshold up so the early exit from the loop doesn't skip the
+ // terminator-based Size adjustment at the end.
+ Threshold += Bonus;
+
// Sum up the cost of each instruction until we get to the terminator. Don't
// include the terminator because the copy won't include it.
unsigned Size = 0;
if (isa<BitCastInst>(I) && I->getType()->isPointerTy())
continue;
+ // Bail out if this instruction gives back a token type, it is not possible
+ // to duplicate it if it is used outside this BB.
+ if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB))
+ return ~0U;
+
// All other instructions count for at least one unit.
++Size;
// as having cost of 2 total, and if they are a vector intrinsic, we model
// them as having cost 1.
if (const CallInst *CI = dyn_cast<CallInst>(I)) {
- if (CI->cannotDuplicate())
+ if (CI->cannotDuplicate() || CI->isConvergent())
// Blocks with NoDuplicate are modelled as having infinite cost, so they
// are never duplicated.
return ~0U;
}
}
- // Threading through a switch statement is particularly profitable. If this
- // block ends in a switch, decrease its cost to make it more likely to happen.
- if (isa<SwitchInst>(I))
- Size = Size > 6 ? Size-6 : 0;
-
- // The same holds for indirect branches, but slightly more so.
- if (isa<IndirectBrInst>(I))
- Size = Size > 8 ? Size-8 : 0;
-
- return Size;
+ return Size > Bonus ? Size - Bonus : 0;
}
/// FindLoopHeaders - We do not want jump threading to turn proper loop
assert(Preference == WantInteger && "Compares only produce integers");
PHINode *PN = dyn_cast<PHINode>(Cmp->getOperand(0));
if (PN && PN->getParent() == BB) {
+ const DataLayout &DL = PN->getModule()->getDataLayout();
// We can do this simplification if any comparisons fold to true or false.
// See if any do.
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
bool JumpThreading::ProcessBlock(BasicBlock *BB) {
// If the block is trivially dead, just return and let the caller nuke it.
// This simplifies other transformations.
- if (pred_begin(BB) == pred_end(BB) &&
+ if (pred_empty(BB) &&
BB != &BB->getParent()->getEntryBlock())
return false;
// because now the condition in this block can be threaded through
// predecessors of our predecessor block.
if (BasicBlock *SinglePred = BB->getSinglePredecessor()) {
- if (SinglePred->getTerminator()->getNumSuccessors() == 1 &&
+ const TerminatorInst *TI = SinglePred->getTerminator();
+ if (!TI->isExceptional() && TI->getNumSuccessors() == 1 &&
SinglePred != BB && !hasAddressTakenAndUsed(BB)) {
// If SinglePred was a loop header, BB becomes one.
if (LoopHeaders.erase(SinglePred))
// Run constant folding to see if we can reduce the condition to a simple
// constant.
if (Instruction *I = dyn_cast<Instruction>(Condition)) {
- Value *SimpleVal = ConstantFoldInstruction(I, DL, TLI);
+ Value *SimpleVal =
+ ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI);
if (SimpleVal) {
I->replaceAllUsesWith(SimpleVal);
I->eraseFromParent();
if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) {
- // For a comparison where the LHS is outside this block, it's possible
- // that we've branched on it before. Used LVI to see if we can simplify
- // the branch based on that.
+ // If we're branching on a conditional, LVI might be able to determine
+ // it's value at the branch instruction. We only handle comparisons
+ // against a constant at this time.
+ // TODO: This should be extended to handle switches as well.
BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1));
- pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
- if (CondBr && CondConst && CondBr->isConditional() && PI != PE &&
- (!isa<Instruction>(CondCmp->getOperand(0)) ||
- cast<Instruction>(CondCmp->getOperand(0))->getParent() != BB)) {
- // For predecessor edge, determine if the comparison is true or false
- // on that edge. If they're all true or all false, we can simplify the
- // branch.
- // FIXME: We could handle mixed true/false by duplicating code.
- LazyValueInfo::Tristate Baseline =
- LVI->getPredicateOnEdge(CondCmp->getPredicate(), CondCmp->getOperand(0),
- CondConst, *PI, BB, CondCmp);
- if (Baseline != LazyValueInfo::Unknown) {
- // Check that all remaining incoming values match the first one.
- while (++PI != PE) {
- LazyValueInfo::Tristate Ret =
- LVI->getPredicateOnEdge(CondCmp->getPredicate(),
- CondCmp->getOperand(0), CondConst, *PI, BB,
- CondCmp);
- if (Ret != Baseline) break;
- }
-
- // If we terminated early, then one of the values didn't match.
- if (PI == PE) {
- unsigned ToRemove = Baseline == LazyValueInfo::True ? 1 : 0;
- unsigned ToKeep = Baseline == LazyValueInfo::True ? 0 : 1;
- CondBr->getSuccessor(ToRemove)->removePredecessor(BB, true);
- BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr);
- CondBr->eraseFromParent();
- return true;
- }
- }
-
- } else if (CondBr && CondConst && CondBr->isConditional()) {
- // There might be an invairant in the same block with the conditional
- // that can determine the predicate.
-
+ if (CondBr && CondConst && CondBr->isConditional()) {
LazyValueInfo::Tristate Ret =
LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
- CondConst, CondCmp);
+ CondConst, CondBr);
if (Ret != LazyValueInfo::Unknown) {
unsigned ToRemove = Ret == LazyValueInfo::True ? 1 : 0;
unsigned ToKeep = Ret == LazyValueInfo::True ? 0 : 1;
CondBr->getSuccessor(ToRemove)->removePredecessor(BB, true);
BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr);
CondBr->eraseFromParent();
+ if (CondCmp->use_empty())
+ CondCmp->eraseFromParent();
+ else if (CondCmp->getParent() == BB) {
+ // If the fact we just learned is true for all uses of the
+ // condition, replace it with a constant value
+ auto *CI = Ret == LazyValueInfo::True ?
+ ConstantInt::getTrue(CondCmp->getType()) :
+ ConstantInt::getFalse(CondCmp->getType());
+ CondCmp->replaceAllUsesWith(CI);
+ CondCmp->eraseFromParent();
+ }
return true;
}
}
CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
return ProcessBranchOnXOR(cast<BinaryOperator>(CondInst));
+ // Search for a stronger dominating condition that can be used to simplify a
+ // conditional branch leaving BB.
+ if (ProcessImpliedCondition(BB))
+ return true;
- // TODO: If we have: "br (X > 0)" and we have a predecessor where we know
- // "(X == 4)", thread through this block.
+ return false;
+}
+
+bool JumpThreading::ProcessImpliedCondition(BasicBlock *BB) {
+ auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!BI || !BI->isConditional())
+ return false;
+
+ Value *Cond = BI->getCondition();
+ BasicBlock *CurrentBB = BB;
+ BasicBlock *CurrentPred = BB->getSinglePredecessor();
+ unsigned Iter = 0;
+
+ auto &DL = BB->getModule()->getDataLayout();
+
+ while (CurrentPred && Iter++ < ImplicationSearchThreshold) {
+ auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator());
+ if (!PBI || !PBI->isConditional() || PBI->getSuccessor(0) != CurrentBB)
+ return false;
+
+ if (isImpliedCondition(PBI->getCondition(), Cond, DL)) {
+ BI->getSuccessor(1)->removePredecessor(BB);
+ BranchInst::Create(BI->getSuccessor(0), BI);
+ BI->eraseFromParent();
+ return true;
+ }
+ CurrentBB = CurrentPred;
+ CurrentPred = CurrentBB->getSinglePredecessor();
+ }
return false;
}
if (LoadBB->getSinglePredecessor())
return false;
- // If the load is defined in a landing pad, it can't be partially redundant,
- // because the edges between the invoke and the landing pad cannot have other
+ // If the load is defined in an EH pad, it can't be partially redundant,
+ // because the edges between the invoke and the EH pad cannot have other
// instructions between them.
- if (LoadBB->isLandingPad())
+ if (LoadBB->isEHPad())
return false;
Value *LoadedPtr = LI->getOperand(0);
// Scan a few instructions up from the load, to see if it is obviously live at
// the entry to its block.
- BasicBlock::iterator BBIt = LI;
+ BasicBlock::iterator BBIt(LI);
if (Value *AvailableVal =
- FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, 6)) {
- // If the value if the load is locally available within the block, just use
+ FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, DefMaxInstsToScan)) {
+ // If the value of the load is locally available within the block, just use
// it. This frequently occurs for reg2mem'd allocas.
//cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n";
// If the returned value is the load itself, replace with an undef. This can
// only happen in dead loops.
if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType());
+ if (AvailableVal->getType() != LI->getType())
+ AvailableVal =
+ CastInst::CreateBitOrPointerCast(AvailableVal, LI->getType(), "", LI);
LI->replaceAllUsesWith(AvailableVal);
LI->eraseFromParent();
return true;
BasicBlock *PredBB = *PI;
// If we already scanned this predecessor, skip it.
- if (!PredsScanned.insert(PredBB))
+ if (!PredsScanned.insert(PredBB).second)
continue;
// Scan the predecessor to see if the value is available in the pred.
BBIt = PredBB->end();
AAMDNodes ThisAATags;
- Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6,
+ Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt,
+ DefMaxInstsToScan,
nullptr, &ThisAATags);
if (!PredAvailable) {
OneUnavailablePred = PredBB;
}
// Split them out to their own block.
- UnavailablePred =
- SplitBlockPredecessors(LoadBB, PredsToSplit, "thread-pre-split", this);
+ UnavailablePred = SplitBlockPreds(LoadBB, PredsToSplit, "thread-pre-split");
}
// If the value isn't available in all predecessors, then there will be
// Create a PHI node at the start of the block for the PRE'd load value.
pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB);
PHINode *PN = PHINode::Create(LI->getType(), std::distance(PB, PE), "",
- LoadBB->begin());
+ &LoadBB->front());
PN->takeName(LI);
PN->setDebugLoc(LI->getDebugLoc());
assert(I != AvailablePreds.end() && I->first == P &&
"Didn't find entry for predecessor!");
- PN->addIncoming(I->second, I->first);
+ // If we have an available predecessor but it requires casting, insert the
+ // cast in the predecessor and use the cast. Note that we have to update the
+ // AvailablePreds vector as we go so that all of the PHI entries for this
+ // predecessor use the same bitcast.
+ Value *&PredV = I->second;
+ if (PredV->getType() != LI->getType())
+ PredV = CastInst::CreateBitOrPointerCast(PredV, LI->getType(), "",
+ P->getTerminator());
+
+ PN->addIncoming(PredV, I->first);
}
//cerr << "PRE: " << *LI << *PN << "\n";
for (unsigned i = 0, e = PredValues.size(); i != e; ++i) {
BasicBlock *Pred = PredValues[i].second;
- if (!SeenPreds.insert(Pred))
+ if (!SeenPreds.insert(Pred).second)
continue; // Duplicate predecessor entry.
// If the predecessor ends with an indirect goto, we can't change its
// Into:
// BB':
// %Y = icmp ne i32 %A, %B
- // br i1 %Z, ...
+ // br i1 %Y, ...
PredValueInfoTy XorOpValues;
bool isLHS = true;
return false;
}
- unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, Threshold);
- if (JumpThreadCost > Threshold) {
+ unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+ if (JumpThreadCost > BBDupThreshold) {
DEBUG(dbgs() << " Not threading BB '" << BB->getName()
<< "' - Cost is too high: " << JumpThreadCost << "\n");
return false;
}
- // And finally, do it! Start by factoring the predecessors is needed.
+ // And finally, do it! Start by factoring the predecessors if needed.
BasicBlock *PredBB;
if (PredBBs.size() == 1)
PredBB = PredBBs[0];
else {
DEBUG(dbgs() << " Factoring out " << PredBBs.size()
<< " common predecessors.\n");
- PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm", this);
+ PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
}
// And finally, do it!
BB->getParent(), BB);
NewBB->moveAfter(PredBB);
+ // Set the block frequency of NewBB.
+ if (HasProfileData) {
+ auto NewBBFreq =
+ BFI->getBlockFreq(PredBB) * BPI->getEdgeProbability(PredBB, BB);
+ BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
+ }
+
BasicBlock::iterator BI = BB->begin();
for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
Instruction *New = BI->clone();
New->setName(BI->getName());
NewBB->getInstList().push_back(New);
- ValueMapping[BI] = New;
+ ValueMapping[&*BI] = New;
// Remap operands to patch up intra-block references.
for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
// We didn't copy the terminator from BB over to NewBB, because there is now
// an unconditional jump to SuccBB. Insert the unconditional jump.
- BranchInst *NewBI =BranchInst::Create(SuccBB, NewBB);
+ BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB);
NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc());
// Check to see if SuccBB has PHI nodes. If so, we need to add entries to the
// its block to be uses of the appropriate PHI node etc. See ValuesInBlocks
// with the two values we know.
SSAUpdate.Initialize(I->getType(), I->getName());
- SSAUpdate.AddAvailableValue(BB, I);
- SSAUpdate.AddAvailableValue(NewBB, ValueMapping[I]);
+ SSAUpdate.AddAvailableValue(BB, &*I);
+ SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&*I]);
while (!UsesToRename.empty())
SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
// At this point, the IR is fully up to date and consistent. Do a quick scan
// over the new instructions and zap any that are constants or dead. This
// frequently happens because of phi translation.
- SimplifyInstructionsInBlock(NewBB, DL, TLI);
+ SimplifyInstructionsInBlock(NewBB, TLI);
+
+ // Update the edge weight from BB to SuccBB, which should be less than before.
+ UpdateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB);
// Threaded an edge!
++NumThreads;
return true;
}
+/// Create a new basic block that will be the predecessor of BB and successor of
+/// all blocks in Preds. When profile data is availble, update the frequency of
+/// this new block.
+BasicBlock *JumpThreading::SplitBlockPreds(BasicBlock *BB,
+ ArrayRef<BasicBlock *> Preds,
+ const char *Suffix) {
+ // Collect the frequencies of all predecessors of BB, which will be used to
+ // update the edge weight on BB->SuccBB.
+ BlockFrequency PredBBFreq(0);
+ if (HasProfileData)
+ for (auto Pred : Preds)
+ PredBBFreq += BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB);
+
+ BasicBlock *PredBB = SplitBlockPredecessors(BB, Preds, Suffix);
+
+ // Set the block frequency of the newly created PredBB, which is the sum of
+ // frequencies of Preds.
+ if (HasProfileData)
+ BFI->setBlockFreq(PredBB, PredBBFreq.getFrequency());
+ return PredBB;
+}
+
+/// Update the block frequency of BB and branch weight and the metadata on the
+/// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 -
+/// Freq(PredBB->BB) / Freq(BB->SuccBB).
+void JumpThreading::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
+ BasicBlock *BB,
+ BasicBlock *NewBB,
+ BasicBlock *SuccBB) {
+ if (!HasProfileData)
+ return;
+
+ assert(BFI && BPI && "BFI & BPI should have been created here");
+
+ // As the edge from PredBB to BB is deleted, we have to update the block
+ // frequency of BB.
+ auto BBOrigFreq = BFI->getBlockFreq(BB);
+ auto NewBBFreq = BFI->getBlockFreq(NewBB);
+ auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB);
+ auto BBNewFreq = BBOrigFreq - NewBBFreq;
+ BFI->setBlockFreq(BB, BBNewFreq.getFrequency());
+
+ // Collect updated outgoing edges' frequencies from BB and use them to update
+ // edge probabilities.
+ SmallVector<uint64_t, 4> BBSuccFreq;
+ for (auto I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+ auto SuccFreq = (*I == SuccBB)
+ ? BB2SuccBBFreq - NewBBFreq
+ : BBOrigFreq * BPI->getEdgeProbability(BB, *I);
+ BBSuccFreq.push_back(SuccFreq.getFrequency());
+ }
+
+ uint64_t MaxBBSuccFreq =
+ *std::max_element(BBSuccFreq.begin(), BBSuccFreq.end());
+
+ SmallVector<BranchProbability, 4> BBSuccProbs;
+ if (MaxBBSuccFreq == 0)
+ BBSuccProbs.assign(BBSuccFreq.size(),
+ {1, static_cast<unsigned>(BBSuccFreq.size())});
+ else {
+ for (uint64_t Freq : BBSuccFreq)
+ BBSuccProbs.push_back(
+ BranchProbability::getBranchProbability(Freq, MaxBBSuccFreq));
+ // Normalize edge probabilities so that they sum up to one.
+ BranchProbability::normalizeProbabilities(BBSuccProbs.begin(),
+ BBSuccProbs.end());
+ }
+
+ // Update edge probabilities in BPI.
+ for (int I = 0, E = BBSuccProbs.size(); I < E; I++)
+ BPI->setEdgeProbability(BB, I, BBSuccProbs[I]);
+
+ if (BBSuccProbs.size() >= 2) {
+ SmallVector<uint32_t, 4> Weights;
+ for (auto Prob : BBSuccProbs)
+ Weights.push_back(Prob.getNumerator());
+
+ auto TI = BB->getTerminator();
+ TI->setMetadata(
+ LLVMContext::MD_prof,
+ MDBuilder(TI->getParent()->getContext()).createBranchWeights(Weights));
+ }
+}
+
/// DuplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch
/// to BB which contains an i1 PHI node and a conditional branch on that PHI.
/// If we can duplicate the contents of BB up into PredBB do so now, this
return false;
}
- unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, Threshold);
- if (DuplicationCost > Threshold) {
+ unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+ if (DuplicationCost > BBDupThreshold) {
DEBUG(dbgs() << " Not duplicating BB '" << BB->getName()
<< "' - Cost is too high: " << DuplicationCost << "\n");
return false;
}
- // And finally, do it! Start by factoring the predecessors is needed.
+ // And finally, do it! Start by factoring the predecessors if needed.
BasicBlock *PredBB;
if (PredBBs.size() == 1)
PredBB = PredBBs[0];
else {
DEBUG(dbgs() << " Factoring out " << PredBBs.size()
<< " common predecessors.\n");
- PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm", this);
+ PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
}
// Okay, we decided to do this! Clone all the instructions in BB onto the end
BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
if (!OldPredBranch || !OldPredBranch->isUnconditional()) {
- PredBB = SplitEdge(PredBB, BB, this);
+ PredBB = SplitEdge(PredBB, BB);
OldPredBranch = cast<BranchInst>(PredBB->getTerminator());
}
BasicBlock::iterator BI = BB->begin();
for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
-
// Clone the non-phi instructions of BB into PredBB, keeping track of the
// mapping and using it to remap operands in the cloned instructions.
for (; BI != BB->end(); ++BI) {
// If this instruction can be simplified after the operands are updated,
// just use the simplified value instead. This frequently happens due to
// phi translation.
- if (Value *IV = SimplifyInstruction(New, DL)) {
+ if (Value *IV =
+ SimplifyInstruction(New, BB->getModule()->getDataLayout())) {
delete New;
- ValueMapping[BI] = IV;
+ ValueMapping[&*BI] = IV;
} else {
// Otherwise, insert the new instruction into the block.
New->setName(BI->getName());
- PredBB->getInstList().insert(OldPredBranch, New);
- ValueMapping[BI] = New;
+ PredBB->getInstList().insert(OldPredBranch->getIterator(), New);
+ ValueMapping[&*BI] = New;
}
}
// its block to be uses of the appropriate PHI node etc. See ValuesInBlocks
// with the two values we know.
SSAUpdate.Initialize(I->getType(), I->getName());
- SSAUpdate.AddAvailableValue(BB, I);
- SSAUpdate.AddAvailableValue(PredBB, ValueMapping[I]);
+ SSAUpdate.AddAvailableValue(BB, &*I);
+ SSAUpdate.AddAvailableValue(PredBB, ValueMapping[&*I]);
while (!UsesToRename.empty())
SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());