X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FCodeGen%2FMachineBlockPlacement.cpp;fp=lib%2FCodeGen%2FMachineBlockPlacement.cpp;h=0383d2c56dd9f73d8e67329d0fa6181e14ffeebf;hp=84cc9bcdb60db08d95aefeec12a993589b45d61d;hb=f2558c2db4a9820e578a212f5e6faf36c9e46f1d;hpb=e61c7b5220cd118c005cba94fc14e2598b68f59e diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp index 84cc9bcdb60..0383d2c56dd 100644 --- a/lib/CodeGen/MachineBlockPlacement.cpp +++ b/lib/CodeGen/MachineBlockPlacement.cpp @@ -81,6 +81,23 @@ static cl::opt OutlineOptionalThreshold( "instruction count below this threshold"), cl::init(4), cl::Hidden); +static cl::opt + PreciseRotationCost("precise-rotation-cost", + cl::desc("Model the cost of loop rotation more " + "precisely by using profile data."), + cl::init(false), cl::Hidden); + +static cl::opt MisfetchCost( + "misfetch-cost", + cl::desc("Cost that models the probablistic risk of an instruction " + "misfetch due to a jump comparing to falling through, whose cost " + "is zero."), + cl::init(1), cl::Hidden); + +static cl::opt JumpInstCost("jump-inst-cost", + cl::desc("Cost of jump instructions."), + cl::init(1), cl::Hidden); + namespace { class BlockChain; /// \brief Type for our function-wide basic block -> block chain mapping. @@ -249,6 +266,8 @@ class MachineBlockPlacement : public MachineFunctionPass { void buildLoopChains(MachineFunction &F, MachineLoop &L); void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB, const BlockFilterSet &LoopBlockSet); + void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L, + const BlockFilterSet &LoopBlockSet); void buildCFGChains(MachineFunction &F); public: @@ -791,6 +810,156 @@ void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain, std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end()); } +/// \brief Attempt to rotate a loop based on profile data to reduce branch cost. +/// +/// With profile data, we can determine the cost in terms of missed fall through +/// opportunities when rotating a loop chain and select the best rotation. +/// Basically, there are three kinds of cost to consider for each rotation: +/// 1. The possibly missed fall through edge (if it exists) from BB out of +/// the loop to the loop header. +/// 2. The possibly missed fall through edges (if they exist) from the loop +/// exits to BB out of the loop. +/// 3. The missed fall through edge (if it exists) from the last BB to the +/// first BB in the loop chain. +/// Therefore, the cost for a given rotation is the sum of costs listed above. +/// We select the best rotation with the smallest cost. +void MachineBlockPlacement::rotateLoopWithProfile( + BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet) { + auto HeaderBB = L.getHeader(); + auto HeaderIter = std::find(LoopChain.begin(), LoopChain.end(), HeaderBB); + auto RotationPos = LoopChain.end(); + + BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency(); + + // A utility lambda that scales up a block frequency by dividing it by a + // branch probability which is the reciprocal of the scale. + auto ScaleBlockFrequency = [](BlockFrequency Freq, + unsigned Scale) -> BlockFrequency { + if (Scale == 0) + return 0; + // Use operator / between BlockFrequency and BranchProbability to implement + // saturating multiplication. + return Freq / BranchProbability(1, Scale); + }; + + // Compute the cost of the missed fall-through edge to the loop header if the + // chain head is not the loop header. As we only consider natural loops with + // single header, this computation can be done only once. + BlockFrequency HeaderFallThroughCost(0); + for (auto *Pred : HeaderBB->predecessors()) { + BlockChain *PredChain = BlockToChain[Pred]; + if (!LoopBlockSet.count(Pred) && + (!PredChain || Pred == *std::prev(PredChain->end()))) { + auto EdgeFreq = + MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, HeaderBB); + auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost); + // If the predecessor has only an unconditional jump to the header, we + // need to consider the cost of this jump. + if (Pred->succ_size() == 1) + FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost); + HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost); + } + } + + // Here we collect all exit blocks in the loop, and for each exit we find out + // its hottest exit edge. For each loop rotation, we define the loop exit cost + // as the sum of frequencies of exit edges we collect here, excluding the exit + // edge from the tail of the loop chain. + SmallVector, 4> ExitsWithFreq; + for (auto BB : LoopChain) { + uint32_t LargestExitEdgeWeight = 0; + for (auto *Succ : BB->successors()) { + BlockChain *SuccChain = BlockToChain[Succ]; + if (!LoopBlockSet.count(Succ) && + (!SuccChain || Succ == *SuccChain->begin())) { + uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ); + LargestExitEdgeWeight = std::max(LargestExitEdgeWeight, SuccWeight); + } + } + if (LargestExitEdgeWeight > 0) { + uint32_t WeightScale = 0; + uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale); + auto ExitFreq = + MBFI->getBlockFreq(BB) * + BranchProbability(LargestExitEdgeWeight / WeightScale, SumWeight); + ExitsWithFreq.emplace_back(BB, ExitFreq); + } + } + + // In this loop we iterate every block in the loop chain and calculate the + // cost assuming the block is the head of the loop chain. When the loop ends, + // we should have found the best candidate as the loop chain's head. + for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()), + EndIter = LoopChain.end(); + Iter != EndIter; Iter++, TailIter++) { + // TailIter is used to track the tail of the loop chain if the block we are + // checking (pointed by Iter) is the head of the chain. + if (TailIter == LoopChain.end()) + TailIter = LoopChain.begin(); + + auto TailBB = *TailIter; + + // Calculate the cost by putting this BB to the top. + BlockFrequency Cost = 0; + + // If the current BB is the loop header, we need to take into account the + // cost of the missed fall through edge from outside of the loop to the + // header. + if (Iter != HeaderIter) + Cost += HeaderFallThroughCost; + + // Collect the loop exit cost by summing up frequencies of all exit edges + // except the one from the chain tail. + for (auto &ExitWithFreq : ExitsWithFreq) + if (TailBB != ExitWithFreq.first) + Cost += ExitWithFreq.second; + + // The cost of breaking the once fall-through edge from the tail to the top + // of the loop chain. Here we need to consider three cases: + // 1. If the tail node has only one successor, then we will get an + // additional jmp instruction. So the cost here is (MisfetchCost + + // JumpInstCost) * tail node frequency. + // 2. If the tail node has two successors, then we may still get an + // additional jmp instruction if the layout successor after the loop + // chain is not its CFG successor. Note that the more frequently executed + // jmp instruction will be put ahead of the other one. Assume the + // frequency of those two branches are x and y, where x is the frequency + // of the edge to the chain head, then the cost will be + // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency. + // 3. If the tail node has more than two successors (this rarely happens), + // we won't consider any additional cost. + if (TailBB->isSuccessor(*Iter)) { + auto TailBBFreq = MBFI->getBlockFreq(TailBB); + if (TailBB->succ_size() == 1) + Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(), + MisfetchCost + JumpInstCost); + else if (TailBB->succ_size() == 2) { + auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter); + auto TailToHeadFreq = TailBBFreq * TailToHeadProb; + auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2) + ? TailBBFreq * TailToHeadProb.getCompl() + : TailToHeadFreq; + Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) + + ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost); + } + } + + DEBUG(dbgs() << "The cost of loop rotation by making " << getBlockNum(*Iter) + << " to the top: " << Cost.getFrequency() << "\n"); + + if (Cost < SmallestRotationCost) { + SmallestRotationCost = Cost; + RotationPos = Iter; + } + } + + if (RotationPos != LoopChain.end()) { + DEBUG(dbgs() << "Rotate loop by making " << getBlockNum(*RotationPos) + << " to the top\n"); + std::rotate(LoopChain.begin(), RotationPos, LoopChain.end()); + } +} + /// \brief Forms basic block chains from the natural loop structures. /// /// These chains are designed to preserve the existing *structure* of the code @@ -807,17 +976,25 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F, SmallVector BlockWorkList; BlockFilterSet LoopBlockSet(L.block_begin(), L.block_end()); + // Check if we have profile data for this function. If yes, we will rotate + // this loop by modeling costs more precisely which requires the profile data + // for better layout. + bool RotateLoopWithProfile = + PreciseRotationCost && F.getFunction()->getEntryCount(); + // First check to see if there is an obviously preferable top block for the // loop. This will default to the header, but may end up as one of the // predecessors to the header if there is one which will result in strictly // fewer branches in the loop body. - MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet); + // When we use profile data to rotate the loop, this is unnecessary. + MachineBasicBlock *LoopTop = + RotateLoopWithProfile ? L.getHeader() : findBestLoopTop(L, LoopBlockSet); // If we selected just the header for the loop top, look for a potentially // profitable exit block in the event that rotating the loop can eliminate // branches by placing an exit edge at the bottom. MachineBasicBlock *ExitingBB = nullptr; - if (LoopTop == L.getHeader()) + if (!RotateLoopWithProfile && LoopTop == L.getHeader()) ExitingBB = findBestLoopExit(F, L, LoopBlockSet); BlockChain &LoopChain = *BlockToChain[LoopTop]; @@ -848,7 +1025,11 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F, } buildChain(LoopTop, LoopChain, BlockWorkList, &LoopBlockSet); - rotateLoop(LoopChain, ExitingBB, LoopBlockSet); + + if (RotateLoopWithProfile) + rotateLoopWithProfile(LoopChain, L, LoopBlockSet); + else + rotateLoop(LoopChain, ExitingBB, LoopBlockSet); DEBUG({ // Crash at the end so we get all of the debugging output first.