From b18412ca5b154559691c5d875fd382e4865f6eab Mon Sep 17 00:00:00 2001 From: Cong Hou Date: Mon, 2 Nov 2015 21:24:00 +0000 Subject: [PATCH] In MachineBlockPlacement, filter cold blocks off the loop chain when profile data is available. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit In the current BB placement algorithm, a loop chain always contains all loop blocks. This has a drawback that cold blocks in the loop may be inserted on a hot function path, hence increasing branch cost and also reducing icache locality. Consider a simple example shown below: A | B⇆C | D When B->C is quite cold, the best BB-layout should be A,B,D,C. But the current implementation produces A,C,B,D. This patch filters those cold blocks off from the loop chain by comparing the ratio: LoopBBFreq / LoopFreq to 20%: if it is less than 20%, we don't include this BB to the loop chain. Here LoopFreq is the frequency of the loop when we reduce the loop into a single node. In general we have more cold blocks when the loop has few iterations. And vice versa. Differential revision: http://reviews.llvm.org/D11662 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251833 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/MachineBlockPlacement.cpp | 48 ++++++- .../X86/code_placement_cold_loop_blocks.ll | 122 ++++++++++++++++++ 2 files changed, 168 insertions(+), 2 deletions(-) create mode 100644 test/CodeGen/X86/code_placement_cold_loop_blocks.ll diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp index 0383d2c56dd..b12fcf2940e 100644 --- a/lib/CodeGen/MachineBlockPlacement.cpp +++ b/lib/CodeGen/MachineBlockPlacement.cpp @@ -81,6 +81,12 @@ static cl::opt OutlineOptionalThreshold( "instruction count below this threshold"), cl::init(4), cl::Hidden); +static cl::opt LoopToColdBlockRatio( + "loop-to-cold-block-ratio", + cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " + "(frequency of block) is greater than this ratio"), + cl::init(5), cl::Hidden); + static cl::opt PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " @@ -263,6 +269,7 @@ class MachineBlockPlacement : public MachineFunctionPass { const BlockFilterSet &LoopBlockSet); MachineBasicBlock *findBestLoopExit(MachineFunction &F, MachineLoop &L, const BlockFilterSet &LoopBlockSet); + BlockFilterSet collectLoopBlockSet(MachineFunction &F, MachineLoop &L); void buildLoopChains(MachineFunction &F, MachineLoop &L); void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB, const BlockFilterSet &LoopBlockSet); @@ -960,6 +967,42 @@ void MachineBlockPlacement::rotateLoopWithProfile( } } +/// \brief Collect blocks in the given loop that are to be placed. +/// +/// When profile data is available, exclude cold blocks from the returned set; +/// otherwise, collect all blocks in the loop. +MachineBlockPlacement::BlockFilterSet +MachineBlockPlacement::collectLoopBlockSet(MachineFunction &F, MachineLoop &L) { + BlockFilterSet LoopBlockSet; + + // Filter cold blocks off from LoopBlockSet when profile data is available. + // Collect the sum of frequencies of incoming edges to the loop header from + // outside. If we treat the loop as a super block, this is the frequency of + // the loop. Then for each block in the loop, we calculate the ratio between + // its frequency and the frequency of the loop block. When it is too small, + // don't add it to the loop chain. If there are outer loops, then this block + // will be merged into the first outer loop chain for which this block is not + // cold anymore. This needs precise profile data and we only do this when + // profile data is available. + if (F.getFunction()->getEntryCount()) { + BlockFrequency LoopFreq(0); + for (auto LoopPred : L.getHeader()->predecessors()) + if (!L.contains(LoopPred)) + LoopFreq += MBFI->getBlockFreq(LoopPred) * + MBPI->getEdgeProbability(LoopPred, L.getHeader()); + + for (MachineBasicBlock *LoopBB : L.getBlocks()) { + auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency(); + if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio) + continue; + LoopBlockSet.insert(LoopBB); + } + } else + LoopBlockSet.insert(L.block_begin(), L.block_end()); + + return LoopBlockSet; +} + /// \brief Forms basic block chains from the natural loop structures. /// /// These chains are designed to preserve the existing *structure* of the code @@ -974,7 +1017,7 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F, buildLoopChains(F, *InnerLoop); SmallVector BlockWorkList; - BlockFilterSet LoopBlockSet(L.block_begin(), L.block_end()); + BlockFilterSet LoopBlockSet = collectLoopBlockSet(F, L); // Check if we have profile data for this function. If yes, we will rotate // this loop by modeling costs more precisely which requires the profile data @@ -1005,7 +1048,8 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F, SmallPtrSet UpdatedPreds; assert(LoopChain.LoopPredecessors == 0); UpdatedPreds.insert(&LoopChain); - for (MachineBasicBlock *LoopBB : L.getBlocks()) { + + for (MachineBasicBlock *LoopBB : LoopBlockSet) { BlockChain &Chain = *BlockToChain[LoopBB]; if (!UpdatedPreds.insert(&Chain).second) continue; diff --git a/test/CodeGen/X86/code_placement_cold_loop_blocks.ll b/test/CodeGen/X86/code_placement_cold_loop_blocks.ll new file mode 100644 index 00000000000..592d1ce45bb --- /dev/null +++ b/test/CodeGen/X86/code_placement_cold_loop_blocks.ll @@ -0,0 +1,122 @@ +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK + +define void @foo() !prof !1 { +; Test if a cold block in a loop will be placed at the end of the function +; chain. +; +; CHECK-LABEL: foo: +; CHECK: callq b +; CHECK: callq c +; CHECK: callq e +; CHECK: callq f +; CHECK: callq d + +entry: + br label %header + +header: + call void @b() + %call = call zeroext i1 @a() + br i1 %call, label %if.then, label %if.else, !prof !4 + +if.then: + call void @c() + br label %if.end + +if.else: + call void @d() + br label %if.end + +if.end: + call void @e() + %call2 = call zeroext i1 @a() + br i1 %call2, label %header, label %end, !prof !5 + +end: + call void @f() + ret void +} + +define void @nested_loop_0() !prof !1 { +; Test if a block that is cold in the inner loop but not cold in the outer loop +; will merged to the outer loop chain. +; +; CHECK-LABEL: nested_loop_0: +; CHECK: callq c +; CHECK: callq d +; CHECK: callq e +; CHECK: callq b +; CHECK: callq f + +entry: + br label %header + +header: + call void @b() + %call4 = call zeroext i1 @a() + br i1 %call4, label %header2, label %end + +header2: + call void @c() + %call = call zeroext i1 @a() + br i1 %call, label %if.then, label %if.else, !prof !2 + +if.then: + call void @d() + %call3 = call zeroext i1 @a() + br i1 %call3, label %header2, label %header, !prof !3 + +if.else: + call void @e() + %call2 = call zeroext i1 @a() + br i1 %call2, label %header2, label %header, !prof !3 + +end: + call void @f() + ret void +} + +define void @nested_loop_1() !prof !1 { +; Test if a cold block in an inner loop will be placed at the end of the +; function chain. +; +; CHECK-LABEL: nested_loop_1: +; CHECK: callq b +; CHECK: callq c +; CHECK: callq e +; CHECK: callq d + +entry: + br label %header + +header: + call void @b() + br label %header2 + +header2: + call void @c() + %call = call zeroext i1 @a() + br i1 %call, label %end, label %if.else, !prof !4 + +if.else: + call void @d() + %call2 = call zeroext i1 @a() + br i1 %call2, label %header2, label %header, !prof !5 + +end: + call void @e() + ret void +} + +declare zeroext i1 @a() +declare void @b() +declare void @c() +declare void @d() +declare void @e() +declare void @f() + +!1 = !{!"function_entry_count", i64 1} +!2 = !{!"branch_weights", i32 100, i32 1} +!3 = !{!"branch_weights", i32 1, i32 10} +!4 = !{!"branch_weights", i32 1000, i32 1} +!5 = !{!"branch_weights", i32 100, i32 1} -- 2.34.1