From f4230250a101979aa60e111bb1c7bf4a16d9985d Mon Sep 17 00:00:00 2001 From: Bruno Cardoso Lopes Date: Thu, 25 Sep 2014 23:14:26 +0000 Subject: [PATCH] [MachineSink+PGO] Teach MachineSink to use BlockFrequencyInfo Machine Sink uses loop depth information to select between successors BBs to sink machine instructions into, where BBs within smaller loop depths are preferable. This patch adds support for choosing between successors by using profile information from BlockFrequencyInfo instead, whenever the information is available. Tested it under SPEC2006 train (average of 30 runs for each program); ~1.5% execution speedup in average on x86-64 darwin. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218472 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/MachineSink.cpp | 29 +++++++++++++++---- test/CodeGen/X86/sink-blockfreq.ll | 45 ++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 6 deletions(-) create mode 100644 test/CodeGen/X86/sink-blockfreq.ll diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp index 261af546d72..3ac64d788f6 100644 --- a/lib/CodeGen/MachineSink.cpp +++ b/lib/CodeGen/MachineSink.cpp @@ -21,6 +21,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachinePostDominators.h" @@ -41,6 +42,12 @@ SplitEdges("machine-sink-split", cl::desc("Split critical edges during machine sinking"), cl::init(true), cl::Hidden); +static cl::opt +UseBlockFreqInfo("machine-sink-bfi", + cl::desc("Use block frequency info to find successors to sink"), + cl::init(true), cl::Hidden); + + STATISTIC(NumSunk, "Number of machine instructions sunk"); STATISTIC(NumSplit, "Number of critical edges split"); STATISTIC(NumCoalesces, "Number of copies coalesced"); @@ -53,6 +60,7 @@ namespace { MachineDominatorTree *DT; // Machine dominator tree MachinePostDominatorTree *PDT; // Machine post dominator tree MachineLoopInfo *LI; + const MachineBlockFrequencyInfo *MBFI; AliasAnalysis *AA; // Remember which edges have been considered for breaking. @@ -81,6 +89,8 @@ namespace { AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); + if (UseBlockFreqInfo) + AU.addRequired(); } void releaseMemory() override { @@ -247,6 +257,7 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) { DT = &getAnalysis(); PDT = &getAnalysis(); LI = &getAnalysis(); + MBFI = UseBlockFreqInfo ? &getAnalysis() : nullptr; AA = &getAnalysis(); bool EverMadeChange = false; @@ -566,14 +577,20 @@ MachineBasicBlock *MachineSinking::FindSuccToSinkTo(MachineInstr *MI, } // Otherwise, we should look at all the successors and decide which one - // we should sink to. - // We give successors with smaller loop depth higher priority. - SmallVector Succs(MBB->succ_begin(), MBB->succ_end()); - // Sort Successors according to their loop depth. + // we should sink to. If we have reliable block frequency information + // (frequency != 0) available, give successors with smaller frequencies + // higher priority, otherwise prioritize smaller loop depths. + SmallVector Succs(MBB->succ_begin(), + MBB->succ_end()); + // Sort Successors according to their loop depth or block frequency info. std::stable_sort( Succs.begin(), Succs.end(), - [this](const MachineBasicBlock *LHS, const MachineBasicBlock *RHS) { - return LI->getLoopDepth(LHS) < LI->getLoopDepth(RHS); + [this](const MachineBasicBlock *L, const MachineBasicBlock *R) { + uint64_t LHSFreq = MBFI ? MBFI->getBlockFreq(L).getFrequency() : 0; + uint64_t RHSFreq = MBFI ? MBFI->getBlockFreq(R).getFrequency() : 0; + bool HasBlockFreq = LHSFreq != 0 && RHSFreq != 0; + return HasBlockFreq ? LHSFreq < RHSFreq + : LI->getLoopDepth(L) < LI->getLoopDepth(R); }); for (SmallVectorImpl::iterator SI = Succs.begin(), E = Succs.end(); SI != E; ++SI) { diff --git a/test/CodeGen/X86/sink-blockfreq.ll b/test/CodeGen/X86/sink-blockfreq.ll new file mode 100644 index 00000000000..6e3a0031045 --- /dev/null +++ b/test/CodeGen/X86/sink-blockfreq.ll @@ -0,0 +1,45 @@ +; RUN: llc -disable-machine-licm -machine-sink-bfi=true -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_BFI +; RUN: llc -disable-machine-licm -machine-sink-bfi=false -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_NOBFI + +; Test that by changing BlockFrequencyInfo we change the order in which +; machine-sink looks for sucessor blocks. By not using BFI, both G and B +; have the same loop depth and no instructions is sinked - B is selected but +; can't be used as to avoid breaking a non profitable critical edge. By using +; BFI, "mul" is sinked into the less frequent block G. +define i32 @sink_freqinfo(i32 %a, i32 %b) nounwind uwtable ssp { +; MSINK_BFI-LABEL: sink_freqinfo +; MSINK_BFI: jl +; MSINK_BFI-NEXT: ## BB# +; MSINK_BFI-NEXT: imull + +; MSINK_NOBFI-LABEL: sink_freqinfo +; MSINK_NOBFI: imull +; MSINK_NOBFI: jl +entry: + br label %B + +B: + %ee = phi i32 [ 0, %entry ], [ %inc, %F ] + %xx = sub i32 %a, %ee + %cond0 = icmp slt i32 %xx, 0 + br i1 %cond0, label %F, label %exit, !prof !0 + +F: + %inc = add nsw i32 %xx, 2 + %aa = mul nsw i32 %b, %inc + %exitcond = icmp slt i32 %inc, %a + br i1 %exitcond, label %B, label %G, !prof !1 + +G: + %ii = add nsw i32 %aa, %a + %ll = add i32 %b, 45 + %exitcond2 = icmp sge i32 %ii, %b + br i1 %exitcond2, label %G, label %exit, !prof !2 + +exit: + ret i32 0 +} + +!0 = metadata !{metadata !"branch_weights", i32 4, i32 1} +!1 = metadata !{metadata !"branch_weights", i32 128, i32 1} +!2 = metadata !{metadata !"branch_weights", i32 1, i32 1} -- 2.34.1