From f0c1388dd1b5cd6c297a9555aab7d0ade0f33827 Mon Sep 17 00:00:00 2001 From: Hal Finkel Date: Thu, 12 Dec 2013 00:19:11 +0000 Subject: [PATCH] Improve instruction scheduling for the PPC POWER7 Aside from a few minor latency corrections, the major change here is a new hazard recognizer which focuses on better dispatch-group formation on the POWER7. As with the PPC970's hazard recognizer, the most important thing it does is avoid load-after-store hazards within the same dispatch group. It uses the POWER7's special dispatch-group-terminating nop instruction (instead of inserting multiple regular nop instructions). This new hazard recognizer makes use of the scheduling dependency graph itself, built using AA information, to robustly detect the possibility of load-after-store hazards. significant test-suite performance changes (the error bars are 99.5% confidence intervals based on 5 test-suite runs both with and without the change -- speedups are negative): speedups: MultiSource/Benchmarks/FreeBench/pcompress2/pcompress2 -0.55171% +/- 0.333168% MultiSource/Benchmarks/TSVC/CrossingThresholds-dbl/CrossingThresholds-dbl -17.5576% +/- 14.598% MultiSource/Benchmarks/TSVC/Reductions-dbl/Reductions-dbl -29.5708% +/- 7.09058% MultiSource/Benchmarks/TSVC/Reductions-flt/Reductions-flt -34.9471% +/- 11.4391% SingleSource/Benchmarks/BenchmarkGame/puzzle -25.1347% +/- 11.0104% SingleSource/Benchmarks/Misc/flops-8 -17.7297% +/- 9.79061% SingleSource/Benchmarks/Shootout-C++/ary3 -35.5018% +/- 23.9458% SingleSource/Regression/C/uint64_to_float -56.3165% +/- 25.4234% SingleSource/UnitTests/Vectorizer/gcc-loops -18.5309% +/- 6.8496% regressions: MultiSource/Benchmarks/ASCI_Purple/SMG2000/smg2000 18.351% +/- 12.156% SingleSource/Benchmarks/Shootout-C++/methcall 27.3086% +/- 14.4733% git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@197099 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/PowerPC/PPCHazardRecognizers.cpp | 209 ++++++++++++++++++++ lib/Target/PowerPC/PPCHazardRecognizers.h | 26 +++ lib/Target/PowerPC/PPCInstrFormats.td | 9 + lib/Target/PowerPC/PPCInstrInfo.cpp | 66 ++++++- lib/Target/PowerPC/PPCInstrInfo.h | 12 ++ lib/Target/PowerPC/PPCInstrInfo.td | 9 + lib/Target/PowerPC/PPCScheduleP7.td | 9 +- test/CodeGen/PowerPC/pwr7-gt-nop.ll | 31 +++ 8 files changed, 368 insertions(+), 3 deletions(-) create mode 100644 test/CodeGen/PowerPC/pwr7-gt-nop.ll diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp index b2d7ca11a2d..0c07fd33405 100644 --- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp +++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp @@ -15,12 +15,221 @@ #include "PPCHazardRecognizers.h" #include "PPC.h" #include "PPCInstrInfo.h" +#include "PPCTargetMachine.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; +bool PPCDispatchGroupSBHazardRecognizer::isLoadAfterStore(SUnit *SU) { + // FIXME: Move this. + if (isBCTRAfterSet(SU)) + return true; + + const MCInstrDesc *MCID = DAG->getInstrDesc(SU); + if (!MCID) + return false; + + if (!MCID->mayLoad()) + return false; + + // SU is a load; for any predecessors in this dispatch group, that are stores, + // and with which we have an ordering dependency, return true. + for (unsigned i = 0, ie = (unsigned) SU->Preds.size(); i != ie; ++i) { + const MCInstrDesc *PredMCID = DAG->getInstrDesc(SU->Preds[i].getSUnit()); + if (!PredMCID || !PredMCID->mayStore()) + continue; + + if (!SU->Preds[i].isNormalMemory() && !SU->Preds[i].isBarrier()) + continue; + + for (unsigned j = 0, je = CurGroup.size(); j != je; ++j) + if (SU->Preds[i].getSUnit() == CurGroup[j]) + return true; + } + + return false; +} + +bool PPCDispatchGroupSBHazardRecognizer::isBCTRAfterSet(SUnit *SU) { + const MCInstrDesc *MCID = DAG->getInstrDesc(SU); + if (!MCID) + return false; + + if (!MCID->isBranch()) + return false; + + // SU is a branch; for any predecessors in this dispatch group, with which we + // have a data dependence and set the counter register, return true. + for (unsigned i = 0, ie = (unsigned) SU->Preds.size(); i != ie; ++i) { + const MCInstrDesc *PredMCID = DAG->getInstrDesc(SU->Preds[i].getSUnit()); + if (!PredMCID || PredMCID->getSchedClass() != PPC::Sched::IIC_SprMTSPR) + continue; + + if (SU->Preds[i].isCtrl()) + continue; + + for (unsigned j = 0, je = CurGroup.size(); j != je; ++j) + if (SU->Preds[i].getSUnit() == CurGroup[j]) + return true; + } + + return false; +} + +// FIXME: Remove this when we don't need this: +namespace llvm { namespace PPC { extern int getNonRecordFormOpcode(uint16_t); } } + +// FIXME: A lot of code in PPCDispatchGroupSBHazardRecognizer is P7 specific. + +bool PPCDispatchGroupSBHazardRecognizer::mustComeFirst(const MCInstrDesc *MCID, + unsigned &NSlots) { + // FIXME: Indirectly, this information is contained in the itinerary, and + // we should derive it from there instead of separately specifying it + // here. + unsigned IIC = MCID->getSchedClass(); + switch (IIC) { + default: + NSlots = 1; + break; + case PPC::Sched::IIC_IntDivW: + case PPC::Sched::IIC_IntDivD: + case PPC::Sched::IIC_LdStLoadUpd: + case PPC::Sched::IIC_LdStLDU: + case PPC::Sched::IIC_LdStLFDU: + case PPC::Sched::IIC_LdStLFDUX: + case PPC::Sched::IIC_LdStLHA: + case PPC::Sched::IIC_LdStLHAU: + case PPC::Sched::IIC_LdStLWA: + case PPC::Sched::IIC_LdStSTDU: + case PPC::Sched::IIC_LdStSTFDU: + NSlots = 2; + break; + case PPC::Sched::IIC_LdStLoadUpdX: + case PPC::Sched::IIC_LdStLDUX: + case PPC::Sched::IIC_LdStLHAUX: + case PPC::Sched::IIC_LdStLWARX: + case PPC::Sched::IIC_LdStLDARX: + case PPC::Sched::IIC_LdStSTDUX: + case PPC::Sched::IIC_LdStSTDCX: + case PPC::Sched::IIC_LdStSTWCX: + case PPC::Sched::IIC_BrMCRX: // mtcr + // FIXME: Add sync/isync (here and in the itinerary). + NSlots = 4; + break; + } + + // FIXME: record-form instructions need a different itinerary class. + if (NSlots == 1 && PPC::getNonRecordFormOpcode(MCID->getOpcode()) != -1) + NSlots = 2; + + switch (IIC) { + default: + // All multi-slot instructions must come first. + return NSlots > 1; + case PPC::Sched::IIC_SprMFCR: + case PPC::Sched::IIC_SprMFCRF: + case PPC::Sched::IIC_SprMTSPR: + return true; + } +} + +ScheduleHazardRecognizer::HazardType +PPCDispatchGroupSBHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { + if (Stalls == 0 && isLoadAfterStore(SU)) + return NoopHazard; + + return ScoreboardHazardRecognizer::getHazardType(SU, Stalls); +} + +bool PPCDispatchGroupSBHazardRecognizer::ShouldPreferAnother(SUnit *SU) { + const MCInstrDesc *MCID = DAG->getInstrDesc(SU); + unsigned NSlots; + if (MCID && mustComeFirst(MCID, NSlots) && CurSlots) + return true; + + return ScoreboardHazardRecognizer::ShouldPreferAnother(SU); +} + +unsigned PPCDispatchGroupSBHazardRecognizer::PreEmitNoops(SUnit *SU) { + // We only need to fill out a maximum of 5 slots here: The 6th slot could + // only be a second branch, and otherwise the next instruction will start a + // new group. + if (isLoadAfterStore(SU) && CurSlots < 6) { + unsigned Directive = + DAG->TM.getSubtarget().getDarwinDirective(); + // If we're using a special group-terminating nop, then we need only one. + if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7) + return 1; + + return 5 - CurSlots; + } + + return ScoreboardHazardRecognizer::PreEmitNoops(SU); +} + +void PPCDispatchGroupSBHazardRecognizer::EmitInstruction(SUnit *SU) { + const MCInstrDesc *MCID = DAG->getInstrDesc(SU); + if (MCID) { + if (CurSlots == 5 || (MCID->isBranch() && CurBranches == 1)) { + CurGroup.clear(); + CurSlots = CurBranches = 0; + } else { + DEBUG(dbgs() << "**** Adding to dispatch group: SU(" << + SU->NodeNum << "): "); + DEBUG(DAG->dumpNode(SU)); + + unsigned NSlots; + bool MustBeFirst = mustComeFirst(MCID, NSlots); + + // If this instruction must come first, but does not, then it starts a + // new group. + if (MustBeFirst && CurSlots) { + CurSlots = CurBranches = 0; + CurGroup.clear(); + } + + CurSlots += NSlots; + CurGroup.push_back(SU); + + if (MCID->isBranch()) + ++CurBranches; + } + } + + return ScoreboardHazardRecognizer::EmitInstruction(SU); +} + +void PPCDispatchGroupSBHazardRecognizer::AdvanceCycle() { + return ScoreboardHazardRecognizer::AdvanceCycle(); +} + +void PPCDispatchGroupSBHazardRecognizer::RecedeCycle() { + llvm_unreachable("Bottom-up scheduling not supported"); +} + +void PPCDispatchGroupSBHazardRecognizer::Reset() { + CurGroup.clear(); + CurSlots = CurBranches = 0; + return ScoreboardHazardRecognizer::Reset(); +} + +void PPCDispatchGroupSBHazardRecognizer::EmitNoop() { + unsigned Directive = + DAG->TM.getSubtarget().getDarwinDirective(); + // If the group has now filled all of its slots, or if we're using a special + // group-terminating nop, the group is complete. + if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 || + CurSlots == 6) { + CurGroup.clear(); + CurSlots = CurBranches = 0; + } else { + CurGroup.push_back(0); + ++CurSlots; + } +} + //===----------------------------------------------------------------------===// // PowerPC 970 Hazard Recognizer // diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.h b/lib/Target/PowerPC/PPCHazardRecognizers.h index 9ee042b63e1..6b7fe41e574 100644 --- a/lib/Target/PowerPC/PPCHazardRecognizers.h +++ b/lib/Target/PowerPC/PPCHazardRecognizers.h @@ -21,6 +21,32 @@ namespace llvm { +/// PPCDispatchGroupSBHazardRecognizer - This class implements a scoreboard-based +/// hazard recognizer for PPC ooo processors with dispatch-group hazards. +class PPCDispatchGroupSBHazardRecognizer : public ScoreboardHazardRecognizer { + const ScheduleDAG *DAG; + SmallVector CurGroup; + unsigned CurSlots, CurBranches; + + bool isLoadAfterStore(SUnit *SU); + bool isBCTRAfterSet(SUnit *SU); + bool mustComeFirst(const MCInstrDesc *MCID, unsigned &NSlots); +public: + PPCDispatchGroupSBHazardRecognizer(const InstrItineraryData *ItinData, + const ScheduleDAG *DAG_) : + ScoreboardHazardRecognizer(ItinData, DAG_), DAG(DAG_), + CurSlots(0), CurBranches(0) {} + + virtual HazardType getHazardType(SUnit *SU, int Stalls); + virtual bool ShouldPreferAnother(SUnit* SU); + virtual unsigned PreEmitNoops(SUnit *SU); + virtual void EmitInstruction(SUnit *SU); + virtual void AdvanceCycle(); + virtual void RecedeCycle(); + virtual void Reset(); + virtual void EmitNoop(); +}; + /// PPCHazardRecognizer970 - This class defines a finite state automata that /// models the dispatch logic on the PowerPC 970 (aka G5) processor. This /// promotes good dispatch group formation and implements noop insertion to diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td index 6b20a41ef22..c5e179fec25 100644 --- a/lib/Target/PowerPC/PPCInstrFormats.td +++ b/lib/Target/PowerPC/PPCInstrFormats.td @@ -258,6 +258,15 @@ class DForm_4_zero opcode, dag OOL, dag IOL, string asmstr, let Addr = 0; } +class DForm_4_fixedreg_zero opcode, bits<5> R, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, + list pattern> + : DForm_4 { + let A = R; + let B = R; + let C = 0; +} + class IForm_and_DForm_1 opcode1, bit aa, bit lk, bits<6> opcode2, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index c4582c540ba..bd3b4924ccc 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -74,6 +74,9 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer( const ScheduleDAG *DAG) const { unsigned Directive = TM.getSubtarget().getDarwinDirective(); + if (Directive == PPC::DIR_PWR7) + return new PPCDispatchGroupSBHazardRecognizer(II, DAG); + // Most subtargets use a PPC970 recognizer. if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2 && Directive != PPC::DIR_E500mc && Directive != PPC::DIR_E5500) { @@ -85,6 +88,56 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer( return new ScoreboardHazardRecognizer(II, DAG); } + +int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, + unsigned UseIdx) const { + int Latency = PPCGenInstrInfo::getOperandLatency(ItinData, DefMI, DefIdx, + UseMI, UseIdx); + + const MachineOperand &DefMO = DefMI->getOperand(DefIdx); + unsigned Reg = DefMO.getReg(); + + const TargetRegisterInfo *TRI = &getRegisterInfo(); + bool IsRegCR; + if (TRI->isVirtualRegister(Reg)) { + const MachineRegisterInfo *MRI = + &DefMI->getParent()->getParent()->getRegInfo(); + IsRegCR = MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRRCRegClass) || + MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRBITRCRegClass); + } else { + IsRegCR = PPC::CRRCRegClass.contains(Reg) || + PPC::CRBITRCRegClass.contains(Reg); + } + + if (UseMI->isBranch() && IsRegCR) { + if (Latency < 0) + Latency = getInstrLatency(ItinData, DefMI); + + // On some cores, there is an additional delay between writing to a condition + // register, and using it from a branch. + unsigned Directive = TM.getSubtarget().getDarwinDirective(); + switch (Directive) { + default: break; + case PPC::DIR_7400: + case PPC::DIR_750: + case PPC::DIR_970: + case PPC::DIR_E5500: + case PPC::DIR_PWR4: + case PPC::DIR_PWR5: + case PPC::DIR_PWR5X: + case PPC::DIR_PWR6: + case PPC::DIR_PWR6X: + case PPC::DIR_PWR7: + Latency += 2; + break; + } + } + + return Latency; +} + // Detect 32 -> 64-bit extensions where we may reuse the low sub-register. bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg, @@ -218,11 +271,20 @@ PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const { + // This function is used for scheduling, and the nop wanted here is the type + // that terminates dispatch groups on the POWER cores. + unsigned Directive = TM.getSubtarget().getDarwinDirective(); + unsigned Opcode; + switch (Directive) { + default: Opcode = PPC::NOP; break; + case PPC::DIR_PWR6: Opcode = PPC::NOP_GT_PWR6; break; + case PPC::DIR_PWR7: Opcode = PPC::NOP_GT_PWR7; break; + } + DebugLoc DL; - BuildMI(MBB, MI, DL, get(PPC::NOP)); + BuildMI(MBB, MI, DL, get(Opcode)); } - // Branch analysis. // Note: If the condition register is set to CTR or CTR8 then this is a // BDNZ (imm == 1) or BDZ (imm == 0) branch. diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h index f140c41a2a8..7876703584d 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.h +++ b/lib/Target/PowerPC/PPCInstrInfo.h @@ -95,6 +95,18 @@ public: CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const; + virtual + int getOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const; + virtual + int getOperandLatency(const InstrItineraryData *ItinData, + SDNode *DefNode, unsigned DefIdx, + SDNode *UseNode, unsigned UseIdx) const { + return PPCGenInstrInfo::getOperandLatency(ItinData, DefNode, DefIdx, + UseNode, UseIdx); + } + bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg, unsigned &SubIdx) const; diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index 8754b40e09f..11c296dd336 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -1616,8 +1616,17 @@ def XORI : DForm_4<26, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2), def XORIS : DForm_4<27, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2), "xoris $dst, $src1, $src2", IIC_IntSimple, [(set i32:$dst, (xor i32:$src1, imm16ShiftedZExt:$src2))]>; + def NOP : DForm_4_zero<24, (outs), (ins), "nop", IIC_IntSimple, []>; +let isCodeGenOnly = 1 in { +// The POWER6 and POWER7 have special group-terminating nops. +def NOP_GT_PWR6 : DForm_4_fixedreg_zero<24, 1, (outs), (ins), + "ori 1, 1, 0", IIC_IntSimple, []>; +def NOP_GT_PWR7 : DForm_4_fixedreg_zero<24, 2, (outs), (ins), + "ori 2, 2, 0", IIC_IntSimple, []>; +} + let isCompare = 1, neverHasSideEffects = 1 in { def CMPWI : DForm_5_ext<11, (outs crrc:$crD), (ins gprc:$rA, s16imm:$imm), "cmpwi $crD, $rA, $imm", IIC_IntCompare>; diff --git a/lib/Target/PowerPC/PPCScheduleP7.td b/lib/Target/PowerPC/PPCScheduleP7.td index 95b5a8b2c65..958bc90f674 100644 --- a/lib/Target/PowerPC/PPCScheduleP7.td +++ b/lib/Target/PowerPC/PPCScheduleP7.td @@ -93,6 +93,7 @@ def P7Itineraries : ProcessorItineraries< P7_DU3, P7_DU4], 0>, InstrStage<1, [P7_FX1, P7_FX2]>], [1, 1, 1]>, + // FIXME: Add record-form itinerary data. InstrItinData, InstrStage<1, [P7_DU2], 0>, InstrStage<36, [P7_FX1, P7_FX2]>], @@ -290,7 +291,10 @@ def P7Itineraries : ProcessorItineraries< InstrStage<1, [P7_DU4], 0>, InstrStage<1, [P7_LS1, P7_LS2]>], [1, 1, 1]>, - InstrItinData, + InstrItinData, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, InstrStage<1, [P7_CRU]>, InstrStage<1, [P7_FX1, P7_FX2]>], [3, 1]>, // mtcr @@ -300,6 +304,9 @@ def P7Itineraries : ProcessorItineraries< InstrItinData, InstrStage<1, [P7_CRU]>], [3, 1]>, + InstrItinData, + InstrStage<1, [P7_FX1]>], + [4, 1]>, // mtctr InstrItinData, InstrStage<1, [P7_VS1, P7_VS2]>], diff --git a/test/CodeGen/PowerPC/pwr7-gt-nop.ll b/test/CodeGen/PowerPC/pwr7-gt-nop.ll new file mode 100644 index 00000000000..8c8545d60df --- /dev/null +++ b/test/CodeGen/PowerPC/pwr7-gt-nop.ll @@ -0,0 +1,31 @@ +; RUN: llc < %s -mcpu=pwr7 | FileCheck %s +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +; Function Attrs: nounwind +define void @foo(float* nocapture %a, float* nocapture %b, float* nocapture readonly %c, float* nocapture %d) #0 { + +; CHECK-LABEL: @foo + +entry: + %0 = load float* %b, align 4 + store float %0, float* %a, align 4 + %1 = load float* %c, align 4 + store float %1, float* %b, align 4 + %2 = load float* %a, align 4 + store float %2, float* %d, align 4 + ret void + +; CHECK: lfs [[REG1:[0-9]+]], 0(4) +; CHECK: stfs [[REG1]], 0(3) +; CHECK: ori 2, 2, 0 +; CHECK: lfs [[REG2:[0-9]+]], 0(5) +; CHECK: stfs [[REG2]], 0(4) +; CHECK: ori 2, 2, 0 +; CHECK: lfs [[REG3:[0-9]+]], 0(3) +; CHECK: stfs [[REG3]], 0(6) +; CHECK: blr +} + +attributes #0 = { nounwind } + -- 2.34.1