AMDGPU/SI: Add SI Machine Scheduler

author Nicolai Haehnle <nhaehnle@gmail.com>

Wed, 13 Jan 2016 16:10:10 +0000 (16:10 +0000)

committer Nicolai Haehnle <nhaehnle@gmail.com>

Wed, 13 Jan 2016 16:10:10 +0000 (16:10 +0000)
author Nicolai Haehnle <nhaehnle@gmail.com>
Wed, 13 Jan 2016 16:10:10 +0000 (16:10 +0000)
committer Nicolai Haehnle <nhaehnle@gmail.com>
Wed, 13 Jan 2016 16:10:10 +0000 (16:10 +0000)
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h

index 8c3cb567fc7e5942d19cdfa380bc91feb9650c94..5d00e1cb3be5fad2abb5386618b60d9459326d59 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -20,8 +20,10 @@ class AMDGPUInstrPrinter;
  class AMDGPUSubtarget;
  class AMDGPUTargetMachine;
  class FunctionPass;
  class AMDGPUSubtarget;
  class AMDGPUTargetMachine;
  class FunctionPass;
+class MachineSchedContext;
  class MCAsmInfo;
  class raw_ostream;
  class MCAsmInfo;
  class raw_ostream;
+class ScheduleDAGInstrs;
  class Target;
  class TargetMachine;
  
  class Target;
  class TargetMachine;
  
@@ -49,6 +51,8 @@ FunctionPass *createSIFixSGPRLiveRangesPass();
  FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
  FunctionPass *createSIInsertWaits(TargetMachine &tm);
  
  FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
  FunctionPass *createSIInsertWaits(TargetMachine &tm);
  
+ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C);
+
  ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
  void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
  extern char &AMDGPUAnnotateKernelFeaturesID;
  ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
  void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
  extern char &AMDGPUAnnotateKernelFeaturesID;
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

index 22f85b3e663c292d5d9baf3bc2933dc8e40a0204..b1be6197a6c6a0a01a8b55d876f045a8af5a7fc4 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -66,8 +66,12 @@ static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
  }
  
  static MachineSchedRegistry
  }
  
  static MachineSchedRegistry
-SchedCustomRegistry("r600", "Run R600's custom scheduler",
-                    createR600MachineScheduler);
+R600SchedRegistry("r600", "Run R600's custom scheduler",
+                   createR600MachineScheduler);
+
+static MachineSchedRegistry
+SISchedRegistry("si", "Run SI's custom scheduler",
+                createSIMachineScheduler);
  
  static std::string computeDataLayout(const Triple &TT) {
    std::string Ret = "e-p:32:32";
  
  static std::string computeDataLayout(const Triple &TT) {
    std::string Ret = "e-p:32:32";
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt

index 30bb0e0adde8c06ca6ae6face05f6bad6270cee2..b9ef0e82176345add21eaabcb561b3e004a60d8e 100644 (file)
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -58,6 +58,7 @@ add_llvm_target(AMDGPUCodeGen
    SILowerControlFlow.cpp
    SILowerI1Copies.cpp
    SIMachineFunctionInfo.cpp
    SILowerControlFlow.cpp
    SILowerI1Copies.cpp
    SIMachineFunctionInfo.cpp
+  SIMachineScheduler.cpp
    SIRegisterInfo.cpp
    SIShrinkInstructions.cpp
    SITypeRewriter.cpp
    SIRegisterInfo.cpp
    SIShrinkInstructions.cpp
    SITypeRewriter.cpp
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp

index bd80756d5c8fb790a5aec60f0f80563313ac676b..1e10d25e8fb7a39fbdfc9a1832620a00a20f64aa 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3079,3 +3079,15 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
  
    return Rsrc23;
  }
  
    return Rsrc23;
  }
+
+bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const {
+  unsigned Opc = MI->getOpcode();
+
+  return isSMRD(Opc);
+}
+
+bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const {
+  unsigned Opc = MI->getOpcode();
+
+  return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
+}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h

index 307ef67ed26342eb8be26c1875b408d513ebbc4d..cce1ae7256117c952f86d02244f28312d1ecd71a 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -462,6 +462,9 @@ public:
  
    uint64_t getDefaultRsrcDataFormat() const;
    uint64_t getScratchRsrcWords23() const;
  
    uint64_t getDefaultRsrcDataFormat() const;
    uint64_t getScratchRsrcWords23() const;
+
+  bool isLowLatencyInstruction(const MachineInstr *MI) const;
+  bool isHighLatencyInstruction(const MachineInstr *MI) const;
  };
  
  namespace AMDGPU {
  };
  
  namespace AMDGPU {
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp

new file mode 100644 (file)

index 0000000..1cfa984
--- /dev/null
+++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -0,0 +1,1968 @@
+//===-- SIMachineScheduler.cpp - SI Scheduler Interface -*- C++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#include "SIMachineScheduler.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+// This scheduler implements a different scheduling algorithm than
+// GenericScheduler.
+//
+// There are several specific architecture behaviours that can't be modelled
+// for GenericScheduler:
+// . When accessing the result of an SGPR load instruction, you have to wait
+// for all the SGPR load instructions before your current instruction to
+// have finished.
+// . When accessing the result of an VGPR load instruction, you have to wait
+// for all the VGPR load instructions previous to the VGPR load instruction
+// you are interested in to finish.
+// . The less the register pressure, the best load latencies are hidden
+//
+// Moreover some specifities (like the fact a lot of instructions in the shader
+// have few dependencies) makes the generic scheduler have some unpredictable
+// behaviours. For example when register pressure becomes high, it can either
+// manage to prevent register pressure from going too high, or it can
+// increase register pressure even more than if it hadn't taken register
+// pressure into account.
+//
+// Also some other bad behaviours are generated, like loading at the beginning
+// of the shader a constant in VGPR you won't need until the end of the shader.
+//
+// The scheduling problem for SI can distinguish three main parts:
+// . Hiding high latencies (texture sampling, etc)
+// . Hiding low latencies (SGPR constant loading, etc)
+// . Keeping register usage low for better latency hiding and general
+//   performance
+//
+// Some other things can also affect performance, but are hard to predict
+// (cache usage, the fact the HW can issue several instructions from different
+// wavefronts if different types, etc)
+//
+// This scheduler tries to solve the scheduling problem by dividing it into
+// simpler sub-problems. It divides the instructions into blocks, schedules
+// locally inside the blocks where it takes care of low latencies, and then
+// chooses the order of the blocks by taking care of high latencies.
+// Dividing the instructions into blocks helps control keeping register
+// usage low.
+//
+// First the instructions are put into blocks.
+//   We want the blocks help control register usage and hide high latencies
+//   later. To help control register usage, we typically want all local
+//   computations, when for example you create a result that can be comsummed
+//   right away, to be contained in a block. Block inputs and outputs would
+//   typically be important results that are needed in several locations of
+//   the shader. Since we do want blocks to help hide high latencies, we want
+//   the instructions inside the block to have a minimal set of dependencies
+//   on high latencies. It will make it easy to pick blocks to hide specific
+//   high latencies.
+//   The block creation algorithm is divided into several steps, and several
+//   variants can be tried during the scheduling process.
+//
+// Second the order of the instructions inside the blocks is choosen.
+//   At that step we do take into account only register usage and hiding
+//   low latency instructions
+//
+// Third the block order is choosen, there we try to hide high latencies
+// and keep register usage low.
+//
+// After the third step, a pass is done to improve the hiding of low
+// latencies.
+//
+// Actually when talking about 'low latency' or 'high latency' it includes
+// both the latency to get the cache (or global mem) data go to the register,
+// and the bandwith limitations.
+// Increasing the number of active wavefronts helps hide the former, but it
+// doesn't solve the latter, thus why even if wavefront count is high, we have
+// to try have as many instructions hiding high latencies as possible.
+// The OpenCL doc says for example latency of 400 cycles for a global mem access,
+// which is hidden by 10 instructions if the wavefront count is 10.
+
+// Some figures taken from AMD docs:
+// Both texture and constant L1 caches are 4-way associative with 64 bytes
+// lines.
+// Constant cache is shared with 4 CUs.
+// For texture sampling, the address generation unit receives 4 texture
+// addresses per cycle, thus we could expect texture sampling latency to be
+// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
+// instructions in a wavefront group are executed every 4 cycles),
+// or 16 instructions if the other wavefronts associated to the 3 other VALUs
+// of the CU do texture sampling too. (Don't take these figures too seriously,
+// as I'm not 100% sure of the computation)
+// Data exports should get similar latency.
+// For constant loading, the cache is shader with 4 CUs.
+// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
+// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
+// It means a simple s_buffer_load should take one instruction to hide, as
+// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
+// cache line.
+//
+// As of today the driver doesn't preload the constants in cache, thus the
+// first loads get extra latency. The doc says global memory access can be
+// 300-600 cycles. We do not specially take that into account when scheduling
+// As we expect the driver to be able to preload the constants soon.
+
+
+// common code //
+
+#ifndef NDEBUG
+
+static const char *getReasonStr(SIScheduleCandReason Reason) {
+  switch (Reason) {
+  case NoCand:         return "NOCAND";
+  case RegUsage:       return "REGUSAGE";
+  case Latency:        return "LATENCY";
+  case Successor:      return "SUCCESSOR";
+  case Depth:          return "DEPTH";
+  case NodeOrder:      return "ORDER";
+  }
+  llvm_unreachable("Unknown reason!");
+}
+
+#endif
+
+static bool tryLess(int TryVal, int CandVal,
+                    SISchedulerCandidate &TryCand,
+                    SISchedulerCandidate &Cand,
+                    SIScheduleCandReason Reason) {
+  if (TryVal < CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal > CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  Cand.setRepeat(Reason);
+  return false;
+}
+
+static bool tryGreater(int TryVal, int CandVal,
+                       SISchedulerCandidate &TryCand,
+                       SISchedulerCandidate &Cand,
+                       SIScheduleCandReason Reason) {
+  if (TryVal > CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal < CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  Cand.setRepeat(Reason);
+  return false;
+}
+
+// SIScheduleBlock //
+
+void SIScheduleBlock::addUnit(SUnit *SU) {
+  NodeNum2Index[SU->NodeNum] = SUnits.size();
+  SUnits.push_back(SU);
+}
+
+#ifndef NDEBUG
+
+void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
+
+  dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
+  dbgs() << '\n';
+}
+#endif
+
+void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
+                                          SISchedCandidate &TryCand) {
+  // Initialize the candidate if needed.
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return;
+  }
+
+  if (Cand.SGPRUsage > 60 &&
+      tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
+    return;
+
+  // Schedule low latency instructions as top as possible.
+  // Order of priority is:
+  // . Low latency instructions which do not depend on other low latency
+  //   instructions we haven't waited for
+  // . Other instructions which do not depend on low latency instructions
+  //   we haven't waited for
+  // . Low latencies
+  // . All other instructions
+  // Goal is to get: low latency instructions - independant instructions
+  //     - (eventually some more low latency instructions)
+  //     - instructions that depend on the first low latency instructions.
+  // If in the block there is a lot of constant loads, the SGPR usage
+  // could go quite high, thus above the arbitrary limit of 60 will encourage
+  // use the already loaded constants (in order to release some SGPRs) before
+  // loading more.
+  if (tryLess(TryCand.HasLowLatencyNonWaitedParent,
+              Cand.HasLowLatencyNonWaitedParent,
+              TryCand, Cand, SIScheduleCandReason::Depth))
+    return;
+
+  if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
+                 TryCand, Cand, SIScheduleCandReason::Depth))
+    return;
+
+  if (TryCand.IsLowLatency &&
+      tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
+              TryCand, Cand, SIScheduleCandReason::Depth))
+    return;
+
+  if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage))
+    return;
+
+  // Fall through to original instruction order.
+  if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
+    TryCand.Reason = NodeOrder;
+  }
+}
+
+SUnit* SIScheduleBlock::pickNode() {
+  SISchedCandidate TopCand;
+
+  for (SUnit* SU : TopReadySUs) {
+    SISchedCandidate TryCand;
+    std::vector<unsigned> pressure;
+    std::vector<unsigned> MaxPressure;
+    // Predict register usage after this instruction.
+    TryCand.SU = SU;
+    TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
+    TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
+    TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
+    TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
+    TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
+    TryCand.HasLowLatencyNonWaitedParent =
+      HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
+    tryCandidateTopDown(TopCand, TryCand);
+    if (TryCand.Reason != NoCand)
+      TopCand.setBest(TryCand);
+  }
+
+  return TopCand.SU;
+}
+
+
+// Schedule something valid.
+void SIScheduleBlock::fastSchedule() {
+  TopReadySUs.clear();
+  if (Scheduled)
+    undoSchedule();
+
+  for (SUnit* SU : SUnits) {
+    if (!SU->NumPredsLeft)
+      TopReadySUs.push_back(SU);
+  }
+
+  while (!TopReadySUs.empty()) {
+    SUnit *SU = TopReadySUs[0];
+    ScheduledSUnits.push_back(SU);
+    nodeScheduled(SU);
+  }
+
+  Scheduled = true;
+}
+
+// Returns if the register was set between first and last.
+static bool isDefBetween(unsigned Reg,
+                           SlotIndex First, SlotIndex Last,
+                           const MachineRegisterInfo *MRI,
+                           const LiveIntervals *LIS) {
+  for (MachineRegisterInfo::def_instr_iterator
+       UI = MRI->def_instr_begin(Reg),
+       UE = MRI->def_instr_end(); UI != UE; ++UI) {
+    const MachineInstr* MI = &*UI;
+    if (MI->isDebugValue())
+      continue;
+    SlotIndex InstSlot = LIS->getInstructionIndex(MI).getRegSlot();
+    if (InstSlot >= First && InstSlot <= Last)
+      return true;
+  }
+  return false;
+}
+
+void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
+                                      MachineBasicBlock::iterator EndBlock) {
+  IntervalPressure Pressure, BotPressure;
+  RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
+  LiveIntervals *LIS = DAG->getLIS();
+  MachineRegisterInfo *MRI = DAG->getMRI();
+  DAG->initRPTracker(TopRPTracker);
+  DAG->initRPTracker(BotRPTracker);
+  DAG->initRPTracker(RPTracker);
+
+  // Goes though all SU. RPTracker captures what had to be alive for the SUs
+  // to execute, and what is still alive at the end.
+  for (SUnit* SU : ScheduledSUnits) {
+    RPTracker.setPos(SU->getInstr());
+    RPTracker.advance();
+  }
+
+  // Close the RPTracker to finalize live ins/outs.
+  RPTracker.closeRegion();
+
+  // Initialize the live ins and live outs.
+  TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
+  BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
+
+  // Do not Track Physical Registers, because it messes up.
+  for (unsigned Reg : RPTracker.getPressure().LiveInRegs) {
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      LiveInRegs.insert(Reg);
+  }
+  LiveOutRegs.clear();
+  // There is several possibilities to distinguish:
+  // 1) Reg is not input to any instruction in the block, but is output of one
+  // 2) 1) + read in the block and not needed after it
+  // 3) 1) + read in the block but needed in another block
+  // 4) Reg is input of an instruction but another block will read it too
+  // 5) Reg is input of an instruction and then rewritten in the block.
+  //    result is not read in the block (implies used in another block)
+  // 6) Reg is input of an instruction and then rewritten in the block.
+  //    result is read in the block and not needed in another block
+  // 7) Reg is input of an instruction and then rewritten in the block.
+  //    result is read in the block but also needed in another block
+  // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
+  // We want LiveOutRegs to contain only Regs whose content will be read after
+  // in another block, and whose content was written in the current block,
+  // that is we want it to get 1, 3, 5, 7
+  // Since we made the MIs of a block to be packed all together before
+  // scheduling, then the LiveIntervals were correct, and the RPTracker was
+  // able to correctly handle 5 vs 6, 2 vs 3.
+  // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
+  // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
+  // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
+  // The use of findDefBetween removes the case 4.
+  for (unsigned Reg : RPTracker.getPressure().LiveOutRegs) {
+    if (TargetRegisterInfo::isVirtualRegister(Reg) &&
+        isDefBetween(Reg, LIS->getInstructionIndex(BeginBlock).getRegSlot(),
+                       LIS->getInstructionIndex(EndBlock).getRegSlot(),
+                       MRI, LIS)) {
+      LiveOutRegs.insert(Reg);
+    }
+  }
+
+  // Pressure = sum_alive_registers register size
+  // Internally llvm will represent some registers as big 128 bits registers
+  // for example, but they actually correspond to 4 actual 32 bits registers.
+  // Thus Pressure is not equal to num_alive_registers * constant.
+  LiveInPressure = TopPressure.MaxSetPressure;
+  LiveOutPressure = BotPressure.MaxSetPressure;
+
+  // Prepares TopRPTracker for top down scheduling.
+  TopRPTracker.closeTop();
+}
+
+void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
+                               MachineBasicBlock::iterator EndBlock) {
+  if (!Scheduled)
+    fastSchedule();
+
+  // PreScheduling phase to set LiveIn and LiveOut.
+  initRegPressure(BeginBlock, EndBlock);
+  undoSchedule();
+
+  // Schedule for real now.
+
+  TopReadySUs.clear();
+
+  for (SUnit* SU : SUnits) {
+    if (!SU->NumPredsLeft)
+      TopReadySUs.push_back(SU);
+  }
+
+  while (!TopReadySUs.empty()) {
+    SUnit *SU = pickNode();
+    ScheduledSUnits.push_back(SU);
+    TopRPTracker.setPos(SU->getInstr());
+    TopRPTracker.advance();
+    nodeScheduled(SU);
+  }
+
+  // TODO: compute InternalAdditionnalPressure.
+  InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
+
+  // Check everything is right.
+#ifndef NDEBUG
+  assert(SUnits.size() == ScheduledSUnits.size() &&
+            TopReadySUs.empty());
+  for (SUnit* SU : SUnits) {
+    assert(SU->isScheduled &&
+              SU->NumPredsLeft == 0);
+  }
+#endif
+
+  Scheduled = true;
+}
+
+void SIScheduleBlock::undoSchedule() {
+  for (SUnit* SU : SUnits) {
+    SU->isScheduled = false;
+    for (SDep& Succ : SU->Succs) {
+      if (BC->isSUInBlock(Succ.getSUnit(), ID))
+        undoReleaseSucc(SU, &Succ);
+    }
+  }
+  HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
+  ScheduledSUnits.clear();
+  Scheduled = false;
+}
+
+void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+
+  if (SuccEdge->isWeak()) {
+    ++SuccSU->WeakPredsLeft;
+    return;
+  }
+  ++SuccSU->NumPredsLeft;
+}
+
+void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+
+  if (SuccEdge->isWeak()) {
+    --SuccSU->WeakPredsLeft;
+    return;
+  }
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    SuccSU->dump(DAG);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(nullptr);
+  }
+#endif
+
+  --SuccSU->NumPredsLeft;
+}
+
+/// Release Successors of the SU that are in the block or not.
+void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
+  for (SDep& Succ : SU->Succs) {
+    SUnit *SuccSU = Succ.getSUnit();
+
+    if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
+      continue;
+
+    releaseSucc(SU, &Succ);
+    if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
+      TopReadySUs.push_back(SuccSU);
+  }
+}
+
+void SIScheduleBlock::nodeScheduled(SUnit *SU) {
+  // Is in TopReadySUs
+  assert (!SU->NumPredsLeft);
+  std::vector<SUnit*>::iterator I =
+    std::find(TopReadySUs.begin(), TopReadySUs.end(), SU);
+  if (I == TopReadySUs.end()) {
+    dbgs() << "Data Structure Bug in SI Scheduler\n";
+    llvm_unreachable(nullptr);
+  }
+  TopReadySUs.erase(I);
+
+  releaseSuccessors(SU, true);
+  // Scheduling this node will trigger a wait,
+  // thus propagate to other instructions that they do not need to wait either.
+  if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
+    HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
+
+  if (DAG->IsLowLatencySU[SU->NodeNum]) {
+     for (SDep& Succ : SU->Succs) {
+      std::map<unsigned, unsigned>::iterator I =
+        NodeNum2Index.find(Succ.getSUnit()->NodeNum);
+      if (I != NodeNum2Index.end())
+        HasLowLatencyNonWaitedParent[I->second] = 1;
+    }
+  }
+  SU->isScheduled = true;
+}
+
+void SIScheduleBlock::finalizeUnits() {
+  // We remove links from outside blocks to enable scheduling inside the block.
+  for (SUnit* SU : SUnits) {
+    releaseSuccessors(SU, false);
+    if (DAG->IsHighLatencySU[SU->NodeNum])
+      HighLatencyBlock = true;
+  }
+  HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
+}
+
+// we maintain ascending order of IDs
+void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
+  unsigned PredID = Pred->getID();
+
+  // Check if not already predecessor.
+  for (SIScheduleBlock* P : Preds) {
+    if (PredID == P->getID())
+      return;
+  }
+  Preds.push_back(Pred);
+
+#ifndef NDEBUG
+  for (SIScheduleBlock* S : Succs) {
+    if (PredID == S->getID())
+      assert(!"Loop in the Block Graph!\n");
+  }
+#endif
+}
+
+void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
+  unsigned SuccID = Succ->getID();
+
+  // Check if not already predecessor.
+  for (SIScheduleBlock* S : Succs) {
+    if (SuccID == S->getID())
+      return;
+  }
+  if (Succ->isHighLatencyBlock())
+    ++NumHighLatencySuccessors;
+  Succs.push_back(Succ);
+#ifndef NDEBUG
+  for (SIScheduleBlock* P : Preds) {
+    if (SuccID == P->getID())
+      assert("Loop in the Block Graph!\n");
+  }
+#endif
+}
+
+#ifndef NDEBUG
+void SIScheduleBlock::printDebug(bool full) {
+  dbgs() << "Block (" << ID << ")\n";
+  if (!full)
+    return;
+
+  dbgs() << "\nContains High Latency Instruction: "
+         << HighLatencyBlock << '\n';
+  dbgs() << "\nDepends On:\n";
+  for (SIScheduleBlock* P : Preds) {
+    P->printDebug(false);
+  }
+
+  dbgs() << "\nSuccessors:\n";
+  for (SIScheduleBlock* S : Succs) {
+    S->printDebug(false);
+  }
+
+  if (Scheduled) {
+    dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
+           << LiveInPressure[DAG->getVGPRSetID()] << '\n';
+    dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
+           << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
+    dbgs() << "LiveIns:\n";
+    for (unsigned Reg : LiveInRegs)
+      dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+
+    dbgs() << "\nLiveOuts:\n";
+    for (unsigned Reg : LiveOutRegs)
+      dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+  }
+
+  dbgs() << "\nInstructions:\n";
+  if (!Scheduled) {
+    for (SUnit* SU : SUnits) {
+      SU->dump(DAG);
+    }
+  } else {
+    for (SUnit* SU : SUnits) {
+      SU->dump(DAG);
+    }
+  }
+
+   dbgs() << "///////////////////////\n";
+}
+
+#endif
+
+// SIScheduleBlockCreator //
+
+SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
+DAG(DAG) {
+}
+
+SIScheduleBlockCreator::~SIScheduleBlockCreator() {
+}
+
+SIScheduleBlocks
+SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
+  std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
+    Blocks.find(BlockVariant);
+  if (B == Blocks.end()) {
+    SIScheduleBlocks Res;
+    createBlocksForVariant(BlockVariant);
+    topologicalSort();
+    scheduleInsideBlocks();
+    fillStats();
+    Res.Blocks = CurrentBlocks;
+    Res.TopDownIndex2Block = TopDownIndex2Block;
+    Res.TopDownBlock2Index = TopDownBlock2Index;
+    Blocks[BlockVariant] = Res;
+    return Res;
+  } else {
+    return B->second;
+  }
+}
+
+bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
+  if (SU->NodeNum >= DAG->SUnits.size())
+    return false;
+  return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
+}
+
+void SIScheduleBlockCreator::colorHighLatenciesAlone() {
+  unsigned DAGSize = DAG->SUnits.size();
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    if (DAG->IsHighLatencySU[SU->NodeNum]) {
+      CurrentColoring[SU->NodeNum] = NextReservedID++;
+    }
+  }
+}
+
+void SIScheduleBlockCreator::colorHighLatenciesGroups() {
+  unsigned DAGSize = DAG->SUnits.size();
+  unsigned NumHighLatencies = 0;
+  unsigned GroupSize;
+  unsigned Color = NextReservedID;
+  unsigned Count = 0;
+  std::set<unsigned> FormingGroup;
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    if (DAG->IsHighLatencySU[SU->NodeNum])
+      ++NumHighLatencies;
+  }
+
+  if (NumHighLatencies == 0)
+    return;
+
+  if (NumHighLatencies <= 6)
+    GroupSize = 2;
+  else if (NumHighLatencies <= 12)
+    GroupSize = 3;
+  else
+    GroupSize = 4;
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    if (DAG->IsHighLatencySU[SU->NodeNum]) {
+      unsigned CompatibleGroup = true;
+      unsigned ProposedColor = Color;
+      for (unsigned j : FormingGroup) {
+        // TODO: Currently CompatibleGroup will always be false,
+        // because the graph enforces the load order. This
+        // can be fixed, but as keeping the load order is often
+        // good for performance that causes a performance hit (both
+        // the default scheduler and this scheduler).
+        // When this scheduler determines a good load order,
+        // this can be fixed.
+        if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) ||
+            !DAG->canAddEdge(&DAG->SUnits[j], SU))
+          CompatibleGroup = false;
+      }
+      if (!CompatibleGroup || ++Count == GroupSize) {
+        FormingGroup.clear();
+        Color = ++NextReservedID;
+        if (!CompatibleGroup) {
+          ProposedColor = Color;
+          FormingGroup.insert(SU->NodeNum);
+        }
+        Count = 0;
+      } else {
+        FormingGroup.insert(SU->NodeNum);
+      }
+      CurrentColoring[SU->NodeNum] = ProposedColor;
+    }
+  }
+}
+
+void SIScheduleBlockCreator::colorComputeReservedDependencies() {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::map<std::set<unsigned>, unsigned> ColorCombinations;
+
+  CurrentTopDownReservedDependencyColoring.clear();
+  CurrentBottomUpReservedDependencyColoring.clear();
+
+  CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
+  CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
+
+  // Traverse TopDown, and give different colors to SUs depending
+  // on which combination of High Latencies they depend on.
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->TopDownIndex2SU[i]];
+    std::set<unsigned> SUColors;
+
+    // Already given.
+    if (CurrentColoring[SU->NodeNum]) {
+      CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
+        CurrentColoring[SU->NodeNum];
+      continue;
+    }
+
+   for (SDep& PredDep : SU->Preds) {
+      SUnit *Pred = PredDep.getSUnit();
+      if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
+        continue;
+      if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
+        SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
+    }
+    // Color 0 by default.
+    if (SUColors.empty())
+      continue;
+    // Same color than parents.
+    if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
+      CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
+        *SUColors.begin();
+    else {
+      std::map<std::set<unsigned>, unsigned>::iterator Pos =
+        ColorCombinations.find(SUColors);
+      if (Pos != ColorCombinations.end()) {
+          CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
+      } else {
+        CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
+          NextNonReservedID;
+        ColorCombinations[SUColors] = NextNonReservedID++;
+      }
+    }
+  }
+
+  ColorCombinations.clear();
+
+  // Same as before, but BottomUp.
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    std::set<unsigned> SUColors;
+
+    // Already given.
+    if (CurrentColoring[SU->NodeNum]) {
+      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
+        CurrentColoring[SU->NodeNum];
+      continue;
+    }
+
+    for (SDep& SuccDep : SU->Succs) {
+      SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
+        SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
+    }
+    // Keep color 0.
+    if (SUColors.empty())
+      continue;
+    // Same color than parents.
+    if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
+      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
+        *SUColors.begin();
+    else {
+      std::map<std::set<unsigned>, unsigned>::iterator Pos =
+        ColorCombinations.find(SUColors);
+      if (Pos != ColorCombinations.end()) {
+        CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
+      } else {
+        CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
+          NextNonReservedID;
+        ColorCombinations[SUColors] = NextNonReservedID++;
+      }
+    }
+  }
+}
+
+void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
+
+  // Every combination of colors given by the top down
+  // and bottom up Reserved node dependency
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    std::pair<unsigned, unsigned> SUColors;
+
+    // High latency instructions: already given.
+    if (CurrentColoring[SU->NodeNum])
+      continue;
+
+    SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
+    SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
+
+    std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
+      ColorCombinations.find(SUColors);
+    if (Pos != ColorCombinations.end()) {
+      CurrentColoring[SU->NodeNum] = Pos->second;
+    } else {
+      CurrentColoring[SU->NodeNum] = NextNonReservedID;
+      ColorCombinations[SUColors] = NextNonReservedID++;
+    }
+  }
+}
+
+void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::vector<int> PendingColoring = CurrentColoring;
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    std::set<unsigned> SUColors;
+    std::set<unsigned> SUColorsPending;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
+        CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+      SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
+          CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
+        SUColors.insert(CurrentColoring[Succ->NodeNum]);
+      SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1 && SUColorsPending.size() == 1)
+      PendingColoring[SU->NodeNum] = *SUColors.begin();
+    else // TODO: Attribute new colors depending on color
+         // combination of children.
+      PendingColoring[SU->NodeNum] = NextNonReservedID++;
+  }
+  CurrentColoring = PendingColoring;
+}
+
+
+void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
+  unsigned DAGSize = DAG->SUnits.size();
+  unsigned PreviousColor;
+  std::set<unsigned> SeenColors;
+
+  if (DAGSize <= 1)
+    return;
+
+  PreviousColor = CurrentColoring[0];
+
+  for (unsigned i = 1, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    unsigned CurrentColor = CurrentColoring[i];
+    unsigned PreviousColorSave = PreviousColor;
+    assert(i == SU->NodeNum);
+
+    if (CurrentColor != PreviousColor)
+      SeenColors.insert(PreviousColor);
+    PreviousColor = CurrentColor;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    if (SeenColors.find(CurrentColor) == SeenColors.end())
+      continue;
+
+    if (PreviousColorSave != CurrentColor)
+      CurrentColoring[i] = NextNonReservedID++;
+    else
+      CurrentColoring[i] = CurrentColoring[i-1];
+  }
+}
+
+void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
+  unsigned DAGSize = DAG->SUnits.size();
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    std::set<unsigned> SUColors;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    // No predecessor: Vgpr constant loading.
+    // Low latency instructions usually have a predecessor (the address)
+    if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+      SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      SUColors.insert(CurrentColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1)
+      CurrentColoring[SU->NodeNum] = *SUColors.begin();
+  }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
+  unsigned DAGSize = DAG->SUnits.size();
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    std::set<unsigned> SUColors;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+       SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      SUColors.insert(CurrentColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1)
+      CurrentColoring[SU->NodeNum] = *SUColors.begin();
+  }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
+  unsigned DAGSize = DAG->SUnits.size();
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    std::set<unsigned> SUColors;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+       SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      SUColors.insert(CurrentColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
+      CurrentColoring[SU->NodeNum] = *SUColors.begin();
+  }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::map<unsigned, unsigned> ColorCount;
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    unsigned color = CurrentColoring[SU->NodeNum];
+    std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color);
+      if (Pos != ColorCount.end()) {
+        ++ColorCount[color];
+      } else {
+        ColorCount[color] = 1;
+      }
+  }
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    unsigned color = CurrentColoring[SU->NodeNum];
+    std::set<unsigned> SUColors;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    if (ColorCount[color] > 1)
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+       SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      SUColors.insert(CurrentColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1 && *SUColors.begin() != color) {
+      --ColorCount[color];
+      CurrentColoring[SU->NodeNum] = *SUColors.begin();
+      ++ColorCount[*SUColors.begin()];
+    }
+  }
+}
+
+void SIScheduleBlockCreator::cutHugeBlocks() {
+  // TODO
+}
+
+void SIScheduleBlockCreator::regroupNoUserInstructions() {
+  unsigned DAGSize = DAG->SUnits.size();
+  int GroupID = NextNonReservedID++;
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+    bool hasSuccessor = false;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+       SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      hasSuccessor = true;
+    }
+    if (!hasSuccessor)
+      CurrentColoring[SU->NodeNum] = GroupID;
+  }
+}
+
+void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::map<unsigned,unsigned> RealID;
+
+  CurrentBlocks.clear();
+  CurrentColoring.clear();
+  CurrentColoring.resize(DAGSize, 0);
+  Node2CurrentBlock.clear();
+
+  // Restore links previous scheduling variant has overridden.
+  DAG->restoreSULinksLeft();
+
+  NextReservedID = 1;
+  NextNonReservedID = DAGSize + 1;
+
+  DEBUG(dbgs() << "Coloring the graph\n");
+
+  if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
+    colorHighLatenciesGroups();
+  else
+    colorHighLatenciesAlone();
+  colorComputeReservedDependencies();
+  colorAccordingToReservedDependencies();
+  colorEndsAccordingToDependencies();
+  if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
+    colorForceConsecutiveOrderInGroup();
+  regroupNoUserInstructions();
+  colorMergeConstantLoadsNextGroup();
+  colorMergeIfPossibleNextGroupOnlyForReserved();
+
+  // Put SUs of same color into same block
+  Node2CurrentBlock.resize(DAGSize, -1);
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    unsigned Color = CurrentColoring[SU->NodeNum];
+    if (RealID.find(Color) == RealID.end()) {
+      int ID = CurrentBlocks.size();
+      BlockPtrs.push_back(
+        make_unique<SIScheduleBlock>(DAG, this, ID));
+      CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
+      RealID[Color] = ID;
+    }
+    CurrentBlocks[RealID[Color]]->addUnit(SU);
+    Node2CurrentBlock[SU->NodeNum] = RealID[Color];
+  }
+
+  // Build dependencies between blocks.
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    int SUID = Node2CurrentBlock[i];
+     for (SDep& SuccDep : SU->Succs) {
+       SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      if (Node2CurrentBlock[Succ->NodeNum] != SUID)
+        CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]);
+    }
+    for (SDep& PredDep : SU->Preds) {
+      SUnit *Pred = PredDep.getSUnit();
+      if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
+        continue;
+      if (Node2CurrentBlock[Pred->NodeNum] != SUID)
+        CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
+    }
+  }
+
+  // Free root and leafs of all blocks to enable scheduling inside them.
+  for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    Block->finalizeUnits();
+  }
+  DEBUG(
+    dbgs() << "Blocks created:\n\n";
+    for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+      SIScheduleBlock *Block = CurrentBlocks[i];
+      Block->printDebug(true);
+    }
+  );
+}
+
+// Two functions taken from Codegen/MachineScheduler.cpp
+
+/// If this iterator is a debug value, increment until reaching the End or a
+/// non-debug instruction.
+static MachineBasicBlock::const_iterator
+nextIfDebug(MachineBasicBlock::const_iterator I,
+            MachineBasicBlock::const_iterator End) {
+  for(; I != End; ++I) {
+    if (!I->isDebugValue())
+      break;
+  }
+  return I;
+}
+
+/// Non-const version.
+static MachineBasicBlock::iterator
+nextIfDebug(MachineBasicBlock::iterator I,
+            MachineBasicBlock::const_iterator End) {
+  // Cast the return value to nonconst MachineInstr, then cast to an
+  // instr_iterator, which does not check for null, finally return a
+  // bundle_iterator.
+  return MachineBasicBlock::instr_iterator(
+    const_cast<MachineInstr*>(
+      &*nextIfDebug(MachineBasicBlock::const_iterator(I), End)));
+}
+
+void SIScheduleBlockCreator::topologicalSort() {
+  unsigned DAGSize = CurrentBlocks.size();
+  std::vector<int> WorkList;
+
+  DEBUG(dbgs() << "Topological Sort\n");
+
+  WorkList.reserve(DAGSize);
+  TopDownIndex2Block.resize(DAGSize);
+  TopDownBlock2Index.resize(DAGSize);
+  BottomUpIndex2Block.resize(DAGSize);
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    unsigned Degree = Block->getSuccs().size();
+    TopDownBlock2Index[i] = Degree;
+    if (Degree == 0) {
+      WorkList.push_back(i);
+    }
+  }
+
+  int Id = DAGSize;
+  while (!WorkList.empty()) {
+    int i = WorkList.back();
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    WorkList.pop_back();
+    TopDownBlock2Index[i] = --Id;
+    TopDownIndex2Block[Id] = i;
+    for (SIScheduleBlock* Pred : Block->getPreds()) {
+      if (!--TopDownBlock2Index[Pred->getID()])
+        WorkList.push_back(Pred->getID());
+    }
+  }
+
+#ifndef NDEBUG
+  // Check correctness of the ordering.
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    for (SIScheduleBlock* Pred : Block->getPreds()) {
+      assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
+      "Wrong Top Down topological sorting");
+    }
+  }
+#endif
+
+  BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
+                                         TopDownIndex2Block.rend());
+}
+
+void SIScheduleBlockCreator::scheduleInsideBlocks() {
+  unsigned DAGSize = CurrentBlocks.size();
+
+  DEBUG(dbgs() << "\nScheduling Blocks\n\n");
+
+  // We do schedule a valid scheduling such that a Block corresponds
+  // to a range of instructions.
+  DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    Block->fastSchedule();
+  }
+
+  // Note: the following code, and the part restoring previous position
+  // is by far the most expensive operation of the Scheduler.
+
+  // Do not update CurrentTop.
+  MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
+  std::vector<MachineBasicBlock::iterator> PosOld;
+  std::vector<MachineBasicBlock::iterator> PosNew;
+  PosOld.reserve(DAG->SUnits.size());
+  PosNew.reserve(DAG->SUnits.size());
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    int BlockIndice = TopDownIndex2Block[i];
+    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+    std::vector<SUnit*> SUs = Block->getScheduledUnits();
+
+    for (SUnit* SU : SUs) {
+      MachineInstr *MI = SU->getInstr();
+      MachineBasicBlock::iterator Pos = MI;
+      PosOld.push_back(Pos);
+      if (&*CurrentTopFastSched == MI) {
+        PosNew.push_back(Pos);
+        CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
+                                          DAG->getCurrentBottom());
+      } else {
+        // Update the instruction stream.
+        DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
+
+        // Update LiveIntervals.
+        // Note: Moving all instructions and calling handleMove everytime
+        // is the most cpu intensive operation of the scheduler.
+        // It would gain a lot if there was a way to recompute the
+        // LiveIntervals for the entire scheduling region.
+        DAG->getLIS()->handleMove(MI, /*UpdateFlags=*/true);
+        PosNew.push_back(CurrentTopFastSched);
+      }
+    }
+  }
+
+  // Now we have Block of SUs == Block of MI.
+  // We do the final schedule for the instructions inside the block.
+  // The property that all the SUs of the Block are grouped together as MI
+  // is used for correct reg usage tracking.
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    std::vector<SUnit*> SUs = Block->getScheduledUnits();
+    Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
+  }
+
+  DEBUG(dbgs() << "Restoring MI Pos\n");
+  // Restore old ordering (which prevents a LIS->handleMove bug).
+  for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
+    MachineBasicBlock::iterator POld = PosOld[i-1];
+    MachineBasicBlock::iterator PNew = PosNew[i-1];
+    if (PNew != POld) {
+      // Update the instruction stream.
+      DAG->getBB()->splice(POld, DAG->getBB(), PNew);
+
+      // Update LiveIntervals.
+      DAG->getLIS()->handleMove(POld, /*UpdateFlags=*/true);
+    }
+  }
+
+  DEBUG(
+    for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+      SIScheduleBlock *Block = CurrentBlocks[i];
+      Block->printDebug(true);
+    }
+  );
+}
+
+void SIScheduleBlockCreator::fillStats() {
+  unsigned DAGSize = CurrentBlocks.size();
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    int BlockIndice = TopDownIndex2Block[i];
+    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+    if (Block->getPreds().size() == 0)
+      Block->Depth = 0;
+    else {
+      unsigned Depth = 0;
+      for (SIScheduleBlock *Pred : Block->getPreds()) {
+        if (Depth < Pred->Depth + 1)
+          Depth = Pred->Depth + 1;
+      }
+      Block->Depth = Depth;
+    }
+  }
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    int BlockIndice = BottomUpIndex2Block[i];
+    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+    if (Block->getSuccs().size() == 0)
+      Block->Height = 0;
+    else {
+      unsigned Height = 0;
+      for (SIScheduleBlock *Succ : Block->getSuccs()) {
+        if (Height < Succ->Height + 1)
+          Height = Succ->Height + 1;
+      }
+      Block->Height = Height;
+    }
+  }
+}
+
+// SIScheduleBlockScheduler //
+
+SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
+                                                   SISchedulerBlockSchedulerVariant Variant,
+                                                   SIScheduleBlocks  BlocksStruct) :
+  DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
+  LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
+  SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
+
+  // Fill the usage of every output
+  // Warning: while by construction we always have a link between two blocks
+  // when one needs a result from the other, the number of users of an output
+  // is not the sum of child blocks having as input the same virtual register.
+  // Here is an example. A produces x and y. B eats x and produces x'.
+  // C eats x' and y. The register coalescer may have attributed the same
+  // virtual register to x and x'.
+  // To count accurately, we do a topological sort. In case the register is
+  // found for several parents, we increment the usage of the one with the
+  // highest topological index.
+  LiveOutRegsNumUsages.resize(Blocks.size());
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    for (unsigned Reg : Block->getInRegs()) {
+      bool Found = false;
+      int topoInd = -1;
+      for (SIScheduleBlock* Pred: Block->getPreds()) {
+        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
+        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
+
+        if (RegPos != PredOutRegs.end()) {
+          Found = true;
+          if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
+            topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
+          }
+        }
+      }
+
+      if (!Found)
+        continue;
+
+      int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
+      std::map<unsigned, unsigned>::iterator RegPos =
+        LiveOutRegsNumUsages[PredID].find(Reg);
+      if (RegPos != LiveOutRegsNumUsages[PredID].end()) {
+        ++LiveOutRegsNumUsages[PredID][Reg];
+      } else {
+        LiveOutRegsNumUsages[PredID][Reg] = 1;
+      }
+    }
+  }
+
+  LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
+  BlockNumPredsLeft.resize(Blocks.size());
+  BlockNumSuccsLeft.resize(Blocks.size());
+
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    BlockNumPredsLeft[i] = Block->getPreds().size();
+    BlockNumSuccsLeft[i] = Block->getSuccs().size();
+  }
+
+#ifndef NDEBUG
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    assert(Block->getID() == i);
+  }
+#endif
+
+  std::set<unsigned> InRegs = DAG->getInRegs();
+  addLiveRegs(InRegs);
+
+  // Fill LiveRegsConsumers for regs that were already
+  // defined before scheduling.
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    for (unsigned Reg : Block->getInRegs()) {
+      bool Found = false;
+      for (SIScheduleBlock* Pred: Block->getPreds()) {
+        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
+        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
+
+        if (RegPos != PredOutRegs.end()) {
+          Found = true;
+          break;
+        }
+      }
+
+      if (!Found) {
+        if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end())
+          LiveRegsConsumers[Reg] = 1;
+        else
+          ++LiveRegsConsumers[Reg];
+      }
+    }
+  }
+
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    if (BlockNumPredsLeft[i] == 0) {
+      ReadyBlocks.push_back(Block);
+    }
+  }
+
+  while (SIScheduleBlock *Block = pickBlock()) {
+    BlocksScheduled.push_back(Block);
+    blockScheduled(Block);
+  }
+
+  DEBUG(
+    dbgs() << "Block Order:";
+    for (SIScheduleBlock* Block : BlocksScheduled) {
+      dbgs() << ' ' << Block->getID();
+    }
+  );
+}
+
+bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
+                                                   SIBlockSchedCandidate &TryCand) {
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return true;
+  }
+
+  // Try to hide high latencies.
+  if (tryLess(TryCand.LastPosHighLatParentScheduled,
+              Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
+    return true;
+  // Schedule high latencies early so you can hide them better.
+  if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
+                 TryCand, Cand, Latency))
+    return true;
+  if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height,
+                                          TryCand, Cand, Depth))
+    return true;
+  if (tryGreater(TryCand.NumHighLatencySuccessors,
+                 Cand.NumHighLatencySuccessors,
+                 TryCand, Cand, Successor))
+    return true;
+  return false;
+}
+
+bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
+                                                    SIBlockSchedCandidate &TryCand) {
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return true;
+  }
+
+  if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
+              TryCand, Cand, RegUsage))
+    return true;
+  if (tryGreater(TryCand.NumSuccessors > 0,
+                 Cand.NumSuccessors > 0,
+                 TryCand, Cand, Successor))
+    return true;
+  if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
+    return true;
+  if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
+              TryCand, Cand, RegUsage))
+    return true;
+  return false;
+}
+
+SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
+  SIBlockSchedCandidate Cand;
+  std::vector<SIScheduleBlock*>::iterator Best;
+  SIScheduleBlock *Block;
+  if (ReadyBlocks.empty())
+    return nullptr;
+
+  DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
+                        VregCurrentUsage, SregCurrentUsage);
+  if (VregCurrentUsage > maxVregUsage)
+    maxVregUsage = VregCurrentUsage;
+  if (VregCurrentUsage > maxSregUsage)
+    maxSregUsage = VregCurrentUsage;
+  DEBUG(
+    dbgs() << "Picking New Blocks\n";
+    dbgs() << "Available: ";
+    for (SIScheduleBlock* Block : ReadyBlocks)
+      dbgs() << Block->getID() << ' ';
+    dbgs() << "\nCurrent Live:\n";
+    for (unsigned Reg : LiveRegs)
+      dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+    dbgs() << '\n';
+    dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
+    dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
+  );
+
+  Cand.Block = nullptr;
+  for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
+       E = ReadyBlocks.end(); I != E; ++I) {
+    SIBlockSchedCandidate TryCand;
+    TryCand.Block = *I;
+    TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
+    TryCand.VGPRUsageDiff =
+      checkRegUsageImpact(TryCand.Block->getInRegs(),
+                          TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
+    TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
+    TryCand.NumHighLatencySuccessors =
+      TryCand.Block->getNumHighLatencySuccessors();
+    TryCand.LastPosHighLatParentScheduled =
+      (unsigned int) std::max<int> (0,
+         LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
+           LastPosWaitedHighLatency);
+    TryCand.Height = TryCand.Block->Height;
+    // Try not to increase VGPR usage too much, else we may spill.
+    if (VregCurrentUsage > 120 ||
+        Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
+      if (!tryCandidateRegUsage(Cand, TryCand) &&
+          Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
+        tryCandidateLatency(Cand, TryCand);
+    } else {
+      if (!tryCandidateLatency(Cand, TryCand))
+        tryCandidateRegUsage(Cand, TryCand);
+    }
+    if (TryCand.Reason != NoCand) {
+      Cand.setBest(TryCand);
+      Best = I;
+      DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
+                   << getReasonStr(Cand.Reason) << '\n');
+    }
+  }
+
+  DEBUG(
+    dbgs() << "Picking: " << Cand.Block->getID() << '\n';
+    dbgs() << "Is a block with high latency instruction: "
+      << (Cand.IsHighLatency ? "yes\n" : "no\n");
+    dbgs() << "Position of last high latency dependency: "
+           << Cand.LastPosHighLatParentScheduled << '\n';
+    dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
+    dbgs() << '\n';
+  );
+
+  Block = Cand.Block;
+  ReadyBlocks.erase(Best);
+  return Block;
+}
+
+// Tracking of currently alive registers to determine VGPR Usage.
+
+void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
+  for (unsigned Reg : Regs) {
+    // For now only track virtual registers.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    // If not already in the live set, then add it.
+    (void) LiveRegs.insert(Reg);
+  }
+}
+
+void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
+                                       std::set<unsigned> &Regs) {
+  for (unsigned Reg : Regs) {
+    // For now only track virtual registers.
+    std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
+    assert (Pos != LiveRegs.end() && // Reg must be live.
+               LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
+               LiveRegsConsumers[Reg] >= 1);
+    --LiveRegsConsumers[Reg];
+    if (LiveRegsConsumers[Reg] == 0)
+      LiveRegs.erase(Pos);
+  }
+}
+
+void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
+  for (SIScheduleBlock* Block : Parent->getSuccs()) {
+    --BlockNumPredsLeft[Block->getID()];
+    if (BlockNumPredsLeft[Block->getID()] == 0) {
+      ReadyBlocks.push_back(Block);
+    }
+    // TODO: Improve check. When the dependency between the high latency
+    // instructions and the instructions of the other blocks are WAR or WAW
+    // there will be no wait triggered. We would like these cases to not
+    // update LastPosHighLatencyParentScheduled.
+    if (Parent->isHighLatencyBlock())
+      LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled;
+  }
+}
+
+void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
+  decreaseLiveRegs(Block, Block->getInRegs());
+  addLiveRegs(Block->getOutRegs());
+  releaseBlockSuccs(Block);
+  for (std::map<unsigned, unsigned>::iterator RegI =
+       LiveOutRegsNumUsages[Block->getID()].begin(),
+       E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
+    std::pair<unsigned, unsigned> RegP = *RegI;
+    if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end())
+      LiveRegsConsumers[RegP.first] = RegP.second;
+    else {
+      assert(LiveRegsConsumers[RegP.first] == 0);
+      LiveRegsConsumers[RegP.first] += RegP.second;
+    }
+  }
+  if (LastPosHighLatencyParentScheduled[Block->getID()] >
+        (unsigned)LastPosWaitedHighLatency)
+    LastPosWaitedHighLatency =
+      LastPosHighLatencyParentScheduled[Block->getID()];
+  ++NumBlockScheduled;
+}
+
+std::vector<int>
+SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
+                                     std::set<unsigned> &OutRegs) {
+  std::vector<int> DiffSetPressure;
+  DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
+
+  for (unsigned Reg : InRegs) {
+    // For now only track virtual registers.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    if (LiveRegsConsumers[Reg] > 1)
+      continue;
+    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+    for (; PSetI.isValid(); ++PSetI) {
+      DiffSetPressure[*PSetI] -= PSetI.getWeight();
+    }
+  }
+
+  for (unsigned Reg : OutRegs) {
+    // For now only track virtual registers.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+    for (; PSetI.isValid(); ++PSetI) {
+      DiffSetPressure[*PSetI] += PSetI.getWeight();
+    }
+  }
+
+  return DiffSetPressure;
+}
+
+// SIScheduler //
+
+struct SIScheduleBlockResult
+SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
+                             SISchedulerBlockSchedulerVariant ScheduleVariant) {
+  SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
+  SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
+  std::vector<SIScheduleBlock*> ScheduledBlocks;
+  struct SIScheduleBlockResult Res;
+
+  ScheduledBlocks = Scheduler.getBlocks();
+
+  for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
+    SIScheduleBlock *Block = ScheduledBlocks[b];
+    std::vector<SUnit*> SUs = Block->getScheduledUnits();
+
+    for (SUnit* SU : SUs)
+      Res.SUs.push_back(SU->NodeNum);
+  }
+
+  Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
+  Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
+  return Res;
+}
+
+// SIScheduleDAGMI //
+
+SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
+  ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)) {
+  SITII = static_cast<const SIInstrInfo*>(TII);
+  SITRI = static_cast<const SIRegisterInfo*>(TRI);
+
+  VGPRSetID = SITRI->getVGPR32PressureSet();
+  SGPRSetID = SITRI->getSGPR32PressureSet();
+}
+
+SIScheduleDAGMI::~SIScheduleDAGMI() {
+}
+
+ScheduleDAGInstrs *llvm::createSIMachineScheduler(MachineSchedContext *C) {
+  return new SIScheduleDAGMI(C);
+}
+
+// Code adapted from scheduleDAG.cpp
+// Does a topological sort over the SUs.
+// Both TopDown and BottomUp
+void SIScheduleDAGMI::topologicalSort() {
+  std::vector<int> TopDownSU2Index;
+  unsigned DAGSize = SUnits.size();
+  std::vector<SUnit*> WorkList;
+
+  DEBUG(dbgs() << "Topological Sort\n");
+  WorkList.reserve(DAGSize);
+
+  TopDownIndex2SU.resize(DAGSize);
+  TopDownSU2Index.resize(DAGSize);
+  BottomUpIndex2SU.resize(DAGSize);
+
+  WorkList.push_back(&getExitSU());
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    int NodeNum = SU->NodeNum;
+    unsigned Degree = SU->Succs.size();
+    TopDownSU2Index[NodeNum] = Degree;
+    if (Degree == 0) {
+      assert(SU->Succs.empty() && "SUnit should have no successors");
+      WorkList.push_back(SU);
+    }
+  }
+
+  int Id = DAGSize;
+  while (!WorkList.empty()) {
+    SUnit *SU = WorkList.back();
+    WorkList.pop_back();
+    if (SU->NodeNum < DAGSize) {
+      TopDownSU2Index[SU->NodeNum] = --Id;
+      TopDownIndex2SU[Id] = SU->NodeNum;
+    }
+    for (SDep& Pred : SU->Preds) {
+      SUnit *SU = Pred.getSUnit();
+      if (SU->NodeNum < DAGSize && !--TopDownSU2Index[SU->NodeNum])
+        WorkList.push_back(SU);
+    }
+  }
+
+  BottomUpIndex2SU = std::vector<int>(TopDownIndex2SU.rbegin(),
+                                      TopDownIndex2SU.rend());
+
+#ifndef NDEBUG
+  // Check correctness of the ordering
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    for (SDep& Pred : SU->Preds) {
+      if (Pred.getSUnit()->NodeNum >= DAGSize)
+        continue;
+      assert(TopDownSU2Index[SU->NodeNum] >
+             TopDownSU2Index[Pred.getSUnit()->NodeNum] &&
+             "Wrong Top Down topological sorting");
+    }
+  }
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    for (SDep& Succ : SU->Succs) {
+      if (Succ.getSUnit()->NodeNum >= DAGSize)
+        continue;
+      assert(TopDownSU2Index[SU->NodeNum] <
+             TopDownSU2Index[Succ.getSUnit()->NodeNum] &&
+             "Wrong Bottom Up topological sorting");
+    }
+  }
+#endif
+}
+
+// Move low latencies further from their user without
+// increasing SGPR usage (in general)
+// This is to be replaced by a better pass that would
+// take into account SGPR usage (based on VGPR Usage
+// and the corresponding wavefront count), that would
+// try to merge groups of loads if it make sense, etc
+void SIScheduleDAGMI::moveLowLatencies() {
+   unsigned DAGSize = SUnits.size();
+   int LastLowLatencyUser = -1;
+   int LastLowLatencyPos = -1;
+
+   for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
+    SUnit *SU = &SUnits[ScheduledSUnits[i]];
+    bool IsLowLatencyUser = false;
+    unsigned MinPos = 0;
+
+    for (SDep& PredDep : SU->Preds) {
+      SUnit *Pred = PredDep.getSUnit();
+      if (SITII->isLowLatencyInstruction(Pred->getInstr())) {
+        IsLowLatencyUser = true;
+      }
+      if (Pred->NodeNum >= DAGSize)
+        continue;
+      unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
+      if (PredPos >= MinPos)
+        MinPos = PredPos + 1;
+    }
+
+    if (SITII->isLowLatencyInstruction(SU->getInstr())) {
+      unsigned BestPos = LastLowLatencyUser + 1;
+      if ((int)BestPos <= LastLowLatencyPos)
+        BestPos = LastLowLatencyPos + 1;
+      if (BestPos < MinPos)
+        BestPos = MinPos;
+      if (BestPos < i) {
+        for (unsigned u = i; u > BestPos; --u) {
+          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
+          ScheduledSUnits[u] = ScheduledSUnits[u-1];
+        }
+        ScheduledSUnits[BestPos] = SU->NodeNum;
+        ScheduledSUnitsInv[SU->NodeNum] = BestPos;
+      }
+      LastLowLatencyPos = BestPos;
+      if (IsLowLatencyUser)
+        LastLowLatencyUser = BestPos;
+    } else if (IsLowLatencyUser) {
+      LastLowLatencyUser = i;
+    // Moves COPY instructions on which depends
+    // the low latency instructions too.
+    } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
+      bool CopyForLowLat = false;
+      for (SDep& SuccDep : SU->Succs) {
+        SUnit *Succ = SuccDep.getSUnit();
+        if (SITII->isLowLatencyInstruction(Succ->getInstr())) {
+          CopyForLowLat = true;
+        }
+      }
+      if (!CopyForLowLat)
+        continue;
+      if (MinPos < i) {
+        for (unsigned u = i; u > MinPos; --u) {
+          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
+          ScheduledSUnits[u] = ScheduledSUnits[u-1];
+        }
+        ScheduledSUnits[MinPos] = SU->NodeNum;
+        ScheduledSUnitsInv[SU->NodeNum] = MinPos;
+      }
+    }
+  }
+}
+
+void SIScheduleDAGMI::restoreSULinksLeft() {
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    SUnits[i].isScheduled = false;
+    SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
+    SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
+    SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
+    SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
+  }
+}
+
+// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
+template<typename _Iterator> void
+SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
+                                  unsigned &VgprUsage, unsigned &SgprUsage) {
+  VgprUsage = 0;
+  SgprUsage = 0;
+  for (_Iterator RegI = First; RegI != End; ++RegI) {
+    unsigned Reg = *RegI;
+    // For now only track virtual registers
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    PSetIterator PSetI = MRI.getPressureSets(Reg);
+    for (; PSetI.isValid(); ++PSetI) {
+      if (*PSetI == VGPRSetID)
+        VgprUsage += PSetI.getWeight();
+      else if (*PSetI == SGPRSetID)
+        SgprUsage += PSetI.getWeight();
+    }
+  }
+}
+
+void SIScheduleDAGMI::schedule()
+{
+  SmallVector<SUnit*, 8> TopRoots, BotRoots;
+  SIScheduleBlockResult Best, Temp;
+  DEBUG(dbgs() << "Preparing Scheduling\n");
+
+  buildDAGWithRegPressure();
+  DEBUG(
+    for(SUnit& SU : SUnits)
+       SU.dumpAll(this)
+  );
+
+  Topo.InitDAGTopologicalSorting();
+  topologicalSort();
+  findRootsAndBiasEdges(TopRoots, BotRoots);
+  // We reuse several ScheduleDAGMI and ScheduleDAGMILive
+  // functions, but to make them happy we must initialize
+  // the default Scheduler implementation (even if we do not
+  // run it)
+  SchedImpl->initialize(this);
+  initQueues(TopRoots, BotRoots);
+
+  // Fill some stats to help scheduling.
+
+  SUnitsLinksBackup = SUnits;
+  IsLowLatencySU.clear();
+  LowLatencyOffset.clear();
+  IsHighLatencySU.clear();
+
+  IsLowLatencySU.resize(SUnits.size(), 0);
+  LowLatencyOffset.resize(SUnits.size(), 0);
+  IsHighLatencySU.resize(SUnits.size(), 0);
+
+  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    unsigned BaseLatReg, OffLatReg;
+    if (SITII->isLowLatencyInstruction(SU->getInstr())) {
+      IsLowLatencySU[i] = 1;
+      if (SITII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseLatReg,
+                                      OffLatReg, TRI))
+        LowLatencyOffset[i] = OffLatReg;
+    } else if (SITII->isHighLatencyInstruction(SU->getInstr()))
+      IsHighLatencySU[i] = 1;
+  }
+
+  SIScheduler Scheduler(this);
+  Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
+                                   SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
+#if 0 // To enable when handleMove fix lands
+  // if VGPR usage is extremely high, try other good performing variants
+  // which could lead to lower VGPR usage
+  if (Best.MaxVGPRUsage > 180) {
+    std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
+      { LatenciesAlone, BlockRegUsageLatency },
+//      { LatenciesAlone, BlockRegUsage },
+      { LatenciesGrouped, BlockLatencyRegUsage },
+//      { LatenciesGrouped, BlockRegUsageLatency },
+//      { LatenciesGrouped, BlockRegUsage },
+      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
+//      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
+//      { LatenciesAlonePlusConsecutive, BlockRegUsage }
+    };
+    for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
+      Temp = Scheduler.scheduleVariant(v.first, v.second);
+      if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
+        Best = Temp;
+    }
+  }
+  // if VGPR usage is still extremely high, we may spill. Try other variants
+  // which are less performing, but that could lead to lower VGPR usage.
+  if (Best.MaxVGPRUsage > 200) {
+    std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
+//      { LatenciesAlone, BlockRegUsageLatency },
+      { LatenciesAlone, BlockRegUsage },
+//      { LatenciesGrouped, BlockLatencyRegUsage },
+      { LatenciesGrouped, BlockRegUsageLatency },
+      { LatenciesGrouped, BlockRegUsage },
+//      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
+      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
+      { LatenciesAlonePlusConsecutive, BlockRegUsage }
+    };
+    for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
+      Temp = Scheduler.scheduleVariant(v.first, v.second);
+      if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
+        Best = Temp;
+    }
+  }
+#endif
+  ScheduledSUnits = Best.SUs;
+  ScheduledSUnitsInv.resize(SUnits.size());
+
+  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
+    ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
+  }
+
+  moveLowLatencies();
+
+  // Tell the outside world about the result of the scheduling.
+
+  assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
+  TopRPTracker.setPos(CurrentTop);
+
+  for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
+       E = ScheduledSUnits.end(); I != E; ++I) {
+    SUnit *SU = &SUnits[*I];
+
+    scheduleMI(SU, true);
+
+    DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+                 << *SU->getInstr());
+  }
+
+  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+  placeDebugValues();
+
+  DEBUG({
+      unsigned BBNum = begin()->getParent()->getNumber();
+      dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
+      dumpSchedule();
+      dbgs() << '\n';
+    });
+}
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.h b/lib/Target/AMDGPU/SIMachineScheduler.h

new file mode 100644 (file)

index 0000000..b270136
--- /dev/null
+++ b/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -0,0 +1,489 @@
+//===-- SIMachineScheduler.h - SI Scheduler Interface -*- C++ -*-------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
+
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+enum SIScheduleCandReason {
+  NoCand,
+  RegUsage,
+  Latency,
+  Successor,
+  Depth,
+  NodeOrder
+};
+
+struct SISchedulerCandidate {
+  // The reason for this candidate.
+  SIScheduleCandReason Reason;
+
+  // Set of reasons that apply to multiple candidates.
+  uint32_t RepeatReasonSet;
+
+  SISchedulerCandidate()
+    :  Reason(NoCand), RepeatReasonSet(0) {}
+
+  bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
+  void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
+};
+
+class SIScheduleDAGMI;
+class SIScheduleBlockCreator;
+
+class SIScheduleBlock {
+  SIScheduleDAGMI *DAG;
+  SIScheduleBlockCreator *BC;
+
+  std::vector<SUnit*> SUnits;
+  std::map<unsigned, unsigned> NodeNum2Index;
+  std::vector<SUnit*> TopReadySUs;
+  std::vector<SUnit*> ScheduledSUnits;
+
+  /// The top of the unscheduled zone.
+  IntervalPressure TopPressure;
+  RegPressureTracker TopRPTracker;
+
+  // Pressure: number of said class of registers needed to
+  // store the live virtual and real registers.
+  // We do care only of SGPR32 and VGPR32 and do track only virtual registers.
+  // Pressure of additional registers required inside the block.
+  std::vector<unsigned> InternalAdditionnalPressure;
+  // Pressure of input and output registers
+  std::vector<unsigned> LiveInPressure;
+  std::vector<unsigned> LiveOutPressure;
+  // Registers required by the block, and outputs.
+  // We do track only virtual registers.
+  // Note that some registers are not 32 bits,
+  // and thus the pressure is not equal
+  // to the number of live registers.
+  std::set<unsigned> LiveInRegs;
+  std::set<unsigned> LiveOutRegs;
+
+  bool Scheduled;
+  bool HighLatencyBlock;
+
+  std::vector<unsigned> HasLowLatencyNonWaitedParent;
+
+  // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table.
+  unsigned ID;
+
+  std::vector<SIScheduleBlock*> Preds;  // All blocks predecessors.
+  std::vector<SIScheduleBlock*> Succs;  // All blocks successors.
+  unsigned NumHighLatencySuccessors;
+
+public:
+  SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
+                  unsigned ID):
+    DAG(DAG), BC(BC), SUnits(), TopReadySUs(), ScheduledSUnits(),
+    TopRPTracker(TopPressure), Scheduled(false),
+    HighLatencyBlock(false), ID(ID),
+    Preds(), Succs(), NumHighLatencySuccessors(0) {};
+
+  ~SIScheduleBlock() {};
+
+  unsigned getID() const { return ID; }
+
+  /// Functions for Block construction.
+  void addUnit(SUnit *SU);
+
+  // When all SUs have been added.
+  void finalizeUnits();
+
+  // Add block pred, which has instruction predecessor of SU.
+  void addPred(SIScheduleBlock *Pred);
+  void addSucc(SIScheduleBlock *Succ);
+
+  const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
+  const std::vector<SIScheduleBlock*>& getSuccs() const { return Succs; }
+
+  unsigned Height;  // Maximum topdown path length to block without outputs
+  unsigned Depth;   // Maximum bottomup path length to block without inputs
+
+  unsigned getNumHighLatencySuccessors() const {
+    return NumHighLatencySuccessors;
+  }
+
+  bool isHighLatencyBlock() { return HighLatencyBlock; }
+
+  // This is approximative.
+  // Ideally should take into accounts some instructions (rcp, etc)
+  // are 4 times slower.
+  int getCost() { return SUnits.size(); }
+
+  // The block Predecessors and Successors must be all registered
+  // before fastSchedule().
+  // Fast schedule with no particular requirement.
+  void fastSchedule();
+
+  std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; }
+
+  // Complete schedule that will try to minimize reg pressure and
+  // low latencies, and will fill liveins and liveouts.
+  // Needs all MIs to be grouped between BeginBlock and EndBlock.
+  // The MIs can be moved after the scheduling,
+  // it is just used to allow correct track of live registers.
+  void schedule(MachineBasicBlock::iterator BeginBlock,
+                MachineBasicBlock::iterator EndBlock);
+
+  bool isScheduled() { return Scheduled; }
+
+
+  // Needs the block to be scheduled inside
+  // TODO: find a way to compute it.
+  std::vector<unsigned> &getInternalAdditionnalRegUsage() {
+    return InternalAdditionnalPressure;
+  }
+
+  std::set<unsigned> &getInRegs() { return LiveInRegs; }
+  std::set<unsigned> &getOutRegs() { return LiveOutRegs; }
+
+  void printDebug(bool Full);
+
+private:
+  struct SISchedCandidate : SISchedulerCandidate {
+    // The best SUnit candidate.
+    SUnit *SU;
+
+    unsigned SGPRUsage;
+    unsigned VGPRUsage;
+    bool IsLowLatency;
+    unsigned LowLatencyOffset;
+    bool HasLowLatencyNonWaitedParent;
+
+    SISchedCandidate()
+      : SU(nullptr) {}
+
+    bool isValid() const { return SU; }
+
+    // Copy the status of another candidate without changing policy.
+    void setBest(SISchedCandidate &Best) {
+      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+      SU = Best.SU;
+      Reason = Best.Reason;
+      SGPRUsage = Best.SGPRUsage;
+      VGPRUsage = Best.VGPRUsage;
+      IsLowLatency = Best.IsLowLatency;
+      LowLatencyOffset = Best.LowLatencyOffset;
+      HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent;
+    }
+  };
+
+  void undoSchedule();
+
+  void undoReleaseSucc(SUnit *SU, SDep *SuccEdge);
+  void releaseSucc(SUnit *SU, SDep *SuccEdge);
+  // InOrOutBlock: restrict to links pointing inside the block (true),
+  // or restrict to links pointing outside the block (false).
+  void releaseSuccessors(SUnit *SU, bool InOrOutBlock);
+
+  void nodeScheduled(SUnit *SU);
+  void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand);
+  void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand);
+  SUnit* pickNode();
+  void traceCandidate(const SISchedCandidate &Cand);
+  void initRegPressure(MachineBasicBlock::iterator BeginBlock,
+                       MachineBasicBlock::iterator EndBlock);
+};
+
+struct SIScheduleBlocks {
+  std::vector<SIScheduleBlock*> Blocks;
+  std::vector<int> TopDownIndex2Block;
+  std::vector<int> TopDownBlock2Index;
+};
+
+enum SISchedulerBlockCreatorVariant {
+    LatenciesAlone,
+    LatenciesGrouped,
+    LatenciesAlonePlusConsecutive
+};
+
+class SIScheduleBlockCreator {
+  SIScheduleDAGMI *DAG;
+  // unique_ptr handles freeing memory for us.
+  std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs;
+  std::map<SISchedulerBlockCreatorVariant,
+           SIScheduleBlocks> Blocks;
+  std::vector<SIScheduleBlock*> CurrentBlocks;
+  std::vector<int> Node2CurrentBlock;
+
+  // Topological sort
+  // Maps topological index to the node number.
+  std::vector<int> TopDownIndex2Block;
+  std::vector<int> TopDownBlock2Index;
+  std::vector<int> BottomUpIndex2Block;
+
+  // 0 -> Color not given.
+  // 1 to SUnits.size() -> Reserved group (you should only add elements to them).
+  // Above -> Other groups.
+  int NextReservedID;
+  int NextNonReservedID;
+  std::vector<int> CurrentColoring;
+  std::vector<int> CurrentTopDownReservedDependencyColoring;
+  std::vector<int> CurrentBottomUpReservedDependencyColoring;
+
+public:
+  SIScheduleBlockCreator(SIScheduleDAGMI *DAG);
+  ~SIScheduleBlockCreator();
+
+  SIScheduleBlocks
+  getBlocks(SISchedulerBlockCreatorVariant BlockVariant);
+
+  bool isSUInBlock(SUnit *SU, unsigned ID);
+
+private:
+  // Give a Reserved color to every high latency.
+  void colorHighLatenciesAlone();
+
+  // Create groups of high latencies with a Reserved color.
+  void colorHighLatenciesGroups();
+
+  // Compute coloring for topdown and bottom traversals with
+  // different colors depending on dependencies on Reserved colors.
+  void colorComputeReservedDependencies();
+
+  // Give color to all non-colored SUs according to Reserved groups dependencies.
+  void colorAccordingToReservedDependencies();
+
+  // Divides Blocks having no bottom up or top down dependencies on Reserved groups.
+  // The new colors are computed according to the dependencies on the other blocks
+  // formed with colorAccordingToReservedDependencies.
+  void colorEndsAccordingToDependencies();
+
+  // Cut groups into groups with SUs in consecutive order (except for Reserved groups).
+  void colorForceConsecutiveOrderInGroup();
+
+  // Merge Constant loads that have all their users into another group to the group.
+  // (TODO: else if all their users depend on the same group, put them there)
+  void colorMergeConstantLoadsNextGroup();
+
+  // Merge SUs that have all their users into another group to the group
+  void colorMergeIfPossibleNextGroup();
+
+  // Merge SUs that have all their users into another group to the group,
+  // but only for Reserved groups.
+  void colorMergeIfPossibleNextGroupOnlyForReserved();
+
+  // Merge SUs that have all their users into another group to the group,
+  // but only if the group is no more than a few SUs.
+  void colorMergeIfPossibleSmallGroupsToNextGroup();
+
+  // Divides Blocks with important size.
+  // Idea of implementation: attribute new colors depending on topdown and
+  // bottom up links to other blocks.
+  void cutHugeBlocks();
+
+  // Put in one group all instructions with no users in this scheduling region
+  // (we'd want these groups be at the end).
+  void regroupNoUserInstructions();
+
+  void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);
+
+  void topologicalSort();
+
+  void scheduleInsideBlocks();
+
+  void fillStats();
+};
+
+enum SISchedulerBlockSchedulerVariant {
+  BlockLatencyRegUsage,
+  BlockRegUsageLatency,
+  BlockRegUsage
+};
+
+class SIScheduleBlockScheduler {
+  SIScheduleDAGMI *DAG;
+  SISchedulerBlockSchedulerVariant Variant;
+  std::vector<SIScheduleBlock*> Blocks;
+
+  std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
+  std::set<unsigned> LiveRegs;
+  // Num of schedulable unscheduled blocks reading the register.
+  std::map<unsigned, unsigned> LiveRegsConsumers;
+
+  std::vector<unsigned> LastPosHighLatencyParentScheduled;
+  int LastPosWaitedHighLatency;
+
+  std::vector<SIScheduleBlock*> BlocksScheduled;
+  unsigned NumBlockScheduled;
+  std::vector<SIScheduleBlock*> ReadyBlocks;
+
+  unsigned VregCurrentUsage;
+  unsigned SregCurrentUsage;
+
+  // Currently is only approximation.
+  unsigned maxVregUsage;
+  unsigned maxSregUsage;
+
+  std::vector<unsigned> BlockNumPredsLeft;
+  std::vector<unsigned> BlockNumSuccsLeft;
+
+public:
+  SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
+                           SISchedulerBlockSchedulerVariant Variant,
+                           SIScheduleBlocks BlocksStruct);
+  ~SIScheduleBlockScheduler() {};
+
+  std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; };
+
+  unsigned getVGPRUsage() { return maxVregUsage; };
+  unsigned getSGPRUsage() { return maxSregUsage; };
+
+private:
+  struct SIBlockSchedCandidate : SISchedulerCandidate {
+    // The best Block candidate.
+    SIScheduleBlock *Block;
+
+    bool IsHighLatency;
+    int VGPRUsageDiff;
+    unsigned NumSuccessors;
+    unsigned NumHighLatencySuccessors;
+    unsigned LastPosHighLatParentScheduled;
+    unsigned Height;
+
+    SIBlockSchedCandidate()
+      : Block(nullptr) {}
+
+    bool isValid() const { return Block; }
+
+    // Copy the status of another candidate without changing policy.
+    void setBest(SIBlockSchedCandidate &Best) {
+      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+      Block = Best.Block;
+      Reason = Best.Reason;
+      IsHighLatency = Best.IsHighLatency;
+      VGPRUsageDiff = Best.VGPRUsageDiff;
+      NumSuccessors = Best.NumSuccessors;
+      NumHighLatencySuccessors = Best.NumHighLatencySuccessors;
+      LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled;
+      Height = Best.Height;
+    }
+  };
+
+  bool tryCandidateLatency(SIBlockSchedCandidate &Cand,
+                           SIBlockSchedCandidate &TryCand);
+  bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
+                            SIBlockSchedCandidate &TryCand);
+  SIScheduleBlock *pickBlock();
+
+  void addLiveRegs(std::set<unsigned> &Regs);
+  void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
+  void releaseBlockSuccs(SIScheduleBlock *Parent);
+  void blockScheduled(SIScheduleBlock *Block);
+
+  // Check register pressure change
+  // by scheduling a block with these LiveIn and LiveOut.
+  std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
+                                       std::set<unsigned> &OutRegs);
+
+  void schedule();
+};
+
+struct SIScheduleBlockResult {
+  std::vector<unsigned> SUs;
+  unsigned MaxSGPRUsage;
+  unsigned MaxVGPRUsage;
+};
+
+class SIScheduler {
+  SIScheduleDAGMI *DAG;
+  SIScheduleBlockCreator BlockCreator;
+
+public:
+  SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {};
+
+  ~SIScheduler() {};
+
+  struct SIScheduleBlockResult
+  scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
+                  SISchedulerBlockSchedulerVariant ScheduleVariant);
+};
+
+class SIScheduleDAGMI : public ScheduleDAGMILive {
+  const SIInstrInfo *SITII;
+  const SIRegisterInfo *SITRI;
+
+  std::vector<SUnit> SUnitsLinksBackup;
+
+  // For moveLowLatencies. After all Scheduling variants are tested.
+  std::vector<unsigned> ScheduledSUnits;
+  std::vector<unsigned> ScheduledSUnitsInv;
+
+  unsigned VGPRSetID;
+  unsigned SGPRSetID;
+
+public:
+  SIScheduleDAGMI(MachineSchedContext *C);
+
+  ~SIScheduleDAGMI() override;
+
+  // Entry point for the schedule.
+  void schedule() override;
+
+  // To init Block's RPTracker.
+  void initRPTracker(RegPressureTracker &RPTracker) {
+    RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin);
+  }
+
+  MachineBasicBlock *getBB() { return BB; }
+  MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; };
+  MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; };
+  LiveIntervals *getLIS() { return LIS; }
+  MachineRegisterInfo *getMRI() { return &MRI; }
+  const TargetRegisterInfo *getTRI() { return TRI; }
+  SUnit& getEntrySU() { return EntrySU; };
+  SUnit& getExitSU() { return ExitSU; };
+
+  void restoreSULinksLeft();
+
+  template<typename _Iterator> void fillVgprSgprCost(_Iterator First,
+                                                     _Iterator End,
+                                                     unsigned &VgprUsage,
+                                                     unsigned &SgprUsage);
+  std::set<unsigned> getInRegs() {
+    std::set<unsigned> InRegs (RPTracker.getPressure().LiveInRegs.begin(),
+                               RPTracker.getPressure().LiveInRegs.end());
+    return InRegs;
+  };
+
+  unsigned getVGPRSetID() const { return VGPRSetID; }
+  unsigned getSGPRSetID() const { return SGPRSetID; }
+
+private:
+  void topologicalSort();
+  // After scheduling is done, improve low latency placements.
+  void moveLowLatencies();
+
+public:
+  // Some stats for scheduling inside blocks.
+  std::vector<unsigned> IsLowLatencySU;
+  std::vector<unsigned> LowLatencyOffset;
+  std::vector<unsigned> IsHighLatencySU;
+  // Topological sort
+  // Maps topological index to the node number.
+  std::vector<int> TopDownIndex2SU;
+  std::vector<int> BottomUpIndex2SU;
+};
+
+} // namespace llvm
+
+#endif /* SIMACHINESCHEDULER_H_ */
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp

index 2d68b8b5df0e2e9c4fe2eb69e93ff234a58a6b96..609f5e7df5493f6f10646d0831c66261cc883c5a 100644 (file)
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -23,7 +23,20 @@
  
  using namespace llvm;
  
  
  using namespace llvm;
  
-SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {}
+SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {
+  unsigned NumRegPressureSets = getNumRegPressureSets();
+
+  SGPR32SetID = NumRegPressureSets;
+  VGPR32SetID = NumRegPressureSets;
+  for (unsigned i = 0; i < NumRegPressureSets; ++i) {
+    if (strncmp("SGPR_32", getRegPressureSetName(i), 7) == 0)
+      SGPR32SetID = i;
+    else if (strncmp("VGPR_32", getRegPressureSetName(i), 7) == 0)
+      VGPR32SetID = i;
+  }
+  assert(SGPR32SetID < NumRegPressureSets &&
+         VGPR32SetID < NumRegPressureSets);
+}
  
  void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
    MCRegAliasIterator R(Reg, this, true);
  
  void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
    MCRegAliasIterator R(Reg, this, true);
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h

index 1795237c214016fff53d2cb781fcef1a20512122..9410e2049cba7d67733788f5a8b50a50fe1d5dd8 100644 (file)
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -25,6 +25,9 @@ namespace llvm {
  
  struct SIRegisterInfo : public AMDGPURegisterInfo {
  private:
  
  struct SIRegisterInfo : public AMDGPURegisterInfo {
  private:
+  unsigned SGPR32SetID;
+  unsigned VGPR32SetID;
+
    void reserveRegisterTuples(BitVector &, unsigned Reg) const;
  
  public:
    void reserveRegisterTuples(BitVector &, unsigned Reg) const;
  
  public:
@@ -146,6 +149,9 @@ public:
    unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
                                const TargetRegisterClass *RC) const;
  
    unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
                                const TargetRegisterClass *RC) const;
  
+  unsigned getSGPR32PressureSet() const { return SGPR32SetID; };
+  unsigned getVGPR32PressureSet() const { return VGPR32SetID; };
+
  private:
    void buildScratchLoadStore(MachineBasicBlock::iterator MI,
                               unsigned LoadStoreOp, unsigned Value,
  private:
    void buildScratchLoadStore(MachineBasicBlock::iterator MI,
                               unsigned LoadStoreOp, unsigned Value,
diff --git a/test/CodeGen/AMDGPU/si-scheduler.ll b/test/CodeGen/AMDGPU/si-scheduler.ll

new file mode 100644 (file)

index 0000000..66a9571
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-scheduler.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=amdgcn -mcpu=SI --misched=si < %s | FileCheck %s
+
+; The test checks the "si" machine scheduler pass works correctly.
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: s_wqm
+; CHECK: s_load_dwordx4
+; CHECK: s_load_dwordx8
+; CHECK: s_waitcnt lgkmcnt(0)
+; CHECK: image_sample
+; CHECK: s_waitcnt vmcnt(0)
+; CHECK: exp
+; CHECK: s_endpgm
+
+define void @main([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, 
+<2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
+main_body:
+  %22 = bitcast [34 x <8 x i32>] addrspace(2)* %3 to <32 x i8> addrspace(2)*
+  %23 = load <32 x i8>, <32 x i8> addrspace(2)* %22, align 32, !tbaa !0
+  %24 = bitcast [17 x <4 x i32>] addrspace(2)* %2 to <16 x i8> addrspace(2)*
+  %25 = load <16 x i8>, <16 x i8> addrspace(2)* %24, align 16, !tbaa !0
+  %26 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %11)
+  %27 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %11)
+  %28 = bitcast float %26 to i32
+  %29 = bitcast float %27 to i32
+  %30 = insertelement <2 x i32> undef, i32 %28, i32 0
+  %31 = insertelement <2 x i32> %30, i32 %29, i32 1
+  %32 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %31, <32 x i8> %23, <16 x i8> %25, i32 2)
+  %33 = extractelement <4 x float> %32, i32 0
+  %34 = extractelement <4 x float> %32, i32 1
+  %35 = extractelement <4 x float> %32, i32 2
+  %36 = extractelement <4 x float> %32, i32 3
+  %37 = call i32 @llvm.SI.packf16(float %33, float %34)
+  %38 = bitcast i32 %37 to float
+  %39 = call i32 @llvm.SI.packf16(float %35, float %36)
+  %40 = bitcast i32 %39 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %38, float %40, float %38, float %40)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.SI.packf16(float, float) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!"const", null, i32 1}
author	Nicolai Haehnle <nhaehnle@gmail.com>
	Wed, 13 Jan 2016 16:10:10 +0000 (16:10 +0000)
committer	Nicolai Haehnle <nhaehnle@gmail.com>
	Wed, 13 Jan 2016 16:10:10 +0000 (16:10 +0000)
lib/Target/AMDGPU/AMDGPU.h		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUTargetMachine.cpp		patch \| blob \| history
lib/Target/AMDGPU/CMakeLists.txt		patch \| blob \| history
lib/Target/AMDGPU/SIInstrInfo.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIInstrInfo.h		patch \| blob \| history
lib/Target/AMDGPU/SIMachineScheduler.cpp	[new file with mode: 0644]	patch \| blob
lib/Target/AMDGPU/SIMachineScheduler.h	[new file with mode: 0644]	patch \| blob
lib/Target/AMDGPU/SIRegisterInfo.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIRegisterInfo.h		patch \| blob \| history
test/CodeGen/AMDGPU/si-scheduler.ll	[new file with mode: 0644]	patch \| blob