MachineScheduler: Add a target hook for deciding which RegPressure sets to

[oota-llvm.git] / lib / CodeGen / MachineScheduler.cpp
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp

index cc8f3a075e9e3d425a8921308d49169c04971757..bcee15c7c75f76be2e8c368715283fae518e0a2d 100644 (file)
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -49,6 +49,11 @@ DumpCriticalPathLength("misched-dcpl", cl::Hidden,
  static cl::opt<bool> ViewMISchedDAGs("view-misched-dags", cl::Hidden,
    cl::desc("Pop up a window to show MISched dags after they are processed"));
  
+/// In some situations a few uninteresting nodes depend on nearly all other
+/// nodes in the graph, provide a cutoff to hide them.
+static cl::opt<unsigned> ViewMISchedCutoff("view-misched-cutoff", cl::Hidden,
+  cl::desc("Hide nodes with more predecessor/successor than cutoff"));
+
  static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
    cl::desc("Stop scheduling after N instructions"), cl::init(~0U));
  
@@ -106,7 +111,7 @@ public:
    void print(raw_ostream &O, const Module* = nullptr) const override;
  
  protected:
-  void scheduleRegions(ScheduleDAGInstrs &Scheduler);
+  void scheduleRegions(ScheduleDAGInstrs &Scheduler, bool FixKillFlags);
  };
  
  /// MachineScheduler runs after coalescing and before register allocation.
@@ -144,12 +149,12 @@ char MachineScheduler::ID = 0;
  
  char &llvm::MachineSchedulerID = MachineScheduler::ID;
  
-INITIALIZE_PASS_BEGIN(MachineScheduler, "misched",
+INITIALIZE_PASS_BEGIN(MachineScheduler, "machine-scheduler",
                        "Machine Instruction Scheduler", false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
  INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
  INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_END(MachineScheduler, "misched",
+INITIALIZE_PASS_END(MachineScheduler, "machine-scheduler",
                      "Machine Instruction Scheduler", false, false)
  
  MachineScheduler::MachineScheduler()
@@ -161,7 +166,7 @@ void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
    AU.setPreservesCFG();
    AU.addRequiredID(MachineDominatorsID);
    AU.addRequired<MachineLoopInfo>();
-  AU.addRequired<AliasAnalysis>();
+  AU.addRequired<AAResultsWrapperPass>();
    AU.addRequired<TargetPassConfig>();
    AU.addRequired<SlotIndexes>();
    AU.addPreserved<SlotIndexes>();
@@ -209,6 +214,11 @@ static MachineSchedRegistry
  DefaultSchedRegistry("default", "Use the target's default scheduler choice.",
                       useDefaultMachineSched);
  
+static cl::opt<bool> EnableMachineSched(
+    "enable-misched",
+    cl::desc("Enable the machine instruction scheduling pass."), cl::init(true),
+    cl::Hidden);
+
  /// Forward declare the standard machine scheduler. This will be used as the
  /// default scheduler if the target does not set a default.
  static ScheduleDAGInstrs *createGenericSchedLive(MachineSchedContext *C);
@@ -304,14 +314,20 @@ ScheduleDAGInstrs *PostMachineScheduler::createPostMachineScheduler() {
  /// design would be to split blocks at scheduling boundaries, but LLVM has a
  /// general bias against block splitting purely for implementation simplicity.
  bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
-  DEBUG(dbgs() << "Before MISsched:\n"; mf.print(dbgs()));
+  if (EnableMachineSched.getNumOccurrences()) {
+    if (!EnableMachineSched)
+      return false;
+  } else if (!mf.getSubtarget().enableMachineScheduler())
+    return false;
+
+  DEBUG(dbgs() << "Before MISched:\n"; mf.print(dbgs()));
  
    // Initialize the context of the pass.
    MF = &mf;
    MLI = &getAnalysis<MachineLoopInfo>();
    MDT = &getAnalysis<MachineDominatorTree>();
    PassConfig = &getAnalysis<TargetPassConfig>();
-  AA = &getAnalysis<AliasAnalysis>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
  
    LIS = &getAnalysis<LiveIntervals>();
  
@@ -324,7 +340,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
    // Instantiate the selected scheduler for this target, function, and
    // optimization level.
    std::unique_ptr<ScheduleDAGInstrs> Scheduler(createMachineScheduler());
-  scheduleRegions(*Scheduler);
+  scheduleRegions(*Scheduler, false);
  
    DEBUG(LIS->dump());
    if (VerifyScheduling)
@@ -336,9 +352,7 @@ bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
    if (skipOptnoneFunction(*mf.getFunction()))
      return false;
  
-  const TargetSubtargetInfo &ST =
-    mf.getTarget().getSubtarget<TargetSubtargetInfo>();
-  if (!ST.enablePostMachineScheduler()) {
+  if (!mf.getSubtarget().enablePostRAScheduler()) {
      DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n");
      return false;
    }
@@ -354,7 +368,7 @@ bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
    // Instantiate the selected scheduler for this target, function, and
    // optimization level.
    std::unique_ptr<ScheduleDAGInstrs> Scheduler(createPostMachineScheduler());
-  scheduleRegions(*Scheduler);
+  scheduleRegions(*Scheduler, true);
  
    if (VerifyScheduling)
      MF->verify(this, "After post machine scheduling.");
@@ -374,15 +388,14 @@ bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
  static bool isSchedBoundary(MachineBasicBlock::iterator MI,
                              MachineBasicBlock *MBB,
                              MachineFunction *MF,
-                            const TargetInstrInfo *TII,
-                            bool IsPostRA) {
+                            const TargetInstrInfo *TII) {
    return MI->isCall() || TII->isSchedulingBoundary(MI, MBB, *MF);
  }
  
  /// Main driver for both MachineScheduler and PostMachineScheduler.
-void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) {
+void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
+                                           bool FixKillFlags) {
    const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
-  bool IsPostRA = Scheduler.isPostRA();
  
    // Visit all machine basic blocks.
    //
@@ -391,7 +404,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) {
    for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end();
         MBB != MBBEnd; ++MBB) {
  
-    Scheduler.startBlock(MBB);
+    Scheduler.startBlock(&*MBB);
  
  #ifndef NDEBUG
      if (SchedOnlyFunc.getNumOccurrences() && SchedOnlyFunc != MF->getName())
@@ -420,7 +433,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) {
  
        // Avoid decrementing RegionEnd for blocks with no terminator.
        if (RegionEnd != MBB->end() ||
-          isSchedBoundary(std::prev(RegionEnd), MBB, MF, TII, IsPostRA)) {
+          isSchedBoundary(&*std::prev(RegionEnd), &*MBB, MF, TII)) {
          --RegionEnd;
          // Count the boundary instruction.
          --RemainingInstrs;
@@ -431,14 +444,14 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) {
        unsigned NumRegionInstrs = 0;
        MachineBasicBlock::iterator I = RegionEnd;
        for(;I != MBB->begin(); --I, --RemainingInstrs) {
-        if (isSchedBoundary(std::prev(I), MBB, MF, TII, IsPostRA))
+        if (isSchedBoundary(&*std::prev(I), &*MBB, MF, TII))
            break;
          if (!I->isDebugValue())
            ++NumRegionInstrs;
        }
        // Notify the scheduler of the region, even if we may skip scheduling
        // it. Perhaps it still needs to be bundled.
-      Scheduler.enterRegion(MBB, I, RegionEnd, NumRegionInstrs);
+      Scheduler.enterRegion(&*MBB, I, RegionEnd, NumRegionInstrs);
  
        // Skip empty scheduling regions (0 or 1 schedulable instructions).
        if (I == RegionEnd || I == std::prev(RegionEnd)) {
@@ -447,8 +460,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) {
          Scheduler.exitRegion();
          continue;
        }
-      DEBUG(dbgs() << "********** " << ((Scheduler.isPostRA()) ? "PostRA " : "")
-            << "MI Scheduling **********\n");
+      DEBUG(dbgs() << "********** MI Scheduling **********\n");
        DEBUG(dbgs() << MF->getName()
              << ":BB#" << MBB->getNumber() << " " << MBB->getName()
              << "\n  From: " << *I << "    To: ";
@@ -475,11 +487,11 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) {
      }
      assert(RemainingInstrs == 0 && "Instruction count mismatch!");
      Scheduler.finishBlock();
-    if (Scheduler.isPostRA()) {
-      // FIXME: Ideally, no further passes should rely on kill flags. However,
-      // thumb2 size reduction is currently an exception.
-      Scheduler.fixupKills(MBB);
-    }
+    // FIXME: Ideally, no further passes should rely on kill flags. However,
+    // thumb2 size reduction is currently an exception, so the PostMIScheduler
+    // needs to do this.
+    if (FixKillFlags)
+        Scheduler.fixupKills(&*MBB);
    }
    Scheduler.finalizeSchedule();
  }
@@ -490,7 +502,7 @@ void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const {
  
  LLVM_DUMP_METHOD
  void ReadyQueue::dump() {
-  dbgs() << Name << ": ";
+  dbgs() << "Queue " << Name << ": ";
    for (unsigned i = 0, e = Queue.size(); i < e; ++i)
      dbgs() << Queue[i]->NodeNum << " ";
    dbgs() << "\n";
@@ -651,6 +663,9 @@ bool ScheduleDAGMI::checkSchedLimit() {
  /// does not consider liveness or register pressure. It is useful for PostRA
  /// scheduling and potentially other custom schedulers.
  void ScheduleDAGMI::schedule() {
+  DEBUG(dbgs() << "ScheduleDAGMI::schedule starting\n");
+  DEBUG(SchedImpl->dumpPolicy());
+
    // Build the DAG.
    buildSchedGraph(AA);
  
@@ -673,7 +688,11 @@ void ScheduleDAGMI::schedule() {
    initQueues(TopRoots, BotRoots);
  
    bool IsTopNode = false;
-  while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) {
+  while (true) {
+    DEBUG(dbgs() << "** ScheduleDAGMI::schedule picking next node\n");
+    SUnit *SU = SchedImpl->pickNode(IsTopNode);
+    if (!SU) break;
+
      assert(!SU->isScheduled && "Node already scheduled");
      if (!checkSchedLimit())
        break;
@@ -891,6 +910,13 @@ void ScheduleDAGMILive::initRegPressure() {
      updatePressureDiffs(LiveUses);
    }
  
+  DEBUG(
+    dbgs() << "Top Pressure:\n";
+    dumpRegSetPressure(TopRPTracker.getRegSetPressureAtPos(), TRI);
+    dbgs() << "Bottom Pressure:\n";
+    dumpRegSetPressure(BotRPTracker.getRegSetPressureAtPos(), TRI);
+  );
+
    assert(BotRPTracker.getPos() == RegionEnd && "Can't find the region bottom");
  
    // Cache the list of excess pressure sets in this region. This will also track
@@ -934,8 +960,9 @@ updateScheduledPressure(const SUnit *SU,
      unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID);
      if (NewMaxPressure[ID] >= Limit - 2) {
        DEBUG(dbgs() << "  " << TRI->getRegPressureSetName(ID) << ": "
-            << NewMaxPressure[ID] << " > " << Limit << "(+ "
-            << BotRPTracker.getLiveThru()[ID] << " livethru)\n");
+            << NewMaxPressure[ID]
+            << ((NewMaxPressure[ID] > Limit) ? " > " : " <= ") << Limit
+            << "(+ " << BotRPTracker.getLiveThru()[ID] << " livethru)\n");
      }
    }
  }
@@ -966,18 +993,24 @@ void ScheduleDAGMILive::updatePressureDiffs(ArrayRef<unsigned> LiveUses) {
      }
      // RegisterPressureTracker guarantees that readsReg is true for LiveUses.
      assert(VNI && "No live value at use.");
-    for (VReg2UseMap::iterator
-           UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
-      SUnit *SU = UI->SU;
-      DEBUG(dbgs() << "  UpdateRegP: SU(" << SU->NodeNum << ") "
-            << *SU->getInstr());
+    for (const VReg2SUnit &V2SU
+         : make_range(VRegUses.find(Reg), VRegUses.end())) {
+      SUnit *SU = V2SU.SU;
        // If this use comes before the reaching def, it cannot be a last use, so
        // descrease its pressure change.
        if (!SU->isScheduled && SU != &ExitSU) {
          LiveQueryResult LRQ
            = LI.Query(LIS->getInstructionIndex(SU->getInstr()));
-        if (LRQ.valueIn() == VNI)
-          getPressureDiff(SU).addPressureChange(Reg, true, &MRI);
+        if (LRQ.valueIn() == VNI) {
+          PressureDiff &PDiff = getPressureDiff(SU);
+          PDiff.addPressureChange(Reg, true, &MRI);
+          DEBUG(
+            dbgs() << "  UpdateRegP: SU(" << SU->NodeNum << ") "
+                   << *SU->getInstr();
+            dbgs() << "              to ";
+            PDiff.dump(*TRI);
+          );
+        }
        }
      }
    }
@@ -988,12 +1021,14 @@ void ScheduleDAGMILive::updatePressureDiffs(ArrayRef<unsigned> LiveUses) {
  /// only includes instructions that have DAG nodes, not scheduling boundaries.
  ///
  /// This is a skeletal driver, with all the functionality pushed into helpers,
-/// so that it can be easilly extended by experimental schedulers. Generally,
+/// so that it can be easily extended by experimental schedulers. Generally,
  /// implementing MachineSchedStrategy should be sufficient to implement a new
  /// scheduling algorithm. However, if a scheduler further subclasses
  /// ScheduleDAGMILive then it will want to override this virtual method in order
  /// to update any specialized state.
  void ScheduleDAGMILive::schedule() {
+  DEBUG(dbgs() << "ScheduleDAGMILive::schedule starting\n");
+  DEBUG(SchedImpl->dumpPolicy());
    buildDAGWithRegPressure();
  
    Topo.InitDAGTopologicalSorting();
@@ -1007,8 +1042,16 @@ void ScheduleDAGMILive::schedule() {
    // This may initialize a DFSResult to be used for queue priority.
    SchedImpl->initialize(this);
  
-  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
-          SUnits[su].dumpAll(this));
+  DEBUG(
+    for (const SUnit &SU : SUnits) {
+      SU.dumpAll(this);
+      if (ShouldTrackPressure) {
+        dbgs() << "  Pressure Diff      : ";
+        getPressureDiff(&SU).dump(*TRI);
+      }
+      dbgs() << '\n';
+    }
+  );
    if (ViewMISchedDAGs) viewGraph();
  
    // Initialize ready queues now that the DAG and priority data are finalized.
@@ -1020,15 +1063,17 @@ void ScheduleDAGMILive::schedule() {
    }
  
    bool IsTopNode = false;
-  while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) {
+  while (true) {
+    DEBUG(dbgs() << "** ScheduleDAGMILive::schedule picking next node\n");
+    SUnit *SU = SchedImpl->pickNode(IsTopNode);
+    if (!SU) break;
+
      assert(!SU->isScheduled && "Node already scheduled");
      if (!checkSchedLimit())
        break;
  
      scheduleMI(SU, IsTopNode);
  
-    updateQueues(SU, IsTopNode);
-
      if (DFSResult) {
        unsigned SubtreeID = DFSResult->getSubtreeID(SU);
        if (!ScheduledTrees.test(SubtreeID)) {
@@ -1040,6 +1085,8 @@ void ScheduleDAGMILive::schedule() {
  
      // Notify the scheduling strategy after updating the DAG.
      SchedImpl->schedNode(SU, IsTopNode);
+
+    updateQueues(SU, IsTopNode);
    }
    assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
  
@@ -1139,14 +1186,15 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
      unsigned LiveOutHeight = DefSU->getHeight();
      unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency;
      // Visit all local users of the vreg def.
-    for (VReg2UseMap::iterator
-           UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
-      if (UI->SU == &ExitSU)
+    for (const VReg2SUnit &V2SU
+         : make_range(VRegUses.find(Reg), VRegUses.end())) {
+      SUnit *SU = V2SU.SU;
+      if (SU == &ExitSU)
          continue;
  
        // Only consider uses of the phi.
        LiveQueryResult LRQ =
-        LI.Query(LIS->getInstructionIndex(UI->SU->getInstr()));
+        LI.Query(LIS->getInstructionIndex(SU->getInstr()));
        if (!LRQ.valueIn()->isPHIDef())
          continue;
  
@@ -1154,10 +1202,10 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
        // overestimate in strange cases. This allows cyclic latency to be
        // estimated as the minimum slack of the vreg's depth or height.
        unsigned CyclicLatency = 0;
-      if (LiveOutDepth > UI->SU->getDepth())
-        CyclicLatency = LiveOutDepth - UI->SU->getDepth();
+      if (LiveOutDepth > SU->getDepth())
+        CyclicLatency = LiveOutDepth - SU->getDepth();
  
-      unsigned LiveInHeight = UI->SU->getHeight() + DefSU->Latency;
+      unsigned LiveInHeight = SU->getHeight() + DefSU->Latency;
        if (LiveInHeight > LiveOutHeight) {
          if (LiveInHeight - LiveOutHeight < CyclicLatency)
            CyclicLatency = LiveInHeight - LiveOutHeight;
@@ -1166,7 +1214,7 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
          CyclicLatency = 0;
  
        DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU("
-            << UI->SU->NodeNum << ") = " << CyclicLatency << "c\n");
+            << SU->NodeNum << ") = " << CyclicLatency << "c\n");
        if (CyclicLatency > MaxCyclicLatency)
          MaxCyclicLatency = CyclicLatency;
      }
@@ -1193,6 +1241,11 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
        // Update top scheduled pressure.
        TopRPTracker.advance();
        assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
+      DEBUG(
+        dbgs() << "Top Pressure:\n";
+        dumpRegSetPressure(TopRPTracker.getRegSetPressureAtPos(), TRI);
+      );
+
        updateScheduledPressure(SU, TopRPTracker.getPressure().MaxSetPressure);
      }
    }
@@ -1215,6 +1268,11 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
        SmallVector<unsigned, 8> LiveUses;
        BotRPTracker.recede(&LiveUses);
        assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
+      DEBUG(
+        dbgs() << "Bottom Pressure:\n";
+        dumpRegSetPressure(BotRPTracker.getRegSetPressureAtPos(), TRI);
+      );
+
        updateScheduledPressure(SU, BotRPTracker.getPressure().MaxSetPressure);
        updatePressureDiffs(LiveUses);
      }
@@ -1261,7 +1319,7 @@ void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads,
      SUnit *SU = Loads[Idx];
      unsigned BaseReg;
      unsigned Offset;
-    if (TII->getLdStBaseRegImmOfs(SU->getInstr(), BaseReg, Offset, TRI))
+    if (TII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseReg, Offset, TRI))
        LoadRecords.push_back(LoadInfo(SU, BaseReg, Offset));
    }
    if (LoadRecords.size() < 2)
@@ -1339,25 +1397,49 @@ namespace {
  /// \brief Post-process the DAG to create cluster edges between instructions
  /// that may be fused by the processor into a single operation.
  class MacroFusion : public ScheduleDAGMutation {
-  const TargetInstrInfo *TII;
+  const TargetInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
  public:
-  MacroFusion(const TargetInstrInfo *tii): TII(tii) {}
+  MacroFusion(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI)
+    : TII(TII), TRI(TRI) {}
  
    void apply(ScheduleDAGMI *DAG) override;
  };
  } // anonymous
  
+/// Returns true if \p MI reads a register written by \p Other.
+static bool HasDataDep(const TargetRegisterInfo &TRI, const MachineInstr &MI,
+                       const MachineInstr &Other) {
+  for (const MachineOperand &MO : MI.uses()) {
+    if (!MO.isReg() || !MO.readsReg())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (Other.modifiesRegister(Reg, &TRI))
+      return true;
+  }
+  return false;
+}
+
  /// \brief Callback from DAG postProcessing to create cluster edges to encourage
  /// fused operations.
  void MacroFusion::apply(ScheduleDAGMI *DAG) {
    // For now, assume targets can only fuse with the branch.
-  MachineInstr *Branch = DAG->ExitSU.getInstr();
+  SUnit &ExitSU = DAG->ExitSU;
+  MachineInstr *Branch = ExitSU.getInstr();
    if (!Branch)
      return;
  
-  for (unsigned Idx = DAG->SUnits.size(); Idx > 0;) {
-    SUnit *SU = &DAG->SUnits[--Idx];
-    if (!TII->shouldScheduleAdjacent(SU->getInstr(), Branch))
+  for (SUnit &SU : DAG->SUnits) {
+    // SUnits with successors can't be schedule in front of the ExitSU.
+    if (!SU.Succs.empty())
+      continue;
+    // We only care if the node writes to a register that the branch reads.
+    MachineInstr *Pred = SU.getInstr();
+    if (!HasDataDep(TRI, *Branch, *Pred))
+      continue;
+
+    if (!TII.shouldScheduleAdjacent(Pred, Branch))
        continue;
  
      // Create a single weak edge from SU to ExitSU. The only effect is to cause
@@ -1366,11 +1448,11 @@ void MacroFusion::apply(ScheduleDAGMI *DAG) {
      // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
      // of SU, we could create an artificial edge from the deepest root, but it
      // hasn't been needed yet.
-    bool Success = DAG->addEdge(&DAG->ExitSU, SDep(SU, SDep::Cluster));
+    bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster));
      (void)Success;
      assert(Success && "No DAG nodes should be reachable from ExitSU");
  
-    DEBUG(dbgs() << "Macro Fuse SU(" << SU->NodeNum << ")\n");
+    DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n");
      break;
    }
  }
@@ -1434,12 +1516,15 @@ void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {
    // Check if either the dest or source is local. If it's live across a back
    // edge, it's not local. Note that if both vregs are live across the back
    // edge, we cannot successfully contrain the copy without cyclic scheduling.
-  unsigned LocalReg = DstReg;
-  unsigned GlobalReg = SrcReg;
+  // If both the copy's source and dest are local live intervals, then we
+  // should treat the dest as the global for the purpose of adding
+  // constraints. This adds edges from source's other uses to the copy.
+  unsigned LocalReg = SrcReg;
+  unsigned GlobalReg = DstReg;
    LiveInterval *LocalLI = &LIS->getInterval(LocalReg);
    if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) {
-    LocalReg = SrcReg;
-    GlobalReg = DstReg;
+    LocalReg = DstReg;
+    GlobalReg = SrcReg;
      LocalLI = &LIS->getInterval(LocalReg);
      if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx))
        return;
@@ -2137,7 +2222,7 @@ void GenericSchedulerBase::setPolicy(CandPolicy &Policy,
                                       bool IsPostRA,
                                       SchedBoundary &CurrZone,
                                       SchedBoundary *OtherZone) {
-  // Apply preemptive heuristics based on the the total latency and resources
+  // Apply preemptive heuristics based on the total latency and resources
    // inside and outside this zone. Potential stalls should be considered before
    // following this policy.
  
@@ -2264,7 +2349,7 @@ void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) {
      Latency = Cand.SU->getDepth();
      break;
    }
-  dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
+  dbgs() << "  Cand SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
    if (P.isValid())
      dbgs() << " " << TRI->getRegPressureSetName(P.getPSet())
             << ":" << P.getUnitInc() << " ";
@@ -2425,6 +2510,14 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
    }
  }
  
+void GenericScheduler::dumpPolicy() {
+  dbgs() << "GenericScheduler RegionPolicy: "
+         << " ShouldTrackPressure=" << RegionPolicy.ShouldTrackPressure
+         << " OnlyTopDown=" << RegionPolicy.OnlyTopDown
+         << " OnlyBottomUp=" << RegionPolicy.OnlyBottomUp
+         << "\n";
+}
+
  /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
  /// critical path by more cycles than it takes to drain the instruction buffer.
  /// We estimate an upper bounds on in-flight instructions as:
@@ -2486,11 +2579,13 @@ static bool tryPressure(const PressureChange &TryP,
                          const PressureChange &CandP,
                          GenericSchedulerBase::SchedCandidate &TryCand,
                          GenericSchedulerBase::SchedCandidate &Cand,
-                        GenericSchedulerBase::CandReason Reason) {
-  int TryRank = TryP.getPSetOrMax();
-  int CandRank = CandP.getPSetOrMax();
+                        GenericSchedulerBase::CandReason Reason,
+                        const TargetRegisterInfo *TRI,
+                        const MachineFunction &MF) {
+  unsigned TryPSet = TryP.getPSetOrMax();
+  unsigned CandPSet = CandP.getPSetOrMax();
    // If both candidates affect the same set, go with the smallest increase.
-  if (TryRank == CandRank) {
+  if (TryPSet == CandPSet) {
      return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand,
                     Reason);
    }
@@ -2500,6 +2595,13 @@ static bool tryPressure(const PressureChange &TryP,
                   Reason)) {
      return true;
    }
+
+  int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, TryPSet) :
+                                 std::numeric_limits<int>::max();
+
+  int CandRank = CandP.isValid() ? TRI->getRegPressureSetScore(MF, CandPSet) :
+                                   std::numeric_limits<int>::max();
+
    // If the candidates are decreasing pressure, reverse priority.
    if (TryP.getUnitInc() < 0)
      std::swap(TryRank, CandRank);
@@ -2584,7 +2686,7 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
      }
    }
    DEBUG(if (TryCand.RPDelta.Excess.isValid())
-          dbgs() << "  SU(" << TryCand.SU->NodeNum << ") "
+          dbgs() << "  Try  SU(" << TryCand.SU->NodeNum << ") "
                   << TRI->getRegPressureSetName(TryCand.RPDelta.Excess.getPSet())
                   << ":" << TryCand.RPDelta.Excess.getUnitInc() << "\n");
  
@@ -2599,17 +2701,18 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
                   TryCand, Cand, PhysRegCopy))
      return;
  
-  // Avoid exceeding the target's limit. If signed PSetID is negative, it is
-  // invalid; convert it to INT_MAX to give it lowest priority.
+  // Avoid exceeding the target's limit.
    if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess,
                                                 Cand.RPDelta.Excess,
-                                               TryCand, Cand, RegExcess))
+                                               TryCand, Cand, RegExcess, TRI,
+                                               DAG->MF))
      return;
  
    // Avoid increasing the max critical pressure in the scheduled region.
    if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax,
                                                 Cand.RPDelta.CriticalMax,
-                                               TryCand, Cand, RegCritical))
+                                               TryCand, Cand, RegCritical, TRI,
+                                               DAG->MF))
      return;
  
    // For loops that are acyclic path limited, aggressively schedule for latency.
@@ -2645,7 +2748,8 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
    // Avoid increasing the max pressure of the entire region.
    if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax,
                                                 Cand.RPDelta.CurrentMax,
-                                               TryCand, Cand, RegMax))
+                                               TryCand, Cand, RegMax, TRI,
+                                               DAG->MF))
      return;
  
    // Avoid critical resource consumption and balance the schedule.
@@ -2660,8 +2764,8 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
  
    // Avoid serializing long latency dependence chains.
    // For acyclic path limited loops, latency was already checked above.
-  if (Cand.Policy.ReduceLatency && !Rem.IsAcyclicLatencyLimited
-      && tryLatency(TryCand, Cand, Zone)) {
+  if (!RegionPolicy.DisableLatencyHeuristic && Cand.Policy.ReduceLatency &&
+      !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone)) {
      return;
    }
  
@@ -2715,12 +2819,12 @@ SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
    // efficient, but also provides the best heuristics for CriticalPSets.
    if (SUnit *SU = Bot.pickOnlyChoice()) {
      IsTopNode = false;
-    DEBUG(dbgs() << "Pick Bot NOCAND\n");
+    DEBUG(dbgs() << "Pick Bot ONLY1\n");
      return SU;
    }
    if (SUnit *SU = Top.pickOnlyChoice()) {
      IsTopNode = true;
-    DEBUG(dbgs() << "Pick Top NOCAND\n");
+    DEBUG(dbgs() << "Pick Top ONLY1\n");
      return SU;
    }
    CandPolicy NoPolicy;
@@ -2875,7 +2979,7 @@ static ScheduleDAGInstrs *createGenericSchedLive(MachineSchedContext *C) {
    if (EnableLoadCluster && DAG->TII->enableClusterLoads())
      DAG->addMutation(make_unique<LoadClusterMutation>(DAG->TII, DAG->TRI));
    if (EnableMacroFusion)
-    DAG->addMutation(make_unique<MacroFusion>(DAG->TII));
+    DAG->addMutation(make_unique<MacroFusion>(*DAG->TII, *DAG->TRI));
    return DAG;
  }
  
@@ -3242,12 +3346,10 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
    }
  
    static bool isNodeHidden(const SUnit *Node) {
-    return (Node->Preds.size() > 10 || Node->Succs.size() > 10);
-  }
-
-  static bool hasNodeAddressLabel(const SUnit *Node,
-                                  const ScheduleDAG *Graph) {
-    return false;
+    if (ViewMISchedCutoff == 0)
+      return false;
+    return (Node->Preds.size() > ViewMISchedCutoff
+         || Node->Succs.size() > ViewMISchedCutoff);
    }
  
    /// If you want to override the dot attributes printed for a particular