Remove "localize global" optimization

[oota-llvm.git] / lib / CodeGen / ScheduleDAGInstrs.cpp
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp

index 3960c57fa227b2b238ffbdcacfa75f000ddb6968..d940dbcf9f285315ac019779c29c15d8ea0399cb 100644 (file)
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -36,6 +36,8 @@
  #include "llvm/Target/TargetMachine.h"
  #include "llvm/Target/TargetRegisterInfo.h"
  #include "llvm/Target/TargetSubtargetInfo.h"
+#include <queue>
+
  using namespace llvm;
  
  static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
@@ -98,7 +100,7 @@ static void getUnderlyingObjects(const Value *V,
      SmallVector<Value *, 4> Objs;
      GetUnderlyingObjects(const_cast<Value *>(V), Objs);
  
-    for (SmallVector<Value *, 4>::iterator I = Objs.begin(), IE = Objs.end();
+    for (SmallVectorImpl<Value *>::iterator I = Objs.begin(), IE = Objs.end();
           I != IE; ++I) {
        V = *I;
        if (!Visited.insert(V))
@@ -116,12 +118,15 @@ static void getUnderlyingObjects(const Value *V,
    } while (!Working.empty());
  }
  
+typedef SmallVector<PointerIntPair<const Value *, 1, bool>, 4>
+UnderlyingObjectsVector;
+
  /// getUnderlyingObjectsForInstr - If this machine instr has memory reference
  /// information and it can be tracked to a normal reference to a known
  /// object, return the Value for that object.
  static void getUnderlyingObjectsForInstr(const MachineInstr *MI,
-              const MachineFrameInfo *MFI,
-              SmallVectorImpl<std::pair<const Value *, bool> > &Objects) {
+                                         const MachineFrameInfo *MFI,
+                                         UnderlyingObjectsVector &Objects) {
    if (!MI->hasOneMemOperand() ||
        !(*MI->memoperands_begin())->getValue() ||
        (*MI->memoperands_begin())->isVolatile())
@@ -134,8 +139,8 @@ static void getUnderlyingObjectsForInstr(const MachineInstr *MI,
    SmallVector<Value *, 4> Objs;
    getUnderlyingObjects(V, Objs);
  
-  for (SmallVector<Value *, 4>::iterator I = Objs.begin(), IE = Objs.end();
-       I != IE; ++I) {
+  for (SmallVectorImpl<Value *>::iterator I = Objs.begin(), IE = Objs.end();
+         I != IE; ++I) {
      bool MayAlias = true;
      V = *I;
  
@@ -155,7 +160,7 @@ static void getUnderlyingObjectsForInstr(const MachineInstr *MI,
        return;
      }
  
-    Objects.push_back(std::make_pair(V, MayAlias));
+    Objects.push_back(UnderlyingObjectsVector::value_type(V, MayAlias));
    }
  }
  
@@ -175,14 +180,11 @@ void ScheduleDAGInstrs::finishBlock() {
  void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,
                                      MachineBasicBlock::iterator begin,
                                      MachineBasicBlock::iterator end,
-                                    unsigned endcount) {
+                                    unsigned regioninstrs) {
    assert(bb == BB && "startBlock should set BB");
    RegionBegin = begin;
    RegionEnd = end;
-  EndIndex = endcount;
-  MISUnitMap.clear();
-
-  ScheduleDAG::clearDAG();
+  NumRegionInstrs = regioninstrs;
  }
  
  /// Close the current scheduling region. Don't clear any state in case the
@@ -262,15 +264,15 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
        if (UseOp < 0)
          Dep = SDep(SU, SDep::Artificial);
        else {
+        // Set the hasPhysRegDefs only for physreg defs that have a use within
+        // the scheduling region.
+        SU->hasPhysRegDefs = true;
          Dep = SDep(SU, SDep::Data, *Alias);
          RegUse = UseSU->getInstr();
-        Dep.setMinLatency(
-          SchedModel.computeOperandLatency(SU->getInstr(), OperIdx,
-                                           RegUse, UseOp, /*FindMin=*/true));
        }
        Dep.setLatency(
-        SchedModel.computeOperandLatency(SU->getInstr(), OperIdx,
-                                         RegUse, UseOp, /*FindMin=*/false));
+        SchedModel.computeOperandLatency(SU->getInstr(), OperIdx, RegUse,
+                                         UseOp));
  
        ST.adjustSchedDependency(SU, UseSU, Dep);
        UseSU->addPred(Dep);
@@ -307,10 +309,8 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
            DefSU->addPred(SDep(SU, Kind, /*Reg=*/*Alias));
          else {
            SDep Dep(SU, Kind, /*Reg=*/*Alias);
-          unsigned OutLatency =
-            SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr());
-          Dep.setMinLatency(OutLatency);
-          Dep.setLatency(OutLatency);
+          Dep.setLatency(
+            SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
            DefSU->addPred(Dep);
          }
        }
@@ -318,6 +318,7 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
    }
  
    if (!MO.isDef()) {
+    SU->hasPhysRegUses = true;
      // Either insert a new Reg2SUnits entry with an empty SUnits list, or
      // retrieve the existing SUnits list for this register's uses.
      // Push this SUnit on the use list.
@@ -385,10 +386,8 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
      SUnit *DefSU = DefI->SU;
      if (DefSU != SU && DefSU != &ExitSU) {
        SDep Dep(SU, SDep::Output, Reg);
-      unsigned OutLatency =
-        SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr());
-      Dep.setMinLatency(OutLatency);
-      Dep.setLatency(OutLatency);
+      Dep.setLatency(
+        SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
        DefSU->addPred(Dep);
      }
      DefI->SU = SU;
@@ -405,6 +404,15 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
    MachineInstr *MI = SU->getInstr();
    unsigned Reg = MI->getOperand(OperIdx).getReg();
  
+  // Record this local VReg use.
+  VReg2UseMap::iterator UI = VRegUses.find(Reg);
+  for (; UI != VRegUses.end(); ++UI) {
+    if (UI->SU == SU)
+      break;
+  }
+  if (UI == VRegUses.end())
+    VRegUses.insert(VReg2SUnit(Reg, SU));
+
    // Lookup this operand's reaching definition.
    assert(LIS && "vreg dependencies requires LiveIntervals");
    LiveRangeQuery LRQ(LIS->getInterval(Reg), LIS->getInstructionIndex(MI));
@@ -423,10 +431,7 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
        // Adjust the dependence latency using operand def/use information, then
        // allow the target to perform its own adjustments.
        int DefOp = Def->findRegisterDefOperandIdx(Reg);
-      dep.setLatency(
-        SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx, false));
-      dep.setMinLatency(
-        SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx, true));
+      dep.setLatency(SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx));
  
        const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
        ST.adjustSchedDependency(DefSU, SU, const_cast<SDep &>(dep));
@@ -468,8 +473,8 @@ static inline bool isUnsafeMemoryObject(MachineInstr *MI,
  
    SmallVector<Value *, 4> Objs;
    getUnderlyingObjects(V, Objs);
-  for (SmallVector<Value *, 4>::iterator I = Objs.begin(),
-       IE = Objs.end(); I != IE; ++I) {
+  for (SmallVectorImpl<Value *>::iterator I = Objs.begin(),
+         IE = Objs.end(); I != IE; ++I) {
      V = *I;
  
      if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V)) {
@@ -638,8 +643,7 @@ void addChainDependency (AliasAnalysis *AA, const MachineFrameInfo *MFI,
                           bool isNormalMemory = false) {
    // If this is a false dependency,
    // do not add the edge, but rememeber the rejected node.
-  if (!EnableAASchedMI ||
-      MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) {
+  if (!AA || MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) {
      SDep Dep(SUa, isNormalMemory ? SDep::MayAliasMem : SDep::Barrier);
      Dep.setLatency(TrueMemOrderLatency);
      SUb->addPred(Dep);
@@ -667,7 +671,7 @@ void addChainDependency (AliasAnalysis *AA, const MachineFrameInfo *MFI,
  void ScheduleDAGInstrs::initSUnits() {
    // We'll be allocating one SUnit for each real instruction in the region,
    // which is contained within a basic block.
-  SUnits.reserve(BB->size());
+  SUnits.reserve(NumRegionInstrs);
  
    for (MachineBasicBlock::iterator I = RegionBegin; I != RegionEnd; ++I) {
      MachineInstr *MI = I;
@@ -689,10 +693,22 @@ void ScheduleDAGInstrs::initSUnits() {
  /// DAG builder is an efficient place to do it because it already visits
  /// operands.
  void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
-                                        RegPressureTracker *RPTracker) {
+                                        RegPressureTracker *RPTracker,
+                                        PressureDiffs *PDiffs) {
+  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+  bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI
+                                                       : ST.useAA();
+  AliasAnalysis *AAForDep = UseAA ? AA : 0;
+
+  MISUnitMap.clear();
+  ScheduleDAG::clearDAG();
+
    // Create an SUnit for each real instruction.
    initSUnits();
  
+  if (PDiffs)
+    PDiffs->init(SUnits.size());
+
    // We build scheduling units by walking a block's instruction list from bottom
    // to top.
  
@@ -718,10 +734,9 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
    Uses.setUniverse(TRI->getNumRegs());
  
    assert(VRegDefs.empty() && "Only BuildSchedGraph may access VRegDefs");
-  // FIXME: Allow SparseSet to reserve space for the creation of virtual
-  // registers during scheduling. Don't artificially inflate the Universe
-  // because we want to assert that vregs are not created during DAG building.
+  VRegUses.clear();
    VRegDefs.setUniverse(MRI.getNumVirtRegs());
+  VRegUses.setUniverse(MRI.getNumVirtRegs());
  
    // Model data dependencies between instructions being scheduled and the
    // ExitSU.
@@ -741,17 +756,18 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
        DbgMI = MI;
        continue;
      }
+    SUnit *SU = MISUnitMap[MI];
+    assert(SU && "No SUnit mapped to this MI");
+
      if (RPTracker) {
-      RPTracker->recede();
+      PressureDiff *PDiff = PDiffs ? &(*PDiffs)[SU->NodeNum] : 0;
+      RPTracker->recede(/*LiveUses=*/0, PDiff);
        assert(RPTracker->getPos() == prior(MII) && "RPTracker can't find MI");
      }
  
-    assert((!MI->isTerminator() || CanHandleTerminators) && !MI->isLabel() &&
+    assert((CanHandleTerminators || (!MI->isTerminator() && !MI->isLabel())) &&
             "Cannot schedule terminators or labels!");
  
-    SUnit *SU = MISUnitMap[MI];
-    assert(SU && "No SUnit mapped to this MI");
-
      // Add register-based dependencies (data, anti, and output).
      bool HasVRegDef = false;
      for (unsigned j = 0, n = MI->getNumOperands(); j != n; ++j) {
@@ -829,20 +845,20 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
          unsigned ChainLatency = 0;
          if (AliasChain->getInstr()->mayLoad())
            ChainLatency = TrueMemOrderLatency;
-        addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes,
+        addChainDependency(AAForDep, MFI, SU, AliasChain, RejectMemNodes,
                             ChainLatency);
        }
        AliasChain = SU;
        for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
-        addChainDependency(AA, MFI, SU, PendingLoads[k], RejectMemNodes,
+        addChainDependency(AAForDep, MFI, SU, PendingLoads[k], RejectMemNodes,
                             TrueMemOrderLatency);
        for (MapVector<const Value *, SUnit *>::iterator I = AliasMemDefs.begin(),
             E = AliasMemDefs.end(); I != E; ++I)
-        addChainDependency(AA, MFI, SU, I->second, RejectMemNodes);
+        addChainDependency(AAForDep, MFI, SU, I->second, RejectMemNodes);
        for (MapVector<const Value *, std::vector<SUnit *> >::iterator I =
             AliasMemUses.begin(), E = AliasMemUses.end(); I != E; ++I) {
          for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-          addChainDependency(AA, MFI, SU, I->second[i], RejectMemNodes,
+          addChainDependency(AAForDep, MFI, SU, I->second[i], RejectMemNodes,
                               TrueMemOrderLatency);
        }
        adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
@@ -851,7 +867,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
        AliasMemDefs.clear();
        AliasMemUses.clear();
      } else if (MI->mayStore()) {
-      SmallVector<std::pair<const Value *, bool>, 4> Objs;
+      UnderlyingObjectsVector Objs;
        getUnderlyingObjectsForInstr(MI, MFI, Objs);
  
        if (Objs.empty()) {
@@ -860,10 +876,10 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
        }
  
        bool MayAlias = false;
-      for (SmallVector<std::pair<const Value *, bool>, 4>::iterator
-           K = Objs.begin(), KE = Objs.end(); K != KE; ++K) {
-        const Value *V = K->first;
-        bool ThisMayAlias = K->second;
+      for (UnderlyingObjectsVector::iterator K = Objs.begin(), KE = Objs.end();
+           K != KE; ++K) {
+        const Value *V = K->getPointer();
+        bool ThisMayAlias = K->getInt();
          if (ThisMayAlias)
            MayAlias = true;
  
@@ -875,7 +891,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
          MapVector<const Value *, SUnit *>::iterator IE =
            ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
          if (I != IE) {
-          addChainDependency(AA, MFI, SU, I->second, RejectMemNodes, 0, true);
+          addChainDependency(AAForDep, MFI, SU, I->second, RejectMemNodes,
+                             0, true);
            I->second = SU;
          } else {
            if (ThisMayAlias)
@@ -890,7 +907,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
            ((ThisMayAlias) ? AliasMemUses.end() : NonAliasMemUses.end());
          if (J != JE) {
            for (unsigned i = 0, e = J->second.size(); i != e; ++i)
-            addChainDependency(AA, MFI, SU, J->second[i], RejectMemNodes,
+            addChainDependency(AAForDep, MFI, SU, J->second[i], RejectMemNodes,
                                 TrueMemOrderLatency, true);
            J->second.clear();
          }
@@ -899,11 +916,11 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
          // Add dependencies from all the PendingLoads, i.e. loads
          // with no underlying object.
          for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
-          addChainDependency(AA, MFI, SU, PendingLoads[k], RejectMemNodes,
+          addChainDependency(AAForDep, MFI, SU, PendingLoads[k], RejectMemNodes,
                               TrueMemOrderLatency);
          // Add dependence on alias chain, if needed.
          if (AliasChain)
-          addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes);
+          addChainDependency(AAForDep, MFI, SU, AliasChain, RejectMemNodes);
          // But we also should check dependent instructions for the
          // SU in question.
          adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
@@ -925,7 +942,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
        if (MI->isInvariantLoad(AA)) {
          // Invariant load, no chain dependencies needed!
        } else {
-        SmallVector<std::pair<const Value *, bool>, 4> Objs;
+        UnderlyingObjectsVector Objs;
          getUnderlyingObjectsForInstr(MI, MFI, Objs);
  
          if (Objs.empty()) {
@@ -933,7 +950,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
            // potentially aliasing stores.
            for (MapVector<const Value *, SUnit *>::iterator I =
                   AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I)
-            addChainDependency(AA, MFI, SU, I->second, RejectMemNodes);
+            addChainDependency(AAForDep, MFI, SU, I->second, RejectMemNodes);
  
            PendingLoads.push_back(SU);
            MayAlias = true;
@@ -941,10 +958,10 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
            MayAlias = false;
          }
  
-        for (SmallVector<std::pair<const Value *, bool>, 4>::iterator
+        for (UnderlyingObjectsVector::iterator
               J = Objs.begin(), JE = Objs.end(); J != JE; ++J) {
-          const Value *V = J->first;
-          bool ThisMayAlias = J->second;
+          const Value *V = J->getPointer();
+          bool ThisMayAlias = J->getInt();
  
            if (ThisMayAlias)
              MayAlias = true;
@@ -955,7 +972,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
            MapVector<const Value *, SUnit *>::iterator IE =
              ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
            if (I != IE)
-            addChainDependency(AA, MFI, SU, I->second, RejectMemNodes, 0, true);
+            addChainDependency(AAForDep, MFI, SU, I->second, RejectMemNodes,
+                               0, true);
            if (ThisMayAlias)
              AliasMemUses[V].push_back(SU);
            else
@@ -965,7 +983,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
            adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes, /*Latency=*/0);
          // Add dependencies on alias and barrier chains, if needed.
          if (MayAlias && AliasChain)
-          addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes);
+          addChainDependency(AAForDep, MFI, SU, AliasChain, RejectMemNodes);
          if (BarrierChain)
            BarrierChain->addPred(SDep(SU, SDep::Barrier));
        }
@@ -994,7 +1012,7 @@ std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const {
    else if (SU == &ExitSU)
      oss << "<exit>";
    else
-    SU->getInstr()->print(oss);
+    SU->getInstr()->print(oss, &TM, /*SkipOpers=*/true);
    return oss.str();
  }
  
@@ -1018,38 +1036,95 @@ class SchedDFSImpl {
    /// List PredSU, SuccSU pairs that represent data edges between subtrees.
    std::vector<std::pair<const SUnit*, const SUnit*> > ConnectionPairs;
  
+  struct RootData {
+    unsigned NodeID;
+    unsigned ParentNodeID;  // Parent node (member of the parent subtree).
+    unsigned SubInstrCount; // Instr count in this tree only, not children.
+
+    RootData(unsigned id): NodeID(id),
+                           ParentNodeID(SchedDFSResult::InvalidSubtreeID),
+                           SubInstrCount(0) {}
+
+    unsigned getSparseSetIndex() const { return NodeID; }
+  };
+
+  SparseSet<RootData> RootSet;
+
  public:
-  SchedDFSImpl(SchedDFSResult &r): R(r), SubtreeClasses(R.DFSData.size()) {}
+  SchedDFSImpl(SchedDFSResult &r): R(r), SubtreeClasses(R.DFSNodeData.size()) {
+    RootSet.setUniverse(R.DFSNodeData.size());
+  }
  
-  /// SubtreID is initialized to zero, set to itself to flag the root of a
-  /// subtree, set to the parent to indicate an interior node,
-  /// then set to a representative subtree ID during finalization.
+  /// Return true if this node been visited by the DFS traversal.
+  ///
+  /// During visitPostorderNode the Node's SubtreeID is assigned to the Node
+  /// ID. Later, SubtreeID is updated but remains valid.
    bool isVisited(const SUnit *SU) const {
-    return R.DFSData[SU->NodeNum].SubtreeID;
+    return R.DFSNodeData[SU->NodeNum].SubtreeID
+      != SchedDFSResult::InvalidSubtreeID;
    }
  
    /// Initialize this node's instruction count. We don't need to flag the node
    /// visited until visitPostorder because the DAG cannot have cycles.
    void visitPreorder(const SUnit *SU) {
-    R.DFSData[SU->NodeNum].InstrCount = SU->getInstr()->isTransient() ? 0 : 1;
+    R.DFSNodeData[SU->NodeNum].InstrCount =
+      SU->getInstr()->isTransient() ? 0 : 1;
    }
  
-  /// Mark this node as either the root of a subtree or an interior
-  /// node. Increment the parent node's instruction count.
-  void visitPostorder(const SUnit *SU, const SDep *PredDep, const SUnit *Parent) {
-    R.DFSData[SU->NodeNum].SubtreeID = SU->NodeNum;
-
-    if (!Parent)
-      return;
-    assert(PredDep && "PredDep required for non-root node");
-
-    joinPredSubtree(*PredDep, Parent);
+  /// Called once for each node after all predecessors are visited. Revisit this
+  /// node's predecessors and potentially join them now that we know the ILP of
+  /// the other predecessors.
+  void visitPostorderNode(const SUnit *SU) {
+    // Mark this node as the root of a subtree. It may be joined with its
+    // successors later.
+    R.DFSNodeData[SU->NodeNum].SubtreeID = SU->NodeNum;
+    RootData RData(SU->NodeNum);
+    RData.SubInstrCount = SU->getInstr()->isTransient() ? 0 : 1;
+
+    // If any predecessors are still in their own subtree, they either cannot be
+    // joined or are large enough to remain separate. If this parent node's
+    // total instruction count is not greater than a child subtree by at least
+    // the subtree limit, then try to join it now since splitting subtrees is
+    // only useful if multiple high-pressure paths are possible.
+    unsigned InstrCount = R.DFSNodeData[SU->NodeNum].InstrCount;
+    for (SUnit::const_pred_iterator
+           PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) {
+      if (PI->getKind() != SDep::Data)
+        continue;
+      unsigned PredNum = PI->getSUnit()->NodeNum;
+      if ((InstrCount - R.DFSNodeData[PredNum].InstrCount) < R.SubtreeLimit)
+        joinPredSubtree(*PI, SU, /*CheckLimit=*/false);
+
+      // Either link or merge the TreeData entry from the child to the parent.
+      if (R.DFSNodeData[PredNum].SubtreeID == PredNum) {
+        // If the predecessor's parent is invalid, this is a tree edge and the
+        // current node is the parent.
+        if (RootSet[PredNum].ParentNodeID == SchedDFSResult::InvalidSubtreeID)
+          RootSet[PredNum].ParentNodeID = SU->NodeNum;
+      }
+      else if (RootSet.count(PredNum)) {
+        // The predecessor is not a root, but is still in the root set. This
+        // must be the new parent that it was just joined to. Note that
+        // RootSet[PredNum].ParentNodeID may either be invalid or may still be
+        // set to the original parent.
+        RData.SubInstrCount += RootSet[PredNum].SubInstrCount;
+        RootSet.erase(PredNum);
+      }
+    }
+    RootSet[SU->NodeNum] = RData;
    }
  
-  /// Determine whether the DFS cross edge should be considered a subtree edge
-  /// or a connection between subtrees.
-  void visitCross(const SDep &PredDep, const SUnit *Succ) {
+  /// Called once for each tree edge after calling visitPostOrderNode on the
+  /// predecessor. Increment the parent node's instruction count and
+  /// preemptively join this subtree to its parent's if it is small enough.
+  void visitPostorderEdge(const SDep &PredDep, const SUnit *Succ) {
+    R.DFSNodeData[Succ->NodeNum].InstrCount
+      += R.DFSNodeData[PredDep.getSUnit()->NodeNum].InstrCount;
      joinPredSubtree(PredDep, Succ);
+  }
+
+  /// Add a connection for cross edges.
+  void visitCrossEdge(const SDep &PredDep, const SUnit *Succ) {
      ConnectionPairs.push_back(std::make_pair(PredDep.getSUnit(), Succ));
    }
  
@@ -1057,13 +1132,27 @@ public:
    /// between trees.
    void finalize() {
      SubtreeClasses.compress();
+    R.DFSTreeData.resize(SubtreeClasses.getNumClasses());
+    assert(SubtreeClasses.getNumClasses() == RootSet.size()
+           && "number of roots should match trees");
+    for (SparseSet<RootData>::const_iterator
+           RI = RootSet.begin(), RE = RootSet.end(); RI != RE; ++RI) {
+      unsigned TreeID = SubtreeClasses[RI->NodeID];
+      if (RI->ParentNodeID != SchedDFSResult::InvalidSubtreeID)
+        R.DFSTreeData[TreeID].ParentTreeID = SubtreeClasses[RI->ParentNodeID];
+      R.DFSTreeData[TreeID].SubInstrCount = RI->SubInstrCount;
+      // Note that SubInstrCount may be greater than InstrCount if we joined
+      // subtrees across a cross edge. InstrCount will be attributed to the
+      // original parent, while SubInstrCount will be attributed to the joined
+      // parent.
+    }
      R.SubtreeConnections.resize(SubtreeClasses.getNumClasses());
      R.SubtreeConnectLevels.resize(SubtreeClasses.getNumClasses());
      DEBUG(dbgs() << R.getNumSubtrees() << " subtrees:\n");
-    for (unsigned Idx = 0, End = R.DFSData.size(); Idx != End; ++Idx) {
-      R.DFSData[Idx].SubtreeID = SubtreeClasses[Idx];
+    for (unsigned Idx = 0, End = R.DFSNodeData.size(); Idx != End; ++Idx) {
+      R.DFSNodeData[Idx].SubtreeID = SubtreeClasses[Idx];
        DEBUG(dbgs() << "  SU(" << Idx << ") in tree "
-            << R.DFSData[Idx].SubtreeID << '\n');
+            << R.DFSNodeData[Idx].SubtreeID << '\n');
      }
      for (std::vector<std::pair<const SUnit*, const SUnit*> >::const_iterator
             I = ConnectionPairs.begin(), E = ConnectionPairs.end();
@@ -1079,32 +1168,33 @@ public:
    }
  
  protected:
-  void joinPredSubtree(const SDep &PredDep, const SUnit *Succ) {
-    // Join the child to its parent if they are connected via data dependence.
-    if (PredDep.getKind() != SDep::Data)
-      return;
+  /// Join the predecessor subtree with the successor that is its DFS
+  /// parent. Apply some heuristics before joining.
+  bool joinPredSubtree(const SDep &PredDep, const SUnit *Succ,
+                       bool CheckLimit = true) {
+    assert(PredDep.getKind() == SDep::Data && "Subtrees are for data edges");
+
+    // Check if the predecessor is already joined.
+    const SUnit *PredSU = PredDep.getSUnit();
+    unsigned PredNum = PredSU->NodeNum;
+    if (R.DFSNodeData[PredNum].SubtreeID != PredNum)
+      return false;
  
      // Four is the magic number of successors before a node is considered a
      // pinch point.
      unsigned NumDataSucs = 0;
-    const SUnit *PredSU = PredDep.getSUnit();
      for (SUnit::const_succ_iterator SI = PredSU->Succs.begin(),
             SE = PredSU->Succs.end(); SI != SE; ++SI) {
        if (SI->getKind() == SDep::Data) {
          if (++NumDataSucs >= 4)
-          return;
+          return false;
        }
      }
-    // If this is a cross edge to a root, join the subtrees. This happens when
-    // the root was first reached by a non-data dependence.
-    unsigned NodeNum = PredSU->NodeNum;
-    unsigned PredCnt = R.DFSData[NodeNum].InstrCount;
-    if (R.DFSData[NodeNum].SubtreeID == NodeNum && PredCnt < R.SubtreeLimit) {
-      R.DFSData[NodeNum].SubtreeID = Succ->NodeNum;
-      R.DFSData[Succ->NodeNum].InstrCount += PredCnt;
-      SubtreeClasses.join(Succ->NodeNum, NodeNum);
-      return;
-    }
+    if (CheckLimit && R.DFSNodeData[PredNum].InstrCount > R.SubtreeLimit)
+      return false;
+    R.DFSNodeData[PredNum].SubtreeID = Succ->NodeNum;
+    SubtreeClasses.join(Succ->NodeNum, PredNum);
+    return true;
    }
  
    /// Called by finalize() to record a connection between trees.
@@ -1112,16 +1202,19 @@ protected:
      if (!Depth)
        return;
  
-    SmallVectorImpl<SchedDFSResult::Connection> &Connections =
-      R.SubtreeConnections[FromTree];
-    for (SmallVectorImpl<SchedDFSResult::Connection>::iterator
-           I = Connections.begin(), E = Connections.end(); I != E; ++I) {
-      if (I->TreeID == ToTree) {
-        I->Level = std::max(I->Level, Depth);
-        return;
+    do {
+      SmallVectorImpl<SchedDFSResult::Connection> &Connections =
+        R.SubtreeConnections[FromTree];
+      for (SmallVectorImpl<SchedDFSResult::Connection>::iterator
+             I = Connections.begin(), E = Connections.end(); I != E; ++I) {
+        if (I->TreeID == ToTree) {
+          I->Level = std::max(I->Level, Depth);
+          return;
+        }
        }
-    }
-    Connections.push_back(SchedDFSResult::Connection(ToTree, Depth));
+      Connections.push_back(SchedDFSResult::Connection(ToTree, Depth));
+      FromTree = R.DFSTreeData[FromTree].ParentTreeID;
+    } while (FromTree != SchedDFSResult::InvalidSubtreeID);
    }
  };
  } // namespace llvm
@@ -1153,28 +1246,44 @@ public:
  };
  } // anonymous
  
+static bool hasDataSucc(const SUnit *SU) {
+  for (SUnit::const_succ_iterator
+         SI = SU->Succs.begin(), SE = SU->Succs.end(); SI != SE; ++SI) {
+    if (SI->getKind() == SDep::Data && !SI->getSUnit()->isBoundaryNode())
+      return true;
+  }
+  return false;
+}
+
  /// Compute an ILP metric for all nodes in the subDAG reachable via depth-first
  /// search from this root.
-void SchedDFSResult::compute(ArrayRef<SUnit *> Roots) {
+void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) {
    if (!IsBottomUp)
      llvm_unreachable("Top-down ILP metric is unimplemnted");
  
    SchedDFSImpl Impl(*this);
-  for (ArrayRef<const SUnit*>::const_iterator
-         RootI = Roots.begin(), RootE = Roots.end(); RootI != RootE; ++RootI) {
+  for (ArrayRef<SUnit>::const_iterator
+         SI = SUnits.begin(), SE = SUnits.end(); SI != SE; ++SI) {
+    const SUnit *SU = &*SI;
+    if (Impl.isVisited(SU) || hasDataSucc(SU))
+      continue;
+
      SchedDAGReverseDFS DFS;
-    Impl.visitPreorder(*RootI);
-    DFS.follow(*RootI);
+    Impl.visitPreorder(SU);
+    DFS.follow(SU);
      for (;;) {
        // Traverse the leftmost path as far as possible.
        while (DFS.getPred() != DFS.getPredEnd()) {
          const SDep &PredDep = *DFS.getPred();
          DFS.advance();
-        // If the pred is already valid, skip it. We may preorder visit a node
-        // with InstrCount==0 more than once, but it won't affect heuristics
-        // because we don't care about cross edges to leaf copies.
+        // Ignore non-data edges.
+        if (PredDep.getKind() != SDep::Data
+            || PredDep.getSUnit()->isBoundaryNode()) {
+          continue;
+        }
+        // An already visited edge is a cross edge, assuming an acyclic DAG.
          if (Impl.isVisited(PredDep.getSUnit())) {
-          Impl.visitCross(PredDep, DFS.getCurr());
+          Impl.visitCrossEdge(PredDep, DFS.getCurr());
            continue;
          }
          Impl.visitPreorder(PredDep.getSUnit());
@@ -1183,7 +1292,9 @@ void SchedDFSResult::compute(ArrayRef<SUnit *> Roots) {
        // Visit the top of the stack in postorder and backtrack.
        const SUnit *Child = DFS.getCurr();
        const SDep *PredDep = DFS.backtrack();
-      Impl.visitPostorder(Child, PredDep, PredDep ? DFS.getCurr() : 0);
+      Impl.visitPostorderNode(Child);
+      if (PredDep)
+        Impl.visitPostorderEdge(*PredDep, DFS.getCurr());
        if (DFS.isComplete())
          break;
      }