Add initial support for back-scheduling address computations,

[oota-llvm.git] / lib / CodeGen / ScheduleDAGInstrs.cpp
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp

index cfa639e940b42359533c20697ee310483cb42d7b..c3209006b971827058384018ff15171e22967976 100644 (file)
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -15,6 +15,7 @@
  #define DEBUG_TYPE "sched-instrs"
  #include "llvm/CodeGen/MachineDominators.h"
  #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
  #include "llvm/CodeGen/MachineRegisterInfo.h"
  #include "llvm/CodeGen/ScheduleDAGInstrs.h"
  #include "llvm/CodeGen/PseudoSourceValue.h"
@@ -29,6 +30,59 @@
  #include <map>
  using namespace llvm;
  
+namespace {
+  class VISIBILITY_HIDDEN LoopDependencies {
+    const MachineLoopInfo &MLI;
+    const MachineDominatorTree &MDT;
+
+  public:
+    typedef std::map<unsigned, std::pair<const MachineOperand *, unsigned> >
+      LoopDeps;
+    LoopDeps Deps;
+
+    LoopDependencies(const MachineLoopInfo &mli,
+                     const MachineDominatorTree &mdt) :
+      MLI(mli), MDT(mdt) {}
+
+    void VisitLoop(const MachineLoop *Loop) {
+      Deps.clear();
+      MachineBasicBlock *Header = Loop->getHeader();
+      SmallSet<unsigned, 8> LoopLiveIns;
+      for (MachineBasicBlock::livein_iterator LI = Header->livein_begin(),
+           LE = Header->livein_end(); LI != LE; ++LI)
+        LoopLiveIns.insert(*LI);
+
+      VisitRegion(MDT.getNode(Header), Loop, LoopLiveIns);
+    }
+
+  private:
+    void VisitRegion(const MachineDomTreeNode *Node,
+                     const MachineLoop *Loop,
+                     const SmallSet<unsigned, 8> &LoopLiveIns) {
+      MachineBasicBlock *MBB = Node->getBlock();
+      if (!Loop->contains(MBB)) return;
+
+      unsigned Count = 0;
+      for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+           I != E; ++I, ++Count) {
+        const MachineInstr *MI = I;
+        for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+          const MachineOperand &MO = MI->getOperand(i);
+          if (!MO.isReg() || !MO.isUse())
+            continue;
+          unsigned MOReg = MO.getReg();
+          if (LoopLiveIns.count(MOReg))
+            Deps.insert(std::make_pair(MOReg, std::make_pair(&MO, Count)));
+        }
+      }
+
+      const std::vector<MachineDomTreeNode*> &Children = Node->getChildren();
+      for (unsigned I = 0, E = Children.size(); I != E; ++I)
+        VisitRegion(Children[I], Loop, LoopLiveIns);
+    }
+  };
+}
+
  ScheduleDAGInstrs::ScheduleDAGInstrs(MachineBasicBlock *bb,
                                       const TargetMachine &tm,
                                       const MachineLoopInfo &mli,
@@ -65,9 +119,27 @@ void ScheduleDAGInstrs::BuildSchedUnits() {
    // all the work of the block is done before the terminator.
    SUnit *Terminator = 0;
  
+  LoopDependencies LoopRegs(MLI, MDT);
+
+  // Track which regs are live into a loop, to help guide back-edge-aware
+  // scheduling.
+  SmallSet<unsigned, 8> LoopLiveInRegs;
+  if (MachineLoop *ML = MLI.getLoopFor(BB))
+    if (BB == ML->getLoopLatch()) {
+      MachineBasicBlock *Header = ML->getHeader();
+      for (MachineBasicBlock::livein_iterator I = Header->livein_begin(),
+           E = Header->livein_end(); I != E; ++I)
+        LoopLiveInRegs.insert(*I);
+      LoopRegs.VisitLoop(ML);
+    }
+
    // Check to see if the scheduler cares about latencies.
    bool UnitLatencies = ForceUnitLatencies();
  
+  // Ask the target if address-backscheduling is desirable, and if so how much.
+  unsigned SpecialAddressLatency =
+    TM.getSubtarget<TargetSubtarget>().getSpecialAddressLatency();
+
    for (MachineBasicBlock::iterator MII = BB->end(), MIE = BB->begin();
         MII != MIE; --MII) {
      MachineInstr *MI = prior(MII);
@@ -118,7 +190,21 @@ void ScheduleDAGInstrs::BuildSchedUnits() {
          for (unsigned i = 0, e = UseList.size(); i != e; ++i) {
            SUnit *UseSU = UseList[i];
            if (UseSU != SU) {
-            UseSU->addPred(SDep(SU, SDep::Data, DataLatency, Reg));
+            unsigned LDataLatency = DataLatency;
+            // Optionally add in a special extra latency for nodes that
+            // feed addresses.
+            // TODO: Do this for register aliases too.
+            if (SpecialAddressLatency != 0 && !UnitLatencies) {
+              MachineInstr *UseMI = UseSU->getInstr();
+              const TargetInstrDesc &UseTID = UseMI->getDesc();
+              int RegUseIndex = UseMI->findRegisterUseOperandIdx(Reg);
+              assert(RegUseIndex >= 0 && "UseMI doesn's use register!");
+              if ((UseTID.mayLoad() || UseTID.mayStore()) &&
+                  (unsigned)RegUseIndex < UseTID.getNumOperands() &&
+                  UseTID.OpInfo[RegUseIndex].isLookupPtrRegClass())
+                LDataLatency += SpecialAddressLatency;
+            }
+            UseSU->addPred(SDep(SU, SDep::Data, LDataLatency, Reg));
            }
          }
          for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
@@ -130,6 +216,51 @@ void ScheduleDAGInstrs::BuildSchedUnits() {
            }
          }
  
+        // If a def is going to wrap back around to the top of the loop,
+        // backschedule it.
+        // TODO: Blocks in loops without terminators can benefit too.
+        if (!UnitLatencies && Terminator && DefList.empty()) {
+          LoopDependencies::LoopDeps::iterator I = LoopRegs.Deps.find(Reg);
+          if (I != LoopRegs.Deps.end()) {
+            const MachineOperand *UseMO = I->second.first;
+            unsigned Count = I->second.second;
+            const MachineInstr *UseMI = UseMO->getParent();
+            unsigned UseMOIdx = UseMO - &UseMI->getOperand(0);
+            const TargetInstrDesc &UseTID = UseMI->getDesc();
+            // TODO: If we knew the total depth of the region here, we could
+            // handle the case where the whole loop is inside the region but
+            // is large enough that the isScheduleHigh trick isn't needed.
+            if (UseMOIdx < UseTID.getNumOperands()) {
+              // Currently, we only support scheduling regions consisting of
+              // single basic blocks. Check to see if the instruction is in
+              // the same region by checking to see if it has the same parent.
+              if (UseMI->getParent() != MI->getParent()) {
+                unsigned Latency = SU->Latency;
+                if (UseTID.OpInfo[UseMOIdx].isLookupPtrRegClass())
+                  Latency += SpecialAddressLatency;
+                // This is a wild guess as to the portion of the latency which
+                // will be overlapped by work done outside the current
+                // scheduling region.
+                Latency -= std::min(Latency, Count);
+                // Add the artifical edge.
+                Terminator->addPred(SDep(SU, SDep::Order, Latency,
+                                         /*Reg=*/0, /*isNormalMemory=*/false,
+                                         /*isMustAlias=*/false,
+                                         /*isArtificial=*/true));
+              } else if (SpecialAddressLatency > 0 &&
+                         UseTID.OpInfo[UseMOIdx].isLookupPtrRegClass()) {
+                // The entire loop body is within the current scheduling region
+                // and the latency of this operation is assumed to be greater
+                // than the latency of the loop.
+                // TODO: Recursively mark data-edge predecessors as
+                //       isScheduleHigh too.
+                SU->isScheduleHigh = true;
+              }
+            }
+            LoopRegs.Deps.erase(I);
+          }
+        }
+
          UseList.clear();
          if (!MO.isDead())
            DefList.clear();