Add a hybrid bottom up scheduler that reduce register usage while avoiding
authorEvan Cheng <evan.cheng@apple.com>
Thu, 20 May 2010 06:13:19 +0000 (06:13 +0000)
committerEvan Cheng <evan.cheng@apple.com>
Thu, 20 May 2010 06:13:19 +0000 (06:13 +0000)
pipeline stall. It's useful for targets like ARM cortex-a8. NEON has a lot
of long latency instructions so a strict register pressure reduction
scheduler does not work well.
Early experiments show this speeds up some NEON loops by over 30%.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@104216 91177308-0d34-0410-b5e6-96231b3b80d8

include/llvm/CodeGen/LinkAllCodegenComponents.h
include/llvm/CodeGen/ScheduleDAG.h
include/llvm/CodeGen/SchedulerRegistry.h
include/llvm/Target/TargetMachine.h
lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp

index 0064776ca29493a9fedc4b7b5653bd99cbd0a5f7..b4c2f2fe13c15f87b5fcb26dd553c212cdd3eff4 100644 (file)
@@ -46,6 +46,7 @@ namespace {
       (void) llvm::createBURRListDAGScheduler(NULL, llvm::CodeGenOpt::Default);
       (void) llvm::createTDRRListDAGScheduler(NULL, llvm::CodeGenOpt::Default);
       (void) llvm::createSourceListDAGScheduler(NULL,llvm::CodeGenOpt::Default);
+      (void) llvm::createHybridListDAGScheduler(NULL,llvm::CodeGenOpt::Default);
       (void) llvm::createTDListDAGScheduler(NULL, llvm::CodeGenOpt::Default);
       (void) llvm::createFastDAGScheduler(NULL, llvm::CodeGenOpt::Default);
       (void) llvm::createDefaultScheduler(NULL, llvm::CodeGenOpt::Default);
index 5a0109ac86e13ffa0b04859420c55358a86706dd..3c59a5ac11edc980b99167786a5f2205d2012e3d 100644 (file)
@@ -409,7 +409,9 @@ namespace llvm {
   /// implementation to decide.
   /// 
   class SchedulingPriorityQueue {
+    unsigned CurCycle;
   public:
+    SchedulingPriorityQueue() : CurCycle(0) {}
     virtual ~SchedulingPriorityQueue() {}
   
     virtual void initNodes(std::vector<SUnit> &SUnits) = 0;
@@ -433,6 +435,14 @@ namespace llvm {
     virtual void ScheduledNode(SUnit *) {}
 
     virtual void UnscheduledNode(SUnit *) {}
+
+    void setCurCycle(unsigned Cycle) {
+      CurCycle = Cycle;
+    }
+
+    unsigned getCurCycle() const {
+      return CurCycle;
+    }    
   };
 
   class ScheduleDAG {
index cf3274f4a961abe84e7b622c3258b3d4b86739bd..14c33e2187552d4391f302e873d5d25a10be224a 100644 (file)
@@ -73,11 +73,17 @@ ScheduleDAGSDNodes *createBURRListDAGScheduler(SelectionDAGISel *IS,
 ScheduleDAGSDNodes *createTDRRListDAGScheduler(SelectionDAGISel *IS,
                                                CodeGenOpt::Level OptLevel);
 
-/// createBURRListDAGScheduler - This creates a bottom up register usage
-/// reduction list scheduler that schedules in source code order when possible.
+/// createBURRListDAGScheduler - This creates a bottom up list scheduler that
+/// schedules nodes in source code order when possible.
 ScheduleDAGSDNodes *createSourceListDAGScheduler(SelectionDAGISel *IS,
                                                  CodeGenOpt::Level OptLevel);
 
+/// createHybridListDAGScheduler - This creates a bottom up hybrid register
+/// usage reduction list scheduler that make use of latency information to
+/// avoid stalls for long latency instructions.
+ScheduleDAGSDNodes *createHybridListDAGScheduler(SelectionDAGISel *IS,
+                                                 CodeGenOpt::Level);
+
 /// createTDListDAGScheduler - This creates a top-down list scheduler with
 /// a hazard recognizer.
 ScheduleDAGSDNodes *createTDListDAGScheduler(SelectionDAGISel *IS,
index aa7a30a759895ba58a9047a6cbeebef921fc55c7..a89288d1a8f23fbef8117794fb06311b957c472a 100644 (file)
@@ -73,7 +73,8 @@ namespace CodeGenOpt {
 namespace Sched {
   enum Preference {
     Latency,          // Scheduling for shortest total latency.
-    RegPressure       // Scheduling for lowest register pressure.
+    RegPressure,      // Scheduling for lowest register pressure.
+    Hybrid            // Scheduling for both latency and register pressure.
   };
 }
 
index da028500bd0a10c93944d30f21c828dedf7c3555..98dd34fe163929412663c7e2582a1d441e282651 100644 (file)
@@ -53,6 +53,12 @@ static RegisterScheduler
                          "order when possible",
                          createSourceListDAGScheduler);
 
+static RegisterScheduler
+  hybridListDAGScheduler("hybrid",
+                         "Bottom-up rr list scheduling which avoid stalls for "
+                         "long latency instructions",
+                         createHybridListDAGScheduler);
+
 namespace {
 //===----------------------------------------------------------------------===//
 /// ScheduleDAGRRList - The actual register reduction list scheduler
@@ -64,6 +70,10 @@ private:
   /// it is top-down.
   bool isBottomUp;
 
+  /// NeedLatency - True if the scheduler will make use of latency information.
+  ///
+  bool NeedLatency;
+
   /// AvailableQueue - The priority queue to use for the available SUnits.
   SchedulingPriorityQueue *AvailableQueue;
 
@@ -80,9 +90,9 @@ private:
 
 public:
   ScheduleDAGRRList(MachineFunction &mf,
-                    bool isbottomup,
+                    bool isbottomup, bool needlatency,
                     SchedulingPriorityQueue *availqueue)
-    : ScheduleDAGSDNodes(mf), isBottomUp(isbottomup),
+    : ScheduleDAGSDNodes(mf), isBottomUp(isbottomup), NeedLatency(needlatency),
       AvailableQueue(availqueue), Topo(SUnits) {
     }
 
@@ -161,9 +171,11 @@ private:
     return NewNode;
   }
 
-  /// ForceUnitLatencies - Return true, since register-pressure-reducing
-  /// scheduling doesn't need actual latency information.
-  bool ForceUnitLatencies() const { return true; }
+  /// ForceUnitLatencies - Register-pressure-reducing scheduling doesn't
+  /// need actual latency information but the hybrid scheduler does.
+  bool ForceUnitLatencies() const {
+    return !NeedLatency;
+  }
 };
 }  // end anonymous namespace
 
@@ -213,6 +225,12 @@ void ScheduleDAGRRList::ReleasePred(SUnit *SU, const SDep *PredEdge) {
 #endif
   --PredSU->NumSuccsLeft;
 
+  if (!ForceUnitLatencies()) {
+    // Updating predecessor's height. This is now the cycle when the
+    // predecessor can be scheduled without causing a pipeline stall.
+    PredSU->setHeightToAtLeast(SU->getHeight() + PredEdge->getLatency());
+  }
+
   // If all the node's successors are scheduled, this node is ready
   // to be scheduled. Ignore the special EntrySU node.
   if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU) {
@@ -244,10 +262,15 @@ void ScheduleDAGRRList::ReleasePredecessors(SUnit *SU, unsigned CurCycle) {
 /// count of its predecessors. If a predecessor pending count is zero, add it to
 /// the Available queue.
 void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU, unsigned CurCycle) {
-  DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: ");
+  DEBUG(dbgs() << "\n*** Scheduling [" << CurCycle << "]: ");
   DEBUG(SU->dump(this));
 
-  assert(CurCycle >= SU->getHeight() && "Node scheduled below its height!");
+#ifndef NDEBUG
+  if (CurCycle < SU->getHeight())
+    DEBUG(dbgs() << "   Height [" << SU->getHeight() << "] pipeline stall!\n");
+#endif
+
+  // FIXME: Handle noop hazard.
   SU->setHeightToAtLeast(CurCycle);
   Sequence.push_back(SU);
 
@@ -339,6 +362,7 @@ void ScheduleDAGRRList::BacktrackBottomUp(SUnit *SU, unsigned BtCycle,
       SU->isAvailable = false;
     UnscheduleNodeBottomUp(OldSU);
     --CurCycle;
+    AvailableQueue->setCurCycle(CurCycle);
   }
 
   assert(!SU->isSucc(OldSU) && "Something is wrong!");
@@ -386,7 +410,7 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
     if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes))
       return NULL;
 
-    DEBUG(dbgs() << "Unfolding SU # " << SU->NodeNum << "\n");
+    DEBUG(dbgs() << "Unfolding SU #" << SU->NodeNum << "\n");
     assert(NewNodes.size() == 2 && "Expected a load folding node!");
 
     N = NewNodes[1];
@@ -504,7 +528,7 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
     SU = NewSU;
   }
 
-  DEBUG(dbgs() << "Duplicating SU # " << SU->NodeNum << "\n");
+  DEBUG(dbgs() << "    Duplicating SU #" << SU->NodeNum << "\n");
   NewSU = CreateClone(SU);
 
   // New SUnit has the exact same predecessors.
@@ -786,7 +810,7 @@ void ScheduleDAGRRList::ListScheduleBottomUp() {
           // Issue copies, these can be expensive cross register class copies.
           SmallVector<SUnit*, 2> Copies;
           InsertCopiesAndMoveSuccs(LRDef, Reg, DestRC, RC, Copies);
-          DEBUG(dbgs() << "Adding an edge from SU #" << TrySU->NodeNum
+          DEBUG(dbgs() << "    Adding an edge from SU #" << TrySU->NodeNum
                        << " to SU #" << Copies.front()->NodeNum << "\n");
           AddPred(TrySU, SDep(Copies.front(), SDep::Order, /*Latency=*/1,
                               /*Reg=*/0, /*isNormalMemory=*/false,
@@ -795,7 +819,7 @@ void ScheduleDAGRRList::ListScheduleBottomUp() {
           NewDef = Copies.back();
         }
 
-        DEBUG(dbgs() << "Adding an edge from SU #" << NewDef->NodeNum
+        DEBUG(dbgs() << "    Adding an edge from SU #" << NewDef->NodeNum
                      << " to SU #" << TrySU->NodeNum << "\n");
         LiveRegDefs[Reg] = NewDef;
         AddPred(NewDef, SDep(TrySU, SDep::Order, /*Latency=*/1,
@@ -821,6 +845,7 @@ void ScheduleDAGRRList::ListScheduleBottomUp() {
     if (CurSU)
       ScheduleNodeBottomUp(CurSU, CurCycle);
     ++CurCycle;
+    AvailableQueue->setCurCycle(CurCycle);
   }
 
   // Reverse the order if it is bottom up.
@@ -889,6 +914,7 @@ void ScheduleDAGRRList::ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle) {
 /// schedulers.
 void ScheduleDAGRRList::ListScheduleTopDown() {
   unsigned CurCycle = 0;
+  AvailableQueue->setCurCycle(CurCycle);
 
   // Release any successors of the special Entry node.
   ReleaseSuccessors(&EntrySU);
@@ -911,6 +937,7 @@ void ScheduleDAGRRList::ListScheduleTopDown() {
     if (CurSU)
       ScheduleNodeTopDown(CurSU, CurCycle);
     ++CurCycle;
+    AvailableQueue->setCurCycle(CurCycle);
   }
   
 #ifndef NDEBUG
@@ -956,6 +983,16 @@ namespace {
     
     bool operator()(const SUnit* left, const SUnit* right) const;
   };
+
+  struct hybrid_ls_rr_sort : public std::binary_function<SUnit*, SUnit*, bool> {
+    RegReductionPriorityQueue<hybrid_ls_rr_sort> *SPQ;
+    hybrid_ls_rr_sort(RegReductionPriorityQueue<hybrid_ls_rr_sort> *spq)
+      : SPQ(spq) {}
+    hybrid_ls_rr_sort(const hybrid_ls_rr_sort &RHS)
+      : SPQ(RHS.SPQ) {}
+    
+    bool operator()(const SUnit* left, const SUnit* right) const;
+  };
 }  // end anonymous namespace
 
 /// CalcNodeSethiUllmanNumber - Compute Sethi Ullman number.
@@ -991,7 +1028,7 @@ namespace {
   template<class SF>
   class RegReductionPriorityQueue : public SchedulingPriorityQueue {
     PriorityQueue<SUnit*, std::vector<SUnit*>, SF> Queue;
-    unsigned currentQueueId;
+    unsigned CurQueueId;
 
   protected:
     // SUnits - The SUnits for the current graph.
@@ -1007,7 +1044,7 @@ namespace {
   public:
     RegReductionPriorityQueue(const TargetInstrInfo *tii,
                               const TargetRegisterInfo *tri)
-      : Queue(SF(this)), currentQueueId(0),
+      : Queue(SF(this)), CurQueueId(0),
         TII(tii), TRI(tri), scheduleDAG(NULL) {}
     
     void initNodes(std::vector<SUnit> &sunits) {
@@ -1067,14 +1104,14 @@ namespace {
     unsigned getNodeOrdering(const SUnit *SU) const {
       return scheduleDAG->DAG->GetOrdering(SU->getNode());
     }
-    
+
     unsigned size() const { return Queue.size(); }
 
     bool empty() const { return Queue.empty(); }
     
     void push(SUnit *U) {
       assert(!U->NodeQueueId && "Node in the queue already");
-      U->NodeQueueId = ++currentQueueId;
+      U->NodeQueueId = ++CurQueueId;
       Queue.push(U);
     }
 
@@ -1117,6 +1154,9 @@ namespace {
 
   typedef RegReductionPriorityQueue<src_ls_rr_sort>
     SrcRegReductionPriorityQueue;
+
+  typedef RegReductionPriorityQueue<hybrid_ls_rr_sort>
+    HybridBURRPriorityQueue;
 }
 
 /// closestSucc - Returns the scheduled cycle of the successor which is
@@ -1203,7 +1243,7 @@ bool bu_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const {
 }
 
 // Source order, otherwise bottom up.
-bool src_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const{
+bool src_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const {
   unsigned LOrder = SPQ->getNodeOrdering(left);
   unsigned ROrder = SPQ->getNodeOrdering(right);
 
@@ -1215,6 +1255,23 @@ bool src_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const{
   return BURRSort(left, right, SPQ);
 }
 
+bool hybrid_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const{
+  bool LStall = SPQ->getCurCycle() < left->getHeight();
+  bool RStall = SPQ->getCurCycle() < right->getHeight();
+  // If scheduling one of the node will cause a pipeline stall, delay it.
+  // If scheduling either one of the node will cause a pipeline stall, sort them
+  // according to their height.
+  // If neither will cause a pipeline stall, try to reduce register pressure.
+  if (LStall) {
+    if (!RStall)
+      return true;
+    if (left->getHeight() != right->getHeight())
+      return left->getHeight() > right->getHeight();
+  } else if (RStall)
+      return false;
+  return BURRSort(left, right, SPQ);
+}
+
 template<class SF>
 bool
 RegReductionPriorityQueue<SF>::canClobber(const SUnit *SU, const SUnit *Op) {
@@ -1379,8 +1436,8 @@ void RegReductionPriorityQueue<SF>::PrescheduleNodesWithMultipleUses() {
 
     // Ok, the transformation is safe and the heuristics suggest it is
     // profitable. Update the graph.
-    DEBUG(dbgs() << "Prescheduling SU # " << SU->NodeNum
-                 << " next to PredSU # " << PredSU->NodeNum
+    DEBUG(dbgs() << "    Prescheduling SU #" << SU->NodeNum
+                 << " next to PredSU #" << PredSU->NodeNum
                  << " to guide scheduling in the presence of multiple uses\n");
     for (unsigned i = 0; i != PredSU->Succs.size(); ++i) {
       SDep Edge = PredSU->Succs[i];
@@ -1469,7 +1526,7 @@ void RegReductionPriorityQueue<SF>::AddPseudoTwoAddrDeps() {
              (hasCopyToRegUse(SU) && !hasCopyToRegUse(SuccSU)) ||
              (!SU->isCommutable && SuccSU->isCommutable)) &&
             !scheduleDAG->IsReachable(SuccSU, SU)) {
-          DEBUG(dbgs() << "Adding a pseudo-two-addr edge from SU # "
+          DEBUG(dbgs() << "    Adding a pseudo-two-addr edge from SU #"
                        << SU->NodeNum << " to SU #" << SuccSU->NodeNum << "\n");
           scheduleDAG->AddPred(SU, SDep(SuccSU, SDep::Order, /*Latency=*/0,
                                         /*Reg=*/0, /*isNormalMemory=*/false,
@@ -1563,8 +1620,7 @@ llvm::createBURRListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
   
   BURegReductionPriorityQueue *PQ = new BURegReductionPriorityQueue(TII, TRI);
 
-  ScheduleDAGRRList *SD =
-    new ScheduleDAGRRList(*IS->MF, true, PQ);
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, false, PQ);
   PQ->setScheduleDAG(SD);
   return SD;  
 }
@@ -1577,8 +1633,7 @@ llvm::createTDRRListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
   
   TDRegReductionPriorityQueue *PQ = new TDRegReductionPriorityQueue(TII, TRI);
 
-  ScheduleDAGRRList *SD =
-    new ScheduleDAGRRList(*IS->MF, false, PQ);
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, false, false, PQ);
   PQ->setScheduleDAG(SD);
   return SD;
 }
@@ -1591,8 +1646,20 @@ llvm::createSourceListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
   
   SrcRegReductionPriorityQueue *PQ = new SrcRegReductionPriorityQueue(TII, TRI);
 
-  ScheduleDAGRRList *SD =
-    new ScheduleDAGRRList(*IS->MF, true, PQ);
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, false, PQ);
+  PQ->setScheduleDAG(SD);
+  return SD;  
+}
+
+llvm::ScheduleDAGSDNodes *
+llvm::createHybridListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  const TargetMachine &TM = IS->TM;
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  
+  HybridBURRPriorityQueue *PQ = new HybridBURRPriorityQueue(TII, TRI);
+
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, true, PQ);
   PQ->setScheduleDAG(SD);
   return SD;  
 }
index 68d0c804d8266c1e6b4844988e154c1879738ee8..f765409d9e2f26c2e53a3f32b0ce3cb0ebedc897 100644 (file)
@@ -347,7 +347,7 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
         const SDep& dep = SDep(OpSU, isChain ? SDep::Order : SDep::Data,
                                OpSU->Latency, PhysReg);
         if (!isChain && !UnitLatencies) {
-          ComputeOperandLatency(OpSU, SU, const_cast<SDep &>(dep));
+          ComputeOperandLatency(OpN, N, i, const_cast<SDep &>(dep));
           ST.adjustSchedDependency(OpSU, SU, const_cast<SDep &>(dep));
         }
 
@@ -378,6 +378,10 @@ void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) {
   }
 
   const InstrItineraryData &InstrItins = TM.getInstrItineraryData();
+  if (InstrItins.isEmpty()) {
+    SU->Latency = 1;
+    return;
+  }
   
   // Compute the latency for the node.  We use the sum of the latencies for
   // all nodes flagged together into this SUnit.
@@ -389,6 +393,37 @@ void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) {
     }
 }
 
+void ScheduleDAGSDNodes::ComputeOperandLatency(SDNode *Def, SDNode *Use,
+                                               unsigned OpIdx, SDep& dep) const{
+  // Check to see if the scheduler cares about latencies.
+  if (ForceUnitLatencies())
+    return;
+
+  const InstrItineraryData &InstrItins = TM.getInstrItineraryData();
+  if (InstrItins.isEmpty())
+    return;
+  
+  if (dep.getKind() != SDep::Data)
+    return;
+
+  unsigned DefIdx = Use->getOperand(OpIdx).getResNo();
+  if (Def->isMachineOpcode() && Use->isMachineOpcode()) {
+    const TargetInstrDesc &II = TII->get(Def->getMachineOpcode());
+    if (DefIdx >= II.getNumDefs())
+      return;
+    int DefCycle = InstrItins.getOperandCycle(II.getSchedClass(), DefIdx);
+    if (DefCycle < 0)
+      return;
+    const unsigned UseClass = TII->get(Use->getMachineOpcode()).getSchedClass();
+    int UseCycle = InstrItins.getOperandCycle(UseClass, OpIdx);
+    if (UseCycle >= 0) {
+      int Latency = DefCycle - UseCycle + 1;
+      if (Latency >= 0)
+        dep.setLatency(Latency);
+    }
+  }
+}
+
 void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const {
   if (!SU->getNode()) {
     dbgs() << "PHYS REG COPY\n";
index 7ae8ec236fbf1cfeb9cd9903b0d2e85b93346e9c..42c58c105b941e9d85219ba4f3c161924a3519f7 100644 (file)
@@ -94,6 +94,15 @@ namespace llvm {
     ///
     virtual void ComputeLatency(SUnit *SU);
 
+    /// ComputeOperandLatency - Override dependence edge latency using
+    /// operand use/def information
+    ///
+    virtual void ComputeOperandLatency(SUnit *Def, SUnit *Use,
+                                       SDep& dep) const { }
+
+    virtual void ComputeOperandLatency(SDNode *Def, SDNode *Use,
+                                       unsigned OpIdx, SDep& dep) const;
+
     virtual MachineBasicBlock *EmitSchedule();
 
     /// Schedule - Order nodes according to selected style, filling
index 76e09e79f9236bcc4e58fd67e5852c44d0d59ed0..6d789a674cf6b6ffd4b0374c4548fbbf149c40e2 100644 (file)
@@ -134,9 +134,11 @@ namespace llvm {
       return createFastDAGScheduler(IS, OptLevel);
     if (TLI.getSchedulingPreference() == Sched::Latency)
       return createTDListDAGScheduler(IS, OptLevel);
-    assert(TLI.getSchedulingPreference() == Sched::RegPressure &&
+    if (TLI.getSchedulingPreference() == Sched::RegPressure)
+      return createBURRListDAGScheduler(IS, OptLevel);
+    assert(TLI.getSchedulingPreference() == Sched::Hybrid &&
            "Unknown sched type!");
-    return createBURRListDAGScheduler(IS, OptLevel);
+    return createHybridListDAGScheduler(IS, OptLevel);
   }
 }