Avoiding overly aggressive latency scheduling. If the two nodes share an

author Evan Cheng <evan.cheng@apple.com>

Fri, 29 Oct 2010 18:09:28 +0000 (18:09 +0000)

committer Evan Cheng <evan.cheng@apple.com>

Fri, 29 Oct 2010 18:09:28 +0000 (18:09 +0000)
author Evan Cheng <evan.cheng@apple.com>
Fri, 29 Oct 2010 18:09:28 +0000 (18:09 +0000)
committer Evan Cheng <evan.cheng@apple.com>
Fri, 29 Oct 2010 18:09:28 +0000 (18:09 +0000)
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp

index 4c3e4e3b076801d44e08320e3d403dedb7933405..ea1aaa1e05c78b95fd9e2b8ecc2d059d37cb74bc 100644 (file)
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -190,7 +190,7 @@ private:
  void ScheduleDAGRRList::Schedule() {
    DEBUG(dbgs()
          << "********** List Scheduling BB#" << BB->getNumber()
-        << " **********\n");
+        << " '" << BB->getName() << "' **********\n");
  
    NumLiveRegs = 0;
    LiveRegDefs.resize(TRI->getNumRegs(), NULL);  
@@ -1483,6 +1483,46 @@ static unsigned calcMaxScratches(const SUnit *SU) {
    return Scratches;
  }
  
+/// hasOnlyLiveOutUse - Return true if SU has a single value successor that is a
+/// CopyToReg to a virtual register. This SU def is probably a liveout and
+/// it has no other use. It should be scheduled closer to the terminator.
+static bool hasOnlyLiveOutUses(const SUnit *SU) {
+  bool RetVal = false;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;
+    const SUnit *SuccSU = I->getSUnit();
+    if (SuccSU->getNode() && SuccSU->getNode()->getOpcode() == ISD::CopyToReg) {
+      unsigned Reg =
+        cast<RegisterSDNode>(SuccSU->getNode()->getOperand(1))->getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        RetVal = true;
+        continue;
+      }
+    }
+    return false;
+  }
+  return RetVal;
+}
+
+/// UnitsSharePred - Return true if the two scheduling units share a common
+/// data predecessor.
+static bool UnitsSharePred(const SUnit *left, const SUnit *right) {
+  SmallSet<const SUnit*, 4> Preds;
+  for (SUnit::const_pred_iterator I = left->Preds.begin(),E = left->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;  // ignore chain preds
+    Preds.insert(I->getSUnit());
+  }
+  for (SUnit::const_pred_iterator I = right->Preds.begin(),E = right->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;  // ignore chain preds
+    if (Preds.count(I->getSUnit()))
+      return true;
+  }
+  return false;
+}
+
  template <typename RRSort>
  static bool BURRSort(const SUnit *left, const SUnit *right,
                       const RegReductionPriorityQueue<RRSort> *SPQ) {
@@ -1558,29 +1598,46 @@ bool hybrid_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const{
    else if (!LHigh && RHigh)
      return false;
    else if (!LHigh && !RHigh) {
+    // If the two nodes share an operand and one of them has a single
+    // use that is a live out copy, favor the one that is live out. Otherwise
+    // it will be difficult to eliminate the copy if the instruction is a
+    // loop induction variable update. e.g.
+    // BB:
+    // sub r1, r3, #1
+    // str r0, [r2, r3]
+    // mov r3, r1
+    // cmp
+    // bne BB
+    bool SharePred = UnitsSharePred(left, right);
+    // FIXME: Only adjust if BB is a loop back edge.
+    // FIXME: What's the cost of a copy?
+    int LBonus = (SharePred && hasOnlyLiveOutUses(left)) ? 1 : 0;
+    int RBonus = (SharePred && hasOnlyLiveOutUses(right)) ? 1 : 0;
+    int LHeight = (int)left->getHeight() - LBonus;
+    int RHeight = (int)right->getHeight() - RBonus;
+
      // Low register pressure situation, schedule for latency if possible.
      bool LStall = left->SchedulingPref == Sched::Latency &&
-      SPQ->getCurCycle() < left->getHeight();
+      (int)SPQ->getCurCycle() < LHeight;
      bool RStall = right->SchedulingPref == Sched::Latency &&
-      SPQ->getCurCycle() < right->getHeight();
+      (int)SPQ->getCurCycle() < RHeight;
      // If scheduling one of the node will cause a pipeline stall, delay it.
      // If scheduling either one of the node will cause a pipeline stall, sort
      // them according to their height.
-    // If neither will cause a pipeline stall, try to reduce register pressure.
      if (LStall) {
        if (!RStall)
          return true;
-      if (left->getHeight() != right->getHeight())
-        return left->getHeight() > right->getHeight();
+      if (LHeight != RHeight)
+        return LHeight > RHeight;
      } else if (RStall)
        return false;
  
-    // If either node is scheduling for latency, sort them by height and latency
-    // first.
+    // If either node is scheduling for latency, sort them by height
+    // and latency.
      if (left->SchedulingPref == Sched::Latency ||
          right->SchedulingPref == Sched::Latency) {
-      if (left->getHeight() != right->getHeight())
-        return left->getHeight() > right->getHeight();
+      if (LHeight != RHeight)
+        return LHeight > RHeight;
        if (left->Latency != right->Latency)
          return left->Latency > right->Latency;
      }
@@ -1631,19 +1688,6 @@ RegReductionPriorityQueue<SF>::canClobber(const SUnit *SU, const SUnit *Op) {
    return false;
  }
  
-/// hasCopyToRegUse - Return true if SU has a value successor that is a
-/// CopyToReg node.
-static bool hasCopyToRegUse(const SUnit *SU) {
-  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-       I != E; ++I) {
-    if (I->isCtrl()) continue;
-    const SUnit *SuccSU = I->getSUnit();
-    if (SuccSU->getNode() && SuccSU->getNode()->getOpcode() == ISD::CopyToReg)
-      return true;
-  }
-  return false;
-}
-
  /// canClobberPhysRegDefs - True if SU would clobber one of SuccSU's
  /// physical register defs.
  static bool canClobberPhysRegDefs(const SUnit *SuccSU, const SUnit *SU,
@@ -1813,6 +1857,7 @@ void RegReductionPriorityQueue<SF>::AddPseudoTwoAddrDeps() {
      if (!Node || !Node->isMachineOpcode() || SU->getNode()->getFlaggedNode())
        continue;
  
+    bool isLiveOut = hasOnlyLiveOutUses(SU);
      unsigned Opc = Node->getMachineOpcode();
      const TargetInstrDesc &TID = TII->get(Opc);
      unsigned NumRes = TID.getNumDefs();
@@ -1862,7 +1907,7 @@ void RegReductionPriorityQueue<SF>::AddPseudoTwoAddrDeps() {
              SuccOpc == TargetOpcode::SUBREG_TO_REG)
            continue;
          if ((!canClobber(SuccSU, DUSU) ||
-             (hasCopyToRegUse(SU) && !hasCopyToRegUse(SuccSU)) ||
+             (isLiveOut && !hasOnlyLiveOutUses(SuccSU)) ||
               (!SU->isCommutable && SuccSU->isCommutable)) &&
              !scheduleDAG->IsReachable(SuccSU, SU)) {
            DEBUG(dbgs() << "    Adding a pseudo-two-addr edge from SU #"
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp

index d34a52d80144997a85a1d7939dbef19eb883fdc6..7d01bd31b960148517eaffc4a4f5654938ef8c47 100644 (file)
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -458,6 +458,15 @@ void ScheduleDAGSDNodes::ComputeOperandLatency(SDNode *Def, SDNode *Use,
      // Adjust the use operand index by num of defs.
      OpIdx += TII->get(Use->getMachineOpcode()).getNumDefs();
    int Latency = TII->getOperandLatency(InstrItins, Def, DefIdx, Use, OpIdx);
+  if (Latency > 1 && Use->getOpcode() == ISD::CopyToReg &&
+      !BB->succ_empty()) {
+    unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg();
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      // This copy is a liveout value. It is likely coalesced, so reduce the
+      // latency so not to penalize the def.
+      // FIXME: need target specific adjustment here?
+      Latency = (Latency > 1) ? Latency - 1 : 1;
+  }
    if (Latency >= 0)
      dep.setLatency(Latency);
  }
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp

index 51db6775817b4acd9c395514ba313ccdaef47cef..c23bc93925b2a4418036d656c53497024e112344 100644 (file)
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1967,8 +1967,13 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
    if (!ItinData || ItinData->isEmpty())
      return DefTID.mayLoad() ? 3 : 1;
  
-  if (!UseNode->isMachineOpcode())
-    return ItinData->getOperandCycle(DefTID.getSchedClass(), DefIdx);
+  if (!UseNode->isMachineOpcode()) {
+    int Latency = ItinData->getOperandCycle(DefTID.getSchedClass(), DefIdx);
+    if (Subtarget.isCortexA9())
+      return Latency <= 2 ? 1 : Latency - 1;
+    else
+      return Latency <= 3 ? 1 : Latency - 2;
+  }
  
    const TargetInstrDesc &UseTID = get(UseNode->getMachineOpcode());
    const MachineSDNode *DefMN = dyn_cast<MachineSDNode>(DefNode);
diff --git a/test/CodeGen/Thumb2/2010-06-14-NEONCoalescer.ll b/test/CodeGen/Thumb2/2010-06-14-NEONCoalescer.ll

index 0b6c92ba8a09325d8b644f56217657de0c8cc767..080341c8df412fbeb3eb8f90097f41346fe679c2 100644 (file)
--- a/test/CodeGen/Thumb2/2010-06-14-NEONCoalescer.ll
+++ b/test/CodeGen/Thumb2/2010-06-14-NEONCoalescer.ll
@@ -23,7 +23,10 @@ entry:
    %4 = insertelement <2 x double> %2, double %V.0.ph, i32 1 ; <<2 x double>> [#uses=2]
  ; Constant pool load followed by add.
  ; Then clobber the loaded register, not the sum.
+; CHECK: vldr.64
+; CHECK: vadd.f64
  ; CHECK: vldr.64 [[LDR:d.*]],
+; CHECK: LPC0_0:
  ; CHECK: vadd.f64 [[ADD:d.*]], [[LDR]], [[LDR]]
  ; CHECK: vmov.f64 [[LDR]]
    %5 = fadd <2 x double> %3, %3                   ; <<2 x double>> [#uses=2]
diff --git a/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll b/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll

index ea401ee0e590bd089ff85ece075d3efb3f1daa96..c169fb334a392100abec61e75df9f6496e186720 100644 (file)
--- a/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll
+++ b/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll
@@ -15,9 +15,9 @@ bb.nph:                                           ; preds = %bb5
  
  ; Loop preheader
  ; CHECK: vmov.f32
-; CHECK: vmul.f32
  ; CHECK: vsub.f32
  ; CHECK: vadd.f32
+; CHECK: vmul.f32
  bb7:                                              ; preds = %bb9, %bb.nph
    %s1.02 = phi float [ undef, %bb.nph ], [ %35, %bb9 ] ; <float> [#uses=3]
    %tmp79 = add i32 undef, undef                   ; <i32> [#uses=1]
author	Evan Cheng <evan.cheng@apple.com>
	Fri, 29 Oct 2010 18:09:28 +0000 (18:09 +0000)
committer	Evan Cheng <evan.cheng@apple.com>
	Fri, 29 Oct 2010 18:09:28 +0000 (18:09 +0000)
lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp		patch \| blob \| history
lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp		patch \| blob \| history
lib/Target/ARM/ARMBaseInstrInfo.cpp		patch \| blob \| history
test/CodeGen/Thumb2/2010-06-14-NEONCoalescer.ll		patch \| blob \| history
test/CodeGen/Thumb2/cross-rc-coalescing-2.ll		patch \| blob \| history