Recommit r129383. PreRA scheduler heuristic fixes: VRegCycle, TokenFactor latency.

author Andrew Trick <atrick@apple.com>

Wed, 13 Apr 2011 00:38:32 +0000 (00:38 +0000)

committer Andrew Trick <atrick@apple.com>

Wed, 13 Apr 2011 00:38:32 +0000 (00:38 +0000)
author Andrew Trick <atrick@apple.com>
Wed, 13 Apr 2011 00:38:32 +0000 (00:38 +0000)
committer Andrew Trick <atrick@apple.com>
Wed, 13 Apr 2011 00:38:32 +0000 (00:38 +0000)
diff --git a/include/llvm/Target/TargetInstrItineraries.h b/include/llvm/Target/TargetInstrItineraries.h

index a95b70f6b99762ea53b5a470080af445bb606db8..198d5854462faba4de002ab5920afc7126dab816 100644 (file)
--- a/include/llvm/Target/TargetInstrItineraries.h
+++ b/include/llvm/Target/TargetInstrItineraries.h
@@ -155,9 +155,13 @@ public:
    /// in the itinerary.
    ///
    unsigned getStageLatency(unsigned ItinClassIndx) const {
-    // If the target doesn't provide itinerary information, use a
-    // simple non-zero default value for all instructions.
-    if (isEmpty())
+    // If the target doesn't provide itinerary information, use a simple
+    // non-zero default value for all instructions.  Some target's provide a
+    // dummy (Generic) itinerary which should be handled as if it's itinerary is
+    // empty. We identify this by looking for a reference to stage zero (invalid
+    // stage). This is different from beginStage == endState != 0, which could
+    // be used for zero-latency pseudo ops.
+    if (isEmpty() || Itineraries[ItinClassIndx].FirstStage == 0)
        return 1;
  
      // Calculate the maximum completion time for any stage.
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp

index b2e9c15b68bcb094a240db621b00624b3deba788..ac2f3d5c8510871b4d786574ffe446ff64680956 100644 (file)
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -102,11 +102,11 @@ static cl::opt<unsigned> AvgIPC(
  #ifndef NDEBUG
  namespace {
    // For sched=list-ilp, Count the number of times each factor comes into play.
-  enum { FactPressureDiff, FactRegUses, FactHeight, FactDepth, FactStatic,
-         FactOther, NumFactors };
+  enum { FactPressureDiff, FactRegUses, FactStall, FactHeight, FactDepth,
+         FactStatic, FactOther, NumFactors };
  }
  static const char *FactorName[NumFactors] =
-{"PressureDiff", "RegUses", "Height", "Depth","Static", "Other"};
+{"PressureDiff", "RegUses", "Stall", "Height", "Depth","Static", "Other"};
  static int FactorCount[NumFactors];
  #endif //!NDEBUG
  
@@ -463,6 +463,13 @@ void ScheduleDAGRRList::AdvancePastStalls(SUnit *SU) {
    if (DisableSchedCycles)
      return;
  
+  // FIXME: Nodes such as CopyFromReg probably should not advance the current
+  // cycle. Otherwise, we can wrongly mask real stalls. If the non-machine node
+  // has predecessors the cycle will be advanced when they are scheduled.
+  // But given the crude nature of modeling latency though such nodes, we
+  // currently need to treat these nodes like real instructions.
+  // if (!SU->getNode() || !SU->getNode()->isMachineOpcode()) return;
+
    unsigned ReadyCycle = isBottomUp ? SU->getHeight() : SU->getDepth();
  
    // Bump CurCycle to account for latency. We assume the latency of other
@@ -533,6 +540,8 @@ void ScheduleDAGRRList::EmitNode(SUnit *SU) {
    }
  }
  
+static void resetVRegCycle(SUnit *SU);
+
  /// ScheduleNodeBottomUp - Add the node to the schedule. Decrement the pending
  /// count of its predecessors. If a predecessor pending count is zero, add it to
  /// the Available queue.
@@ -542,7 +551,8 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
  
  #ifndef NDEBUG
    if (CurCycle < SU->getHeight())
-    DEBUG(dbgs() << "   Height [" << SU->getHeight() << "] pipeline stall!\n");
+    DEBUG(dbgs() << "   Height [" << SU->getHeight()
+          << "] pipeline stall!\n");
  #endif
  
    // FIXME: Do not modify node height. It may interfere with
@@ -559,7 +569,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
    AvailableQueue->ScheduledNode(SU);
  
    // If HazardRec is disabled, and each inst counts as one cycle, then
-  // advance CurCycle before ReleasePredecessors to avoid useles pushed to
+  // advance CurCycle before ReleasePredecessors to avoid useless pushes to
    // PendingQueue for schedulers that implement HasReadyFilter.
    if (!HazardRec->isEnabled() && AvgIPC < 2)
      AdvanceToCycle(CurCycle + 1);
@@ -580,20 +590,25 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
      }
    }
  
+  resetVRegCycle(SU);
+
    SU->isScheduled = true;
  
    // Conditions under which the scheduler should eagerly advance the cycle:
    // (1) No available instructions
    // (2) All pipelines full, so available instructions must have hazards.
    //
-  // If HazardRec is disabled, the cycle was advanced earlier.
+  // If HazardRec is disabled, the cycle was pre-advanced before calling
+  // ReleasePredecessors. In that case, IssueCount should remain 0.
    //
    // Check AvailableQueue after ReleasePredecessors in case of zero latency.
-  ++IssueCount;
-  if ((HazardRec->isEnabled() && HazardRec->atIssueLimit())
-      || (!HazardRec->isEnabled() && AvgIPC > 1 && IssueCount == AvgIPC)
-      || AvailableQueue->empty())
-    AdvanceToCycle(CurCycle + 1);
+  if (HazardRec->isEnabled() || AvgIPC > 1) {
+    if (SU->getNode() && SU->getNode()->isMachineOpcode())
+      ++IssueCount;
+    if ((HazardRec->isEnabled() && HazardRec->atIssueLimit())
+        || (!HazardRec->isEnabled() && IssueCount == AvgIPC))
+      AdvanceToCycle(CurCycle + 1);
+  }
  }
  
  /// CapturePred - This does the opposite of ReleasePred. Since SU is being
@@ -1220,7 +1235,7 @@ void ScheduleDAGRRList::ListScheduleBottomUp() {
    // priority. If it is not ready put it back.  Schedule the node.
    Sequence.reserve(SUnits.size());
    while (!AvailableQueue->empty()) {
-    DEBUG(dbgs() << "\n*** Examining Available\n";
+    DEBUG(dbgs() << "\nExamining Available:\n";
            AvailableQueue->dump(this));
  
      // Pick the best node to schedule taking all constraints into
@@ -1661,17 +1676,6 @@ void RegReductionPQBase::CalculateSethiUllmanNumbers() {
      CalcNodeSethiUllmanNumber(&(*SUnits)[i], SethiUllmanNumbers);
  }
  
-void RegReductionPQBase::initNodes(std::vector<SUnit> &sunits) {
-  SUnits = &sunits;
-  // Add pseudo dependency edges for two-address nodes.
-  AddPseudoTwoAddrDeps();
-  // Reroute edges to nodes with multiple uses.
-  if (!TracksRegPressure)
-    PrescheduleNodesWithMultipleUses();
-  // Calculate node priorities.
-  CalculateSethiUllmanNumbers();
-}
-
  void RegReductionPQBase::addNode(const SUnit *SU) {
    unsigned SUSize = SethiUllmanNumbers.size();
    if (SUnits->size() > SUSize)
@@ -2008,7 +2012,29 @@ static unsigned calcMaxScratches(const SUnit *SU) {
    return Scratches;
  }
  
-/// hasOnlyLiveOutUse - Return true if SU has a single value successor that is a
+/// hasOnlyLiveInOpers - Return true if SU has only value predecessors that are
+/// CopyFromReg from a virtual register.
+static bool hasOnlyLiveInOpers(const SUnit *SU) {
+  bool RetVal = false;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;
+    const SUnit *PredSU = I->getSUnit();
+    if (PredSU->getNode() &&
+        PredSU->getNode()->getOpcode() == ISD::CopyFromReg) {
+      unsigned Reg =
+        cast<RegisterSDNode>(PredSU->getNode()->getOperand(1))->getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        RetVal = true;
+        continue;
+      }
+    }
+    return false;
+  }
+  return RetVal;
+}
+
+/// hasOnlyLiveOutUses - Return true if SU has only value successors that are
  /// CopyToReg to a virtual register. This SU def is probably a liveout and
  /// it has no other use. It should be scheduled closer to the terminator.
  static bool hasOnlyLiveOutUses(const SUnit *SU) {
@@ -2030,62 +2056,71 @@ static bool hasOnlyLiveOutUses(const SUnit *SU) {
    return RetVal;
  }
  
-/// UnitsSharePred - Return true if the two scheduling units share a common
-/// data predecessor.
-static bool UnitsSharePred(const SUnit *left, const SUnit *right) {
-  SmallSet<const SUnit*, 4> Preds;
-  for (SUnit::const_pred_iterator I = left->Preds.begin(),E = left->Preds.end();
+// Set isVRegCycle for a node with only live in opers and live out uses. Also
+// set isVRegCycle for its CopyFromReg operands.
+//
+// This is only relevant for single-block loops, in which case the VRegCycle
+// node is likely an induction variable in which the operand and target virtual
+// registers should be coalesced (e.g. pre/post increment values). Setting the
+// isVRegCycle flag helps the scheduler prioritize other uses of the same
+// CopyFromReg so that this node becomes the virtual register "kill". This
+// avoids interference between the values live in and out of the block and
+// eliminates a copy inside the loop.
+static void initVRegCycle(SUnit *SU) {
+  if (DisableSchedVRegCycle)
+    return;
+
+  if (!hasOnlyLiveInOpers(SU) || !hasOnlyLiveOutUses(SU))
+    return;
+
+  DEBUG(dbgs() << "VRegCycle: SU(" << SU->NodeNum << ")\n");
+
+  SU->isVRegCycle = true;
+
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
         I != E; ++I) {
-    if (I->isCtrl()) continue;  // ignore chain preds
-    Preds.insert(I->getSUnit());
+    if (I->isCtrl()) continue;
+    I->getSUnit()->isVRegCycle = true;
    }
-  for (SUnit::const_pred_iterator I = right->Preds.begin(),E = right->Preds.end();
+}
+
+// After scheduling the definition of a VRegCycle, clear the isVRegCycle flag of
+// CopyFromReg operands. We should no longer penalize other uses of this VReg.
+static void resetVRegCycle(SUnit *SU) {
+  if (!SU->isVRegCycle)
+    return;
+
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end();
         I != E; ++I) {
      if (I->isCtrl()) continue;  // ignore chain preds
-    if (Preds.count(I->getSUnit()))
-      return true;
+    SUnit *PredSU = I->getSUnit();
+    if (PredSU->isVRegCycle) {
+      assert(PredSU->getNode()->getOpcode() == ISD::CopyFromReg &&
+             "VRegCycle def must be CopyFromReg");
+      I->getSUnit()->isVRegCycle = 0;
+    }
    }
-  return false;
  }
  
-// Return true if the virtual register defined by VRCycleSU may interfere with
-// VRUseSU.
-//
-// Note: We may consider two SU's that use the same value live into a loop as
-// interferng even though the value is not an induction variable. This is an
-// unfortunate consequence of scheduling on the selection DAG.
-static bool checkVRegCycleInterference(const SUnit *VRCycleSU,
-                                       const SUnit *VRUseSU) {
-  for (SUnit::const_pred_iterator I = VRCycleSU->Preds.begin(),
-         E = VRCycleSU->Preds.end(); I != E; ++I) {
+// Return true if this SUnit uses a CopyFromReg node marked as a VRegCycle. This
+// means a node that defines the VRegCycle has not been scheduled yet.
+static bool hasVRegCycleUse(const SUnit *SU) {
+  // If this SU also defines the VReg, don't hoist it as a "use".
+  if (SU->isVRegCycle)
+    return false;
+
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end();
+       I != E; ++I) {
      if (I->isCtrl()) continue;  // ignore chain preds
-    SDNode *InNode = I->getSUnit()->getNode();
-    if (!InNode || InNode->getOpcode() != ISD::CopyFromReg)
-      continue;
-    for (SUnit::const_pred_iterator II = VRUseSU->Preds.begin(),
-           EE = VRUseSU->Preds.end(); II != EE; ++II) {
-      if (II->getSUnit() == I->getSUnit())
-        return true;
+    if (I->getSUnit()->isVRegCycle &&
+        I->getSUnit()->getNode()->getOpcode() == ISD::CopyFromReg) {
+      DEBUG(dbgs() << "  VReg cycle use: SU (" << SU->NodeNum << ")\n");
+      return true;
      }
    }
    return false;
  }
  
-// Compare the VRegCycle properties of the nodes.
-// Return -1 if left has higher priority, 1 if right has higher priority.
-// Return 0 if priority is equivalent.
-static int BUCompareVRegCycle(const SUnit *left, const SUnit *right) {
-  if (left->isVRegCycle && !right->isVRegCycle) {
-    if (checkVRegCycleInterference(left, right))
-      return -1;
-  }
-  else if (!left->isVRegCycle && right->isVRegCycle) {
-    if (checkVRegCycleInterference(right, left))
-      return 1;
-  }
-  return 0;
-}
-
  // Check for either a dependence (latency) or resource (hazard) stall.
  //
  // Note: The ScheduleHazardRecognizer interface requires a non-const SU.
@@ -2101,23 +2136,12 @@ static bool BUHasStall(SUnit *SU, int Height, RegReductionPQBase *SPQ) {
  // Return 0 if latency-based priority is equivalent.
  static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref,
                              RegReductionPQBase *SPQ) {
-  // If the two nodes share an operand and one of them has a single
-  // use that is a live out copy, favor the one that is live out. Otherwise
-  // it will be difficult to eliminate the copy if the instruction is a
-  // loop induction variable update. e.g.
-  // BB:
-  // sub r1, r3, #1
-  // str r0, [r2, r3]
-  // mov r3, r1
-  // cmp
-  // bne BB
-  bool SharePred = UnitsSharePred(left, right);
-  // FIXME: Only adjust if BB is a loop back edge.
-  // FIXME: What's the cost of a copy?
-  int LBonus = (SharePred && hasOnlyLiveOutUses(left)) ? 1 : 0;
-  int RBonus = (SharePred && hasOnlyLiveOutUses(right)) ? 1 : 0;
-  int LHeight = (int)left->getHeight() - LBonus;
-  int RHeight = (int)right->getHeight() - RBonus;
+  // Scheduling an instruction that uses a VReg whose postincrement has not yet
+  // been scheduled will induce a copy. Model this as an extra cycle of latency.
+  int LPenalty = hasVRegCycleUse(left) ? 1 : 0;
+  int RPenalty = hasVRegCycleUse(right) ? 1 : 0;
+  int LHeight = (int)left->getHeight() + LPenalty;
+  int RHeight = (int)right->getHeight() + RPenalty;
  
    bool LStall = (!checkPref || left->SchedulingPref == Sched::Latency) &&
      BUHasStall(left, LHeight, SPQ);
@@ -2128,36 +2152,47 @@ static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref,
    // If scheduling either one of the node will cause a pipeline stall, sort
    // them according to their height.
    if (LStall) {
-    if (!RStall)
+    if (!RStall) {
+      DEBUG(++FactorCount[FactStall]);
        return 1;
-    if (LHeight != RHeight)
+    }
+    if (LHeight != RHeight) {
+      DEBUG(++FactorCount[FactStall]);
        return LHeight > RHeight ? 1 : -1;
-  } else if (RStall)
+    }
+  } else if (RStall) {
+    DEBUG(++FactorCount[FactStall]);
      return -1;
+  }
  
    // If either node is scheduling for latency, sort them by height/depth
    // and latency.
    if (!checkPref || (left->SchedulingPref == Sched::Latency ||
                       right->SchedulingPref == Sched::Latency)) {
      if (DisableSchedCycles) {
-      if (LHeight != RHeight)
+      if (LHeight != RHeight) {
+        DEBUG(++FactorCount[FactHeight]);
          return LHeight > RHeight ? 1 : -1;
+      }
      }
      else {
        // If neither instruction stalls (!LStall && !RStall) then
        // its height is already covered so only its depth matters. We also reach
        // this if both stall but have the same height.
-      unsigned LDepth = left->getDepth();
-      unsigned RDepth = right->getDepth();
+      int LDepth = left->getDepth() - LPenalty;
+      int RDepth = right->getDepth() - RPenalty;
        if (LDepth != RDepth) {
+        DEBUG(++FactorCount[FactDepth]);
          DEBUG(dbgs() << "  Comparing latency of SU (" << left->NodeNum
                << ") depth " << LDepth << " vs SU (" << right->NodeNum
                << ") depth " << RDepth << "\n");
          return LDepth < RDepth ? 1 : -1;
        }
      }
-    if (left->Latency != right->Latency)
+    if (left->Latency != right->Latency) {
+      DEBUG(++FactorCount[FactOther]);
        return left->Latency > right->Latency ? 1 : -1;
+    }
    }
    return 0;
  }
@@ -2169,7 +2204,19 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
      DEBUG(++FactorCount[FactStatic]);
      return LPriority > RPriority;
    }
-  DEBUG(++FactorCount[FactOther]);
+  else if(LPriority == 0) {
+    // Schedule zero-latency TokenFactor below any other special
+    // nodes. The alternative may be to avoid artificially boosting the
+    // TokenFactor's height when it is scheduled, but we currently rely on an
+    // instruction's final height to equal the cycle in which it is scheduled,
+    // so heights are monotonically increasing.
+    unsigned LOpc = left->getNode() ? left->getNode()->getOpcode() : 0;
+    unsigned ROpc = right->getNode() ? right->getNode()->getOpcode() : 0;
+    if (LOpc == ISD::TokenFactor)
+      return false;
+    if (ROpc == ISD::TokenFactor)
+      return true;
+  }
  
    // Try schedule def + use closer when Sethi-Ullman numbers are the same.
    // e.g.
@@ -2190,14 +2237,18 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
    // This creates more short live intervals.
    unsigned LDist = closestSucc(left);
    unsigned RDist = closestSucc(right);
-  if (LDist != RDist)
+  if (LDist != RDist) {
+    DEBUG(++FactorCount[FactOther]);
      return LDist < RDist;
+  }
  
    // How many registers becomes live when the node is scheduled.
    unsigned LScratch = calcMaxScratches(left);
    unsigned RScratch = calcMaxScratches(right);
-  if (LScratch != RScratch)
+  if (LScratch != RScratch) {
+    DEBUG(++FactorCount[FactOther]);
      return LScratch > RScratch;
+  }
  
    if (!DisableSchedCycles) {
      int result = BUCompareLatency(left, right, false /*checkPref*/, SPQ);
@@ -2205,15 +2256,20 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
        return result > 0;
    }
    else {
-    if (left->getHeight() != right->getHeight())
+    if (left->getHeight() != right->getHeight()) {
+      DEBUG(++FactorCount[FactHeight]);
        return left->getHeight() > right->getHeight();
+    }
  
-    if (left->getDepth() != right->getDepth())
+    if (left->getDepth() != right->getDepth()) {
+      DEBUG(++FactorCount[FactDepth]);
        return left->getDepth() < right->getDepth();
+    }
    }
  
    assert(left->NodeQueueId && right->NodeQueueId &&
           "NodeQueueId cannot be zero");
+  DEBUG(++FactorCount[FactOther]);
    return (left->NodeQueueId > right->NodeQueueId);
  }
  
@@ -2264,24 +2320,22 @@ bool hybrid_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
    // Avoid causing spills. If register pressure is high, schedule for
    // register pressure reduction.
    if (LHigh && !RHigh) {
+    DEBUG(++FactorCount[FactPressureDiff]);
      DEBUG(dbgs() << "  pressure SU(" << left->NodeNum << ") > SU("
            << right->NodeNum << ")\n");
      return true;
    }
    else if (!LHigh && RHigh) {
+    DEBUG(++FactorCount[FactPressureDiff]);
      DEBUG(dbgs() << "  pressure SU(" << right->NodeNum << ") > SU("
            << left->NodeNum << ")\n");
      return false;
    }
-  int result = 0;
-  if (!DisableSchedVRegCycle) {
-    result = BUCompareVRegCycle(left, right);
-  }
-  if (result == 0 && !LHigh && !RHigh) {
-    result = BUCompareLatency(left, right, true /*checkPref*/, SPQ);
+  if (!LHigh && !RHigh) {
+    int result = BUCompareLatency(left, right, true /*checkPref*/, SPQ);
+    if (result != 0)
+      return result > 0;
    }
-  if (result != 0)
-    return result > 0;
    return BURRSort(left, right, SPQ);
  }
  
@@ -2347,12 +2401,6 @@ bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
      if (RReduce && !LReduce) return true;
    }
  
-  if (!DisableSchedVRegCycle) {
-    int result = BUCompareVRegCycle(left, right);
-    if (result != 0)
-      return result > 0;
-  }
-
    if (!DisableSchedLiveUses && (LLiveUses != RLiveUses)) {
      DEBUG(dbgs() << "Live uses SU(" << left->NodeNum << "): " << LLiveUses
            << " != SU(" << right->NodeNum << "): " << RLiveUses << "\n");
@@ -2391,6 +2439,24 @@ bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
    return BURRSort(left, right, SPQ);
  }
  
+void RegReductionPQBase::initNodes(std::vector<SUnit> &sunits) {
+  SUnits = &sunits;
+  // Add pseudo dependency edges for two-address nodes.
+  AddPseudoTwoAddrDeps();
+  // Reroute edges to nodes with multiple uses.
+  if (!TracksRegPressure)
+    PrescheduleNodesWithMultipleUses();
+  // Calculate node priorities.
+  CalculateSethiUllmanNumbers();
+
+  // For single block loops, mark nodes that look like canonical IV increments.
+  if (scheduleDAG->BB->isSuccessor(scheduleDAG->BB)) {
+    for (unsigned i = 0, e = sunits.size(); i != e; ++i) {
+      initVRegCycle(&sunits[i]);
+    }
+  }
+}
+
  //===----------------------------------------------------------------------===//
  //                    Preschedule for Register Pressure
  //===----------------------------------------------------------------------===//
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp

index 24a1937c445ec4092f014096ce0508940926ff38..078533be843ac23e9341a61128e337c259f0b5d9 100644 (file)
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -342,10 +342,6 @@ void ScheduleDAGSDNodes::BuildSchedUnits() {
      assert(N->getNodeId() == -1 && "Node already inserted!");
      N->setNodeId(NodeSUnit->NodeNum);
  
-    // Set isVRegCycle if the node operands are live into and value is live out
-    // of a single block loop.
-    InitVRegCycleFlag(NodeSUnit);
-
      // Compute NumRegDefsLeft. This must be done before AddSchedEdges.
      InitNumRegDefsLeft(NodeSUnit);
  
@@ -417,6 +413,10 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
  
          // If this is a ctrl dep, latency is 1.
          unsigned OpLatency = isChain ? 1 : OpSU->Latency;
+        // Special-case TokenFactor chains as zero-latency.
+        if(isChain && OpN->getOpcode() == ISD::TokenFactor)
+          OpLatency = 0;
+
          const SDep &dep = SDep(OpSU, isChain ? SDep::Order : SDep::Data,
                                 OpLatency, PhysReg);
          if (!isChain && !UnitLatencies) {
@@ -512,47 +512,6 @@ void ScheduleDAGSDNodes::RegDefIter::Advance() {
    }
  }
  
-// Set isVRegCycle if this node's single use is CopyToReg and its only active
-// data operands are CopyFromReg.
-//
-// This is only relevant for single-block loops, in which case the VRegCycle
-// node is likely an induction variable in which the operand and target virtual
-// registers should be coalesced (e.g. pre/post increment values). Setting the
-// isVRegCycle flag helps the scheduler prioritize other uses of the same
-// CopyFromReg so that this node becomes the virtual register "kill". This
-// avoids interference between the values live in and out of the block and
-// eliminates a copy inside the loop.
-void ScheduleDAGSDNodes::InitVRegCycleFlag(SUnit *SU) {
-  if (!BB->isSuccessor(BB))
-    return;
-
-  SDNode *N = SU->getNode();
-  if (N->getGluedNode())
-    return;
-
-  if (!N->hasOneUse() || N->use_begin()->getOpcode() != ISD::CopyToReg)
-    return;
-
-  bool FoundLiveIn = false;
-  for (SDNode::op_iterator OI = N->op_begin(), E = N->op_end(); OI != E; ++OI) {
-    EVT OpVT = OI->getValueType();
-    assert(OpVT != MVT::Glue && "Glued nodes should be in same sunit!");
-
-    if (OpVT == MVT::Other)
-      continue; // ignore chain operands
-
-    if (isPassiveNode(OI->getNode()))
-      continue; // ignore constants and such
-
-    if (OI->getNode()->getOpcode() != ISD::CopyFromReg)
-      return;
-
-    FoundLiveIn = true;
-  }
-  if (FoundLiveIn)
-    SU->isVRegCycle = true;
-}
-
  void ScheduleDAGSDNodes::InitNumRegDefsLeft(SUnit *SU) {
    assert(SU->NumRegDefsLeft == 0 && "expect a new node");
    for (RegDefIter I(SU, this); I.IsValid(); I.Advance()) {
@@ -562,6 +521,16 @@ void ScheduleDAGSDNodes::InitNumRegDefsLeft(SUnit *SU) {
  }
  
  void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) {
+  SDNode *N = SU->getNode();
+
+  // TokenFactor operands are considered zero latency, and some schedulers
+  // (e.g. Top-Down list) may rely on the fact that operand latency is nonzero
+  // whenever node latency is nonzero.
+  if (N && N->getOpcode() == ISD::TokenFactor) {
+    SU->Latency = 0;
+    return;
+  }
+
    // Check to see if the scheduler cares about latencies.
    if (ForceUnitLatencies()) {
      SU->Latency = 1;
@@ -569,7 +538,6 @@ void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) {
    }
  
    if (!InstrItins || InstrItins->isEmpty()) {
-    SDNode *N = SU->getNode();
      if (N && N->isMachineOpcode() &&
          TII->isHighLatencyDef(N->getMachineOpcode()))
        SU->Latency = HighLatencyCycles;
diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll

index e8a2a3b7d5bfff1c37e2024d826ace79f00c159f..5bae037cafb3ccedaf492938cefa22a0b373207a 100644 (file)
--- a/test/CodeGen/ARM/memcpy-inline.ll
+++ b/test/CodeGen/ARM/memcpy-inline.ll
@@ -1,10 +1,8 @@
-; RUN: llc < %s -mtriple=arm-apple-darwin -regalloc=linearscan -disable-post-ra | FileCheck %s
-; RUN: llc < %s -mtriple=arm-apple-darwin -regalloc=basic -disable-post-ra | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -regalloc=linearscan -disable-post-ra | FileCheck %s
  
  ; The ARM magic hinting works best with linear scan.
-; CHECK: ldmia
-; CHECK: stmia
-; CHECK: ldrh
+; CHECK: ldrd
+; CHECK: strd
  ; CHECK: ldrb
  
  %struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
diff --git a/test/CodeGen/ARM/neon_div.ll b/test/CodeGen/ARM/neon_div.ll

index e3379707909330fa31af81b7be77c22ab528762b..de48feeb9ec297cabeea16328973ee1d055d1efe 100644 (file)
--- a/test/CodeGen/ARM/neon_div.ll
+++ b/test/CodeGen/ARM/neon_div.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc < %s -march=arm -mattr=+neon -pre-RA-sched=source | FileCheck %s
  
  define <8 x i8> @sdivi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
  ;CHECK: vrecpe.f32
diff --git a/test/CodeGen/ARM/va_arg.ll b/test/CodeGen/ARM/va_arg.ll

index 7cb976236dc543f41bc25ae046dc6b0f3664ce52..bb4045311624dcc37d426eeb6e0aa5f14a89f1c6 100644 (file)
--- a/test/CodeGen/ARM/va_arg.ll
+++ b/test/CodeGen/ARM/va_arg.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi | FileCheck %s
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -pre-RA-sched=source | FileCheck %s
  ; Test that we correctly align elements when using va_arg
  
  ; CHECK: test1:
  ; CHECK-NOT: bfc
-; CHECK: add   r0, r0, #7
-; CHECK: bfc   r0, #0, #3
+; CHECK: add   [[REG:(r[0-9]+)|(lr)]], {{(r[0-9]+)|(lr)}}, #7
+; CHECK: bfc   [[REG]], #0, #3
  ; CHECK-NOT: bfc
  
  define i64 @test1(i32 %i, ...) nounwind optsize {
@@ -19,8 +19,8 @@ entry:
  
  ; CHECK: test2:
  ; CHECK-NOT: bfc
-; CHECK: add   r0, r0, #7
-; CHECK: bfc   r0, #0, #3
+; CHECK: add   [[REG:(r[0-9]+)|(lr)]], {{(r[0-9]+)|(lr)}}, #7
+; CHECK: bfc   [[REG]], #0, #3
  ; CHECK-NOT:   bfc
  ; CHECK: bx    lr
  
diff --git a/test/CodeGen/ARM/vfp.ll b/test/CodeGen/ARM/vfp.ll

index 390457fc21b385040df2a9273fdd667a58284dab..49a69827bc05325f1e3d738b39858a8d48c903e7 100644 (file)
--- a/test/CodeGen/ARM/vfp.ll
+++ b/test/CodeGen/ARM/vfp.ll
@@ -40,8 +40,8 @@ define void @test_add(float* %P, double* %D) {
  define void @test_ext_round(float* %P, double* %D) {
  ;CHECK: test_ext_round:
         %a = load float* %P             ; <float> [#uses=1]
-;CHECK: vcvt.f32.f64
  ;CHECK: vcvt.f64.f32
+;CHECK: vcvt.f32.f64
         %b = fpext float %a to double           ; <double> [#uses=1]
         %A = load double* %D            ; <double> [#uses=1]
         %B = fptrunc double %A to float         ; <float> [#uses=1]
diff --git a/test/CodeGen/Mips/o32_cc_vararg.ll b/test/CodeGen/Mips/o32_cc_vararg.ll

index 6601d2505567f07cbffdadc0a0ec6996bcce104c..1f71ed2640ebf2580f4652456c66ed4de2e5ba84 100644 (file)
--- a/test/CodeGen/Mips/o32_cc_vararg.ll
+++ b/test/CodeGen/Mips/o32_cc_vararg.ll
@@ -1,12 +1,12 @@
-; RUN: llc -march=mipsel -mcpu=mips2 < %s | FileCheck %s
-; RUN: llc -march=mipsel -mcpu=mips2 < %s -regalloc=basic | FileCheck %s
+; RUN: llc -march=mipsel -mcpu=mips2 -pre-RA-sched=source < %s | FileCheck %s
+; RUN: llc -march=mipsel -mcpu=mips2 -pre-RA-sched=source < %s -regalloc=basic | FileCheck %s
  
  
  ; All test functions do the same thing - they return the first variable
  ; argument.
  
-; All CHECK's do the same thing - they check whether variable arguments from 
-; registers are placed on correct stack locations, and whether the first 
+; All CHECK's do the same thing - they check whether variable arguments from
+; registers are placed on correct stack locations, and whether the first
  ; variable argument is returned from the correct stack location.
  
  
@@ -31,14 +31,14 @@ entry:
  
  ; CHECK: va1:
  ; CHECK: addiu   $sp, $sp, -32
-; CHECK: sw      $5, 36($sp)
-; CHECK: sw      $6, 40($sp)
  ; CHECK: sw      $7, 44($sp)
+; CHECK: sw      $6, 40($sp)
+; CHECK: sw      $5, 36($sp)
  ; CHECK: lw      $2, 36($sp)
  }
  
-; check whether the variable double argument will be accessed from the 8-byte 
-; aligned location (i.e. whether the address is computed by adding 7 and 
+; check whether the variable double argument will be accessed from the 8-byte
+; aligned location (i.e. whether the address is computed by adding 7 and
  ; clearing lower 3 bits)
  define double @va2(i32 %a, ...) nounwind {
  entry:
@@ -57,10 +57,10 @@ entry:
  
  ; CHECK: va2:
  ; CHECK: addiu   $sp, $sp, -40
-; CHECK: addiu   $[[R0:[0-9]+]], $sp, 44
-; CHECK: sw      $5, 44($sp)
-; CHECK: sw      $6, 48($sp)
  ; CHECK: sw      $7, 52($sp)
+; CHECK: sw      $6, 48($sp)
+; CHECK: sw      $5, 44($sp)
+; CHECK: addiu   $[[R0:[0-9]+]], $sp, 44
  ; CHECK: addiu   $[[R1:[0-9]+]], $[[R0]], 7
  ; CHECK: addiu   $[[R2:[0-9]+]], $zero, -8
  ; CHECK: and     $[[R3:[0-9]+]], $[[R1]], $[[R2]]
@@ -85,8 +85,8 @@ entry:
  
  ; CHECK: va3:
  ; CHECK: addiu   $sp, $sp, -40
-; CHECK: sw      $6, 48($sp)
  ; CHECK: sw      $7, 52($sp)
+; CHECK: sw      $6, 48($sp)
  ; CHECK: lw      $2, 48($sp)
  }
  
@@ -108,8 +108,8 @@ entry:
  
  ; CHECK: va4:
  ; CHECK: addiu   $sp, $sp, -48
-; CHECK: sw      $6, 56($sp)
  ; CHECK: sw      $7, 60($sp)
+; CHECK: sw      $6, 56($sp)
  ; CHECK: addiu   $[[R0:[0-9]+]], $sp, 56
  ; CHECK: addiu   $[[R1:[0-9]+]], $[[R0]], 7
  ; CHECK: addiu   $[[R2:[0-9]+]], $zero, -8
diff --git a/test/CodeGen/Thumb2/thumb2-uxtb.ll b/test/CodeGen/Thumb2/thumb2-uxtb.ll

index 2074f98cb608c2506deacb4a5c42bacf6245a8fd..35914b16790a12b826dabfeb327db1186a102114 100644 (file)
--- a/test/CodeGen/Thumb2/thumb2-uxtb.ll
+++ b/test/CodeGen/Thumb2/thumb2-uxtb.ll
@@ -128,9 +128,9 @@ define i32 @test10(i32 %p0) {
  
  ; ARMv7M: test10
  ; ARMv7M: mov.w r1, #16253176
+; ARMv7M: mov.w r2, #458759
  ; ARMv7M: and.w r0, r1, r0, lsr #7
-; ARMv7M: mov.w r1, #458759
-; ARMv7M: and.w r1, r1, r0, lsr #5
+; ARMv7M: and.w r1, r2, r0, lsr #5
  ; ARMv7M: orrs r0, r1
         %tmp1 = lshr i32 %p0, 7         ; <i32> [#uses=1]
         %tmp2 = and i32 %tmp1, 16253176         ; <i32> [#uses=2]
author	Andrew Trick <atrick@apple.com>
	Wed, 13 Apr 2011 00:38:32 +0000 (00:38 +0000)
committer	Andrew Trick <atrick@apple.com>
	Wed, 13 Apr 2011 00:38:32 +0000 (00:38 +0000)
include/llvm/Target/TargetInstrItineraries.h		patch \| blob \| history
lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp		patch \| blob \| history
lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp		patch \| blob \| history
test/CodeGen/ARM/memcpy-inline.ll		patch \| blob \| history
test/CodeGen/ARM/neon_div.ll		patch \| blob \| history
test/CodeGen/ARM/va_arg.ll		patch \| blob \| history
test/CodeGen/ARM/vfp.ll		patch \| blob \| history
test/CodeGen/Mips/o32_cc_vararg.ll		patch \| blob \| history
test/CodeGen/Thumb2/thumb2-uxtb.ll		patch \| blob \| history