Allow a zero cycle stage to reserve/require a FU without advancing the cycle counter.

author David Goodwin <david_goodwin@apple.com>

Tue, 11 Aug 2009 22:38:43 +0000 (22:38 +0000)

committer David Goodwin <david_goodwin@apple.com>

Tue, 11 Aug 2009 22:38:43 +0000 (22:38 +0000)
author David Goodwin <david_goodwin@apple.com>
Tue, 11 Aug 2009 22:38:43 +0000 (22:38 +0000)
committer David Goodwin <david_goodwin@apple.com>
Tue, 11 Aug 2009 22:38:43 +0000 (22:38 +0000)
diff --git a/lib/CodeGen/ExactHazardRecognizer.cpp b/lib/CodeGen/ExactHazardRecognizer.cpp

index 5a89d223598ddd2350fd16b115f3d6c831d56cc2..48043f286ccf075a2ec11e845e22888d65c50f59 100644 (file)
--- a/lib/CodeGen/ExactHazardRecognizer.cpp
+++ b/lib/CodeGen/ExactHazardRecognizer.cpp
@@ -39,7 +39,7 @@ ExactHazardRecognizer::ExactHazardRecognizer(const InstrItineraryData &LItinData
  
        unsigned ItinDepth = 0;
        for (; IS != E; ++IS)
-        ItinDepth += IS->Cycles;
+        ItinDepth += std::max(1U, IS->Cycles);
  
        ScoreboardDepth = std::max(ScoreboardDepth, ItinDepth);
      }
@@ -89,9 +89,13 @@ ExactHazardRecognizer::HazardType ExactHazardRecognizer::getHazardType(SUnit *SU
    unsigned idx = SU->getInstr()->getDesc().getSchedClass();
    for (const InstrStage *IS = ItinData.begin(idx), *E = ItinData.end(idx);
         IS != E; ++IS) {
+    // If the stages cycles are 0, then we must have the FU free in
+    // the current cycle, but we don't advance the cycle time .
+    unsigned StageCycles = std::max(1U, IS->Cycles);
+
      // We must find one of the stage's units free for every cycle the
      // stage is occupied.
-    for (unsigned int i = 0; i < IS->Cycles; ++i) {
+    for (unsigned int i = 0; i < StageCycles; ++i) {
        assert((cycle < ScoreboardDepth) && "Scoreboard depth exceeded!");
  
        unsigned index = getFutureIndex(cycle);
@@ -103,7 +107,8 @@ ExactHazardRecognizer::HazardType ExactHazardRecognizer::getHazardType(SUnit *SU
          return Hazard;
        }
  
-      ++cycle;
+      if (IS->Cycles > 0)
+        ++cycle;
      }
    }
  
@@ -118,9 +123,13 @@ void ExactHazardRecognizer::EmitInstruction(SUnit *SU) {
    unsigned idx = SU->getInstr()->getDesc().getSchedClass();
    for (const InstrStage *IS = ItinData.begin(idx), *E = ItinData.end(idx);
         IS != E; ++IS) {
+    // If the stages cycles are 0, then we must reserve the FU in the
+    // current cycle, but we don't advance the cycle time .
+    unsigned StageCycles = std::max(1U, IS->Cycles);
+
      // We must reserve one of the stage's units for every cycle the
      // stage is occupied.
-    for (unsigned int i = 0; i < IS->Cycles; ++i) {
+    for (unsigned int i = 0; i < StageCycles; ++i) {
        assert((cycle < ScoreboardDepth) && "Scoreboard depth exceeded!");
  
        unsigned index = getFutureIndex(cycle);
@@ -135,7 +144,9 @@ void ExactHazardRecognizer::EmitInstruction(SUnit *SU) {
  
        assert(freeUnit && "No function unit available!");
        Scoreboard[index] |= freeUnit;
-      ++cycle;
+      
+      if (IS->Cycles > 0)
+        ++cycle;
      }
    }
  
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td

index ce28149b54f738440afea42c7da4de27c22b21d4..eb6304c448ed700b0a231fc94d185b96ec0749c3 100644 (file)
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -114,7 +114,7 @@ def : Processor<"arm1156t2f-s",     V6Itineraries,
  // V7 Processors.
  def : Processor<"cortex-a8",        CortexA8Itineraries,
                  [ArchV7A, FeatureThumb2, FeatureNEON, FeatureNEONFP]>;
-def : Processor<"cortex-a9",        V7Itineraries,
+def : Processor<"cortex-a9",        CortexA9Itineraries,
                  [ArchV7A, FeatureThumb2, FeatureNEON]>;
  
  //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td

index a5ca773ef1d990afa0c6bc4333edfc405ace066a..11a7b2a717ad1b8adb392626de7884fec3dcd9cd 100644 (file)
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -10,8 +10,9 @@
  //===----------------------------------------------------------------------===//
  // Functional units across ARM processors
  //
-def FU_Pipe0   : FuncUnit; // pipeline 0 issue
-def FU_Pipe1   : FuncUnit; // pipeline 1 issue
+def FU_Issue   : FuncUnit; // issue
+def FU_Pipe0   : FuncUnit; // pipeline 0
+def FU_Pipe1   : FuncUnit; // pipeline 1
  def FU_LdSt0   : FuncUnit; // pipeline 0 load/store
  def FU_LdSt1   : FuncUnit; // pipeline 1 load/store
  
@@ -19,9 +20,11 @@ def FU_LdSt1   : FuncUnit; // pipeline 1 load/store
  // Instruction Itinerary classes used for ARM
  //
  def IIC_iALU    : InstrItinClass;
+def IIC_iMPY    : InstrItinClass;
  def IIC_iLoad   : InstrItinClass;
  def IIC_iStore  : InstrItinClass;
  def IIC_fpALU   : InstrItinClass;
+def IIC_fpMPY   : InstrItinClass;
  def IIC_fpLoad  : InstrItinClass;
  def IIC_fpStore : InstrItinClass;
  def IIC_Br      : InstrItinClass;
@@ -31,12 +34,14 @@ def IIC_Br      : InstrItinClass;
  
  def GenericItineraries : ProcessorItineraries<[
    InstrItinData<IIC_iALU    , [InstrStage<1, [FU_Pipe0]>]>,
+  InstrItinData<IIC_iMPY    , [InstrStage<1, [FU_Pipe0]>]>,
    InstrItinData<IIC_iLoad   , [InstrStage<1, [FU_Pipe0]>, InstrStage<1, [FU_LdSt0]>]>,
-  InstrItinData<IIC_fpLoad  , [InstrStage<1, [FU_Pipe0]>, InstrStage<1, [FU_LdSt0]>]>,
    InstrItinData<IIC_iStore  , [InstrStage<1, [FU_Pipe0]>]>,
-  InstrItinData<IIC_fpStore , [InstrStage<1, [FU_Pipe0]>]>,
+  InstrItinData<IIC_Br      , [InstrStage<1, [FU_Pipe0]>]>,
    InstrItinData<IIC_fpALU   , [InstrStage<1, [FU_Pipe0]>]>,
-  InstrItinData<IIC_Br      , [InstrStage<1, [FU_Pipe0]>]>
+  InstrItinData<IIC_fpMPY   , [InstrStage<1, [FU_Pipe0]>]>,
+  InstrItinData<IIC_fpLoad  , [InstrStage<1, [FU_Pipe0]>, InstrStage<1, [FU_LdSt0]>]>,
+  InstrItinData<IIC_fpStore , [InstrStage<1, [FU_Pipe0]>]>
  ]>;
  
  
diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td

index f0b8116a038cce2edf536caba81c6876fb7a3316..755547a678aeefbf695ac894311689f84789ed6a 100644 (file)
--- a/lib/Target/ARM/ARMScheduleV6.td
+++ b/lib/Target/ARM/ARMScheduleV6.td
@@ -11,18 +11,16 @@
  //
  //===----------------------------------------------------------------------===//
  
+// TODO: this should model an ARM11
  // Single issue pipeline so every itinerary starts with FU_pipe0
  def V6Itineraries : ProcessorItineraries<[
-  // single-cycle integer ALU
    InstrItinData<IIC_iALU    , [InstrStage<1, [FU_Pipe0]>]>,
-  // loads have an extra cycle of latency, but are fully pipelined
+  InstrItinData<IIC_iMPY    , [InstrStage<1, [FU_Pipe0]>]>,
    InstrItinData<IIC_iLoad   , [InstrStage<1, [FU_Pipe0]>, InstrStage<1, [FU_LdSt0]>]>,
-  InstrItinData<IIC_fpLoad  , [InstrStage<1, [FU_Pipe0]>, InstrStage<1, [FU_LdSt0]>]>,
-  // fully-pipelined stores
    InstrItinData<IIC_iStore  , [InstrStage<1, [FU_Pipe0]>]>,
-  InstrItinData<IIC_fpStore , [InstrStage<1, [FU_Pipe0]>]>,
-  // fp ALU is not pipelined
-  InstrItinData<IIC_fpALU   , [InstrStage<6, [FU_Pipe0]>]>,
-  // no delay slots, so the latency of a branch is unimportant
-  InstrItinData<IIC_Br      , [InstrStage<1, [FU_Pipe0]>]>
+  InstrItinData<IIC_Br      , [InstrStage<1, [FU_Pipe0]>]>,
+  InstrItinData<IIC_fpALU   , [InstrStage<1, [FU_Pipe0]>]>,
+  InstrItinData<IIC_fpMPY   , [InstrStage<1, [FU_Pipe0]>]>,
+  InstrItinData<IIC_fpLoad  , [InstrStage<1, [FU_Pipe0]>, InstrStage<1, [FU_LdSt0]>]>,
+  InstrItinData<IIC_fpStore , [InstrStage<1, [FU_Pipe0]>]>
  ]>;
diff --git a/lib/Target/ARM/ARMScheduleV7.td b/lib/Target/ARM/ARMScheduleV7.td

index 30360bc9c41ba40e01082571ab99fd54b11bb32a..8a7b42eb729e1376a275e8e3b8340abed4916a91 100644 (file)
--- a/lib/Target/ARM/ARMScheduleV7.td
+++ b/lib/Target/ARM/ARMScheduleV7.td
@@ -11,34 +11,51 @@
  //
  //===----------------------------------------------------------------------===//
  
-// Single issue pipeline so every itinerary starts with FU_Pipe0
-def V7Itineraries : ProcessorItineraries<[
-  // single-cycle integer ALU
-  InstrItinData<IIC_iALU    , [InstrStage<1, [FU_Pipe0]>]>,
-  // loads have an extra cycle of latency, but are fully pipelined
-  InstrItinData<IIC_iLoad   , [InstrStage<1, [FU_Pipe0]>, InstrStage<1, [FU_LdSt0]>]>,
-  InstrItinData<IIC_fpLoad  , [InstrStage<1, [FU_Pipe0]>, InstrStage<1, [FU_LdSt0]>]>,
-  // fully-pipelined stores
-  InstrItinData<IIC_iStore  , [InstrStage<1, [FU_Pipe0]>]>,
-  InstrItinData<IIC_fpStore , [InstrStage<1, [FU_Pipe0]>]>,
-  // fp ALU is not pipelined
-  InstrItinData<IIC_fpALU   , [InstrStage<6, [FU_Pipe0]>]>,
-  // no delay slots, so the latency of a branch is unimportant
-  InstrItinData<IIC_Br      , [InstrStage<1, [FU_Pipe0]>]>
-]>;
-
  // Dual issue pipeline so every itinerary starts with FU_Pipe0 | FU_Pipe1
  def CortexA8Itineraries : ProcessorItineraries<[
-  // single-cycle integer ALU
+  // two fully-pipelined integer ALU pipelines
    InstrItinData<IIC_iALU    , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>]>,
+  // one fully-pipelined integer Multiply pipeline
+  // function units are used in alpha order, so use FU_Pipe1
+  // for the Multiple pipeline
+  InstrItinData<IIC_iMPY    , [InstrStage<1, [FU_Pipe1]>]>,
    // loads have an extra cycle of latency, but are fully pipelined
-  InstrItinData<IIC_iLoad   , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, InstrStage<1, [FU_LdSt0]>]>,
-  InstrItinData<IIC_fpLoad  , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, InstrStage<1, [FU_LdSt0]>]>,
+  // use a 0 cycle FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_iLoad   , [InstrStage<0, [FU_Issue]>, 
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0]>]>,
    // fully-pipelined stores
-  InstrItinData<IIC_iStore  , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>]>,
-  InstrItinData<IIC_fpStore , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>]>,
-  // fp ALU is not pipelined
-  InstrItinData<IIC_fpALU   , [InstrStage<6, [FU_Pipe0, FU_Pipe1]>]>,
+  // use a 0 cycle FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_iStore  , [InstrStage<0, [FU_Issue]>, 
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>]>,
    // no delay slots, so the latency of a branch is unimportant
-  InstrItinData<IIC_Br      , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>]>
+  InstrItinData<IIC_Br      , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>]>,
+
+  // VFP ALU is not pipelined so stall all issues 
+  // FIXME assume NFP pipeline and 7 cycle non-pipelined latency
+  InstrItinData<IIC_fpALU   , [InstrStage<7, [FU_Pipe0, FU_Pipe1]>]>,
+  // VFP MPY is not pipelined so stall all issues 
+  // FIXME assume NFP pipeline and 7 cycle non-pipelined latency
+  InstrItinData<IIC_fpMPY   , [InstrStage<7, [FU_Pipe0, FU_Pipe1]>]>,
+  // loads have an extra cycle of latency, but are fully pipelined
+  // use a 0 cycle FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_fpLoad  , [InstrStage<0, [FU_Issue]>, 
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0]>]>,
+  // use a 0 cycle FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_fpStore , [InstrStage<0, [FU_Issue]>, 
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>]>
+]>;
+
+// FIXME
+def CortexA9Itineraries : ProcessorItineraries<[
+  InstrItinData<IIC_iALU    , [InstrStage<1, [FU_Pipe0]>]>,
+  InstrItinData<IIC_iMPY    , [InstrStage<1, [FU_Pipe0]>]>,
+  InstrItinData<IIC_iLoad   , [InstrStage<1, [FU_Pipe0]>, InstrStage<1, [FU_LdSt0]>]>,
+  InstrItinData<IIC_iStore  , [InstrStage<1, [FU_Pipe0]>]>,
+  InstrItinData<IIC_Br      , [InstrStage<1, [FU_Pipe0]>]>,
+  InstrItinData<IIC_fpALU   , [InstrStage<1, [FU_Pipe0]>]>,
+  InstrItinData<IIC_fpMPY   , [InstrStage<1, [FU_Pipe0]>]>,
+  InstrItinData<IIC_fpLoad  , [InstrStage<1, [FU_Pipe0]>, InstrStage<1, [FU_LdSt0]>]>,
+  InstrItinData<IIC_fpStore , [InstrStage<1, [FU_Pipe0]>]>
  ]>;
author	David Goodwin <david_goodwin@apple.com>
	Tue, 11 Aug 2009 22:38:43 +0000 (22:38 +0000)
committer	David Goodwin <david_goodwin@apple.com>
	Tue, 11 Aug 2009 22:38:43 +0000 (22:38 +0000)
lib/CodeGen/ExactHazardRecognizer.cpp		patch \| blob \| history
lib/Target/ARM/ARM.td		patch \| blob \| history
lib/Target/ARM/ARMSchedule.td		patch \| blob \| history
lib/Target/ARM/ARMScheduleV6.td		patch \| blob \| history
lib/Target/ARM/ARMScheduleV7.td		patch \| blob \| history