MIsched: add an ILP window property to machine model.

author Andrew Trick <atrick@apple.com>

Wed, 9 Jan 2013 03:36:49 +0000 (03:36 +0000)

committer Andrew Trick <atrick@apple.com>

Wed, 9 Jan 2013 03:36:49 +0000 (03:36 +0000)
author Andrew Trick <atrick@apple.com>
Wed, 9 Jan 2013 03:36:49 +0000 (03:36 +0000)
committer Andrew Trick <atrick@apple.com>
Wed, 9 Jan 2013 03:36:49 +0000 (03:36 +0000)
diff --git a/include/llvm/CodeGen/TargetSchedule.h b/include/llvm/CodeGen/TargetSchedule.h

index 4c4a2a8b9572fff60e04db20110465cef2f25baf..484d7e200adfecae4b518974b48c8904b1c7accf 100644 (file)
--- a/include/llvm/CodeGen/TargetSchedule.h
+++ b/include/llvm/CodeGen/TargetSchedule.h
@@ -84,6 +84,9 @@ public:
    /// \brief Maximum number of micro-ops that may be scheduled per cycle.
    unsigned getIssueWidth() const { return SchedModel.IssueWidth; }
  
+  /// \brief Number of cycles the OOO processor is expected to hide.
+  unsigned getILPWindow() const { return SchedModel.ILPWindow; }
+
    /// \brief Return the number of issue slots required for this MI.
    unsigned getNumMicroOps(const MachineInstr *MI,
                            const MCSchedClassDesc *SC = 0) const;
diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h

index 0c71ee513500ce2f101da62daf768fc3a2f26228..9e9474952a581d0520665adebdeb3257c47a6359 100644 (file)
--- a/include/llvm/MC/MCSchedule.h
+++ b/include/llvm/MC/MCSchedule.h
@@ -155,7 +155,7 @@ public:
    //      Optional InstrItinerary OperandCycles provides expected latency.
    //      TODO: can't yet specify both min and expected latency per operand.
    int MinLatency;
-  static const unsigned DefaultMinLatency = -1;
+  static const int DefaultMinLatency = -1;
  
    // LoadLatency is the expected latency of load instructions.
    //
@@ -172,6 +172,16 @@ public:
    unsigned HighLatency;
    static const unsigned DefaultHighLatency = 10;
  
+  // ILPWindow is the number of cycles that the scheduler effectively ignores
+  // before attempting to hide latency. This should be zero for in-order cpus to
+  // always hide expected latency. For out-of-order cpus, it may be tweaked as
+  // desired to roughly approximate instruction buffers. The actual threshold is
+  // not very important for an OOO processor, as long as it isn't too high. A
+  // nonzero value helps avoid rescheduling to hide latency when its is fairly
+  // obviously useless and makes register pressure heuristics more effective.
+  unsigned ILPWindow;
+  static const unsigned DefaultILPWindow = 0;
+
    // MispredictPenalty is the typical number of extra cycles the processor
    // takes to recover from a branch misprediction.
    unsigned MispredictPenalty;
@@ -196,6 +206,7 @@ public:
                    MinLatency(DefaultMinLatency),
                    LoadLatency(DefaultLoadLatency),
                    HighLatency(DefaultHighLatency),
+                  ILPWindow(DefaultILPWindow),
                    MispredictPenalty(DefaultMispredictPenalty),
                    ProcID(0), ProcResourceTable(0), SchedClassTable(0),
                    NumProcResourceKinds(0), NumSchedClasses(0),
@@ -205,12 +216,12 @@ public:
    }
  
    // Table-gen driven ctor.
-  MCSchedModel(unsigned iw, int ml, unsigned ll, unsigned hl, unsigned mp,
-               unsigned pi, const MCProcResourceDesc *pr,
+  MCSchedModel(unsigned iw, int ml, unsigned ll, unsigned hl, unsigned ilp,
+               unsigned mp, unsigned pi, const MCProcResourceDesc *pr,
                 const MCSchedClassDesc *sc, unsigned npr, unsigned nsc,
                 const InstrItinerary *ii):
      IssueWidth(iw), MinLatency(ml), LoadLatency(ll), HighLatency(hl),
-    MispredictPenalty(mp), ProcID(pi), ProcResourceTable(pr),
+    ILPWindow(ilp), MispredictPenalty(mp), ProcID(pi), ProcResourceTable(pr),
      SchedClassTable(sc), NumProcResourceKinds(npr), NumSchedClasses(nsc),
      InstrItineraries(ii) {}
  
diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td

index 0da82fdd8971849fa172824ea9618a3ab382edf7..b7920bae8a63345a2fd1c8e3438cbd94092175c2 100644 (file)
--- a/include/llvm/Target/TargetSchedule.td
+++ b/include/llvm/Target/TargetSchedule.td
@@ -76,6 +76,7 @@ class SchedMachineModel {
    int IssueWidth = -1; // Max micro-ops that may be scheduled per cycle.
    int MinLatency = -1; // Determines which instrucions are allowed in a group.
                         // (-1) inorder (0) ooo, (1): inorder +var latencies.
+  int ILPWindow = -1;  // Cycles of latency likely hidden by hardware buffers.
    int LoadLatency = -1; // Cycles for loads to access the cache.
    int HighLatency = -1; // Approximation of cycles for "high latency" ops.
    int MispredictPenalty = -1; // Extra cycles for a mispredicted branch.
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp

index 117b2bdccf331f13e4b569afb3f20ba1264a46a0..a32df7805bfee6fd5bc5c30f763bd8bfe0dd9919 100644 (file)
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -48,15 +48,6 @@ static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
  static bool ViewMISchedDAGs = false;
  #endif // NDEBUG
  
-// Threshold to very roughly model an out-of-order processor's instruction
-// buffers. If the actual value of this threshold matters much in practice, then
-// it can be specified by the machine model. For now, it's an experimental
-// tuning knob to determine when and if it matters.
-static cl::opt<unsigned> ILPWindow("ilp-window", cl::Hidden,
-  cl::desc("Allow expected latency to exceed the critical path by N cycles "
-           "before attempting to balance ILP"),
-  cl::init(10U));
-
  // Experimental heuristics
  static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
    cl::desc("Enable load clustering."), cl::init(true));
@@ -1297,7 +1288,8 @@ void ConvergingScheduler::SchedBoundary::setLatencyPolicy(CandPolicy &Policy) {
      if (L > RemLatency)
        RemLatency = L;
    }
-  if (RemLatency + ExpectedLatency >= Rem->CriticalPath + ILPWindow
+  unsigned CriticalPathLimit = Rem->CriticalPath + SchedModel->getILPWindow();
+  if (RemLatency + ExpectedLatency >= CriticalPathLimit
        && RemLatency > Rem->getMaxRemainingCount(SchedModel)) {
      Policy.ReduceLatency = true;
      DEBUG(dbgs() << "Increase ILP: " << Available.getName() << '\n');
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td

index 404634fee989ba883b5b2eb6032e8aa9060c2ec1..4191931a5ad3aa42d38d1e7fb4f0a8225ab783e3 100644 (file)
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -1887,6 +1887,9 @@ def CortexA9Model : SchedMachineModel {
    let LoadLatency = 2; // Optimistic load latency assuming bypass.
                         // This is overriden by OperandCycles if the
                         // Itineraries are queried instead.
+  let ILPWindow = 10; // Don't reschedule small blocks to hide
+                      // latency. Minimum latency requirements are already
+                      // modeled strictly by reserving resources.
    let MispredictPenalty = 8; // Based on estimate of pipeline depth.
  
    let Itineraries = CortexA9Itineraries;
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td

index c14407f9ac1b31631809f1f73cce4a4d81a42ea5..d99d085298f159c44a428b359f17ac77e897896a 100644 (file)
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -470,12 +470,17 @@ def IIC_NOP : InstrItinClass;
  // latencies. Since these latencies are not used for pipeline hazards,
  // they do not need to be exact.
  //
+// ILPWindow=10 is an arbitrary threshold that approximates cycles of
+// latency hidden by instruction buffers. The actual value is not very
+// important but should be zero for inorder and nonzero for OOO processors.
+//
  // The GenericModel contains no instruciton itineraries.
  def GenericModel : SchedMachineModel {
    let IssueWidth = 4;
    let MinLatency = 0;
    let LoadLatency = 4;
    let HighLatency = 10;
+  let ILPWindow = 10;
  }
  
  include "X86ScheduleAtom.td"
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td

index 87102614cc8bb05d3335612d07c125b9656d9fc6..1e5f2d6c9a53020be75e0d2ebb6227ef87e7aa8c 100644 (file)
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -525,6 +525,7 @@ def AtomModel : SchedMachineModel {
                         // OperandCycles may be used for expected latency.
    let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
    let HighLatency = 30;// Expected, may be overriden by OperandCycles.
+  let ILPWindow = 0; // Always try to hide expected latency.
  
    let Itineraries = AtomItineraries;
  }
diff --git a/test/CodeGen/ARM/misched-inorder-latency.ll b/test/CodeGen/ARM/misched-inorder-latency.ll

deleted file mode 100644 (file)

index 8c06b4c..0000000
--- a/test/CodeGen/ARM/misched-inorder-latency.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: llc < %s -enable-misched -march=thumb -mcpu=swift \
-; RUN:          -pre-RA-sched=source -scheditins=false -ilp-window=0 \
-; RUN:          -disable-ifcvt-triangle-false -disable-post-ra | FileCheck %s
-;
-; For these tests, we set -ilp-window=0 to simulate in order processor.
-
-; %val1 is a 3-cycle load live out of %entry. It should be hoisted
-; above the add.
-; CHECK: @testload
-; CHECK: %entry
-; CHECK: ldr
-; CHECK: adds
-; CHECK: bne
-; CHECK: %true
-define i32 @testload(i32 *%ptr, i32 %sumin) {
-entry:
-  %sum1 = add i32 %sumin, 1
-  %val1 = load i32* %ptr
-  %p = icmp eq i32 %sumin, 0
-  br i1 %p, label %true, label %end
-true:
-  %sum2 = add i32 %sum1, 1
-  %ptr2 = getelementptr i32* %ptr, i32 1
-  %val = load i32* %ptr2
-  %val2 = add i32 %val1, %val
-  br label %end
-end:
-  %valmerge = phi i32 [ %val1, %entry], [ %val2, %true ]
-  %summerge = phi i32 [ %sum1, %entry], [ %sum2, %true ]
-  %sumout = add i32 %valmerge, %summerge
-  ret i32 %sumout
-}
-
-; The prefetch gets a default latency of 3 cycles and should be hoisted
-; above the add.
-;
-; CHECK: @testprefetch
-; CHECK: %entry
-; CHECK: pld
-; CHECK: adds
-; CHECK: bx
-define i32 @testprefetch(i8 *%ptr, i32 %i) {
-entry:
-  %tmp = add i32 %i, 1
-  tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3, i32 1 )
-  ret i32 %tmp
-}
-declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind
diff --git a/test/CodeGen/PowerPC/misched-inorder-latency.ll b/test/CodeGen/PowerPC/misched-inorder-latency.ll

new file mode 100644 (file)

index 0000000..8fae7ad
--- /dev/null
+++ b/test/CodeGen/PowerPC/misched-inorder-latency.ll
@@ -0,0 +1,55 @@
+; RUN: llc < %s -enable-misched -pre-RA-sched=source -scheditins=false \
+; RUN:          -disable-ifcvt-triangle-false -disable-post-ra | FileCheck %s
+;
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+; %val1 is a load live out of %entry. It should be hoisted
+; above the add.
+; CHECK: testload:
+; CHECK: %entry
+; CHECK: lwz
+; CHECK: addi
+; CHECK: bne
+; CHECK: %true
+define i32 @testload(i32 *%ptr, i32 %sumin) {
+entry:
+  %sum1 = add i32 %sumin, 1
+  %val1 = load i32* %ptr
+  %p = icmp eq i32 %sumin, 0
+  br i1 %p, label %true, label %end
+true:
+  %sum2 = add i32 %sum1, 1
+  %ptr2 = getelementptr i32* %ptr, i32 1
+  %val = load i32* %ptr2
+  %val2 = add i32 %val1, %val
+  br label %end
+end:
+  %valmerge = phi i32 [ %val1, %entry], [ %val2, %true ]
+  %summerge = phi i32 [ %sum1, %entry], [ %sum2, %true ]
+  %sumout = add i32 %valmerge, %summerge
+  ret i32 %sumout
+}
+
+; The prefetch gets a default latency of 3 cycles and should be hoisted
+; above the add.
+;
+; CHECK: testprefetch:
+; CHECK: %entry
+; CHECK: dcbt
+; CHECK: addi
+; CHECK: blr
+define i32 @testprefetch(i8 *%ptr, i32 %i) {
+entry:
+  %val1 = add i32 %i, 1
+  tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3, i32 1 )
+  %p = icmp eq i32 %i, 0
+  br i1 %p, label %true, label %end
+true:
+  %val2 = add i32 %val1, 1
+  br label %end
+end:
+  %valmerge = phi i32 [ %val1, %entry], [ %val2, %true ]
+  ret i32 %valmerge
+}
+declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind
diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp

index 3b7d006fd1dcae7f71990de06ebce6734fb05737..fc8d00dd83f9d3fc914cf4d955216224401ea2a3 100644 (file)
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@@ -1108,6 +1108,7 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
      EmitProcessorProp(OS, PI->ModelDef, "MinLatency", ',');
      EmitProcessorProp(OS, PI->ModelDef, "LoadLatency", ',');
      EmitProcessorProp(OS, PI->ModelDef, "HighLatency", ',');
+    EmitProcessorProp(OS, PI->ModelDef, "ILPWindow", ',');
      EmitProcessorProp(OS, PI->ModelDef, "MispredictPenalty", ',');
      OS << "  " << PI->Index << ", // Processor ID\n";
      if (PI->hasInstrSchedModel())
author	Andrew Trick <atrick@apple.com>
	Wed, 9 Jan 2013 03:36:49 +0000 (03:36 +0000)
committer	Andrew Trick <atrick@apple.com>
	Wed, 9 Jan 2013 03:36:49 +0000 (03:36 +0000)
include/llvm/CodeGen/TargetSchedule.h		patch \| blob \| history
include/llvm/MC/MCSchedule.h		patch \| blob \| history
include/llvm/Target/TargetSchedule.td		patch \| blob \| history
lib/CodeGen/MachineScheduler.cpp		patch \| blob \| history
lib/Target/ARM/ARMScheduleA9.td		patch \| blob \| history
lib/Target/X86/X86Schedule.td		patch \| blob \| history
lib/Target/X86/X86ScheduleAtom.td		patch \| blob \| history
test/CodeGen/ARM/misched-inorder-latency.ll	[deleted file]	patch \| blob \| history
test/CodeGen/PowerPC/misched-inorder-latency.ll	[new file with mode: 0644]	patch \| blob
utils/TableGen/SubtargetEmitter.cpp		patch \| blob \| history