[ARM] Add support for Cortex-M7, FPv5-SP and FPv5-DP (LLVM)

[oota-llvm.git] / lib / Target / ARM / ARMScheduleA8.td
diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td

index bd5e2749ea25c0e0e88bc029ccc0b7ca9ee19a77..2c6382542ab95e723afed1ba69c2bbdb50ff6c46 100644 (file)
--- a/lib/Target/ARM/ARMScheduleA8.td
+++ b/lib/Target/ARM/ARMScheduleA8.td
@@ -71,6 +71,12 @@ def CortexA8Itineraries : ProcessorItineraries<
    InstrItinData<IIC_iMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>,
    InstrItinData<IIC_iMOVix2,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                               InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+  InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                                  InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                                  InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [3]>,
+  InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<1, [A8_LSPipe]>], [5]>,
    //
    // Move instructions, conditional
    InstrItinData<IIC_iCMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
@@ -145,28 +151,30 @@ def CortexA8Itineraries : ProcessorItineraries<
    // Load multiple, def is the 5th operand. Pipeline 0 only.
    // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers.
    InstrItinData<IIC_iLoad_m  , [InstrStage<2, [A8_Pipe0], 0>,
-                                InstrStage<2, [A8_LSPipe]>], [1, 1, 1, 1, 3]>,
+                                InstrStage<2, [A8_LSPipe]>],
+                [1, 1, 1, 1, 3], [], -1>, // dynamic uops
    //
    // Load multiple + update, defs are the 1st and 5th operands.
    InstrItinData<IIC_iLoad_mu , [InstrStage<3, [A8_Pipe0], 0>,
-                                InstrStage<3, [A8_LSPipe]>], [2, 1, 1, 1, 3]>,
+                                InstrStage<3, [A8_LSPipe]>],
+                [2, 1, 1, 1, 3], [], -1>, // dynamic uops
    //
    // Load multiple plus branch
    InstrItinData<IIC_iLoad_mBr, [InstrStage<3, [A8_Pipe0], 0>,
                                  InstrStage<3, [A8_LSPipe]>,
                                  InstrStage<1, [A8_Pipe0, A8_Pipe1]>],
-                               [1, 2, 1, 1, 3]>,
+                              [1, 2, 1, 1, 3], [], -1>, // dynamic uops
    //
    // Pop, def is the 3rd operand.
    InstrItinData<IIC_iPop  ,    [InstrStage<3, [A8_Pipe0], 0>,
-                                InstrStage<3, [A8_LSPipe]>], [1, 1, 3]>,
+                                InstrStage<3, [A8_LSPipe]>],
+                [1, 1, 3], [], -1>, // dynamic uops
    //
    // Push, def is the 3th operand.
    InstrItinData<IIC_iPop_Br,   [InstrStage<3, [A8_Pipe0], 0>,
                                  InstrStage<3, [A8_LSPipe]>,
                                  InstrStage<1, [A8_Pipe0, A8_Pipe1]>],
-                               [1, 1, 3]>,
-
+                               [1, 1, 3], [], -1>, // dynamic uops
    //
    // iLoadi + iALUr for t2LDRpci_pic.
    InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
@@ -221,12 +229,13 @@ def CortexA8Itineraries : ProcessorItineraries<
    // Store multiple. Pipeline 0 only.
    // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers.
    InstrItinData<IIC_iStore_m , [InstrStage<2, [A8_Pipe0], 0>,
-                                InstrStage<2, [A8_LSPipe]>]>,
+                                InstrStage<2, [A8_LSPipe]>],
+                [], [], -1>, // dynamic uops
    //
    // Store multiple + update
    InstrItinData<IIC_iStore_mu, [InstrStage<2, [A8_Pipe0], 0>,
-                                InstrStage<2, [A8_LSPipe]>], [2]>,
-
+                                InstrStage<2, [A8_LSPipe]>],
+                [2], [], -1>, // dynamic uops
    //
    // Preload
    InstrItinData<IIC_Preload, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
@@ -318,6 +327,15 @@ def CortexA8Itineraries : ProcessorItineraries<
                                 InstrStage<19, [A8_NPipe], 0>,
                                 InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
    //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<19, [A8_NPipe], 0>,
+                               InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
+  //
    // Single-precision FP DIV
    InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                 InstrStage<20, [A8_NPipe], 0>,
@@ -378,14 +396,16 @@ def CortexA8Itineraries : ProcessorItineraries<
                                 InstrStage<1, [A8_NLSPipe], 0>,
                                 InstrStage<1, [A8_LSPipe]>,
                                 InstrStage<1, [A8_NLSPipe], 0>,
-                               InstrStage<1, [A8_LSPipe]>], [1, 1, 1, 2]>,
+                               InstrStage<1, [A8_LSPipe]>],
+                [1, 1, 1, 2], [], -1>, // dynamic uops
    //
    // FP Load Multiple + update
    InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                 InstrStage<1, [A8_NLSPipe], 0>,
                                 InstrStage<1, [A8_LSPipe]>,
                                 InstrStage<1, [A8_NLSPipe], 0>,
-                               InstrStage<1, [A8_LSPipe]>], [2, 1, 1, 1, 2]>,
+                               InstrStage<1, [A8_LSPipe]>],
+                [2, 1, 1, 1, 2], [], -1>, // dynamic uops
    //
    // Single-precision FP Store
    InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
@@ -404,15 +424,16 @@ def CortexA8Itineraries : ProcessorItineraries<
                                 InstrStage<1, [A8_NLSPipe], 0>,
                                 InstrStage<1, [A8_LSPipe]>,
                                 InstrStage<1, [A8_NLSPipe], 0>,
-                               InstrStage<1, [A8_LSPipe]>], [1, 1, 1, 1]>,
+                               InstrStage<1, [A8_LSPipe]>],
+                [1, 1, 1, 1], [], -1>, // dynamic uops
    //
    // FP Store Multiple + update
    InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                  InstrStage<1, [A8_NLSPipe], 0>,
                                  InstrStage<1, [A8_LSPipe]>,
                                  InstrStage<1, [A8_NLSPipe], 0>,
-                                InstrStage<1, [A8_LSPipe]>], [2, 1, 1, 1, 1]>,
-
+                                InstrStage<1, [A8_LSPipe]>],
+                [2, 1, 1, 1, 1], [], -1>, // dynamic uops
    // NEON
    // Issue through integer pipeline, and execute in NEON unit.
    //
@@ -465,16 +486,28 @@ def CortexA8Itineraries : ProcessorItineraries<
    //
    // VLD1ln
    InstrItinData<IIC_VLD1ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                               InstrStage<3, [A8_NLSPipe], 1>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
                                 InstrStage<3, [A8_LSPipe]>],
                                [3, 1, 1, 1]>,
    //
    // VLD1lnu
    InstrItinData<IIC_VLD1lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                               InstrStage<3, [A8_NLSPipe], 1>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
                                 InstrStage<3, [A8_LSPipe]>],
                                [3, 2, 1, 1, 1, 1]>,
    //
+  // VLD1dup
+  InstrItinData<IIC_VLD1dup,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1]>,
+  //
+  // VLD1dupu
+  InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1, 1]>,
+  //
    // VLD2
    InstrItinData<IIC_VLD2,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                 InstrStage<2, [A8_NLSPipe], 0>,
@@ -511,6 +544,18 @@ def CortexA8Itineraries : ProcessorItineraries<
                                 InstrStage<3, [A8_LSPipe]>],
                                [3, 3, 2, 1, 1, 1, 1, 1]>,
    //
+  // VLD2dup
+  InstrItinData<IIC_VLD2dup,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1]>,
+  //
+  // VLD2dupu
+  InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 2, 1, 1]>,
+  //
    // VLD3
    InstrItinData<IIC_VLD3,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                 InstrStage<4, [A8_NLSPipe], 0>,
@@ -535,6 +580,18 @@ def CortexA8Itineraries : ProcessorItineraries<
                                 InstrStage<5, [A8_LSPipe]>],
                                [4, 4, 5, 2, 1, 1, 1, 1, 1, 2]>,
    //
+  // VLD3dup
+  InstrItinData<IIC_VLD3dup,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 1]>,
+  //
+  // VLD3dupu
+  InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 2, 1, 1]>,
+  //
    // VLD4
    InstrItinData<IIC_VLD4,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                 InstrStage<4, [A8_NLSPipe], 0>,
@@ -559,6 +616,18 @@ def CortexA8Itineraries : ProcessorItineraries<
                                 InstrStage<5, [A8_LSPipe]>],
                                [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>,
    //
+  // VLD4dup
+  InstrItinData<IIC_VLD4dup,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 1]>,
+  //
+  // VLD4dupu
+  InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 2, 1, 1]>,
+  //
    // VST1
    InstrItinData<IIC_VST1,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                 InstrStage<2, [A8_NLSPipe], 0>,
@@ -609,13 +678,13 @@ def CortexA8Itineraries : ProcessorItineraries<
    //
    // VST1ln
    InstrItinData<IIC_VST1ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                               InstrStage<2, [A8_NLSPipe], 1>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
                                 InstrStage<2, [A8_LSPipe]>],
                                [1, 1, 1]>,
    //
    // VST1lnu
    InstrItinData<IIC_VST1lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                               InstrStage<2, [A8_NLSPipe], 1>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
                                 InstrStage<2, [A8_LSPipe]>],
                                [2, 1, 1, 1, 1]>,
    //
@@ -806,6 +875,16 @@ def CortexA8Itineraries : ProcessorItineraries<
    InstrItinData<IIC_VMACQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                 InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
    //
+  // Double-register Fused FP Multiple-Accumulate
+  InstrItinData<IIC_VFMACD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>,
+  //
+  // Quad-register Fused FP Multiple-Accumulate
+  // Result written in N9, but that is relative to the last cycle of multicycle,
+  // so we use 10 for those cases
+  InstrItinData<IIC_VFMACQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
+  //
    // Double-register Reciprical Step
    InstrItinData<IIC_VRECSD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                 InstrStage<1, [A8_NPipe]>], [9, 2, 2]>,
@@ -978,3 +1057,19 @@ def CortexA8Itineraries : ProcessorItineraries<
                                 InstrStage<1, [A8_NPipe], 0>,
                              InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]>
  ]>;
+
+// ===---------------------------------------------------------------------===//
+// This following definitions describe the simple machine model which
+// will replace itineraries.
+
+// Cortex-A8 machine model for scheduling and other instruction cost heuristics.
+def CortexA8Model : SchedMachineModel {
+  let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
+  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 2; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+  let MispredictPenalty = 13; // Based on estimate of pipeline depth.
+
+  let Itineraries = CortexA8Itineraries;
+}