X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FARM%2FARMScheduleA9.td;h=9a1d2227564649f65cc21162e71b6943bc854b8d;hb=47b167dd84902e2cdc4796d68ddcfa5f540a67cd;hp=4a398c46567d82a06f0877bf84304f9096dee0bb;hpb=c47f7d643eee54c087bbe4c9964aa4d5afb7f6fe;p=oota-llvm.git diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index 4a398c46567..9a1d2227564 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -11,6 +11,10 @@ // //===----------------------------------------------------------------------===// +// ===---------------------------------------------------------------------===// +// This section contains legacy support for itineraries. This is +// required until SD and PostRA schedulers are replaced by MachineScheduler. + // // Ad-hoc scheduling information derived from pretty vague "Cortex-A9 Technical // Reference Manual". @@ -50,6 +54,16 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData, InstrStage<1, [A9_ALU0, A9_ALU1]>, InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [3]>, + InstrItinData, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], [5]>, // // MVN instructions InstrItinData, @@ -123,7 +137,8 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData, InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1], [A9_LdBypass, A9_LdBypass]>, - InstrItinData], + InstrItinData, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [1, 1], [A9_LdBypass, NoBypass]>, InstrItinData, InstrStage<3, [A9_ALU0, A9_ALU1]>], @@ -269,7 +284,8 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<2, [A9_AGU], 1>, InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1, 3], - [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>, + [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass], + -1>, // dynamic uops // // Load multiple + update, defs are the 1st and 5th operands. InstrItinData, @@ -277,7 +293,8 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<2, [A9_AGU], 1>, InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1, 3], - [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>, + [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass], + -1>, // dynamic uops // // Load multiple plus branch InstrItinData, @@ -286,7 +303,8 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<2, [A9_LSUnit]>, InstrStage<1, [A9_Branch]>], [1, 2, 1, 1, 3], - [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>, + [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass], + -1>, // dynamic uops // // Pop, def is the 3rd operand. InstrItinData, @@ -294,7 +312,8 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<2, [A9_AGU], 1>, InstrStage<2, [A9_LSUnit]>], [1, 1, 3], - [NoBypass, NoBypass, A9_LdBypass]>, + [NoBypass, NoBypass, A9_LdBypass], + -1>, // dynamic uops // // Pop + branch, def is the 3rd operand. InstrItinData, @@ -303,8 +322,8 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<2, [A9_LSUnit]>, InstrStage<1, [A9_Branch]>], [1, 1, 3], - [NoBypass, NoBypass, A9_LdBypass]>, - + [NoBypass, NoBypass, A9_LdBypass], + -1>, // dynamic uops // // iLoadi + iALUr for t2LDRpci_pic. InstrItinData, @@ -398,14 +417,15 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_AGU], 0>, - InstrStage<2, [A9_LSUnit]>]>, + InstrStage<2, [A9_LSUnit]>], + [], [], -1>, // dynamic uops // // Store multiple + update InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_AGU], 0>, - InstrStage<2, [A9_LSUnit]>], [2]>, - + InstrStage<2, [A9_LSUnit]>], + [2], [], -1>, // dynamic uops // // Preload InstrItinData], [1, 1]>, @@ -593,6 +613,22 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<2, [A9_NPipe]>], [9, 1, 1, 1]>, // + // Single-precision Fused FP MAC + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<9, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [8, 1, 1, 1]>, + // + // Double-precision Fused FP MAC + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<10, [A9_DRegsN], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [9, 1, 1, 1]>, + // // Single-precision FP DIV InstrItinData, InstrStage<1, [A9_MUX0], 0>, @@ -645,19 +681,19 @@ def CortexA9Itineraries : ProcessorItineraries< [1, 1, 1]>, // // Single-precision to Integer Move + // + // On A9 move-from-VFP is free to issue with no stall if other VFP + // operations are in flight. I assume it still can't dual-issue though. InstrItinData, - InstrStage<1, [A9_MUX0], 0>, - InstrStage<1, [A9_DRegsVFP], 0, Required>, - InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_NPipe]>], + InstrStage<1, [A9_MUX0], 0>], [2, 1]>, // // Double-precision to Integer Move + // + // On A9 move-from-VFP is free to issue with no stall if other VFP + // operations are in flight. I assume it still can't dual-issue though. InstrItinData, - InstrStage<1, [A9_MUX0], 0>, - InstrStage<1, [A9_DRegsVFP], 0, Required>, - InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_NPipe]>], + InstrStage<1, [A9_MUX0], 0>], [2, 1, 1]>, // // Single-precision FP Load @@ -680,20 +716,24 @@ def CortexA9Itineraries : ProcessorItineraries< [2, 1]>, // // FP Load Multiple + // FIXME: assumes 2 doubles which requires 2 LS cycles. InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, InstrStage<1, [A9_NPipe], 0>, - InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1], [], -1>, // dynamic uops // // FP Load Multiple + update + // FIXME: assumes 2 doubles which requires 2 LS cycles. InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, InstrStage<1, [A9_NPipe], 0>, - InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1]>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1], [], -1>, // dynamic uops // // Single-precision FP Store InstrItinData, @@ -714,169 +754,208 @@ def CortexA9Itineraries : ProcessorItineraries< [1, 1]>, // // FP Store Multiple + // FIXME: assumes 2 doubles which requires 2 LS cycles. InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, InstrStage<1, [A9_NPipe], 0>, - InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1], [], -1>, // dynamic uops // // FP Store Multiple + update + // FIXME: assumes 2 doubles which requires 2 LS cycles. InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, InstrStage<1, [A9_NPipe], 0>, - InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1]>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1], [], -1>, // dynamic uops // NEON // VLD1 - // FIXME: Conservatively assume insufficent alignment. InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [2, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1]>, // VLD1x2 InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [2, 2, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 1]>, // VLD1x3 InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [2, 2, 3, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 1]>, // VLD1x4 InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [2, 2, 3, 3, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 2, 1]>, // VLD1u InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [2, 2, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 2, 1]>, // VLD1x2u InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [2, 2, 2, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 2, 1]>, // VLD1x3u InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [2, 2, 3, 2, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 2, 1]>, // VLD1x4u InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [2, 2, 3, 3, 2, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 2, 2, 1]>, // // VLD1ln InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 1>, - InstrStage<3, [A9_LSUnit]>], - [4, 1, 1, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 1, 1, 1]>, // // VLD1lnu InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 1>, - InstrStage<3, [A9_LSUnit]>], - [4, 2, 1, 1, 1, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 2, 1, 1, 1, 1]>, + // + // VLD1dup + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1]>, + // + // VLD1dupu + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 1, 1]>, // // VLD2 InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [3, 3, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 1]>, // // VLD2x2 InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [3, 4, 3, 4, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 3, 2, 3, 1]>, // // VLD2ln InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [4, 4, 1, 1, 1, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 3, 1, 1, 1, 1]>, // // VLD2u InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [3, 3, 2, 1, 1, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 2, 1, 1, 1]>, // // VLD2x2u InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [3, 4, 3, 4, 2, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 3, 2, 3, 2, 1]>, // // VLD2lnu InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [4, 4, 2, 1, 1, 1, 1, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 3, 2, 1, 1, 1, 1, 1]>, + // + // VLD2dup + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 1]>, + // + // VLD2dupu + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 2, 1, 1]>, // // VLD3 InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<10,[A9_DRegsVFP], 0, Reserved>, - InstrStage<4, [A9_NPipe], 0>, - InstrStage<4, [A9_LSUnit]>], - [4, 4, 5, 1]>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 1]>, // // VLD3ln InstrItinData, @@ -891,10 +970,10 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<10,[A9_DRegsVFP], 0, Reserved>, - InstrStage<4, [A9_NPipe], 0>, - InstrStage<4, [A9_LSUnit]>], - [4, 4, 5, 2, 1]>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 2, 1]>, // // VLD3lnu InstrItinData, @@ -905,94 +984,130 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<5, [A9_LSUnit]>], [5, 5, 6, 2, 1, 1, 1, 1, 1, 2]>, // + // VLD3dup + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 1]>, + // + // VLD3dupu + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 2, 1, 1]>, + // // VLD4 InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<10,[A9_DRegsVFP], 0, Reserved>, - InstrStage<4, [A9_NPipe], 0>, - InstrStage<4, [A9_LSUnit]>], - [4, 4, 5, 5, 1]>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 4, 1]>, // // VLD4ln InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<11,[A9_DRegsVFP], 0, Reserved>, - InstrStage<5, [A9_NPipe], 0>, - InstrStage<5, [A9_LSUnit]>], - [5, 5, 6, 6, 1, 1, 1, 1, 2, 2]>, + InstrStage<10,[A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe], 0>, + InstrStage<4, [A9_LSUnit]>], + [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>, // // VLD4u InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 4, 2, 1]>, + // + // VLD4lnu + InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, InstrStage<10,[A9_DRegsVFP], 0, Reserved>, InstrStage<4, [A9_NPipe], 0>, InstrStage<4, [A9_LSUnit]>], - [4, 4, 5, 5, 2, 1]>, + [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>, // - // VLD4lnu - InstrItinData, + // VLD4dup + InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<11,[A9_DRegsVFP], 0, Reserved>, - InstrStage<5, [A9_NPipe], 0>, - InstrStage<5, [A9_LSUnit]>], - [5, 5, 6, 6, 2, 1, 1, 1, 1, 1, 2, 2]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 2, 3, 3, 1]>, // - // VST1 - InstrItinData, + // VLD4dupu + InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, InstrStage<2, [A9_NPipe], 0>, InstrStage<2, [A9_LSUnit]>], + [2, 2, 3, 3, 2, 1, 1]>, + // + // VST1 + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, // // VST1x2 InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>, // // VST1x3 InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1, 2]>, // // VST1x4 InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1, 2, 2]>, // // VST1u InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1, 1]>, // // VST1x2u InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1]>, // // VST1x3u @@ -1000,44 +1115,44 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1, 2]>, // // VST1x4u InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1, 2, 2]>, // // VST1ln InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 1>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, // // VST1lnu InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 1>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1, 1]>, // // VST2 InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>, // // VST2x2 @@ -1053,9 +1168,9 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1]>, // // VST2x2u @@ -1071,36 +1186,36 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>, // // VST2lnu InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1]>, // // VST3 InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1, 2]>, // // VST3u InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1, 2]>, // // VST3ln @@ -1125,36 +1240,36 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1, 2, 2]>, // // VST4u InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1, 2, 2]>, // // VST4ln InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1, 2, 2]>, // // VST4lnu InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1, 2, 2]>, // @@ -1324,11 +1439,11 @@ def CortexA9Itineraries : ProcessorItineraries< [4, 2, 2]>, // // Double-register Absolute Difference and Accumulate - InstrItinData, - InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, InstrStage<1, [A9_NPipe]>], [6, 3, 2, 1]>, // @@ -1611,6 +1726,26 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<4, [A9_NPipe]>], [8, 4, 2, 1]>, // + // Double-register Fused FP Multiple-Accumulate + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [6, 3, 2, 1]>, + // + // Quad-register Fused FP Multiple-Accumulate + // Result written in N9, but that is relative to the last cycle of multicycle, + // so we use 10 for those cases + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 9 cycles + InstrStage<10, [A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe]>], + [8, 4, 2, 1]>, + // // Double-register Reciprical Step InstrItinData, InstrStage<1, [A9_MUX0], 0>, @@ -1739,3 +1874,656 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<2, [A9_NPipe]>], [4, 1, 2, 2, 3, 3, 1]> ]>; + +// ===---------------------------------------------------------------------===// +// The following definitions describe the simpler per-operand machine model. +// This works with MachineScheduler and will eventually replace itineraries. + +class A9WriteLMOpsListType writes> { + list Writes = writes; + SchedMachineModel SchedModel = ?; +} + +// Cortex-A9 machine model for scheduling and other instruction cost heuristics. +def CortexA9Model : SchedMachineModel { + let IssueWidth = 2; // 2 micro-ops are dispatched per cycle. + let MicroOpBufferSize = 56; // Based on available renamed registers. + let LoadLatency = 2; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + let MispredictPenalty = 8; // Based on estimate of pipeline depth. + + let Itineraries = CortexA9Itineraries; + + // FIXME: Many vector operations were never given an itinerary. We + // haven't mapped these to the new model either. + let CompleteModel = 0; +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available. +// +// The AGU unit has BufferSize=1 so that the latency between operations +// that use it are considered to stall other operations. +// +// The FP unit has BufferSize=0 so that it is a hard dispatch +// hazard. No instruction may be dispatched while the unit is reserved. + +let SchedModel = CortexA9Model in { + +def A9UnitALU : ProcResource<2>; +def A9UnitMul : ProcResource<1> { let Super = A9UnitALU; } +def A9UnitAGU : ProcResource<1> { let BufferSize = 1; } +def A9UnitLS : ProcResource<1>; +def A9UnitFP : ProcResource<1> { let BufferSize = 0; } +def A9UnitB : ProcResource<1>; + +//===----------------------------------------------------------------------===// +// Define scheduler read/write types with their resources and latency on A9. + +// Consume an issue slot, but no processor resources. This is useful when all +// other writes associated with the operand have NumMicroOps = 0. +def A9WriteIssue : SchedWriteRes<[]> { let Latency = 0; } + +// Write an integer register. +def A9WriteI : SchedWriteRes<[A9UnitALU]>; +// Write an integer shifted-by register +def A9WriteIsr : SchedWriteRes<[A9UnitALU]> { let Latency = 2; } + +// Basic ALU. +def A9WriteALU : SchedWriteRes<[A9UnitALU]>; +// ALU with operand shifted by immediate. +def : WriteRes { let Latency = 2; } +// ALU with operand shifted by register. +def A9WriteALUsr : SchedWriteRes<[A9UnitALU]> { let Latency = 3; } + +// Multiplication +def A9WriteM : SchedWriteRes<[A9UnitMul, A9UnitMul]> { let Latency = 4; } +def A9WriteMHi : SchedWriteRes<[A9UnitMul]> { let Latency = 5; + let NumMicroOps = 0; } +def A9WriteM16 : SchedWriteRes<[A9UnitMul]> { let Latency = 3; } +def A9WriteM16Hi : SchedWriteRes<[A9UnitMul]> { let Latency = 4; + let NumMicroOps = 0; } + +// Floating-point +// Only one FP or AGU instruction may issue per cycle. We model this +// by having FP instructions consume the AGU resource. +def A9WriteF : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; } +def A9WriteFMov : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; } +def A9WriteFMulS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; } +def A9WriteFMulD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; } +def A9WriteFMAS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 8; } +def A9WriteFMAD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; } +def A9WriteFDivS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 15; } +def A9WriteFDivD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 25; } +def A9WriteFSqrtS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 17; } +def A9WriteFSqrtD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 32; } + +// NEON has an odd mix of latencies. Simply name the write types by latency. +def A9WriteV1 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; } +def A9WriteV2 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 2; } +def A9WriteV3 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 3; } +def A9WriteV4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; } +def A9WriteV5 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; } +def A9WriteV6 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; } +def A9WriteV7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 7; } +def A9WriteV9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; } +def A9WriteV10 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 10; } + +// Reserve A9UnitFP for 2 consecutive cycles. +def A9Write2V4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { + let Latency = 4; + let ResourceCycles = [2]; +} +def A9Write2V7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { + let Latency = 7; + let ResourceCycles = [2]; +} +def A9Write2V9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { + let Latency = 9; + let ResourceCycles = [2]; +} + +// Branches don't have a def operand but still consume resources. +def A9WriteB : SchedWriteRes<[A9UnitB]>; + +// Address generation. +def A9WriteAdr : SchedWriteRes<[A9UnitAGU]> { let NumMicroOps = 0; } + +// Load Integer. +def A9WriteL : SchedWriteRes<[A9UnitLS]> { let Latency = 3; } +// Load the upper 32-bits using the same micro-op. +def A9WriteLHi : SchedWriteRes<[]> { let Latency = 3; + let NumMicroOps = 0; } +// Offset shifted by register. +def A9WriteLsi : SchedWriteRes<[A9UnitLS]> { let Latency = 4; } +// Load (and zero extend) a byte. +def A9WriteLb : SchedWriteRes<[A9UnitLS]> { let Latency = 4; } +def A9WriteLbsi : SchedWriteRes<[A9UnitLS]> { let Latency = 5; } + +// Load or Store Float, aligned. +def A9WriteLSfp : SchedWriteRes<[A9UnitLS, A9UnitFP]> { let Latency = 1; } + +// Store Integer. +def A9WriteS : SchedWriteRes<[A9UnitLS]>; + +//===----------------------------------------------------------------------===// +// Define resources dynamically for load multiple variants. + +// Define helpers for extra latency without consuming resources. +def A9WriteCycle1 : SchedWriteRes<[]> { let Latency = 1; let NumMicroOps = 0; } +foreach NumCycles = 2-8 in { +def A9WriteCycle#NumCycles : WriteSequence<[A9WriteCycle1], NumCycles>; +} // foreach NumCycles + +// Define address generation sequences and predicates for 8 flavors of LDMs. +foreach NumAddr = 1-8 in { + +// Define A9WriteAdr1-8 as a sequence of A9WriteAdr with additive +// latency for instructions that generate multiple loads or stores. +def A9WriteAdr#NumAddr : WriteSequence<[A9WriteAdr], NumAddr>; + +// Define a predicate to select the LDM based on number of memory addresses. +def A9LMAdr#NumAddr#Pred : + SchedPredicate<"(TII->getNumLDMAddresses(MI)+1)/2 == "#NumAddr>; + +} // foreach NumAddr + +// Fall-back for unknown LDMs. +def A9LMUnknownPred : SchedPredicate<"TII->getNumLDMAddresses(MI) == 0">; + +// LDM/VLDM/VLDn address generation latency & resources. +// Dynamically select the A9WriteAdrN sequence using a predicate. +def A9WriteLMAdr : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + // For unknown LDM/VLDM/VSTM, assume 2 32-bit registers. + SchedVar]>; + +// Define LDM Resources. +// These take no issue resource, so they can be combined with other +// writes like WriteB. +// A9WriteLMLo takes a single LS resource and 2 cycles. +def A9WriteLMLo : SchedWriteRes<[A9UnitLS]> { let Latency = 2; + let NumMicroOps = 0; } +// Assuming aligned access, the upper half of each pair is free with +// the same latency. +def A9WriteLMHi : SchedWriteRes<[]> { let Latency = 2; + let NumMicroOps = 0; } +// Each A9WriteL#N variant adds N cycles of latency without consuming +// additional resources. +foreach NumAddr = 1-8 in { +def A9WriteL#NumAddr : WriteSequence< + [A9WriteLMLo, !cast("A9WriteCycle"#NumAddr)]>; +def A9WriteL#NumAddr#Hi : WriteSequence< + [A9WriteLMHi, !cast("A9WriteCycle"#NumAddr)]>; +} + +//===----------------------------------------------------------------------===// +// LDM: Load multiple into 32-bit integer registers. + +def A9WriteLMOpsList : A9WriteLMOpsListType< + [A9WriteL1, A9WriteL1Hi, + A9WriteL2, A9WriteL2Hi, + A9WriteL3, A9WriteL3Hi, + A9WriteL4, A9WriteL4Hi, + A9WriteL5, A9WriteL5Hi, + A9WriteL6, A9WriteL6Hi, + A9WriteL7, A9WriteL7Hi, + A9WriteL8, A9WriteL8Hi]>; + +// A9WriteLM variants expand into a pair of writes for each 64-bit +// value loaded. When the number of registers is odd, the last +// A9WriteLnHi is naturally ignored because the instruction has no +// following def operands. These variants take no issue resource, so +// they may need to be part of a WriteSequence that includes A9WriteIssue. +def A9WriteLM : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + // For unknown LDMs, define the maximum number of writes, but only + // make the first two consume resources. + SchedVar]> { + let Variadic = 1; +} + +//===----------------------------------------------------------------------===// +// VFP Load/Store Multiple Variants, and NEON VLDn/VSTn support. + +// A9WriteLfpOp is the same as A9WriteLSfp but takes no issue resources +// so can be used in WriteSequences for in single-issue instructions that +// encapsulate multiple loads. +def A9WriteLfpOp : SchedWriteRes<[A9UnitLS, A9UnitFP]> { + let Latency = 1; + let NumMicroOps = 0; +} + +foreach NumAddr = 1-8 in { + +// Helper for A9WriteLfp1-8: A sequence of fp loads with no micro-ops. +def A9WriteLfp#NumAddr#Seq : WriteSequence<[A9WriteLfpOp], NumAddr>; + +// A9WriteLfp1-8 definitions are statically expanded into a sequence of +// A9WriteLfpOps with additive latency that takes a single issue slot. +// Used directly to describe NEON VLDn. +def A9WriteLfp#NumAddr : WriteSequence< + [A9WriteIssue, !cast("A9WriteLfp"#NumAddr#Seq)]>; + +// A9WriteLfp1-8Mov adds a cycle of latency and FP resource for +// permuting loaded values. +def A9WriteLfp#NumAddr#Mov : WriteSequence< + [A9WriteF, !cast("A9WriteLfp"#NumAddr#Seq)]>; + +} // foreach NumAddr + +// Define VLDM/VSTM PreRA resources. +// A9WriteLMfpPreRA are dynamically expanded into the correct +// A9WriteLfp1-8 sequence based on a predicate. This supports the +// preRA VLDM variants in which all 64-bit loads are written to the +// same tuple of either single or double precision registers. +def A9WriteLMfpPreRA : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + // For unknown VLDM/VSTM PreRA, assume 2xS registers. + SchedVar]>; + +// Define VLDM/VSTM PostRA Resources. +// A9WriteLMfpLo takes a LS and FP resource and one issue slot but no latency. +def A9WriteLMfpLo : SchedWriteRes<[A9UnitLS, A9UnitFP]> { let Latency = 0; } + +foreach NumAddr = 1-8 in { + +// Each A9WriteL#N variant adds N cycles of latency without consuming +// additional resources. +def A9WriteLMfp#NumAddr : WriteSequence< + [A9WriteLMfpLo, !cast("A9WriteCycle"#NumAddr)]>; + +// Assuming aligned access, the upper half of each pair is free with +// the same latency. +def A9WriteLMfp#NumAddr#Hi : WriteSequence< + [A9WriteLMHi, !cast("A9WriteCycle"#NumAddr)]>; + +} // foreach NumAddr + +// VLDM PostRA Variants. These variants expand A9WriteLMfpPostRA into a +// pair of writes for each 64-bit data loaded. When the number of +// registers is odd, the last WriteLMfpnHi is naturally ignored because +// the instruction has no following def operands. + +def A9WriteLMfpPostRAOpsList : A9WriteLMOpsListType< + [A9WriteLMfp1, A9WriteLMfp2, // 0-1 + A9WriteLMfp3, A9WriteLMfp4, // 2-3 + A9WriteLMfp5, A9WriteLMfp6, // 4-5 + A9WriteLMfp7, A9WriteLMfp8, // 6-7 + A9WriteLMfp1Hi, // 8-8 + A9WriteLMfp2Hi, A9WriteLMfp2Hi, // 9-10 + A9WriteLMfp3Hi, A9WriteLMfp3Hi, // 11-12 + A9WriteLMfp4Hi, A9WriteLMfp4Hi, // 13-14 + A9WriteLMfp5Hi, A9WriteLMfp5Hi, // 15-16 + A9WriteLMfp6Hi, A9WriteLMfp6Hi, // 17-18 + A9WriteLMfp7Hi, A9WriteLMfp7Hi, // 19-20 + A9WriteLMfp8Hi, A9WriteLMfp8Hi]>; // 21-22 + +def A9WriteLMfpPostRA : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + // For unknown LDMs, define the maximum number of writes, but only + // make the first two consume resources. We are optimizing for the case + // where the operands are DPRs, and this determines the first eight + // types. The remaining eight types are filled to cover the case + // where the operands are SPRs. + SchedVar]> { + let Variadic = 1; +} + +// Distinguish between our multiple MI-level forms of the same +// VLDM/VSTM instructions. +def A9PreRA : SchedPredicate< + "TargetRegisterInfo::isVirtualRegister(MI->getOperand(0).getReg())">; +def A9PostRA : SchedPredicate< + "TargetRegisterInfo::isPhysicalRegister(MI->getOperand(0).getReg())">; + +// VLDM represents all destination registers as a single register +// tuple, unlike LDM. So the number of write operands is not variadic. +def A9WriteLMfp : SchedWriteVariant<[ + SchedVar, + SchedVar]>; + +//===----------------------------------------------------------------------===// +// Resources for other (non-LDM/VLDM) Variants. + +// These mov immediate writers are unconditionally expanded with +// additive latency. +def A9WriteI2 : WriteSequence<[A9WriteI, A9WriteI]>; +def A9WriteI2pc : WriteSequence<[A9WriteI, A9WriteI, WriteALU]>; +def A9WriteI2ld : WriteSequence<[A9WriteI, A9WriteI, A9WriteL]>; + +// Some ALU operations can read loaded integer values one cycle early. +def A9ReadALU : SchedReadAdvance<1, + [A9WriteL, A9WriteLHi, A9WriteLsi, A9WriteLb, A9WriteLbsi, + A9WriteL1, A9WriteL2, A9WriteL3, A9WriteL4, + A9WriteL5, A9WriteL6, A9WriteL7, A9WriteL8, + A9WriteL1Hi, A9WriteL2Hi, A9WriteL3Hi, A9WriteL4Hi, + A9WriteL5Hi, A9WriteL6Hi, A9WriteL7Hi, A9WriteL8Hi]>; + +// Read types for operands that are unconditionally read in cycle N +// after the instruction issues, decreases producer latency by N-1. +def A9Read2 : SchedReadAdvance<1>; +def A9Read3 : SchedReadAdvance<2>; +def A9Read4 : SchedReadAdvance<3>; + +//===----------------------------------------------------------------------===// +// Map itinerary classes to scheduler read/write resources per operand. +// +// For ARM, we piggyback scheduler resources on the Itinerary classes +// to avoid perturbing the existing instruction definitions. + +// This table follows the ARM Cortex-A9 Technical Reference Manuals, +// mostly in order. + +def :ItinRW<[WriteALU], [IIC_iMOVi,IIC_iMOVr,IIC_iMOVsi, + IIC_iMVNi,IIC_iMVNsi, + IIC_iCMOVi,IIC_iCMOVr,IIC_iCMOVsi]>; +def :ItinRW<[WriteALU, A9ReadALU],[IIC_iMVNr]>; +def :ItinRW<[A9WriteIsr], [IIC_iMOVsr,IIC_iMVNsr,IIC_iCMOVsr]>; + +def :ItinRW<[A9WriteI2], [IIC_iMOVix2,IIC_iCMOVix2]>; +def :ItinRW<[A9WriteI2pc], [IIC_iMOVix2addpc]>; +def :ItinRW<[A9WriteI2ld], [IIC_iMOVix2ld]>; + +def :ItinRW<[WriteALU], [IIC_iBITi,IIC_iBITr,IIC_iUNAr,IIC_iTSTi,IIC_iTSTr]>; +def :ItinRW<[WriteALU, A9ReadALU], [IIC_iALUi, IIC_iCMPi, IIC_iCMPsi]>; +def :ItinRW<[WriteALU, A9ReadALU, A9ReadALU],[IIC_iALUr,IIC_iCMPr]>; +def :ItinRW<[WriteALUsi], [IIC_iBITsi,IIC_iUNAsi,IIC_iEXTr,IIC_iTSTsi]>; +def :ItinRW<[WriteALUsi, A9ReadALU], [IIC_iALUsi]>; +def :ItinRW<[WriteALUsi, ReadDefault, A9ReadALU], [IIC_iALUsir]>; // RSB +def :ItinRW<[A9WriteALUsr], [IIC_iBITsr,IIC_iTSTsr,IIC_iEXTAr,IIC_iEXTAsr]>; +def :ItinRW<[A9WriteALUsr, A9ReadALU], [IIC_iALUsr,IIC_iCMPsr]>; + +// A9WriteHi ignored for MUL32. +def :ItinRW<[A9WriteM, A9WriteMHi], [IIC_iMUL32,IIC_iMAC32, + IIC_iMUL64,IIC_iMAC64]>; +// FIXME: SMLALxx needs itin classes +def :ItinRW<[A9WriteM16, A9WriteM16Hi], [IIC_iMUL16,IIC_iMAC16]>; + +// TODO: For floating-point ops, we model the pipeline forwarding +// latencies here. WAW latencies are sometimes longer. + +def :ItinRW<[A9WriteFMov], [IIC_fpSTAT, IIC_fpMOVIS, IIC_fpMOVID, IIC_fpMOVSI, + IIC_fpUNA32, IIC_fpUNA64, + IIC_fpCMP32, IIC_fpCMP64]>; +def :ItinRW<[A9WriteFMov, A9WriteFMov], [IIC_fpMOVDI]>; +def :ItinRW<[A9WriteF], [IIC_fpCVTSD, IIC_fpCVTDS, IIC_fpCVTSH, IIC_fpCVTHS, + IIC_fpCVTIS, IIC_fpCVTID, IIC_fpCVTSI, IIC_fpCVTDI, + IIC_fpALU32, IIC_fpALU64]>; +def :ItinRW<[A9WriteFMulS], [IIC_fpMUL32]>; +def :ItinRW<[A9WriteFMulD], [IIC_fpMUL64]>; +def :ItinRW<[A9WriteFMAS], [IIC_fpMAC32]>; +def :ItinRW<[A9WriteFMAD], [IIC_fpMAC64]>; +def :ItinRW<[A9WriteFDivS], [IIC_fpDIV32]>; +def :ItinRW<[A9WriteFDivD], [IIC_fpDIV64]>; +def :ItinRW<[A9WriteFSqrtS], [IIC_fpSQRT32]>; +def :ItinRW<[A9WriteFSqrtD], [IIC_fpSQRT64]>; + +def :ItinRW<[A9WriteB], [IIC_Br]>; + +// A9 PLD is processed in a dedicated unit. +def :ItinRW<[], [IIC_Preload]>; + +// Note: We must assume that loads are aligned, since the machine +// model cannot know this statically and A9 ignores alignment hints. + +// A9WriteAdr consumes AGU regardless address writeback. But it's +// latency is only relevant for users of an updated address. +def :ItinRW<[A9WriteL, A9WriteAdr], [IIC_iLoad_i,IIC_iLoad_r, + IIC_iLoad_iu,IIC_iLoad_ru]>; +def :ItinRW<[A9WriteLsi, A9WriteAdr], [IIC_iLoad_si,IIC_iLoad_siu]>; +def :ItinRW<[A9WriteLb, A9WriteAdr2], [IIC_iLoad_bh_i,IIC_iLoad_bh_r, + IIC_iLoad_bh_iu,IIC_iLoad_bh_ru]>; +def :ItinRW<[A9WriteLbsi, A9WriteAdr2], [IIC_iLoad_bh_si,IIC_iLoad_bh_siu]>; +def :ItinRW<[A9WriteL, A9WriteLHi, A9WriteAdr], [IIC_iLoad_d_i,IIC_iLoad_d_r, + IIC_iLoad_d_ru]>; +// Store either has no def operands, or the one def for address writeback. +def :ItinRW<[A9WriteAdr, A9WriteS], [IIC_iStore_i, IIC_iStore_r, + IIC_iStore_iu, IIC_iStore_ru, + IIC_iStore_d_i, IIC_iStore_d_r, + IIC_iStore_d_ru]>; +def :ItinRW<[A9WriteAdr2, A9WriteS], [IIC_iStore_si, IIC_iStore_siu, + IIC_iStore_bh_i, IIC_iStore_bh_r, + IIC_iStore_bh_iu, IIC_iStore_bh_ru]>; +def :ItinRW<[A9WriteAdr3, A9WriteS], [IIC_iStore_bh_si, IIC_iStore_bh_siu]>; + +// A9WriteML will be expanded into a separate write for each def +// operand. Address generation consumes resources, but A9WriteLMAdr +// is listed after all def operands, so has no effective latency. +// +// Note: A9WriteLM expands into an even number of def operands. The +// actual number of def operands may be less by one. +def :ItinRW<[A9WriteLM, A9WriteLMAdr, A9WriteIssue], [IIC_iLoad_m, IIC_iPop]>; + +// Load multiple with address writeback has an extra def operand in +// front of the loaded registers. +// +// Reuse the load-multiple variants for store-multiple because the +// resources are identical, For stores only the address writeback +// has a def operand so the WriteL latencies are unused. +def :ItinRW<[A9WriteLMAdr, A9WriteLM, A9WriteIssue], [IIC_iLoad_mu, + IIC_iStore_m, + IIC_iStore_mu]>; +def :ItinRW<[A9WriteLM, A9WriteLMAdr, A9WriteB], [IIC_iLoad_mBr, IIC_iPop_Br]>; +def :ItinRW<[A9WriteL, A9WriteAdr, WriteALU], [IIC_iLoadiALU]>; + +def :ItinRW<[A9WriteLSfp, A9WriteAdr], [IIC_fpLoad32, IIC_fpLoad64]>; + +def :ItinRW<[A9WriteLMfp, A9WriteLMAdr], [IIC_fpLoad_m]>; +def :ItinRW<[A9WriteLMAdr, A9WriteLMfp], [IIC_fpLoad_mu]>; +def :ItinRW<[A9WriteAdr, A9WriteLSfp], [IIC_fpStore32, IIC_fpStore64, + IIC_fpStore_m, IIC_fpStore_mu]>; + +// Note: Unlike VLDM, VLD1 expects the writeback operand after the +// normal writes. +def :ItinRW<[A9WriteLfp1, A9WriteAdr1], [IIC_VLD1, IIC_VLD1u, + IIC_VLD1x2, IIC_VLD1x2u]>; +def :ItinRW<[A9WriteLfp2, A9WriteAdr2], [IIC_VLD1x3, IIC_VLD1x3u, + IIC_VLD1x4, IIC_VLD1x4u, + IIC_VLD4dup, IIC_VLD4dupu]>; +def :ItinRW<[A9WriteLfp1Mov, A9WriteAdr1], [IIC_VLD1dup, IIC_VLD1dupu, + IIC_VLD2, IIC_VLD2u, + IIC_VLD2dup, IIC_VLD2dupu]>; +def :ItinRW<[A9WriteLfp2Mov, A9WriteAdr1], [IIC_VLD1ln, IIC_VLD1lnu, + IIC_VLD2x2, IIC_VLD2x2u, + IIC_VLD2ln, IIC_VLD2lnu]>; +def :ItinRW<[A9WriteLfp3Mov, A9WriteAdr3], [IIC_VLD3, IIC_VLD3u, + IIC_VLD3dup, IIC_VLD3dupu]>; +def :ItinRW<[A9WriteLfp4Mov, A9WriteAdr4], [IIC_VLD4, IIC_VLD4u, + IIC_VLD4ln, IIC_VLD4lnu]>; +def :ItinRW<[A9WriteLfp5Mov, A9WriteAdr5], [IIC_VLD3ln, IIC_VLD3lnu]>; + +// Vector stores use similar resources to vector loads, so use the +// same write types. The address write must be first for stores with +// address writeback. +def :ItinRW<[A9WriteAdr1, A9WriteLfp1], [IIC_VST1, IIC_VST1u, + IIC_VST1x2, IIC_VST1x2u, + IIC_VST1ln, IIC_VST1lnu, + IIC_VST2, IIC_VST2u, + IIC_VST2x2, IIC_VST2x2u, + IIC_VST2ln, IIC_VST2lnu]>; +def :ItinRW<[A9WriteAdr2, A9WriteLfp2], [IIC_VST1x3, IIC_VST1x3u, + IIC_VST1x4, IIC_VST1x4u, + IIC_VST3, IIC_VST3u, + IIC_VST3ln, IIC_VST3lnu, + IIC_VST4, IIC_VST4u, + IIC_VST4ln, IIC_VST4lnu]>; + +// NEON moves. +def :ItinRW<[A9WriteV2], [IIC_VMOVSI, IIC_VMOVDI, IIC_VMOVD, IIC_VMOVQ]>; +def :ItinRW<[A9WriteV1], [IIC_VMOV, IIC_VMOVIS, IIC_VMOVID]>; +def :ItinRW<[A9WriteV3], [IIC_VMOVISL, IIC_VMOVN]>; + +// NEON integer arithmetic +// +// VADD/VAND/VORR/VEOR/VBIC/VORN/VBIT/VBIF/VBSL +def :ItinRW<[A9WriteV3, A9Read2, A9Read2], [IIC_VBINiD, IIC_VBINiQ]>; +// VSUB/VMVN/VCLSD/VCLZD/VCNTD +def :ItinRW<[A9WriteV3, A9Read2], [IIC_VSUBiD, IIC_VSUBiQ, IIC_VCNTiD]>; +// VADDL/VSUBL/VNEG are mapped later under IIC_SHLi. +// ... +// VHADD/VRHADD/VQADD/VTST/VADH/VRADH +def :ItinRW<[A9WriteV4, A9Read2, A9Read2], [IIC_VBINi4D, IIC_VBINi4Q]>; + +// VSBH/VRSBH/VHSUB/VQSUB/VABD/VCEQ/VCGE/VCGT/VMAX/VMIN/VPMAX/VPMIN/VABDL +def :ItinRW<[A9WriteV4, A9Read2], [IIC_VSUBi4D, IIC_VSUBi4Q]>; +// VQNEG/VQABS +def :ItinRW<[A9WriteV4], [IIC_VQUNAiD, IIC_VQUNAiQ]>; +// VABS +def :ItinRW<[A9WriteV4, A9Read2], [IIC_VUNAiD, IIC_VUNAiQ]>; +// VPADD/VPADDL are mapped later under IIC_SHLi. +// ... +// VCLSQ/VCLZQ/VCNTQ, takes two cycles. +def :ItinRW<[A9Write2V4, A9Read3], [IIC_VCNTiQ]>; +// VMOVimm/VMVNimm/VORRimm/VBICimm +def :ItinRW<[A9WriteV3], [IIC_VMOVImm]>; +def :ItinRW<[A9WriteV6, A9Read3, A9Read2], [IIC_VABAD, IIC_VABAQ]>; +def :ItinRW<[A9WriteV6, A9Read3], [IIC_VPALiD, IIC_VPALiQ]>; + +// NEON integer multiply +// +// Note: these don't quite match the timing docs, but they do match +// the original A9 itinerary. +def :ItinRW<[A9WriteV6, A9Read2, A9Read2], [IIC_VMULi16D]>; +def :ItinRW<[A9WriteV7, A9Read2, A9Read2], [IIC_VMULi16Q]>; +def :ItinRW<[A9Write2V7, A9Read2], [IIC_VMULi32D]>; +def :ItinRW<[A9Write2V9, A9Read2], [IIC_VMULi32Q]>; +def :ItinRW<[A9WriteV6, A9Read3, A9Read2, A9Read2], [IIC_VMACi16D]>; +def :ItinRW<[A9WriteV7, A9Read3, A9Read2, A9Read2], [IIC_VMACi16Q]>; +def :ItinRW<[A9Write2V7, A9Read3, A9Read2], [IIC_VMACi32D]>; +def :ItinRW<[A9Write2V9, A9Read3, A9Read2], [IIC_VMACi32Q]>; + +// NEON integer shift +// TODO: Q,Q,Q shifts should actually reserve FP for 2 cycles. +def :ItinRW<[A9WriteV3], [IIC_VSHLiD, IIC_VSHLiQ]>; +def :ItinRW<[A9WriteV4], [IIC_VSHLi4D, IIC_VSHLi4Q]>; + +// NEON permute +def :ItinRW<[A9WriteV2, A9WriteV2], [IIC_VPERMD, IIC_VPERMQ, IIC_VEXTD]>; +def :ItinRW<[A9WriteV3, A9WriteV4, ReadDefault, A9Read2], + [IIC_VPERMQ3, IIC_VEXTQ]>; +def :ItinRW<[A9WriteV3, A9Read2], [IIC_VTB1]>; +def :ItinRW<[A9WriteV3, A9Read2, A9Read2], [IIC_VTB2]>; +def :ItinRW<[A9WriteV4, A9Read2, A9Read2, A9Read3], [IIC_VTB3]>; +def :ItinRW<[A9WriteV4, A9Read2, A9Read2, A9Read3, A9Read3], [IIC_VTB4]>; +def :ItinRW<[A9WriteV3, ReadDefault, A9Read2], [IIC_VTBX1]>; +def :ItinRW<[A9WriteV3, ReadDefault, A9Read2, A9Read2], [IIC_VTBX2]>; +def :ItinRW<[A9WriteV4, ReadDefault, A9Read2, A9Read2, A9Read3], [IIC_VTBX3]>; +def :ItinRW<[A9WriteV4, ReadDefault, A9Read2, A9Read2, A9Read3, A9Read3], + [IIC_VTBX4]>; + +// NEON floating-point +def :ItinRW<[A9WriteV5, A9Read2, A9Read2], [IIC_VBIND]>; +def :ItinRW<[A9WriteV6, A9Read2, A9Read2], [IIC_VBINQ]>; +def :ItinRW<[A9WriteV5, A9Read2], [IIC_VUNAD, IIC_VFMULD]>; +def :ItinRW<[A9WriteV6, A9Read2], [IIC_VUNAQ, IIC_VFMULQ]>; +def :ItinRW<[A9WriteV9, A9Read3, A9Read2], [IIC_VMACD, IIC_VFMACD]>; +def :ItinRW<[A9WriteV10, A9Read3, A9Read2], [IIC_VMACQ, IIC_VFMACQ]>; +def :ItinRW<[A9WriteV9, A9Read2, A9Read2], [IIC_VRECSD]>; +def :ItinRW<[A9WriteV10, A9Read2, A9Read2], [IIC_VRECSQ]>; + +// Map SchedRWs that are identical for cortexa9 to existing resources. +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : InstRW< [WriteALU], + (instregex "ANDri", "ORRri", "EORri", "BICri", "ANDrr", "ORRrr", "EORrr", + "BICrr")>; +def : InstRW< [WriteALUsi], (instregex "ANDrsi", "ORRrsi", "EORrsi", "BICrsi")>; +def : InstRW< [WriteALUsr], (instregex "ANDrsr", "ORRrsr", "EORrsr", "BICrsr")>; + + +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + +def : InstRW< [A9WriteIsr], (instregex "MOVsr", "MOVsi", "MVNsr", "MOVCCsi", + "MOVCCsr")>; +def : InstRW< [WriteALU, A9ReadALU], (instregex "MVNr")>; +def : InstRW< [A9WriteI2], (instregex "MOVCCi32imm", "MOVi32imm", + "MOV_ga_dyn")>; +def : InstRW< [A9WriteI2pc], (instregex "MOV_ga_pcrel")>; +def : InstRW< [A9WriteI2ld], (instregex "MOV_ga_pcrel_ldr")>; + +def : InstRW< [WriteALU], (instregex "SEL")>; + +def : InstRW< [WriteALUsi], (instregex "BFC", "BFI", "UBFX", "SBFX")>; + +def : InstRW< [A9WriteM], + (instregex "MUL", "MULv5", "SMMUL", "SMMULR", "MLA", "MLAv5", "MLS", + "SMMLA", "SMMLAR", "SMMLS", "SMMLSR")>; +def : InstRW< [A9WriteM, A9WriteMHi], + (instregex "SMULL", "SMULLv5", "UMULL", "UMULLv5", "SMLAL$", "UMLAL", + "UMAAL", "SMLALv5", "UMLALv5", "UMAALv5", "SMLALBB", "SMLALBT", "SMLALTB", + "SMLALTT")>; +// FIXME: These instructions used to have NoItinerary. Just copied the one from above. +def : InstRW< [A9WriteM, A9WriteMHi], + (instregex "SMLAD", "SMLADX", "SMLALD", "SMLALDX", "SMLSD", "SMLSDX", + "SMLSLD", "SMLLDX", "SMUAD", "SMUADX", "SMUSD", "SMUSDX")>; + +def : InstRW<[A9WriteM16, A9WriteM16Hi], + (instregex "SMULBB", "SMULBT", "SMULTB", "SMULTT", "SMULWB", "SMULWT")>; +def : InstRW<[A9WriteM16, A9WriteM16Hi], + (instregex "SMLABB", "SMLABT", "SMLATB", "SMLATT", "SMLAWB", "SMLAWT")>; + +def : InstRW<[A9WriteL], (instregex "LDRi12", "PICLDR$")>; +def : InstRW<[A9WriteLsi], (instregex "LDRrs")>; +def : InstRW<[A9WriteLb], + (instregex "LDRBi12", "PICLDRH", "PICLDRB", "PICLDRSH", "PICLDRSB", + "LDRH", "LDRSH", "LDRSB")>; +def : InstRW<[A9WriteLbsi], (instregex "LDRrs")>; + +def : WriteRes { let Latency = 0; } + +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : SchedAlias; +def : WriteRes { let Latency = 0; let NumMicroOps = 0; } +} // SchedModel = CortexA9Model