Fix r115332: correctly model AGU / NEON mux.
[oota-llvm.git] / lib / Target / ARM / ARMScheduleA9.td
1 //=- ARMScheduleA9.td - ARM Cortex-A9 Scheduling Definitions -*- tablegen -*-=//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the itinerary class data for the ARM Cortex A9 processors.
11 //
12 //===----------------------------------------------------------------------===//
13
14 //
15 // Ad-hoc scheduling information derived from pretty vague "Cortex-A9 Technical
16 // Reference Manual".
17 //
18 // Functional units
19 def A9_Pipe0   : FuncUnit; // pipeline 0
20 def A9_Pipe1   : FuncUnit; // pipeline 1
21 def A9_AGU     : FuncUnit; // Address generation unit for ld / st
22 def A9_NPipe   : FuncUnit; // NEON ALU/MUL pipeline
23 def A9_DRegsVFP: FuncUnit; // FP register set, VFP side
24 def A9_DRegsN  : FuncUnit; // FP register set, NEON side
25 def A9_MUX0    : FuncUnit; // AGU + NEON/FPU multiplexer
26
27 // Bypasses
28 def A9_LdBypass : Bypass;
29
30 // Dual issue pipeline represented by A9_Pipe0 | A9_Pipe1
31 //
32 def CortexA9Itineraries : ProcessorItineraries<
33   [A9_Pipe0, A9_Pipe1, A9_AGU, A9_NPipe, A9_DRegsVFP, A9_DRegsN, A9_MUX0],
34   [A9_LdBypass], [
35   // Two fully-pipelined integer ALU pipelines
36
37   //
38   // Move instructions, unconditional
39   InstrItinData<IIC_iMOVi   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1]>,
40   InstrItinData<IIC_iMOVr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
41   InstrItinData<IIC_iMOVsi  , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
42   InstrItinData<IIC_iMOVsr  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
43   InstrItinData<IIC_iMOVix2 , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
44                                InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2]>,
45   //
46   // MVN instructions
47   InstrItinData<IIC_iMVNi   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
48                               [1]>,
49   InstrItinData<IIC_iMVNr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
50                               [1, 1], [NoBypass, A9_LdBypass]>,
51   InstrItinData<IIC_iMVNsi  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>],
52                               [2, 1]>,
53   InstrItinData<IIC_iMVNsr  , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>],
54                               [3, 1, 1]>,
55   //
56   // No operand cycles
57   InstrItinData<IIC_iALUx   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>,
58   //
59   // Binary Instructions that produce a result
60   InstrItinData<IIC_iALUi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
61                             [1, 1], [NoBypass, A9_LdBypass]>,
62   InstrItinData<IIC_iALUr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
63                             [1, 1, 1], [NoBypass, A9_LdBypass, A9_LdBypass]>,
64   InstrItinData<IIC_iALUsi, [InstrStage<2, [A9_Pipe0, A9_Pipe1]>],
65                             [2, 1, 1], [NoBypass, A9_LdBypass, NoBypass]>,
66   InstrItinData<IIC_iALUsir,[InstrStage<2, [A9_Pipe0, A9_Pipe1]>],
67                             [2, 1, 1], [NoBypass, NoBypass, A9_LdBypass]>,
68   InstrItinData<IIC_iALUsr,[InstrStage<3, [A9_Pipe0, A9_Pipe1]>],
69                             [3, 1, 1, 1],
70                             [NoBypass, A9_LdBypass, NoBypass, NoBypass]>,
71   //
72   // Bitwise Instructions that produce a result
73   InstrItinData<IIC_iBITi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
74   InstrItinData<IIC_iBITr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1, 1]>,
75   InstrItinData<IIC_iBITsi, [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
76   InstrItinData<IIC_iBITsr,[InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [3, 1, 1, 1]>,
77   //
78   // Unary Instructions that produce a result
79
80   // CLZ, RBIT, etc.
81   InstrItinData<IIC_iUNAr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
82
83   // BFC, BFI, UBFX, SBFX
84   InstrItinData<IIC_iUNAsi  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
85
86   //
87   // Zero and sign extension instructions
88   InstrItinData<IIC_iEXTr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
89   InstrItinData<IIC_iEXTAr, [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [3, 1, 1]>,
90   InstrItinData<IIC_iEXTAsr,[InstrStage<3, [A9_Pipe0, A9_Pipe1]>],[3, 1, 1, 1]>,
91   //
92   // Compare instructions
93   InstrItinData<IIC_iCMPi   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
94                               [1], [A9_LdBypass]>,
95   InstrItinData<IIC_iCMPr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
96                               [1, 1], [A9_LdBypass, A9_LdBypass]>,
97   InstrItinData<IIC_iCMPsi  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>],
98                               [1, 1], [A9_LdBypass, NoBypass]>,
99   InstrItinData<IIC_iCMPsr  , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>],
100                               [1, 1, 1], [A9_LdBypass, NoBypass, NoBypass]>,
101   //
102   // Test instructions
103   InstrItinData<IIC_iTSTi   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1]>,
104   InstrItinData<IIC_iTSTr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
105   InstrItinData<IIC_iTSTsi  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
106   InstrItinData<IIC_iTSTsr  , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [1, 1, 1]>,
107   //
108   // Move instructions, conditional
109   // FIXME: Correctly model the extra input dep on the destination.
110   InstrItinData<IIC_iCMOVi  , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1]>,
111   InstrItinData<IIC_iCMOVr  , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
112   InstrItinData<IIC_iCMOVsi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
113   InstrItinData<IIC_iCMOVsr , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
114
115   // Integer multiply pipeline
116   //
117   InstrItinData<IIC_iMUL16   , [InstrStage<1, [A9_Pipe1], 0>,
118                                 InstrStage<2, [A9_Pipe0]>], [3, 1, 1]>,
119   InstrItinData<IIC_iMAC16   , [InstrStage<1, [A9_Pipe1], 0>,
120                                 InstrStage<2, [A9_Pipe0]>], [3, 1, 1, 1]>,
121   InstrItinData<IIC_iMUL32   , [InstrStage<1, [A9_Pipe1], 0>,
122                                 InstrStage<2, [A9_Pipe0]>], [4, 1, 1]>,
123   InstrItinData<IIC_iMAC32   , [InstrStage<1, [A9_Pipe1], 0>,
124                                 InstrStage<2, [A9_Pipe0]>], [4, 1, 1, 1]>,
125   InstrItinData<IIC_iMUL64   , [InstrStage<1, [A9_Pipe1], 0>,
126                                 InstrStage<3, [A9_Pipe0]>], [4, 5, 1, 1]>,
127   InstrItinData<IIC_iMAC64   , [InstrStage<1, [A9_Pipe1], 0>,
128                                 InstrStage<3, [A9_Pipe0]>], [4, 5, 1, 1]>,
129   // Integer load pipeline
130   // FIXME: The timings are some rough approximations
131   //
132   // Immediate offset
133   InstrItinData<IIC_iLoad_i   , [InstrStage<1, [A9_Pipe1]>,
134                                  InstrStage<1, [A9_MUX0], 0>,
135                                  InstrStage<1, [A9_AGU]>],
136                                 [3, 1], [A9_LdBypass]>,
137   InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A9_Pipe1]>,
138                                  InstrStage<1, [A9_MUX0], 0>,
139                                  InstrStage<2, [A9_AGU]>],
140                                 [4, 1], [A9_LdBypass]>,
141   // FIXME: If address is 64-bit aligned, AGU cycles is 1.
142   InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [A9_Pipe1]>,
143                                  InstrStage<1, [A9_MUX0], 0>,
144                                  InstrStage<2, [A9_AGU]>],
145                                 [3, 3, 1], [A9_LdBypass]>,
146   //
147   // Register offset
148   InstrItinData<IIC_iLoad_r   , [InstrStage<1, [A9_Pipe1]>,
149                                  InstrStage<1, [A9_MUX0], 0>,
150                                  InstrStage<1, [A9_AGU]>],
151                                 [3, 1, 1], [A9_LdBypass]>,
152   InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A9_Pipe1]>,
153                                  InstrStage<1, [A9_MUX0], 0>,
154                                  InstrStage<2, [A9_AGU]>],
155                                 [4, 1, 1], [A9_LdBypass]>,
156   InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A9_Pipe1]>,
157                                  InstrStage<1, [A9_MUX0], 0>,
158                                  InstrStage<2, [A9_AGU]>],
159                                 [3, 3, 1, 1], [A9_LdBypass]>,
160   //
161   // Scaled register offset
162   InstrItinData<IIC_iLoad_si  , [InstrStage<1, [A9_Pipe1]>,
163                                  InstrStage<1, [A9_MUX0], 0>,
164                                  InstrStage<1, [A9_AGU]>],
165                                 [4, 1, 1], [A9_LdBypass]>,
166   InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [A9_Pipe1]>,
167                                  InstrStage<1, [A9_MUX0], 0>,
168                                  InstrStage<2, [A9_AGU]>],
169                                 [5, 1, 1], [A9_LdBypass]>,
170   //
171   // Immediate offset with update
172   InstrItinData<IIC_iLoad_iu  , [InstrStage<1, [A9_Pipe1]>,
173                                  InstrStage<1, [A9_MUX0], 0>,
174                                  InstrStage<1, [A9_AGU]>],
175                                 [3, 2, 1], [A9_LdBypass]>,
176   InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A9_Pipe1]>,
177                                  InstrStage<1, [A9_MUX0], 0>,
178                                  InstrStage<2, [A9_AGU]>],
179                                 [4, 3, 1], [A9_LdBypass]>,
180   //
181   // Register offset with update
182   InstrItinData<IIC_iLoad_ru  , [InstrStage<1, [A9_Pipe1]>,
183                                  InstrStage<1, [A9_MUX0], 0>,
184                                  InstrStage<1, [A9_AGU]>],
185                                 [3, 2, 1, 1], [A9_LdBypass]>,
186   InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A9_Pipe1]>,
187                                  InstrStage<1, [A9_MUX0], 0>,
188                                  InstrStage<2, [A9_AGU]>],
189                                 [4, 3, 1, 1], [A9_LdBypass]>,
190   InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A9_Pipe1]>,
191                                  InstrStage<1, [A9_MUX0], 0>,
192                                  InstrStage<2, [A9_AGU]>],
193                                 [3, 3, 1, 1], [A9_LdBypass]>,
194   //
195   // Scaled register offset with update
196   InstrItinData<IIC_iLoad_siu , [InstrStage<1, [A9_Pipe1]>,
197                                  InstrStage<1, [A9_MUX0], 0>,
198                                  InstrStage<1, [A9_AGU]>],
199                                 [4, 3, 1, 1], [A9_LdBypass]>,
200   InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [A9_Pipe1]>,
201                                   InstrStage<1, [A9_MUX0], 0>,
202                                   InstrStage<2, [A9_AGU]>],
203                                  [5, 4, 1, 1], [A9_LdBypass]>,
204   //
205   // Load multiple
206   InstrItinData<IIC_iLoadm   , [InstrStage<1, [A9_Pipe1]>,
207                                 InstrStage<1, [A9_MUX0], 0>,
208                                 InstrStage<2, [A9_AGU]>],
209                                [3], [A9_LdBypass]>,
210
211   //
212   // Load multiple plus branch
213   InstrItinData<IIC_iLoadmBr , [InstrStage<1, [A9_Pipe1]>,
214                                 InstrStage<1, [A9_MUX0], 0>,
215                                 InstrStage<1, [A9_AGU]>,
216                                 InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>,
217
218   //
219   // iLoadi + iALUr for t2LDRpci_pic.
220   InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A9_Pipe1]>,
221                                 InstrStage<1, [A9_MUX0], 0>,
222                                 InstrStage<1, [A9_AGU]>,
223                                 InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
224                                [2, 1]>,
225
226   // Integer store pipeline
227   ///
228   // Immediate offset
229   InstrItinData<IIC_iStore_i  , [InstrStage<1, [A9_Pipe1]>,
230                                  InstrStage<1, [A9_MUX0], 0>,
231                                  InstrStage<1, [A9_AGU]>], [1, 1]>,
232   InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A9_Pipe1]>,
233                                  InstrStage<1, [A9_MUX0], 0>,
234                                  InstrStage<2, [A9_AGU]>], [1, 1]>,
235   // FIXME: If address is 64-bit aligned, AGU cycles is 1.
236   InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A9_Pipe1]>,
237                                  InstrStage<1, [A9_MUX0], 0>,
238                                  InstrStage<2, [A9_AGU]>], [1, 1]>,
239   //
240   // Register offset
241   InstrItinData<IIC_iStore_r  , [InstrStage<1, [A9_Pipe1]>,
242                                  InstrStage<1, [A9_MUX0], 0>,
243                                  InstrStage<1, [A9_AGU]>], [1, 1, 1]>,
244   InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [A9_Pipe1]>,
245                                  InstrStage<1, [A9_MUX0], 0>,
246                                  InstrStage<2, [A9_AGU]>], [1, 1, 1]>,
247   InstrItinData<IIC_iStore_d_r, [InstrStage<1, [A9_Pipe1]>,
248                                  InstrStage<1, [A9_MUX0], 0>,
249                                  InstrStage<2, [A9_AGU]>], [1, 1, 1]>,
250   //
251   // Scaled register offset
252   InstrItinData<IIC_iStore_si , [InstrStage<1, [A9_Pipe1]>,
253                                  InstrStage<1, [A9_MUX0], 0>,
254                                  InstrStage<1, [A9_AGU]>], [1, 1, 1]>,
255   InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [A9_Pipe1]>,
256                                   InstrStage<1, [A9_MUX0], 0>,
257                                   InstrStage<2, [A9_AGU]>], [1, 1, 1]>,
258   //
259   // Immediate offset with update
260   InstrItinData<IIC_iStore_iu , [InstrStage<1, [A9_Pipe1]>,
261                                  InstrStage<1, [A9_MUX0], 0>,
262                                  InstrStage<1, [A9_AGU]>], [2, 1, 1]>,
263   InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A9_Pipe1]>,
264                                   InstrStage<1, [A9_MUX0], 0>,
265                                   InstrStage<2, [A9_AGU]>], [3, 1, 1]>,
266   //
267   // Register offset with update
268   InstrItinData<IIC_iStore_ru , [InstrStage<1, [A9_Pipe1]>,
269                                  InstrStage<1, [A9_MUX0], 0>,
270                                  InstrStage<1, [A9_AGU]>],
271                                 [2, 1, 1, 1]>,
272   InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A9_Pipe1]>,
273                                   InstrStage<1, [A9_MUX0], 0>,
274                                   InstrStage<2, [A9_AGU]>],
275                                  [3, 1, 1, 1]>,
276   InstrItinData<IIC_iStore_d_ru,[InstrStage<1, [A9_Pipe1]>,
277                                  InstrStage<1, [A9_MUX0], 0>,
278                                  InstrStage<2, [A9_AGU]>],
279                                 [3, 1, 1, 1]>,
280   //
281   // Scaled register offset with update
282   InstrItinData<IIC_iStore_siu, [InstrStage<1, [A9_Pipe1]>,
283                                  InstrStage<1, [A9_MUX0], 0>,
284                                  InstrStage<1, [A9_AGU]>],
285                                 [2, 1, 1, 1]>,
286   InstrItinData<IIC_iStore_bh_siu,[InstrStage<1, [A9_Pipe1]>,
287                                    InstrStage<1, [A9_MUX0], 0>,
288                                    InstrStage<2, [A9_AGU]>],
289                                   [3, 1, 1, 1]>,
290   //
291   // Store multiple
292   InstrItinData<IIC_iStorem  , [InstrStage<1, [A9_Pipe1]>,
293                                 InstrStage<1, [A9_MUX0], 0>,
294                                 InstrStage<1, [A9_AGU]>]>,
295   // Branch
296   //
297   // no delay slots, so the latency of a branch is unimportant
298   InstrItinData<IIC_Br       , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>,
299
300   // VFP and NEON shares the same register file. This means that every VFP
301   // instruction should wait for full completion of the consecutive NEON
302   // instruction and vice-versa. We model this behavior with two artificial FUs:
303   // DRegsVFP and DRegsVFP.
304   //
305   // Every VFP instruction:
306   //  - Acquires DRegsVFP resource for 1 cycle
307   //  - Reserves DRegsN resource for the whole duration (including time to
308   //    register file writeback!).
309   // Every NEON instruction does the same but with FUs swapped.
310   //
311   // Since the reserved FU cannot be acquired, this models precisely
312   // "cross-domain" stalls.
313
314   // VFP
315   // Issue through integer pipeline, and execute in NEON unit.
316
317   // FP Special Register to Integer Register File Move
318   InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
319                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
320                               InstrStage<1, [A9_Pipe1]>,
321                               InstrStage<1, [A9_MUX0], 0>,
322                               InstrStage<1, [A9_NPipe]>]>,
323   //
324   // Single-precision FP Unary
325   InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
326                                // Extra latency cycles since wbck is 2 cycles
327                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
328                                InstrStage<1, [A9_Pipe1]>,
329                                InstrStage<1, [A9_MUX0], 0>,
330                                InstrStage<1, [A9_NPipe]>],
331                               [1, 1]>,
332   //
333   // Double-precision FP Unary
334   InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
335                                // Extra latency cycles since wbck is 2 cycles
336                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
337                                InstrStage<1, [A9_Pipe1]>,
338                                InstrStage<1, [A9_MUX0], 0>,
339                                InstrStage<1, [A9_NPipe]>],
340                               [1, 1]>,
341
342   //
343   // Single-precision FP Compare
344   InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
345                                // Extra latency cycles since wbck is 4 cycles
346                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
347                                InstrStage<1, [A9_Pipe1]>,
348                                InstrStage<1, [A9_MUX0], 0>,
349                                InstrStage<1, [A9_NPipe]>],
350                               [1, 1]>,
351   //
352   // Double-precision FP Compare
353   InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
354                                // Extra latency cycles since wbck is 4 cycles
355                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
356                                InstrStage<1, [A9_Pipe1]>,
357                                InstrStage<1, [A9_MUX0], 0>,
358                                InstrStage<1, [A9_NPipe]>],
359                               [1, 1]>,
360   //
361   // Single to Double FP Convert
362   InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
363                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
364                                InstrStage<1, [A9_Pipe1]>,
365                                InstrStage<1, [A9_MUX0], 0>,
366                                InstrStage<1, [A9_NPipe]>],
367                               [4, 1]>,
368   //
369   // Double to Single FP Convert
370   InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
371                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
372                                InstrStage<1, [A9_Pipe1]>,
373                                InstrStage<1, [A9_MUX0], 0>,
374                                InstrStage<1, [A9_NPipe]>],
375                               [4, 1]>,
376
377   //
378   // Single to Half FP Convert
379   InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
380                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
381                                InstrStage<1, [A9_Pipe1]>,
382                                InstrStage<1, [A9_MUX0], 0>,
383                                InstrStage<1, [A9_NPipe]>],
384                               [4, 1]>,
385   //
386   // Half to Single FP Convert
387   InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
388                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
389                                InstrStage<1, [A9_Pipe1]>,
390                                InstrStage<1, [A9_MUX0], 0>,
391                                InstrStage<1, [A9_NPipe]>],
392                               [2, 1]>,
393
394   //
395   // Single-Precision FP to Integer Convert
396   InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
397                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
398                                InstrStage<1, [A9_Pipe1]>,
399                                InstrStage<1, [A9_MUX0], 0>,
400                                InstrStage<1, [A9_NPipe]>],
401                               [4, 1]>,
402   //
403   // Double-Precision FP to Integer Convert
404   InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
405                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
406                                InstrStage<1, [A9_Pipe1]>,
407                                InstrStage<1, [A9_MUX0], 0>,
408                                InstrStage<1, [A9_NPipe]>],
409                               [4, 1]>,
410   //
411   // Integer to Single-Precision FP Convert
412   InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
413                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
414                                InstrStage<1, [A9_Pipe1]>,
415                                InstrStage<1, [A9_MUX0], 0>,
416                                InstrStage<1, [A9_NPipe]>],
417                               [4, 1]>,
418   //
419   // Integer to Double-Precision FP Convert
420   InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
421                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
422                                InstrStage<1, [A9_Pipe1]>,
423                                InstrStage<1, [A9_MUX0], 0>,
424                                InstrStage<1, [A9_NPipe]>],
425                               [4, 1]>,
426   //
427   // Single-precision FP ALU
428   InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
429                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
430                                InstrStage<1, [A9_Pipe1]>,
431                                InstrStage<1, [A9_MUX0], 0>,
432                                InstrStage<1, [A9_NPipe]>],
433                               [4, 1, 1]>,
434   //
435   // Double-precision FP ALU
436   InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
437                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
438                                InstrStage<1, [A9_Pipe1]>,
439                                InstrStage<1, [A9_MUX0], 0>,
440                                InstrStage<1, [A9_NPipe]>],
441                               [4, 1, 1]>,
442   //
443   // Single-precision FP Multiply
444   InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
445                                InstrStage<6, [A9_DRegsN],   0, Reserved>,
446                                InstrStage<1, [A9_Pipe1]>,
447                                InstrStage<1, [A9_MUX0], 0>,
448                                InstrStage<1, [A9_NPipe]>],
449                               [5, 1, 1]>,
450   //
451   // Double-precision FP Multiply
452   InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
453                                InstrStage<7, [A9_DRegsN],   0, Reserved>,
454                                InstrStage<1, [A9_Pipe1]>,
455                                InstrStage<1, [A9_MUX0], 0>,
456                                InstrStage<2, [A9_NPipe]>],
457                               [6, 1, 1]>,
458   //
459   // Single-precision FP MAC
460   InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
461                                InstrStage<9, [A9_DRegsN],   0, Reserved>,
462                                InstrStage<1, [A9_Pipe1]>,
463                                InstrStage<1, [A9_MUX0], 0>,
464                                InstrStage<1, [A9_NPipe]>],
465                               [8, 0, 1, 1]>,
466   //
467   // Double-precision FP MAC
468   InstrItinData<IIC_fpMAC64 , [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
469                                InstrStage<10, [A9_DRegsN],  0, Reserved>,
470                                InstrStage<1,  [A9_Pipe1]>,
471                                InstrStage<1,  [A9_MUX0], 0>,
472                                InstrStage<2,  [A9_NPipe]>],
473                               [9, 0, 1, 1]>,
474   //
475   // Single-precision FP DIV
476   InstrItinData<IIC_fpDIV32 , [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
477                                InstrStage<16, [A9_DRegsN],  0, Reserved>,
478                                InstrStage<1,  [A9_Pipe1]>,
479                                InstrStage<1,  [A9_MUX0], 0>,
480                                InstrStage<10, [A9_NPipe]>],
481                               [15, 1, 1]>,
482   //
483   // Double-precision FP DIV
484   InstrItinData<IIC_fpDIV64 , [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
485                                InstrStage<26, [A9_DRegsN],  0, Reserved>,
486                                InstrStage<1,  [A9_Pipe1]>,
487                                InstrStage<1,  [A9_MUX0], 0>,
488                                InstrStage<20, [A9_NPipe]>],
489                               [25, 1, 1]>,
490   //
491   // Single-precision FP SQRT
492   InstrItinData<IIC_fpSQRT32, [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
493                                InstrStage<18, [A9_DRegsN],   0, Reserved>,
494                                InstrStage<1,  [A9_Pipe1]>,
495                                InstrStage<1,  [A9_MUX0], 0>,
496                                InstrStage<13, [A9_NPipe]>],
497                               [17, 1]>,
498   //
499   // Double-precision FP SQRT
500   InstrItinData<IIC_fpSQRT64, [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
501                                InstrStage<33, [A9_DRegsN],   0, Reserved>,
502                                InstrStage<1,  [A9_Pipe1]>,
503                                InstrStage<1,  [A9_MUX0], 0>,
504                                InstrStage<28, [A9_NPipe]>],
505                               [32, 1]>,
506
507   //
508   // Integer to Single-precision Move
509   InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
510                                // Extra 1 latency cycle since wbck is 2 cycles
511                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
512                                InstrStage<1, [A9_Pipe1]>,
513                                InstrStage<1, [A9_MUX0], 0>,
514                                InstrStage<1, [A9_NPipe]>],
515                               [1, 1]>,
516   //
517   // Integer to Double-precision Move
518   InstrItinData<IIC_fpMOVID,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
519                                // Extra 1 latency cycle since wbck is 2 cycles
520                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
521                                InstrStage<1, [A9_Pipe1]>,
522                                InstrStage<1, [A9_MUX0], 0>,
523                                InstrStage<1, [A9_NPipe]>],
524                               [1, 1, 1]>,
525   //
526   // Single-precision to Integer Move
527   InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
528                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
529                                InstrStage<1, [A9_Pipe1]>,
530                                InstrStage<1, [A9_MUX0], 0>,
531                                InstrStage<1, [A9_NPipe]>],
532                               [1, 1]>,
533   //
534   // Double-precision to Integer Move
535   InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
536                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
537                                InstrStage<1, [A9_Pipe1]>,
538                                InstrStage<1, [A9_MUX0], 0>,
539                                InstrStage<1, [A9_NPipe]>],
540                               [1, 1, 1]>,
541   //
542   // Single-precision FP Load
543   InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
544                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
545                                InstrStage<1, [A9_Pipe1], 0>,
546                                InstrStage<1, [A9_MUX0], 0>,
547                                InstrStage<1, [A9_NPipe]>],
548                               [1, 1]>,
549   //
550   // Double-precision FP Load
551   // FIXME: Result latency is 1 if address is 64-bit aligned.
552   InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
553                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
554                                InstrStage<1, [A9_Pipe1], 0>,
555                                InstrStage<1, [A9_MUX0], 0>,
556                                InstrStage<1, [A9_NPipe]>],
557                               [2, 1]>,
558   //
559   // FP Load Multiple
560   InstrItinData<IIC_fpLoadm,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
561                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
562                                InstrStage<1, [A9_Pipe1], 0>,
563                                InstrStage<1, [A9_MUX0], 0>,
564                                InstrStage<1, [A9_NPipe]>]>,
565   //
566   // Single-precision FP Store
567   InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
568                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
569                                InstrStage<1, [A9_Pipe1], 0>,
570                                InstrStage<1, [A9_MUX0], 0>,
571                                InstrStage<1, [A9_NPipe]>],
572                               [1, 1]>,
573   //
574   // Double-precision FP Store
575   InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
576                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
577                                InstrStage<1, [A9_Pipe1], 0>,
578                                InstrStage<1, [A9_MUX0], 0>,
579                                InstrStage<1, [A9_NPipe]>],
580                               [1, 1]>,
581   //
582   // FP Store Multiple
583   InstrItinData<IIC_fpStorem, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
584                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
585                                InstrStage<1, [A9_Pipe1], 0>,
586                                InstrStage<1, [A9_MUX0], 0>,
587                                InstrStage<1, [A9_NPipe]>]>,
588   // NEON
589   // Issue through integer pipeline, and execute in NEON unit.
590   // VLD1
591   // FIXME: We don't model this instruction properly
592   InstrItinData<IIC_VLD1,     [InstrStage<1, [A9_DRegsN],   0, Required>,
593                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
594                                InstrStage<1, [A9_Pipe1], 0>,
595                                InstrStage<1, [A9_MUX0], 0>,
596                                InstrStage<1, [A9_NPipe]>]>,
597   //
598   // VLD2
599   // FIXME: We don't model this instruction properly
600   InstrItinData<IIC_VLD2,     [InstrStage<1, [A9_DRegsN],   0, Required>,
601                                // Extra latency cycles since wbck is 6 cycles
602                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
603                                InstrStage<1, [A9_Pipe1], 0>,
604                                InstrStage<1, [A9_MUX0], 0>,
605                                InstrStage<1, [A9_NPipe]>],
606                               [2, 2, 1]>,
607   //
608   // VLD3
609   // FIXME: We don't model this instruction properly
610   InstrItinData<IIC_VLD3,     [InstrStage<1, [A9_DRegsN],   0, Required>,
611                                // Extra latency cycles since wbck is 6 cycles
612                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
613                                InstrStage<1, [A9_Pipe1], 0>,
614                                InstrStage<1, [A9_MUX0], 0>,
615                                InstrStage<1, [A9_NPipe]>],
616                               [2, 2, 2, 1]>,
617   //
618   // VLD4
619   // FIXME: We don't model this instruction properly
620   InstrItinData<IIC_VLD4,     [InstrStage<1, [A9_DRegsN],   0, Required>,
621                                // Extra latency cycles since wbck is 6 cycles
622                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
623                                InstrStage<1, [A9_Pipe1], 0>,
624                                InstrStage<1, [A9_MUX0], 0>,
625                                InstrStage<1, [A9_NPipe]>],
626                               [2, 2, 2, 2, 1]>,
627   //
628   // VST
629   // FIXME: We don't model this instruction properly
630   InstrItinData<IIC_VST,      [InstrStage<1, [A9_DRegsN],   0, Required>,
631                                // Extra latency cycles since wbck is 6 cycles
632                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
633                                InstrStage<1, [A9_Pipe1], 0>,
634                                InstrStage<1, [A9_MUX0], 0>,
635                                InstrStage<1, [A9_NPipe]>]>,
636   //
637   // Double-register Integer Unary
638   InstrItinData<IIC_VUNAiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
639                                // Extra latency cycles since wbck is 6 cycles
640                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
641                                InstrStage<1, [A9_Pipe1]>,
642                                InstrStage<1, [A9_MUX0], 0>,
643                                InstrStage<1, [A9_NPipe]>],
644                               [4, 2]>,
645   //
646   // Quad-register Integer Unary
647   InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
648                                // Extra latency cycles since wbck is 6 cycles
649                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
650                                InstrStage<1, [A9_Pipe1]>,
651                                InstrStage<1, [A9_MUX0], 0>,
652                                InstrStage<1, [A9_NPipe]>],
653                               [4, 2]>,
654   //
655   // Double-register Integer Q-Unary
656   InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [A9_DRegsN],   0, Required>,
657                                // Extra latency cycles since wbck is 6 cycles
658                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
659                                InstrStage<1, [A9_Pipe1]>,
660                                InstrStage<1, [A9_MUX0], 0>,
661                                InstrStage<1, [A9_NPipe]>],
662                               [4, 1]>,
663   //
664   // Quad-register Integer CountQ-Unary
665   InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
666                                // Extra latency cycles since wbck is 6 cycles
667                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
668                                InstrStage<1, [A9_Pipe1]>,
669                                InstrStage<1, [A9_MUX0], 0>,
670                                InstrStage<1, [A9_NPipe]>],
671                               [4, 1]>,
672   //
673   // Double-register Integer Binary
674   InstrItinData<IIC_VBINiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
675                                // Extra latency cycles since wbck is 6 cycles
676                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
677                                InstrStage<1, [A9_Pipe1]>,
678                                InstrStage<1, [A9_MUX0], 0>,
679                                InstrStage<1, [A9_NPipe]>],
680                               [3, 2, 2]>,
681   //
682   // Quad-register Integer Binary
683   InstrItinData<IIC_VBINiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
684                                // Extra latency cycles since wbck is 6 cycles
685                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
686                                InstrStage<1, [A9_Pipe1]>,
687                                InstrStage<1, [A9_MUX0], 0>,
688                                InstrStage<1, [A9_NPipe]>],
689                               [3, 2, 2]>,
690   //
691   // Double-register Integer Subtract
692   InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
693                                // Extra latency cycles since wbck is 6 cycles
694                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
695                                InstrStage<1, [A9_Pipe1]>,
696                                InstrStage<1, [A9_MUX0], 0>,
697                                InstrStage<1, [A9_NPipe]>],
698                               [3, 2, 1]>,
699   //
700   // Quad-register Integer Subtract
701   InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
702                                // Extra latency cycles since wbck is 6 cycles
703                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
704                                InstrStage<1, [A9_Pipe1]>,
705                                InstrStage<1, [A9_MUX0], 0>,
706                                InstrStage<1, [A9_NPipe]>],
707                               [3, 2, 1]>,
708   //
709   // Double-register Integer Shift
710   InstrItinData<IIC_VSHLiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
711                                // Extra latency cycles since wbck is 6 cycles
712                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
713                                InstrStage<1, [A9_Pipe1]>,
714                                InstrStage<1, [A9_MUX0], 0>,
715                                InstrStage<1, [A9_NPipe]>],
716                               [3, 1, 1]>,
717   //
718   // Quad-register Integer Shift
719   InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
720                                // Extra latency cycles since wbck is 6 cycles
721                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
722                                InstrStage<1, [A9_Pipe1]>,
723                                InstrStage<1, [A9_MUX0], 0>,
724                                InstrStage<1, [A9_NPipe]>],
725                               [3, 1, 1]>,
726   //
727   // Double-register Integer Shift (4 cycle)
728   InstrItinData<IIC_VSHLi4D,  [InstrStage<1, [A9_DRegsN],   0, Required>,
729                                // Extra latency cycles since wbck is 6 cycles
730                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
731                                InstrStage<1, [A9_Pipe1]>,
732                                InstrStage<1, [A9_MUX0], 0>,
733                                InstrStage<1, [A9_NPipe]>],
734                               [4, 1, 1]>,
735   //
736   // Quad-register Integer Shift (4 cycle)
737   InstrItinData<IIC_VSHLi4Q,  [InstrStage<1, [A9_DRegsN],   0, Required>,
738                                // Extra latency cycles since wbck is 6 cycles
739                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
740                                InstrStage<1, [A9_Pipe1]>,
741                                InstrStage<1, [A9_MUX0], 0>,
742                                InstrStage<1, [A9_NPipe]>],
743                               [4, 1, 1]>,
744   //
745   // Double-register Integer Binary (4 cycle)
746   InstrItinData<IIC_VBINi4D,  [InstrStage<1, [A9_DRegsN],   0, Required>,
747                                // Extra latency cycles since wbck is 6 cycles
748                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
749                                InstrStage<1, [A9_Pipe1]>,
750                                InstrStage<1, [A9_MUX0], 0>,
751                                InstrStage<1, [A9_NPipe]>],
752                               [4, 2, 2]>,
753   //
754   // Quad-register Integer Binary (4 cycle)
755   InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [A9_DRegsN],   0, Required>,
756                                // Extra latency cycles since wbck is 6 cycles
757                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
758                                InstrStage<1, [A9_Pipe1]>,
759                                InstrStage<1, [A9_MUX0], 0>,
760                                InstrStage<1, [A9_NPipe]>],
761                               [4, 2, 2]>,
762   //
763   // Double-register Integer Subtract (4 cycle)
764   InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
765                                // Extra latency cycles since wbck is 6 cycles
766                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
767                                InstrStage<1, [A9_Pipe1]>,
768                                InstrStage<1, [A9_MUX0], 0>,
769                                InstrStage<1, [A9_NPipe]>],
770                               [4, 2, 1]>,
771   //
772   // Quad-register Integer Subtract (4 cycle)
773   InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
774                                // Extra latency cycles since wbck is 6 cycles
775                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
776                                InstrStage<1, [A9_Pipe1]>,
777                                InstrStage<1, [A9_MUX0], 0>,
778                                InstrStage<1, [A9_NPipe]>],
779                               [4, 2, 1]>,
780
781   //
782   // Double-register Integer Count
783   InstrItinData<IIC_VCNTiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
784                                // Extra latency cycles since wbck is 6 cycles
785                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
786                                InstrStage<1, [A9_Pipe1]>,
787                                InstrStage<1, [A9_MUX0], 0>,
788                                InstrStage<1, [A9_NPipe]>],
789                               [3, 2, 2]>,
790   //
791   // Quad-register Integer Count
792   // Result written in N3, but that is relative to the last cycle of multicycle,
793   // so we use 4 for those cases
794   InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
795                                // Extra latency cycles since wbck is 7 cycles
796                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
797                                InstrStage<1, [A9_Pipe1]>,
798                                InstrStage<1, [A9_MUX0], 0>,
799                                InstrStage<2, [A9_NPipe]>],
800                               [4, 2, 2]>,
801   //
802   // Double-register Absolute Difference and Accumulate
803   InstrItinData<IIC_VABAD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
804                                // Extra latency cycles since wbck is 6 cycles
805                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
806                                InstrStage<1, [A9_Pipe1]>,
807                                InstrStage<1, [A9_MUX0], 0>,
808                                InstrStage<1, [A9_NPipe]>],
809                               [6, 3, 2, 1]>,
810   //
811   // Quad-register Absolute Difference and Accumulate
812   InstrItinData<IIC_VABAQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
813                                // Extra latency cycles since wbck is 6 cycles
814                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
815                                InstrStage<1, [A9_Pipe1]>,
816                                InstrStage<1, [A9_MUX0], 0>,
817                                InstrStage<2, [A9_NPipe]>],
818                               [6, 3, 2, 1]>,
819   //
820   // Double-register Integer Pair Add Long
821   InstrItinData<IIC_VPALiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
822                                // Extra latency cycles since wbck is 6 cycles
823                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
824                                InstrStage<1, [A9_Pipe1]>,
825                                InstrStage<1, [A9_MUX0], 0>,
826                                InstrStage<1, [A9_NPipe]>],
827                               [6, 3, 1]>,
828   //
829   // Quad-register Integer Pair Add Long
830   InstrItinData<IIC_VPALiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
831                                // Extra latency cycles since wbck is 6 cycles
832                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
833                                InstrStage<1, [A9_Pipe1]>,
834                                InstrStage<1, [A9_MUX0], 0>,
835                                InstrStage<2, [A9_NPipe]>],
836                               [6, 3, 1]>,
837
838   //
839   // Double-register Integer Multiply (.8, .16)
840   InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_DRegsN],   0, Required>,
841                                // Extra latency cycles since wbck is 6 cycles
842                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
843                                InstrStage<1, [A9_Pipe1]>,
844                                InstrStage<1, [A9_MUX0], 0>,
845                                InstrStage<1, [A9_NPipe]>],
846                               [6, 2, 2]>,
847   //
848   // Quad-register Integer Multiply (.8, .16)
849   InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
850                                // Extra latency cycles since wbck is 7 cycles
851                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
852                                InstrStage<1, [A9_Pipe1]>,
853                                InstrStage<1, [A9_MUX0], 0>,
854                                InstrStage<2, [A9_NPipe]>],
855                               [7, 2, 2]>,
856
857   //
858   // Double-register Integer Multiply (.32)
859   InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_DRegsN],   0, Required>,
860                                // Extra latency cycles since wbck is 7 cycles
861                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
862                                InstrStage<1, [A9_Pipe1]>,
863                                InstrStage<1, [A9_MUX0], 0>,
864                                InstrStage<2, [A9_NPipe]>],
865                               [7, 2, 1]>,
866   //
867   // Quad-register Integer Multiply (.32)
868   InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
869                                // Extra latency cycles since wbck is 9 cycles
870                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
871                                InstrStage<1, [A9_Pipe1]>,
872                                InstrStage<1, [A9_MUX0], 0>,
873                                InstrStage<4, [A9_NPipe]>],
874                               [9, 2, 1]>,
875   //
876   // Double-register Integer Multiply-Accumulate (.8, .16)
877   InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_DRegsN],   0, Required>,
878                                // Extra latency cycles since wbck is 6 cycles
879                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
880                                InstrStage<1, [A9_Pipe1]>,
881                                InstrStage<1, [A9_MUX0], 0>,
882                                InstrStage<1, [A9_NPipe]>],
883                               [6, 3, 2, 2]>,
884   //
885   // Double-register Integer Multiply-Accumulate (.32)
886   InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_DRegsN],   0, Required>,
887                                // Extra latency cycles since wbck is 7 cycles
888                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
889                                InstrStage<1, [A9_Pipe1]>,
890                                InstrStage<1, [A9_MUX0], 0>,
891                                InstrStage<2, [A9_NPipe]>],
892                               [7, 3, 2, 1]>,
893   //
894   // Quad-register Integer Multiply-Accumulate (.8, .16)
895   InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
896                                // Extra latency cycles since wbck is 7 cycles
897                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
898                                InstrStage<1, [A9_Pipe1]>,
899                                InstrStage<1, [A9_MUX0], 0>,
900                                InstrStage<2, [A9_NPipe]>],
901                               [7, 3, 2, 2]>,
902   //
903   // Quad-register Integer Multiply-Accumulate (.32)
904   InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
905                                // Extra latency cycles since wbck is 9 cycles
906                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
907                                InstrStage<1, [A9_Pipe1]>,
908                                InstrStage<1, [A9_MUX0], 0>,
909                                InstrStage<4, [A9_NPipe]>],
910                               [9, 3, 2, 1]>,
911
912   //
913   // Move
914   InstrItinData<IIC_VMOV,     [InstrStage<1, [A9_DRegsN],   0, Required>,
915                                InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
916                                InstrStage<1, [A9_Pipe1]>,
917                                InstrStage<1, [A9_MUX0], 0>,
918                                InstrStage<1, [A9_NPipe]>],
919                               [1,1]>,
920   //
921   // Move Immediate
922   InstrItinData<IIC_VMOVImm,  [InstrStage<1, [A9_DRegsN],   0, Required>,
923                                // Extra latency cycles since wbck is 6 cycles
924                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
925                                InstrStage<1, [A9_Pipe1]>,
926                                InstrStage<1, [A9_MUX0], 0>,
927                                InstrStage<1, [A9_NPipe]>],
928                               [3]>,
929   //
930   // Double-register Permute Move
931   InstrItinData<IIC_VMOVD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
932   // FIXME: all latencies are arbitrary, no information is available
933                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
934                                InstrStage<1, [A9_Pipe1]>,
935                                InstrStage<1, [A9_MUX0], 0>,
936                                InstrStage<1, [A9_NPipe]>],
937                               [2, 1]>,
938   //
939   // Quad-register Permute Move
940   InstrItinData<IIC_VMOVQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
941   // FIXME: all latencies are arbitrary, no information is available
942                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
943                                InstrStage<1, [A9_Pipe1]>,
944                                InstrStage<1, [A9_MUX0], 0>,
945                                InstrStage<1, [A9_NPipe]>],
946                               [2, 1]>,
947   //
948   // Integer to Single-precision Move
949   InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
950   // FIXME: all latencies are arbitrary, no information is available
951                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
952                                InstrStage<1, [A9_Pipe1]>,
953                                InstrStage<1, [A9_MUX0], 0>,
954                                InstrStage<1, [A9_NPipe]>],
955                               [2, 1]>,
956   //
957   // Integer to Double-precision Move
958   InstrItinData<IIC_VMOVID ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
959   // FIXME: all latencies are arbitrary, no information is available
960                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
961                                InstrStage<1, [A9_Pipe1]>,
962                                InstrStage<1, [A9_MUX0], 0>,
963                                InstrStage<1, [A9_NPipe]>],
964                               [2, 1, 1]>,
965   //
966   // Single-precision to Integer Move
967   InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
968   // FIXME: all latencies are arbitrary, no information is available
969                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
970                                InstrStage<1, [A9_Pipe1]>,
971                                InstrStage<1, [A9_MUX0], 0>,
972                                InstrStage<1, [A9_NPipe]>],
973                               [2, 1]>,
974   //
975   // Double-precision to Integer Move
976   InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
977   // FIXME: all latencies are arbitrary, no information is available
978                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
979                                InstrStage<1, [A9_Pipe1]>,
980                                InstrStage<1, [A9_MUX0], 0>,
981                                InstrStage<1, [A9_NPipe]>],
982                               [2, 2, 1]>,
983   //
984   // Integer to Lane Move
985   InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_DRegsN],   0, Required>,
986   // FIXME: all latencies are arbitrary, no information is available
987                                InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
988                                InstrStage<1, [A9_Pipe1]>,
989                                InstrStage<1, [A9_MUX0], 0>,
990                                InstrStage<2, [A9_NPipe]>],
991                               [3, 1, 1]>,
992
993   //
994   // Vector narrow move
995   InstrItinData<IIC_VMOVN,    [InstrStage<1, [A9_DRegsN],   0, Required>,
996                                // Extra latency cycles since wbck is 6 cycles
997                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
998                                InstrStage<1, [A9_Pipe1]>,
999                                InstrStage<1, [A9_MUX0], 0>,
1000                                InstrStage<1, [A9_NPipe]>],
1001                               [3, 1]>,
1002   //
1003   // Double-register FP Unary
1004   InstrItinData<IIC_VUNAD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
1005                                // Extra latency cycles since wbck is 6 cycles
1006                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1007                                InstrStage<1, [A9_Pipe1]>,
1008                                InstrStage<1, [A9_MUX0], 0>,
1009                                InstrStage<1, [A9_NPipe]>],
1010                               [5, 2]>,
1011   //
1012   // Quad-register FP Unary
1013   // Result written in N5, but that is relative to the last cycle of multicycle,
1014   // so we use 6 for those cases
1015   InstrItinData<IIC_VUNAQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
1016                                // Extra latency cycles since wbck is 7 cycles
1017                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1018                                InstrStage<1, [A9_Pipe1]>,
1019                                InstrStage<1, [A9_MUX0], 0>,
1020                                InstrStage<2, [A9_NPipe]>],
1021                               [6, 2]>,
1022   //
1023   // Double-register FP Binary
1024   // FIXME: We're using this itin for many instructions and [2, 2] here is too
1025   // optimistic.
1026   InstrItinData<IIC_VBIND,    [InstrStage<1, [A9_DRegsN],   0, Required>,
1027                                // Extra latency cycles since wbck is 7 cycles
1028                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1029                                InstrStage<1, [A9_Pipe1]>,
1030                                InstrStage<1, [A9_MUX0], 0>,
1031                                InstrStage<1, [A9_NPipe]>],
1032                               [5, 2, 2]>,
1033   //
1034   // Quad-register FP Binary
1035   // Result written in N5, but that is relative to the last cycle of multicycle,
1036   // so we use 6 for those cases
1037   // FIXME: We're using this itin for many instructions and [2, 2] here is too
1038   // optimistic.
1039   InstrItinData<IIC_VBINQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
1040                                // Extra latency cycles since wbck is 8 cycles
1041                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1042                                InstrStage<1, [A9_Pipe1]>,
1043                                InstrStage<1, [A9_MUX0], 0>,
1044                                InstrStage<2, [A9_NPipe]>],
1045                               [6, 2, 2]>,
1046   //
1047   // Double-register FP Multiple-Accumulate
1048   InstrItinData<IIC_VMACD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
1049                                // Extra latency cycles since wbck is 7 cycles
1050                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1051                                InstrStage<1, [A9_Pipe1]>,
1052                                InstrStage<1, [A9_MUX0], 0>,
1053                                InstrStage<2, [A9_NPipe]>],
1054                               [6, 3, 2, 1]>,
1055   //
1056   // Quad-register FP Multiple-Accumulate
1057   // Result written in N9, but that is relative to the last cycle of multicycle,
1058   // so we use 10 for those cases
1059   InstrItinData<IIC_VMACQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
1060                                // Extra latency cycles since wbck is 9 cycles
1061                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
1062                                InstrStage<1, [A9_Pipe1]>,
1063                                InstrStage<1, [A9_MUX0], 0>,
1064                                InstrStage<4, [A9_NPipe]>],
1065                               [8, 4, 2, 1]>,
1066   //
1067   // Double-register Reciprical Step
1068   InstrItinData<IIC_VRECSD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
1069                                // Extra latency cycles since wbck is 7 cycles
1070                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1071                                InstrStage<1, [A9_Pipe1]>,
1072                                InstrStage<1, [A9_MUX0], 0>,
1073                                InstrStage<2, [A9_NPipe]>],
1074                               [6, 2, 2]>,
1075   //
1076   // Quad-register Reciprical Step
1077   InstrItinData<IIC_VRECSQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
1078                                // Extra latency cycles since wbck is 9 cycles
1079                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
1080                                InstrStage<1, [A9_Pipe1]>,
1081                                InstrStage<1, [A9_MUX0], 0>,
1082                                InstrStage<4, [A9_NPipe]>],
1083                               [8, 2, 2]>,
1084   //
1085   // Double-register Permute
1086   InstrItinData<IIC_VPERMD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
1087                                // Extra latency cycles since wbck is 6 cycles
1088                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1089                                InstrStage<1, [A9_Pipe1]>,
1090                                InstrStage<1, [A9_MUX0], 0>,
1091                                InstrStage<1, [A9_NPipe]>],
1092                               [2, 2, 1, 1]>,
1093   //
1094   // Quad-register Permute
1095   // Result written in N2, but that is relative to the last cycle of multicycle,
1096   // so we use 3 for those cases
1097   InstrItinData<IIC_VPERMQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
1098                                // Extra latency cycles since wbck is 7 cycles
1099                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1100                                InstrStage<1, [A9_Pipe1]>,
1101                                InstrStage<1, [A9_MUX0], 0>,
1102                                InstrStage<2, [A9_NPipe]>],
1103                               [3, 3, 1, 1]>,
1104   //
1105   // Quad-register Permute (3 cycle issue)
1106   // Result written in N2, but that is relative to the last cycle of multicycle,
1107   // so we use 4 for those cases
1108   InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [A9_DRegsN],   0, Required>,
1109                                // Extra latency cycles since wbck is 8 cycles
1110                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
1111                                InstrStage<1, [A9_Pipe1]>,
1112                                InstrStage<1, [A9_MUX0], 0>,
1113                                InstrStage<3, [A9_NPipe]>],
1114                               [4, 4, 1, 1]>,
1115
1116   //
1117   // Double-register VEXT
1118   InstrItinData<IIC_VEXTD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
1119                                // Extra latency cycles since wbck is 7 cycles
1120                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
1121                                InstrStage<1, [A9_Pipe1]>,
1122                                InstrStage<1, [A9_MUX0], 0>,
1123                                InstrStage<1, [A9_NPipe]>],
1124                               [2, 1, 1]>,
1125   //
1126   // Quad-register VEXT
1127   InstrItinData<IIC_VEXTQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
1128                                // Extra latency cycles since wbck is 9 cycles
1129                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1130                                InstrStage<1, [A9_Pipe1]>,
1131                                InstrStage<1, [A9_MUX0], 0>,
1132                                InstrStage<2, [A9_NPipe]>],
1133                               [3, 1, 1]>,
1134   //
1135   // VTB
1136   InstrItinData<IIC_VTB1,     [InstrStage<1, [A9_DRegsN],   0, Required>,
1137                                // Extra latency cycles since wbck is 7 cycles
1138                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1139                                InstrStage<1, [A9_Pipe1]>,
1140                                InstrStage<1, [A9_MUX0], 0>,
1141                                InstrStage<2, [A9_NPipe]>],
1142                               [3, 2, 1]>,
1143   InstrItinData<IIC_VTB2,     [InstrStage<2, [A9_DRegsN],   0, Required>,
1144                                // Extra latency cycles since wbck is 7 cycles
1145                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1146                                InstrStage<1, [A9_Pipe1]>,
1147                                InstrStage<1, [A9_MUX0], 0>,
1148                                InstrStage<2, [A9_NPipe]>],
1149                               [3, 2, 2, 1]>,
1150   InstrItinData<IIC_VTB3,     [InstrStage<2, [A9_DRegsN],   0, Required>,
1151                                // Extra latency cycles since wbck is 8 cycles
1152                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
1153                                InstrStage<1, [A9_Pipe1]>,
1154                                InstrStage<1, [A9_MUX0], 0>,
1155                                InstrStage<3, [A9_NPipe]>],
1156                               [4, 2, 2, 3, 1]>,
1157   InstrItinData<IIC_VTB4,     [InstrStage<1, [A9_DRegsN],   0, Required>,
1158                                // Extra latency cycles since wbck is 8 cycles
1159                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
1160                                InstrStage<1, [A9_Pipe1]>,
1161                                InstrStage<1, [A9_MUX0], 0>,
1162                                InstrStage<3, [A9_NPipe]>],
1163                               [4, 2, 2, 3, 3, 1]>,
1164   //
1165   // VTBX
1166   InstrItinData<IIC_VTBX1,    [InstrStage<1, [A9_DRegsN],   0, Required>,
1167                                // Extra latency cycles since wbck is 7 cycles
1168                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1169                                InstrStage<1, [A9_Pipe1]>,
1170                                InstrStage<1, [A9_MUX0], 0>,
1171                                InstrStage<2, [A9_NPipe]>],
1172                               [3, 1, 2, 1]>,
1173   InstrItinData<IIC_VTBX2,    [InstrStage<1, [A9_DRegsN],   0, Required>,
1174                                // Extra latency cycles since wbck is 7 cycles
1175                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1176                                InstrStage<1, [A9_Pipe1]>,
1177                                InstrStage<1, [A9_MUX0], 0>,
1178                                InstrStage<2, [A9_NPipe]>],
1179                               [3, 1, 2, 2, 1]>,
1180   InstrItinData<IIC_VTBX3,    [InstrStage<1, [A9_DRegsN],   0, Required>,
1181                                // Extra latency cycles since wbck is 8 cycles
1182                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
1183                                InstrStage<1, [A9_Pipe1]>,
1184                                InstrStage<1, [A9_MUX0], 0>,
1185                                InstrStage<3, [A9_NPipe]>],
1186                               [4, 1, 2, 2, 3, 1]>,
1187   InstrItinData<IIC_VTBX4,    [InstrStage<1, [A9_DRegsN],   0, Required>,
1188                                // Extra latency cycles since wbck is 8 cycles
1189                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
1190                                InstrStage<1, [A9_Pipe1]>,
1191                                InstrStage<1, [A9_MUX0], 0>,
1192                                InstrStage<2, [A9_NPipe]>],
1193                               [4, 1, 2, 2, 3, 3, 1]>
1194 ]>;