1 //=- ARMScheduleA9.td - ARM Cortex-A9 Scheduling Definitions -*- tablegen -*-=//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the itinerary class data for the ARM Cortex A9 processors.
12 //===----------------------------------------------------------------------===//
15 // Ad-hoc scheduling information derived from pretty vague "Cortex-A9 Technical
19 def A9_Pipe0 : FuncUnit; // pipeline 0
20 def A9_Pipe1 : FuncUnit; // pipeline 1
21 def A9_AGU : FuncUnit; // LS pipe
22 def A9_NPipe : FuncUnit; // NEON ALU/MUL pipe
23 def A9_DRegsVFP: FuncUnit; // FP register set, VFP side
24 def A9_DRegsN : FuncUnit; // FP register set, NEON side
27 def A9_LdBypass : Bypass;
29 // Dual issue pipeline represented by A9_Pipe0 | A9_Pipe1
31 def CortexA9Itineraries : ProcessorItineraries<
32 [A9_NPipe, A9_DRegsN, A9_DRegsVFP, A9_AGU, A9_Pipe0, A9_Pipe1],
34 // Two fully-pipelined integer ALU pipelines
37 // Move instructions, unconditional
38 InstrItinData<IIC_iMOVi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1]>,
39 InstrItinData<IIC_iMOVr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
40 InstrItinData<IIC_iMOVsi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
41 InstrItinData<IIC_iMOVsr , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
42 InstrItinData<IIC_iMOVix2 , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
43 InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2]>,
46 InstrItinData<IIC_iMVNi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
48 InstrItinData<IIC_iMVNr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
49 [1, 1], [NoBypass, A9_LdBypass]>,
50 InstrItinData<IIC_iMVNsi , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>],
52 InstrItinData<IIC_iMVNsr , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>],
56 InstrItinData<IIC_iALUx , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>,
58 // Binary Instructions that produce a result
59 InstrItinData<IIC_iALUi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
60 [1, 1], [NoBypass, A9_LdBypass]>,
61 InstrItinData<IIC_iALUr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
62 [1, 1, 1], [NoBypass, A9_LdBypass, A9_LdBypass]>,
63 InstrItinData<IIC_iALUsi, [InstrStage<2, [A9_Pipe0, A9_Pipe1]>],
64 [2, 1, 1], [NoBypass, A9_LdBypass, NoBypass]>,
65 InstrItinData<IIC_iALUsir,[InstrStage<2, [A9_Pipe0, A9_Pipe1]>],
66 [2, 1, 1], [NoBypass, NoBypass, A9_LdBypass]>,
67 InstrItinData<IIC_iALUsr,[InstrStage<3, [A9_Pipe0, A9_Pipe1]>],
69 [NoBypass, A9_LdBypass, NoBypass, NoBypass]>,
71 // Bitwise Instructions that produce a result
72 InstrItinData<IIC_iBITi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
73 InstrItinData<IIC_iBITr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1, 1]>,
74 InstrItinData<IIC_iBITsi, [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
75 InstrItinData<IIC_iBITsr,[InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [3, 1, 1, 1]>,
77 // Unary Instructions that produce a result
80 InstrItinData<IIC_iUNAr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
82 // BFC, BFI, UBFX, SBFX
83 InstrItinData<IIC_iUNAsi , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
86 // Zero and sign extension instructions
87 InstrItinData<IIC_iEXTr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
88 InstrItinData<IIC_iEXTAr, [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [3, 1, 1]>,
89 InstrItinData<IIC_iEXTAsr,[InstrStage<3, [A9_Pipe0, A9_Pipe1]>],[3, 1, 1, 1]>,
91 // Compare instructions
92 InstrItinData<IIC_iCMPi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
94 InstrItinData<IIC_iCMPr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
95 [1, 1], [A9_LdBypass, A9_LdBypass]>,
96 InstrItinData<IIC_iCMPsi , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>],
97 [1, 1], [A9_LdBypass, NoBypass]>,
98 InstrItinData<IIC_iCMPsr , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>],
99 [1, 1, 1], [A9_LdBypass, NoBypass, NoBypass]>,
102 InstrItinData<IIC_iTSTi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1]>,
103 InstrItinData<IIC_iTSTr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
104 InstrItinData<IIC_iTSTsi , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
105 InstrItinData<IIC_iTSTsr , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [1, 1, 1]>,
107 // Move instructions, conditional
108 // FIXME: Correctly model the extra input dep on the destination.
109 InstrItinData<IIC_iCMOVi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1]>,
110 InstrItinData<IIC_iCMOVr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
111 InstrItinData<IIC_iCMOVsi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
112 InstrItinData<IIC_iCMOVsr , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
114 // Integer multiply pipeline
116 InstrItinData<IIC_iMUL16 , [InstrStage<1, [A9_Pipe1], 0>,
117 InstrStage<2, [A9_Pipe0]>], [3, 1, 1]>,
118 InstrItinData<IIC_iMAC16 , [InstrStage<1, [A9_Pipe1], 0>,
119 InstrStage<2, [A9_Pipe0]>], [3, 1, 1, 1]>,
120 InstrItinData<IIC_iMUL32 , [InstrStage<1, [A9_Pipe1], 0>,
121 InstrStage<2, [A9_Pipe0]>], [4, 1, 1]>,
122 InstrItinData<IIC_iMAC32 , [InstrStage<1, [A9_Pipe1], 0>,
123 InstrStage<2, [A9_Pipe0]>], [4, 1, 1, 1]>,
124 InstrItinData<IIC_iMUL64 , [InstrStage<1, [A9_Pipe1], 0>,
125 InstrStage<3, [A9_Pipe0]>], [4, 5, 1, 1]>,
126 InstrItinData<IIC_iMAC64 , [InstrStage<1, [A9_Pipe1], 0>,
127 InstrStage<3, [A9_Pipe0]>], [4, 5, 1, 1]>,
128 // Integer load pipeline
129 // FIXME: The timings are some rough approximations
132 InstrItinData<IIC_iLoad_i , [InstrStage<1, [A9_Pipe1]>,
133 InstrStage<1, [A9_AGU]>],
134 [3, 1], [A9_LdBypass]>,
135 InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A9_Pipe1]>,
136 InstrStage<2, [A9_AGU]>],
137 [4, 1], [A9_LdBypass]>,
138 // FIXME: If address is 64-bit aligned, AGU cycles is 1.
139 InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [A9_Pipe1]>,
140 InstrStage<2, [A9_AGU]>],
141 [3, 3, 1], [A9_LdBypass]>,
144 InstrItinData<IIC_iLoad_r , [InstrStage<1, [A9_Pipe1]>,
145 InstrStage<1, [A9_AGU]>],
146 [3, 1, 1], [A9_LdBypass]>,
147 InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A9_Pipe1]>,
148 InstrStage<2, [A9_AGU]>],
149 [4, 1, 1], [A9_LdBypass]>,
150 InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A9_Pipe1]>,
151 InstrStage<2, [A9_AGU]>],
152 [3, 3, 1, 1], [A9_LdBypass]>,
154 // Scaled register offset
155 InstrItinData<IIC_iLoad_si , [InstrStage<1, [A9_Pipe1]>,
156 InstrStage<1, [A9_AGU]>],
157 [4, 1, 1], [A9_LdBypass]>,
158 InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [A9_Pipe1]>,
159 InstrStage<2, [A9_AGU]>],
160 [5, 1, 1], [A9_LdBypass]>,
162 // Immediate offset with update
163 InstrItinData<IIC_iLoad_iu , [InstrStage<1, [A9_Pipe1]>,
164 InstrStage<1, [A9_AGU]>],
165 [3, 2, 1], [A9_LdBypass]>,
166 InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A9_Pipe1]>,
167 InstrStage<2, [A9_AGU]>],
168 [4, 3, 1], [A9_LdBypass]>,
170 // Register offset with update
171 InstrItinData<IIC_iLoad_ru , [InstrStage<1, [A9_Pipe1]>,
172 InstrStage<1, [A9_AGU]>],
173 [3, 2, 1, 1], [A9_LdBypass]>,
174 InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A9_Pipe1]>,
175 InstrStage<2, [A9_AGU]>],
176 [4, 3, 1, 1], [A9_LdBypass]>,
177 InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A9_Pipe1]>,
178 InstrStage<2, [A9_AGU]>],
179 [3, 3, 1, 1], [A9_LdBypass]>,
181 // Scaled register offset with update
182 InstrItinData<IIC_iLoad_siu , [InstrStage<1, [A9_Pipe1]>,
183 InstrStage<1, [A9_AGU]>],
184 [4, 3, 1, 1], [A9_LdBypass]>,
185 InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [A9_Pipe1]>,
186 InstrStage<2, [A9_AGU]>],
187 [5, 4, 1, 1], [A9_LdBypass]>,
190 InstrItinData<IIC_iLoadm , [InstrStage<1, [A9_Pipe1]>,
191 InstrStage<2, [A9_AGU]>],
195 // Load multiple plus branch
196 InstrItinData<IIC_iLoadmBr , [InstrStage<1, [A9_Pipe1]>,
197 InstrStage<1, [A9_AGU]>,
198 InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>,
201 // iLoadi + iALUr for t2LDRpci_pic.
202 InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A9_Pipe1]>,
203 InstrStage<1, [A9_AGU]>,
204 InstrStage<1, [A9_Pipe0, A9_Pipe1]>],
207 // Integer store pipeline
210 InstrItinData<IIC_iStore_i , [InstrStage<1, [A9_Pipe1]>,
211 InstrStage<1, [A9_AGU]>], [1, 1]>,
212 InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A9_Pipe1]>,
213 InstrStage<2, [A9_AGU]>], [1, 1]>,
214 // FIXME: If address is 64-bit aligned, AGU cycles is 1.
215 InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A9_Pipe1]>,
216 InstrStage<2, [A9_AGU]>], [1, 1]>,
219 InstrItinData<IIC_iStore_r , [InstrStage<1, [ A9_Pipe1]>,
220 InstrStage<1, [A9_AGU]>], [1, 1, 1]>,
221 InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [ A9_Pipe1]>,
222 InstrStage<2, [A9_AGU]>], [1, 1, 1]>,
223 InstrItinData<IIC_iStore_d_r, [InstrStage<1, [ A9_Pipe1]>,
224 InstrStage<2, [A9_AGU]>], [1, 1, 1]>,
226 // Scaled register offset
227 InstrItinData<IIC_iStore_si , [InstrStage<1, [A9_Pipe1]>,
228 InstrStage<1, [A9_AGU]>], [1, 1, 1]>,
229 InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [A9_Pipe1]>,
230 InstrStage<2, [A9_AGU]>], [1, 1, 1]>,
232 // Immediate offset with update
233 InstrItinData<IIC_iStore_iu , [InstrStage<1, [A9_Pipe1]>,
234 InstrStage<1, [A9_AGU]>], [2, 1, 1]>,
235 InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A9_Pipe1]>,
236 InstrStage<2, [A9_AGU]>], [3, 1, 1]>,
238 // Register offset with update
239 InstrItinData<IIC_iStore_ru , [InstrStage<1, [A9_Pipe1]>,
240 InstrStage<1, [A9_AGU]>], [2, 1, 1, 1]>,
241 InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A9_Pipe1]>,
242 InstrStage<2, [A9_AGU]>], [3, 1, 1, 1]>,
243 InstrItinData<IIC_iStore_d_ru,[InstrStage<1, [A9_Pipe1]>,
244 InstrStage<2, [A9_AGU]>], [3, 1, 1, 1]>,
246 // Scaled register offset with update
247 InstrItinData<IIC_iStore_siu, [InstrStage<1, [A9_Pipe1]>,
248 InstrStage<1, [A9_AGU]>], [2, 1, 1, 1]>,
249 InstrItinData<IIC_iStore_bh_siu,[InstrStage<1, [A9_Pipe1]>,
250 InstrStage<2, [A9_AGU]>], [3, 1, 1, 1]>,
253 InstrItinData<IIC_iStorem , [InstrStage<1, [A9_Pipe1]>,
254 InstrStage<1, [A9_AGU]>]>,
257 // no delay slots, so the latency of a branch is unimportant
258 InstrItinData<IIC_Br , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>,
260 // VFP and NEON shares the same register file. This means that every VFP
261 // instruction should wait for full completion of the consecutive NEON
262 // instruction and vice-versa. We model this behavior with two artificial FUs:
263 // DRegsVFP and DRegsVFP.
265 // Every VFP instruction:
266 // - Acquires DRegsVFP resource for 1 cycle
267 // - Reserves DRegsN resource for the whole duration (including time to
268 // register file writeback!).
269 // Every NEON instruction does the same but with FUs swapped.
271 // Since the reserved FU cannot be acquired, this models precisely
272 // "cross-domain" stalls.
275 // Issue through integer pipeline, and execute in NEON unit.
277 // FP Special Register to Integer Register File Move
278 InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
279 InstrStage<2, [A9_DRegsN], 0, Reserved>,
280 InstrStage<1, [A9_Pipe1]>,
281 InstrStage<1, [A9_NPipe]>]>,
283 // Single-precision FP Unary
284 InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
285 // Extra latency cycles since wbck is 2 cycles
286 InstrStage<3, [A9_DRegsN], 0, Reserved>,
287 InstrStage<1, [A9_Pipe1]>,
288 InstrStage<1, [A9_NPipe]>], [1, 1]>,
290 // Double-precision FP Unary
291 InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
292 // Extra latency cycles since wbck is 2 cycles
293 InstrStage<3, [A9_DRegsN], 0, Reserved>,
294 InstrStage<1, [A9_Pipe1]>,
295 InstrStage<1, [A9_NPipe]>], [1, 1]>,
298 // Single-precision FP Compare
299 InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
300 // Extra latency cycles since wbck is 4 cycles
301 InstrStage<5, [A9_DRegsN], 0, Reserved>,
302 InstrStage<1, [A9_Pipe1]>,
303 InstrStage<1, [A9_NPipe]>], [1, 1]>,
305 // Double-precision FP Compare
306 InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
307 // Extra latency cycles since wbck is 4 cycles
308 InstrStage<5, [A9_DRegsN], 0, Reserved>,
309 InstrStage<1, [A9_Pipe1]>,
310 InstrStage<1, [A9_NPipe]>], [1, 1]>,
312 // Single to Double FP Convert
313 InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
314 InstrStage<5, [A9_DRegsN], 0, Reserved>,
315 InstrStage<1, [A9_Pipe1]>,
316 InstrStage<1, [A9_NPipe]>], [4, 1]>,
318 // Double to Single FP Convert
319 InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
320 InstrStage<5, [A9_DRegsN], 0, Reserved>,
321 InstrStage<1, [A9_Pipe1]>,
322 InstrStage<1, [A9_NPipe]>], [4, 1]>,
325 // Single to Half FP Convert
326 InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
327 InstrStage<5, [A9_DRegsN], 0, Reserved>,
328 InstrStage<1, [A9_Pipe1]>,
329 InstrStage<1, [A9_NPipe]>], [4, 1]>,
331 // Half to Single FP Convert
332 InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
333 InstrStage<3, [A9_DRegsN], 0, Reserved>,
334 InstrStage<1, [A9_Pipe1]>,
335 InstrStage<1, [A9_NPipe]>], [2, 1]>,
338 // Single-Precision FP to Integer Convert
339 InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
340 InstrStage<5, [A9_DRegsN], 0, Reserved>,
341 InstrStage<1, [A9_Pipe1]>,
342 InstrStage<1, [A9_NPipe]>], [4, 1]>,
344 // Double-Precision FP to Integer Convert
345 InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
346 InstrStage<5, [A9_DRegsN], 0, Reserved>,
347 InstrStage<1, [A9_Pipe1]>,
348 InstrStage<1, [A9_NPipe]>], [4, 1]>,
350 // Integer to Single-Precision FP Convert
351 InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
352 InstrStage<5, [A9_DRegsN], 0, Reserved>,
353 InstrStage<1, [A9_Pipe1]>,
354 InstrStage<1, [A9_NPipe]>], [4, 1]>,
356 // Integer to Double-Precision FP Convert
357 InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
358 InstrStage<5, [A9_DRegsN], 0, Reserved>,
359 InstrStage<1, [A9_Pipe1]>,
360 InstrStage<1, [A9_NPipe]>], [4, 1]>,
362 // Single-precision FP ALU
363 InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
364 InstrStage<5, [A9_DRegsN], 0, Reserved>,
365 InstrStage<1, [A9_Pipe1]>,
366 InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
368 // Double-precision FP ALU
369 InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
370 InstrStage<5, [A9_DRegsN], 0, Reserved>,
371 InstrStage<1, [A9_Pipe1]>,
372 InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
374 // Single-precision FP Multiply
375 InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
376 InstrStage<6, [A9_DRegsN], 0, Reserved>,
377 InstrStage<1, [A9_Pipe1]>,
378 InstrStage<1, [A9_NPipe]>], [5, 1, 1]>,
380 // Double-precision FP Multiply
381 InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
382 InstrStage<7, [A9_DRegsN], 0, Reserved>,
383 InstrStage<1, [A9_Pipe1]>,
384 InstrStage<2, [A9_NPipe]>], [6, 1, 1]>,
386 // Single-precision FP MAC
387 InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
388 InstrStage<9, [A9_DRegsN], 0, Reserved>,
389 InstrStage<1, [A9_Pipe1]>,
390 InstrStage<1, [A9_NPipe]>], [8, 0, 1, 1]>,
392 // Double-precision FP MAC
393 InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
394 InstrStage<10, [A9_DRegsN], 0, Reserved>,
395 InstrStage<1, [A9_Pipe1]>,
396 InstrStage<2, [A9_NPipe]>], [9, 0, 1, 1]>,
398 // Single-precision FP DIV
399 InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
400 InstrStage<16, [A9_DRegsN], 0, Reserved>,
401 InstrStage<1, [A9_Pipe1]>,
402 InstrStage<10, [A9_NPipe]>], [15, 1, 1]>,
404 // Double-precision FP DIV
405 InstrItinData<IIC_fpDIV64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
406 InstrStage<26, [A9_DRegsN], 0, Reserved>,
407 InstrStage<1, [A9_Pipe1]>,
408 InstrStage<20, [A9_NPipe]>], [25, 1, 1]>,
410 // Single-precision FP SQRT
411 InstrItinData<IIC_fpSQRT32, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
412 InstrStage<18, [A9_DRegsN], 0, Reserved>,
413 InstrStage<1, [A9_Pipe1]>,
414 InstrStage<13, [A9_NPipe]>], [17, 1]>,
416 // Double-precision FP SQRT
417 InstrItinData<IIC_fpSQRT64, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
418 InstrStage<33, [A9_DRegsN], 0, Reserved>,
419 InstrStage<1, [A9_Pipe1]>,
420 InstrStage<28, [A9_NPipe]>], [32, 1]>,
423 // Integer to Single-precision Move
424 InstrItinData<IIC_fpMOVIS, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
425 // Extra 1 latency cycle since wbck is 2 cycles
426 InstrStage<3, [A9_DRegsN], 0, Reserved>,
427 InstrStage<1, [A9_Pipe1]>,
428 InstrStage<1, [A9_NPipe]>], [1, 1]>,
430 // Integer to Double-precision Move
431 InstrItinData<IIC_fpMOVID, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
432 // Extra 1 latency cycle since wbck is 2 cycles
433 InstrStage<3, [A9_DRegsN], 0, Reserved>,
434 InstrStage<1, [A9_Pipe1]>,
435 InstrStage<1, [A9_NPipe]>], [1, 1, 1]>,
437 // Single-precision to Integer Move
438 InstrItinData<IIC_fpMOVSI, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
439 InstrStage<2, [A9_DRegsN], 0, Reserved>,
440 InstrStage<1, [A9_Pipe1]>,
441 InstrStage<1, [A9_NPipe]>], [1, 1]>,
443 // Double-precision to Integer Move
444 InstrItinData<IIC_fpMOVDI, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
445 InstrStage<2, [A9_DRegsN], 0, Reserved>,
446 InstrStage<1, [A9_Pipe1]>,
447 InstrStage<1, [A9_NPipe]>], [1, 1, 1]>,
449 // Single-precision FP Load
450 InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
451 InstrStage<2, [A9_DRegsN], 0, Reserved>,
452 InstrStage<1, [A9_Pipe1], 0>,
453 InstrStage<1, [A9_AGU]>,
454 InstrStage<1, [A9_NPipe]>]>,
456 // Double-precision FP Load
457 InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
458 InstrStage<2, [A9_DRegsN], 0, Reserved>,
459 InstrStage<1, [A9_Pipe1], 0>,
460 InstrStage<1, [A9_AGU]>,
461 InstrStage<1, [A9_NPipe]>]>,
464 InstrItinData<IIC_fpLoadm, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
465 InstrStage<2, [A9_DRegsN], 0, Reserved>,
466 InstrStage<1, [A9_Pipe1], 0>,
467 InstrStage<1, [A9_AGU]>,
468 InstrStage<1, [A9_NPipe]>]>,
470 // Single-precision FP Store
471 InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
472 InstrStage<2, [A9_DRegsN], 0, Reserved>,
473 InstrStage<1, [A9_Pipe1], 0>,
474 InstrStage<1, [A9_AGU]>,
475 InstrStage<1, [A9_NPipe]>]>,
477 // Double-precision FP Store
478 InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
479 InstrStage<2, [A9_DRegsN], 0, Reserved>,
480 InstrStage<1, [A9_Pipe1], 0>,
481 InstrStage<1, [A9_AGU]>,
482 InstrStage<1, [A9_NPipe]>]>,
485 InstrItinData<IIC_fpStorem, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
486 InstrStage<2, [A9_DRegsN], 0, Reserved>,
487 InstrStage<1, [A9_Pipe1], 0>,
488 InstrStage<1, [A9_AGU]>,
489 InstrStage<1, [A9_NPipe]>]>,
491 // Issue through integer pipeline, and execute in NEON unit.
492 // FIXME: Neon pipeline and LdSt unit are multiplexed.
493 // Add some syntactic sugar to model this!
495 // FIXME: We don't model this instruction properly
496 InstrItinData<IIC_VLD1, [InstrStage<1, [A9_DRegsN], 0, Required>,
497 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
498 InstrStage<1, [A9_Pipe1], 0>,
499 InstrStage<1, [A9_AGU]>,
500 InstrStage<1, [A9_NPipe]>]>,
503 // FIXME: We don't model this instruction properly
504 InstrItinData<IIC_VLD2, [InstrStage<1, [A9_DRegsN], 0, Required>,
505 // Extra latency cycles since wbck is 6 cycles
506 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
507 InstrStage<1, [A9_Pipe1], 0>,
508 InstrStage<1, [A9_AGU]>,
509 InstrStage<1, [A9_NPipe]>], [2, 2, 1]>,
512 // FIXME: We don't model this instruction properly
513 InstrItinData<IIC_VLD3, [InstrStage<1, [A9_DRegsN], 0, Required>,
514 // Extra latency cycles since wbck is 6 cycles
515 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
516 InstrStage<1, [A9_Pipe1], 0>,
517 InstrStage<1, [A9_AGU]>,
518 InstrStage<1, [A9_NPipe]>], [2, 2, 2, 1]>,
521 // FIXME: We don't model this instruction properly
522 InstrItinData<IIC_VLD4, [InstrStage<1, [A9_DRegsN], 0, Required>,
523 // Extra latency cycles since wbck is 6 cycles
524 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
525 InstrStage<1, [A9_Pipe1], 0>,
526 InstrStage<1, [A9_AGU]>,
527 InstrStage<1, [A9_NPipe]>], [2, 2, 2, 2, 1]>,
530 // FIXME: We don't model this instruction properly
531 InstrItinData<IIC_VST, [InstrStage<1, [A9_DRegsN], 0, Required>,
532 // Extra latency cycles since wbck is 6 cycles
533 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
534 InstrStage<1, [A9_Pipe1], 0>,
535 InstrStage<1, [A9_AGU]>,
536 InstrStage<1, [A9_NPipe]>]>,
538 // Double-register Integer Unary
539 InstrItinData<IIC_VUNAiD, [InstrStage<1, [A9_DRegsN], 0, Required>,
540 // Extra latency cycles since wbck is 6 cycles
541 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
542 InstrStage<1, [A9_Pipe1]>,
543 InstrStage<1, [A9_NPipe]>], [4, 2]>,
545 // Quad-register Integer Unary
546 InstrItinData<IIC_VUNAiQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
547 // Extra latency cycles since wbck is 6 cycles
548 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
549 InstrStage<1, [A9_Pipe1]>,
550 InstrStage<1, [A9_NPipe]>], [4, 2]>,
552 // Double-register Integer Q-Unary
553 InstrItinData<IIC_VQUNAiD, [InstrStage<1, [A9_DRegsN], 0, Required>,
554 // Extra latency cycles since wbck is 6 cycles
555 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
556 InstrStage<1, [A9_Pipe1]>,
557 InstrStage<1, [A9_NPipe]>], [4, 1]>,
559 // Quad-register Integer CountQ-Unary
560 InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
561 // Extra latency cycles since wbck is 6 cycles
562 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
563 InstrStage<1, [A9_Pipe1]>,
564 InstrStage<1, [A9_NPipe]>], [4, 1]>,
566 // Double-register Integer Binary
567 InstrItinData<IIC_VBINiD, [InstrStage<1, [A9_DRegsN], 0, Required>,
568 // Extra latency cycles since wbck is 6 cycles
569 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
570 InstrStage<1, [A9_Pipe1]>,
571 InstrStage<1, [A9_NPipe]>], [3, 2, 2]>,
573 // Quad-register Integer Binary
574 InstrItinData<IIC_VBINiQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
575 // Extra latency cycles since wbck is 6 cycles
576 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
577 InstrStage<1, [A9_Pipe1]>,
578 InstrStage<1, [A9_NPipe]>], [3, 2, 2]>,
580 // Double-register Integer Subtract
581 InstrItinData<IIC_VSUBiD, [InstrStage<1, [A9_DRegsN], 0, Required>,
582 // Extra latency cycles since wbck is 6 cycles
583 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
584 InstrStage<1, [A9_Pipe1]>,
585 InstrStage<1, [A9_NPipe]>], [3, 2, 1]>,
587 // Quad-register Integer Subtract
588 InstrItinData<IIC_VSUBiQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
589 // Extra latency cycles since wbck is 6 cycles
590 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
591 InstrStage<1, [A9_Pipe1]>,
592 InstrStage<1, [A9_NPipe]>], [3, 2, 1]>,
594 // Double-register Integer Shift
595 InstrItinData<IIC_VSHLiD, [InstrStage<1, [A9_DRegsN], 0, Required>,
596 // Extra latency cycles since wbck is 6 cycles
597 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
598 InstrStage<1, [A9_Pipe1]>,
599 InstrStage<1, [A9_NPipe]>], [3, 1, 1]>,
601 // Quad-register Integer Shift
602 InstrItinData<IIC_VSHLiQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
603 // Extra latency cycles since wbck is 6 cycles
604 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
605 InstrStage<1, [A9_Pipe1]>,
606 InstrStage<1, [A9_NPipe]>], [3, 1, 1]>,
608 // Double-register Integer Shift (4 cycle)
609 InstrItinData<IIC_VSHLi4D, [InstrStage<1, [A9_DRegsN], 0, Required>,
610 // Extra latency cycles since wbck is 6 cycles
611 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
612 InstrStage<1, [A9_Pipe1]>,
613 InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
615 // Quad-register Integer Shift (4 cycle)
616 InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [A9_DRegsN], 0, Required>,
617 // Extra latency cycles since wbck is 6 cycles
618 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
619 InstrStage<1, [A9_Pipe1]>,
620 InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
622 // Double-register Integer Binary (4 cycle)
623 InstrItinData<IIC_VBINi4D, [InstrStage<1, [A9_DRegsN], 0, Required>,
624 // Extra latency cycles since wbck is 6 cycles
625 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
626 InstrStage<1, [A9_Pipe1]>,
627 InstrStage<1, [A9_NPipe]>], [4, 2, 2]>,
629 // Quad-register Integer Binary (4 cycle)
630 InstrItinData<IIC_VBINi4Q, [InstrStage<1, [A9_DRegsN], 0, Required>,
631 // Extra latency cycles since wbck is 6 cycles
632 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
633 InstrStage<1, [A9_Pipe1]>,
634 InstrStage<1, [A9_NPipe]>], [4, 2, 2]>,
636 // Double-register Integer Subtract (4 cycle)
637 InstrItinData<IIC_VSUBiD, [InstrStage<1, [A9_DRegsN], 0, Required>,
638 // Extra latency cycles since wbck is 6 cycles
639 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
640 InstrStage<1, [A9_Pipe1]>,
641 InstrStage<1, [A9_NPipe]>], [4, 2, 1]>,
643 // Quad-register Integer Subtract (4 cycle)
644 InstrItinData<IIC_VSUBiQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
645 // Extra latency cycles since wbck is 6 cycles
646 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
647 InstrStage<1, [A9_Pipe1]>,
648 InstrStage<1, [A9_NPipe]>], [4, 2, 1]>,
651 // Double-register Integer Count
652 InstrItinData<IIC_VCNTiD, [InstrStage<1, [A9_DRegsN], 0, Required>,
653 // Extra latency cycles since wbck is 6 cycles
654 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
655 InstrStage<1, [A9_Pipe1]>,
656 InstrStage<1, [A9_NPipe]>], [3, 2, 2]>,
658 // Quad-register Integer Count
659 // Result written in N3, but that is relative to the last cycle of multicycle,
660 // so we use 4 for those cases
661 InstrItinData<IIC_VCNTiQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
662 // Extra latency cycles since wbck is 7 cycles
663 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
664 InstrStage<1, [A9_Pipe1]>,
665 InstrStage<2, [A9_NPipe]>], [4, 2, 2]>,
667 // Double-register Absolute Difference and Accumulate
668 InstrItinData<IIC_VABAD, [InstrStage<1, [A9_DRegsN], 0, Required>,
669 // Extra latency cycles since wbck is 6 cycles
670 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
671 InstrStage<1, [A9_Pipe1]>,
672 InstrStage<1, [A9_NPipe]>], [6, 3, 2, 1]>,
674 // Quad-register Absolute Difference and Accumulate
675 InstrItinData<IIC_VABAQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
676 // Extra latency cycles since wbck is 6 cycles
677 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
678 InstrStage<1, [A9_Pipe1]>,
679 InstrStage<2, [A9_NPipe]>], [6, 3, 2, 1]>,
681 // Double-register Integer Pair Add Long
682 InstrItinData<IIC_VPALiD, [InstrStage<1, [A9_DRegsN], 0, Required>,
683 // Extra latency cycles since wbck is 6 cycles
684 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
685 InstrStage<1, [A9_Pipe1]>,
686 InstrStage<1, [A9_NPipe]>], [6, 3, 1]>,
688 // Quad-register Integer Pair Add Long
689 InstrItinData<IIC_VPALiQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
690 // Extra latency cycles since wbck is 6 cycles
691 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
692 InstrStage<1, [A9_Pipe1]>,
693 InstrStage<2, [A9_NPipe]>], [6, 3, 1]>,
696 // Double-register Integer Multiply (.8, .16)
697 InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_DRegsN], 0, Required>,
698 // Extra latency cycles since wbck is 6 cycles
699 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
700 InstrStage<1, [A9_Pipe1]>,
701 InstrStage<1, [A9_NPipe]>], [6, 2, 2]>,
703 // Quad-register Integer Multiply (.8, .16)
704 InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_DRegsN], 0, Required>,
705 // Extra latency cycles since wbck is 7 cycles
706 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
707 InstrStage<1, [A9_Pipe1]>,
708 InstrStage<2, [A9_NPipe]>], [7, 2, 2]>,
711 // Double-register Integer Multiply (.32)
712 InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_DRegsN], 0, Required>,
713 // Extra latency cycles since wbck is 7 cycles
714 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
715 InstrStage<1, [A9_Pipe1]>,
716 InstrStage<2, [A9_NPipe]>], [7, 2, 1]>,
718 // Quad-register Integer Multiply (.32)
719 InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_DRegsN], 0, Required>,
720 // Extra latency cycles since wbck is 9 cycles
721 InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
722 InstrStage<1, [A9_Pipe1]>,
723 InstrStage<4, [A9_NPipe]>], [9, 2, 1]>,
725 // Double-register Integer Multiply-Accumulate (.8, .16)
726 InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_DRegsN], 0, Required>,
727 // Extra latency cycles since wbck is 6 cycles
728 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
729 InstrStage<1, [A9_Pipe1]>,
730 InstrStage<1, [A9_NPipe]>], [6, 3, 2, 2]>,
732 // Double-register Integer Multiply-Accumulate (.32)
733 InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_DRegsN], 0, Required>,
734 // Extra latency cycles since wbck is 7 cycles
735 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
736 InstrStage<1, [A9_Pipe1]>,
737 InstrStage<2, [A9_NPipe]>], [7, 3, 2, 1]>,
739 // Quad-register Integer Multiply-Accumulate (.8, .16)
740 InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_DRegsN], 0, Required>,
741 // Extra latency cycles since wbck is 7 cycles
742 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
743 InstrStage<1, [A9_Pipe1]>,
744 InstrStage<2, [A9_NPipe]>], [7, 3, 2, 2]>,
746 // Quad-register Integer Multiply-Accumulate (.32)
747 InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_DRegsN], 0, Required>,
748 // Extra latency cycles since wbck is 9 cycles
749 InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
750 InstrStage<1, [A9_Pipe1]>,
751 InstrStage<4, [A9_NPipe]>], [9, 3, 2, 1]>,
754 InstrItinData<IIC_VMOVImm, [InstrStage<1, [A9_DRegsN], 0, Required>,
755 // Extra latency cycles since wbck is 6 cycles
756 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
757 InstrStage<1, [A9_Pipe1]>,
758 InstrStage<1, [A9_NPipe]>], [3]>,
760 // Double-register Permute Move
761 InstrItinData<IIC_VMOVD, [InstrStage<1, [A9_DRegsN], 0, Required>,
762 // FIXME: all latencies are arbitrary, no information is available
763 InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
764 InstrStage<1, [A9_Pipe1]>,
765 InstrStage<1, [A9_AGU]>], [2, 1]>,
767 // Quad-register Permute Move
768 // Result written in N2, but that is relative to the last cycle of multicycle,
769 // so we use 3 for those cases
770 InstrItinData<IIC_VMOVQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
771 // FIXME: all latencies are arbitrary, no information is available
772 InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
773 InstrStage<1, [A9_Pipe1]>,
774 InstrStage<2, [A9_NPipe]>], [3, 1]>,
776 // Integer to Single-precision Move
777 InstrItinData<IIC_VMOVIS , [InstrStage<1, [A9_DRegsN], 0, Required>,
778 // FIXME: all latencies are arbitrary, no information is available
779 InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
780 InstrStage<1, [A9_Pipe1]>,
781 InstrStage<1, [A9_NPipe]>], [2, 1]>,
783 // Integer to Double-precision Move
784 InstrItinData<IIC_VMOVID , [InstrStage<1, [A9_DRegsN], 0, Required>,
785 // FIXME: all latencies are arbitrary, no information is available
786 InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
787 InstrStage<1, [A9_Pipe1]>,
788 InstrStage<1, [A9_NPipe]>], [2, 1, 1]>,
790 // Single-precision to Integer Move
791 InstrItinData<IIC_VMOVSI , [InstrStage<1, [A9_DRegsN], 0, Required>,
792 // FIXME: all latencies are arbitrary, no information is available
793 InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
794 InstrStage<1, [A9_Pipe1]>,
795 InstrStage<1, [A9_NPipe]>], [2, 1]>,
797 // Double-precision to Integer Move
798 InstrItinData<IIC_VMOVDI , [InstrStage<1, [A9_DRegsN], 0, Required>,
799 // FIXME: all latencies are arbitrary, no information is available
800 InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
801 InstrStage<1, [A9_Pipe1]>,
802 InstrStage<1, [A9_NPipe]>], [2, 2, 1]>,
804 // Integer to Lane Move
805 InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_DRegsN], 0, Required>,
806 // FIXME: all latencies are arbitrary, no information is available
807 InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
808 InstrStage<1, [A9_Pipe1]>,
809 InstrStage<2, [A9_NPipe]>], [3, 1, 1]>,
812 // Double-register FP Unary
813 InstrItinData<IIC_VUNAD, [InstrStage<1, [A9_DRegsN], 0, Required>,
814 // Extra latency cycles since wbck is 6 cycles
815 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
816 InstrStage<1, [A9_Pipe1]>,
817 InstrStage<1, [A9_NPipe]>], [5, 2]>,
819 // Quad-register FP Unary
820 // Result written in N5, but that is relative to the last cycle of multicycle,
821 // so we use 6 for those cases
822 InstrItinData<IIC_VUNAQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
823 // Extra latency cycles since wbck is 7 cycles
824 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
825 InstrStage<1, [A9_Pipe1]>,
826 InstrStage<2, [A9_NPipe]>], [6, 2]>,
828 // Double-register FP Binary
829 // FIXME: We're using this itin for many instructions and [2, 2] here is too
831 InstrItinData<IIC_VBIND, [InstrStage<1, [A9_DRegsN], 0, Required>,
832 // Extra latency cycles since wbck is 7 cycles
833 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
834 InstrStage<1, [A9_Pipe1]>,
835 InstrStage<1, [A9_NPipe]>], [5, 2, 2]>,
837 // Quad-register FP Binary
838 // Result written in N5, but that is relative to the last cycle of multicycle,
839 // so we use 6 for those cases
840 // FIXME: We're using this itin for many instructions and [2, 2] here is too
842 InstrItinData<IIC_VBINQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
843 // Extra latency cycles since wbck is 8 cycles
844 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
845 InstrStage<1, [A9_Pipe1]>,
846 InstrStage<2, [A9_NPipe]>], [6, 2, 2]>,
848 // Double-register FP Multiple-Accumulate
849 InstrItinData<IIC_VMACD, [InstrStage<1, [A9_DRegsN], 0, Required>,
850 // Extra latency cycles since wbck is 7 cycles
851 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
852 InstrStage<1, [A9_Pipe1]>,
853 InstrStage<2, [A9_NPipe]>], [6, 3, 2, 1]>,
855 // Quad-register FP Multiple-Accumulate
856 // Result written in N9, but that is relative to the last cycle of multicycle,
857 // so we use 10 for those cases
858 InstrItinData<IIC_VMACQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
859 // Extra latency cycles since wbck is 9 cycles
860 InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
861 InstrStage<1, [A9_Pipe1]>,
862 InstrStage<4, [A9_NPipe]>], [8, 4, 2, 1]>,
864 // Double-register Reciprical Step
865 InstrItinData<IIC_VRECSD, [InstrStage<1, [A9_DRegsN], 0, Required>,
866 // Extra latency cycles since wbck is 7 cycles
867 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
868 InstrStage<1, [A9_Pipe1]>,
869 InstrStage<2, [A9_NPipe]>], [6, 2, 2]>,
871 // Quad-register Reciprical Step
872 InstrItinData<IIC_VRECSQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
873 // Extra latency cycles since wbck is 9 cycles
874 InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
875 InstrStage<1, [A9_Pipe1]>,
876 InstrStage<4, [A9_NPipe]>], [8, 2, 2]>,
878 // Double-register Permute
879 InstrItinData<IIC_VPERMD, [InstrStage<1, [A9_DRegsN], 0, Required>,
880 // Extra latency cycles since wbck is 6 cycles
881 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
882 InstrStage<1, [A9_Pipe1]>,
883 InstrStage<1, [A9_NPipe]>], [2, 2, 1, 1]>,
885 // Quad-register Permute
886 // Result written in N2, but that is relative to the last cycle of multicycle,
887 // so we use 3 for those cases
888 InstrItinData<IIC_VPERMQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
889 // Extra latency cycles since wbck is 7 cycles
890 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
891 InstrStage<1, [A9_Pipe1]>,
892 InstrStage<2, [A9_NPipe]>], [3, 3, 1, 1]>,
894 // Quad-register Permute (3 cycle issue)
895 // Result written in N2, but that is relative to the last cycle of multicycle,
896 // so we use 4 for those cases
897 InstrItinData<IIC_VPERMQ3, [InstrStage<1, [A9_DRegsN], 0, Required>,
898 // Extra latency cycles since wbck is 8 cycles
899 InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
900 InstrStage<1, [A9_Pipe1]>,
901 InstrStage<3, [A9_AGU]>], [4, 4, 1, 1]>,
904 // Double-register VEXT
905 InstrItinData<IIC_VEXTD, [InstrStage<1, [A9_DRegsN], 0, Required>,
906 // Extra latency cycles since wbck is 7 cycles
907 InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
908 InstrStage<1, [A9_Pipe1]>,
909 InstrStage<1, [A9_NPipe]>], [2, 1, 1]>,
911 // Quad-register VEXT
912 InstrItinData<IIC_VEXTQ, [InstrStage<1, [A9_DRegsN], 0, Required>,
913 // Extra latency cycles since wbck is 9 cycles
914 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
915 InstrStage<1, [A9_Pipe1]>,
916 InstrStage<2, [A9_NPipe]>], [3, 1, 1]>,
919 InstrItinData<IIC_VTB1, [InstrStage<1, [A9_DRegsN], 0, Required>,
920 // Extra latency cycles since wbck is 7 cycles
921 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
922 InstrStage<1, [A9_Pipe1]>,
923 InstrStage<2, [A9_NPipe]>], [3, 2, 1]>,
924 InstrItinData<IIC_VTB2, [InstrStage<2, [A9_DRegsN], 0, Required>,
925 // Extra latency cycles since wbck is 7 cycles
926 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
927 InstrStage<1, [A9_Pipe1]>,
928 InstrStage<2, [A9_NPipe]>], [3, 2, 2, 1]>,
929 InstrItinData<IIC_VTB3, [InstrStage<2, [A9_DRegsN], 0, Required>,
930 // Extra latency cycles since wbck is 8 cycles
931 InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
932 InstrStage<1, [A9_Pipe1]>,
933 InstrStage<3, [A9_NPipe]>], [4, 2, 2, 3, 1]>,
934 InstrItinData<IIC_VTB4, [InstrStage<1, [A9_DRegsN], 0, Required>,
935 // Extra latency cycles since wbck is 8 cycles
936 InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
937 InstrStage<1, [A9_Pipe1]>,
938 InstrStage<3, [A9_NPipe]>], [4, 2, 2, 3, 3, 1]>,
941 InstrItinData<IIC_VTBX1, [InstrStage<1, [A9_DRegsN], 0, Required>,
942 // Extra latency cycles since wbck is 7 cycles
943 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
944 InstrStage<1, [A9_Pipe1]>,
945 InstrStage<2, [A9_NPipe]>], [3, 1, 2, 1]>,
946 InstrItinData<IIC_VTBX2, [InstrStage<1, [A9_DRegsN], 0, Required>,
947 // Extra latency cycles since wbck is 7 cycles
948 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
949 InstrStage<1, [A9_Pipe1]>,
950 InstrStage<2, [A9_NPipe]>], [3, 1, 2, 2, 1]>,
951 InstrItinData<IIC_VTBX3, [InstrStage<1, [A9_DRegsN], 0, Required>,
952 // Extra latency cycles since wbck is 8 cycles
953 InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
954 InstrStage<1, [A9_Pipe1]>,
955 InstrStage<3, [A9_NPipe]>], [4, 1, 2, 2, 3, 1]>,
956 InstrItinData<IIC_VTBX4, [InstrStage<1, [A9_DRegsN], 0, Required>,
957 // Extra latency cycles since wbck is 8 cycles
958 InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
959 InstrStage<1, [A9_Pipe1]>,
960 InstrStage<2, [A9_NPipe]>], [4, 1, 2, 2, 3, 3, 1]>