1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file implements the AArch64TargetLowering class.
12 //===----------------------------------------------------------------------===//
14 #include "AArch64ISelLowering.h"
15 #include "AArch64CallingConvention.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64Subtarget.h"
19 #include "AArch64TargetMachine.h"
20 #include "AArch64TargetObjectFile.h"
21 #include "MCTargetDesc/AArch64AddressingModes.h"
22 #include "llvm/ADT/Statistic.h"
23 #include "llvm/CodeGen/CallingConvLower.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineRegisterInfo.h"
27 #include "llvm/IR/Function.h"
28 #include "llvm/IR/Intrinsics.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/Support/CommandLine.h"
31 #include "llvm/Support/Debug.h"
32 #include "llvm/Support/ErrorHandling.h"
33 #include "llvm/Support/raw_ostream.h"
34 #include "llvm/Target/TargetOptions.h"
37 #define DEBUG_TYPE "aarch64-lower"
39 STATISTIC(NumTailCalls, "Number of tail calls");
40 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
49 static cl::opt<AlignMode>
50 Align(cl::desc("Load/store alignment support"),
51 cl::Hidden, cl::init(NoStrictAlign),
53 clEnumValN(StrictAlign, "aarch64-strict-align",
54 "Disallow all unaligned memory accesses"),
55 clEnumValN(NoStrictAlign, "aarch64-no-strict-align",
56 "Allow unaligned memory accesses"),
59 // Place holder until extr generation is tested fully.
61 EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden,
62 cl::desc("Allow AArch64 (or (shift)(shift))->extract"),
66 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
67 cl::desc("Allow AArch64 SLI/SRI formation"),
70 // FIXME: The necessary dtprel relocations don't seem to be supported
71 // well in the GNU bfd and gold linkers at the moment. Therefore, by
72 // default, for now, fall back to GeneralDynamic code generation.
73 cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
74 "aarch64-elf-ldtls-generation", cl::Hidden,
75 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
78 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
79 const AArch64Subtarget &STI)
80 : TargetLowering(TM), Subtarget(&STI) {
82 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
83 // we have to make something up. Arbitrarily, choose ZeroOrOne.
84 setBooleanContents(ZeroOrOneBooleanContent);
85 // When comparing vectors the result sets the different elements in the
86 // vector to all-one or all-zero.
87 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
89 // Set up the register classes.
90 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
91 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
93 if (Subtarget->hasFPARMv8()) {
94 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
95 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
96 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
97 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
100 if (Subtarget->hasNEON()) {
101 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
102 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
103 // Someone set us up the NEON.
104 addDRTypeForNEON(MVT::v2f32);
105 addDRTypeForNEON(MVT::v8i8);
106 addDRTypeForNEON(MVT::v4i16);
107 addDRTypeForNEON(MVT::v2i32);
108 addDRTypeForNEON(MVT::v1i64);
109 addDRTypeForNEON(MVT::v1f64);
110 addDRTypeForNEON(MVT::v4f16);
112 addQRTypeForNEON(MVT::v4f32);
113 addQRTypeForNEON(MVT::v2f64);
114 addQRTypeForNEON(MVT::v16i8);
115 addQRTypeForNEON(MVT::v8i16);
116 addQRTypeForNEON(MVT::v4i32);
117 addQRTypeForNEON(MVT::v2i64);
118 addQRTypeForNEON(MVT::v8f16);
121 // Compute derived properties from the register classes
122 computeRegisterProperties(Subtarget->getRegisterInfo());
124 // Provide all sorts of operation actions
125 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
126 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
127 setOperationAction(ISD::SETCC, MVT::i32, Custom);
128 setOperationAction(ISD::SETCC, MVT::i64, Custom);
129 setOperationAction(ISD::SETCC, MVT::f32, Custom);
130 setOperationAction(ISD::SETCC, MVT::f64, Custom);
131 setOperationAction(ISD::BRCOND, MVT::Other, Expand);
132 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
133 setOperationAction(ISD::BR_CC, MVT::i64, Custom);
134 setOperationAction(ISD::BR_CC, MVT::f32, Custom);
135 setOperationAction(ISD::BR_CC, MVT::f64, Custom);
136 setOperationAction(ISD::SELECT, MVT::i32, Custom);
137 setOperationAction(ISD::SELECT, MVT::i64, Custom);
138 setOperationAction(ISD::SELECT, MVT::f32, Custom);
139 setOperationAction(ISD::SELECT, MVT::f64, Custom);
140 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
141 setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
142 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
143 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
144 setOperationAction(ISD::BR_JT, MVT::Other, Expand);
145 setOperationAction(ISD::JumpTable, MVT::i64, Custom);
147 setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
148 setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
149 setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
151 setOperationAction(ISD::FREM, MVT::f32, Expand);
152 setOperationAction(ISD::FREM, MVT::f64, Expand);
153 setOperationAction(ISD::FREM, MVT::f80, Expand);
155 // Custom lowering hooks are needed for XOR
156 // to fold it into CSINC/CSINV.
157 setOperationAction(ISD::XOR, MVT::i32, Custom);
158 setOperationAction(ISD::XOR, MVT::i64, Custom);
160 // Virtually no operation on f128 is legal, but LLVM can't expand them when
161 // there's a valid register class, so we need custom operations in most cases.
162 setOperationAction(ISD::FABS, MVT::f128, Expand);
163 setOperationAction(ISD::FADD, MVT::f128, Custom);
164 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
165 setOperationAction(ISD::FCOS, MVT::f128, Expand);
166 setOperationAction(ISD::FDIV, MVT::f128, Custom);
167 setOperationAction(ISD::FMA, MVT::f128, Expand);
168 setOperationAction(ISD::FMUL, MVT::f128, Custom);
169 setOperationAction(ISD::FNEG, MVT::f128, Expand);
170 setOperationAction(ISD::FPOW, MVT::f128, Expand);
171 setOperationAction(ISD::FREM, MVT::f128, Expand);
172 setOperationAction(ISD::FRINT, MVT::f128, Expand);
173 setOperationAction(ISD::FSIN, MVT::f128, Expand);
174 setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
175 setOperationAction(ISD::FSQRT, MVT::f128, Expand);
176 setOperationAction(ISD::FSUB, MVT::f128, Custom);
177 setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
178 setOperationAction(ISD::SETCC, MVT::f128, Custom);
179 setOperationAction(ISD::BR_CC, MVT::f128, Custom);
180 setOperationAction(ISD::SELECT, MVT::f128, Custom);
181 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
182 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
184 // Lowering for many of the conversions is actually specified by the non-f128
185 // type. The LowerXXX function will be trivial when f128 isn't involved.
186 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
187 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
188 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
189 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
190 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
191 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
192 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
193 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
194 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
195 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
196 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
197 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
198 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
199 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
201 // Variable arguments.
202 setOperationAction(ISD::VASTART, MVT::Other, Custom);
203 setOperationAction(ISD::VAARG, MVT::Other, Custom);
204 setOperationAction(ISD::VACOPY, MVT::Other, Custom);
205 setOperationAction(ISD::VAEND, MVT::Other, Expand);
207 // Variable-sized objects.
208 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
209 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
210 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
212 // Exception handling.
213 // FIXME: These are guesses. Has this been defined yet?
214 setExceptionPointerRegister(AArch64::X0);
215 setExceptionSelectorRegister(AArch64::X1);
217 // Constant pool entries
218 setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
221 setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
223 // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
224 setOperationAction(ISD::ADDC, MVT::i32, Custom);
225 setOperationAction(ISD::ADDE, MVT::i32, Custom);
226 setOperationAction(ISD::SUBC, MVT::i32, Custom);
227 setOperationAction(ISD::SUBE, MVT::i32, Custom);
228 setOperationAction(ISD::ADDC, MVT::i64, Custom);
229 setOperationAction(ISD::ADDE, MVT::i64, Custom);
230 setOperationAction(ISD::SUBC, MVT::i64, Custom);
231 setOperationAction(ISD::SUBE, MVT::i64, Custom);
233 // AArch64 lacks both left-rotate and popcount instructions.
234 setOperationAction(ISD::ROTL, MVT::i32, Expand);
235 setOperationAction(ISD::ROTL, MVT::i64, Expand);
237 // AArch64 doesn't have {U|S}MUL_LOHI.
238 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
239 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
242 // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero
243 // counterparts, which AArch64 supports directly.
244 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
245 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
246 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
247 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
249 setOperationAction(ISD::CTPOP, MVT::i32, Custom);
250 setOperationAction(ISD::CTPOP, MVT::i64, Custom);
252 setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
253 setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
254 setOperationAction(ISD::SREM, MVT::i32, Expand);
255 setOperationAction(ISD::SREM, MVT::i64, Expand);
256 setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
257 setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
258 setOperationAction(ISD::UREM, MVT::i32, Expand);
259 setOperationAction(ISD::UREM, MVT::i64, Expand);
261 // Custom lower Add/Sub/Mul with overflow.
262 setOperationAction(ISD::SADDO, MVT::i32, Custom);
263 setOperationAction(ISD::SADDO, MVT::i64, Custom);
264 setOperationAction(ISD::UADDO, MVT::i32, Custom);
265 setOperationAction(ISD::UADDO, MVT::i64, Custom);
266 setOperationAction(ISD::SSUBO, MVT::i32, Custom);
267 setOperationAction(ISD::SSUBO, MVT::i64, Custom);
268 setOperationAction(ISD::USUBO, MVT::i32, Custom);
269 setOperationAction(ISD::USUBO, MVT::i64, Custom);
270 setOperationAction(ISD::SMULO, MVT::i32, Custom);
271 setOperationAction(ISD::SMULO, MVT::i64, Custom);
272 setOperationAction(ISD::UMULO, MVT::i32, Custom);
273 setOperationAction(ISD::UMULO, MVT::i64, Custom);
275 setOperationAction(ISD::FSIN, MVT::f32, Expand);
276 setOperationAction(ISD::FSIN, MVT::f64, Expand);
277 setOperationAction(ISD::FCOS, MVT::f32, Expand);
278 setOperationAction(ISD::FCOS, MVT::f64, Expand);
279 setOperationAction(ISD::FPOW, MVT::f32, Expand);
280 setOperationAction(ISD::FPOW, MVT::f64, Expand);
281 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
282 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
284 // f16 is storage-only, so we promote operations to f32 if we know this is
285 // valid, and ignore them otherwise. The operations not mentioned here will
286 // fail to select, but this is not a major problem as no source language
287 // should be emitting native f16 operations yet.
288 setOperationAction(ISD::FADD, MVT::f16, Promote);
289 setOperationAction(ISD::FDIV, MVT::f16, Promote);
290 setOperationAction(ISD::FMUL, MVT::f16, Promote);
291 setOperationAction(ISD::FSUB, MVT::f16, Promote);
293 // v4f16 is also a storage-only type, so promote it to v4f32 when that is
295 setOperationAction(ISD::FADD, MVT::v4f16, Promote);
296 setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
297 setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
298 setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
299 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote);
300 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote);
301 AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
302 AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
303 AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
304 AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
305 AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32);
306 AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32);
308 // Expand all other v4f16 operations.
309 // FIXME: We could generate better code by promoting some operations to
311 setOperationAction(ISD::FABS, MVT::v4f16, Expand);
312 setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
313 setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
314 setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
315 setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
316 setOperationAction(ISD::FMA, MVT::v4f16, Expand);
317 setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
318 setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
319 setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
320 setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
321 setOperationAction(ISD::FREM, MVT::v4f16, Expand);
322 setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
323 setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
324 setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
325 setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
326 setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);
327 setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
328 setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
329 setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
330 setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
331 setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
332 setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
333 setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
334 setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
335 setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
336 setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);
339 // v8f16 is also a storage-only type, so expand it.
340 setOperationAction(ISD::FABS, MVT::v8f16, Expand);
341 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
342 setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
343 setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
344 setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
345 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
346 setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
347 setOperationAction(ISD::FMA, MVT::v8f16, Expand);
348 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
349 setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
350 setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
351 setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
352 setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
353 setOperationAction(ISD::FREM, MVT::v8f16, Expand);
354 setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
355 setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
356 setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
357 setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
358 setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
359 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
360 setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
361 setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
362 setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
363 setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
364 setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
365 setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
366 setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
367 setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
368 setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
369 setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
370 setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);
372 // AArch64 has implementations of a lot of rounding-like FP operations.
373 for (MVT Ty : {MVT::f32, MVT::f64}) {
374 setOperationAction(ISD::FFLOOR, Ty, Legal);
375 setOperationAction(ISD::FNEARBYINT, Ty, Legal);
376 setOperationAction(ISD::FCEIL, Ty, Legal);
377 setOperationAction(ISD::FRINT, Ty, Legal);
378 setOperationAction(ISD::FTRUNC, Ty, Legal);
379 setOperationAction(ISD::FROUND, Ty, Legal);
382 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
384 if (Subtarget->isTargetMachO()) {
385 // For iOS, we don't want to the normal expansion of a libcall to
386 // sincos. We want to issue a libcall to __sincos_stret to avoid memory
388 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
389 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
391 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
392 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
395 // Make floating-point constants legal for the large code model, so they don't
396 // become loads from the constant pool.
397 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
398 setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
399 setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
402 // AArch64 does not have floating-point extending loads, i1 sign-extending
403 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
404 for (MVT VT : MVT::fp_valuetypes()) {
405 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
406 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
407 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
408 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
410 for (MVT VT : MVT::integer_valuetypes())
411 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
413 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
414 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
415 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
416 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
417 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
418 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
419 setTruncStoreAction(MVT::f128, MVT::f16, Expand);
421 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
422 setOperationAction(ISD::BITCAST, MVT::f16, Custom);
424 // Indexed loads and stores are supported.
425 for (unsigned im = (unsigned)ISD::PRE_INC;
426 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
427 setIndexedLoadAction(im, MVT::i8, Legal);
428 setIndexedLoadAction(im, MVT::i16, Legal);
429 setIndexedLoadAction(im, MVT::i32, Legal);
430 setIndexedLoadAction(im, MVT::i64, Legal);
431 setIndexedLoadAction(im, MVT::f64, Legal);
432 setIndexedLoadAction(im, MVT::f32, Legal);
433 setIndexedStoreAction(im, MVT::i8, Legal);
434 setIndexedStoreAction(im, MVT::i16, Legal);
435 setIndexedStoreAction(im, MVT::i32, Legal);
436 setIndexedStoreAction(im, MVT::i64, Legal);
437 setIndexedStoreAction(im, MVT::f64, Legal);
438 setIndexedStoreAction(im, MVT::f32, Legal);
442 setOperationAction(ISD::TRAP, MVT::Other, Legal);
444 // We combine OR nodes for bitfield operations.
445 setTargetDAGCombine(ISD::OR);
447 // Vector add and sub nodes may conceal a high-half opportunity.
448 // Also, try to fold ADD into CSINC/CSINV..
449 setTargetDAGCombine(ISD::ADD);
450 setTargetDAGCombine(ISD::SUB);
452 setTargetDAGCombine(ISD::XOR);
453 setTargetDAGCombine(ISD::SINT_TO_FP);
454 setTargetDAGCombine(ISD::UINT_TO_FP);
456 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
458 setTargetDAGCombine(ISD::ANY_EXTEND);
459 setTargetDAGCombine(ISD::ZERO_EXTEND);
460 setTargetDAGCombine(ISD::SIGN_EXTEND);
461 setTargetDAGCombine(ISD::BITCAST);
462 setTargetDAGCombine(ISD::CONCAT_VECTORS);
463 setTargetDAGCombine(ISD::STORE);
465 setTargetDAGCombine(ISD::MUL);
467 setTargetDAGCombine(ISD::SELECT);
468 setTargetDAGCombine(ISD::VSELECT);
470 setTargetDAGCombine(ISD::INTRINSIC_VOID);
471 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
472 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
474 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
475 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
476 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
478 setStackPointerRegisterToSaveRestore(AArch64::SP);
480 setSchedulingPreference(Sched::Hybrid);
483 MaskAndBranchFoldingIsLegal = true;
485 setMinFunctionAlignment(2);
487 RequireStrictAlign = (Align == StrictAlign);
489 setHasExtractBitsInsn(true);
491 if (Subtarget->hasNEON()) {
492 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
493 // silliness like this:
494 setOperationAction(ISD::FABS, MVT::v1f64, Expand);
495 setOperationAction(ISD::FADD, MVT::v1f64, Expand);
496 setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
497 setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
498 setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
499 setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
500 setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
501 setOperationAction(ISD::FMA, MVT::v1f64, Expand);
502 setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
503 setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
504 setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
505 setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
506 setOperationAction(ISD::FREM, MVT::v1f64, Expand);
507 setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
508 setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
509 setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
510 setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
511 setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
512 setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
513 setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
514 setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
515 setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
516 setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
517 setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
518 setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
520 setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
521 setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
522 setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
523 setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
524 setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
526 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
528 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
529 // elements smaller than i32, so promote the input to i32 first.
530 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
531 setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
532 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
533 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
534 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
535 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
536 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
537 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
538 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
540 // AArch64 doesn't have MUL.2d:
541 setOperationAction(ISD::MUL, MVT::v2i64, Expand);
542 // Custom handling for some quad-vector types to detect MULL.
543 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
544 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
545 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
547 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
548 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
549 // Likewise, narrowing and extending vector loads/stores aren't handled
551 for (MVT VT : MVT::vector_valuetypes()) {
552 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
554 setOperationAction(ISD::MULHS, VT, Expand);
555 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
556 setOperationAction(ISD::MULHU, VT, Expand);
557 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
559 setOperationAction(ISD::BSWAP, VT, Expand);
561 for (MVT InnerVT : MVT::vector_valuetypes()) {
562 setTruncStoreAction(VT, InnerVT, Expand);
563 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
564 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
565 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
569 // AArch64 has implementations of a lot of rounding-like FP operations.
570 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
571 setOperationAction(ISD::FFLOOR, Ty, Legal);
572 setOperationAction(ISD::FNEARBYINT, Ty, Legal);
573 setOperationAction(ISD::FCEIL, Ty, Legal);
574 setOperationAction(ISD::FRINT, Ty, Legal);
575 setOperationAction(ISD::FTRUNC, Ty, Legal);
576 setOperationAction(ISD::FROUND, Ty, Legal);
580 // Prefer likely predicted branches to selects on out-of-order cores.
581 if (Subtarget->isCortexA57())
582 PredictableSelectIsExpensive = true;
585 void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
586 if (VT == MVT::v2f32 || VT == MVT::v4f16) {
587 setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
588 AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
590 setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
591 AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
592 } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) {
593 setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
594 AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
596 setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
597 AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64);
600 // Mark vector float intrinsics as expand.
601 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
602 setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand);
603 setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand);
604 setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand);
605 setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand);
606 setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand);
607 setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand);
608 setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
609 setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
610 setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
613 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
614 setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
615 setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
616 setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
617 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
618 setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
619 setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
620 setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
621 setOperationAction(ISD::AND, VT.getSimpleVT(), Custom);
622 setOperationAction(ISD::OR, VT.getSimpleVT(), Custom);
623 setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
624 setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
626 setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
627 setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
628 setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
629 for (MVT InnerVT : MVT::all_valuetypes())
630 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand);
632 // CNT supports only B element sizes.
633 if (VT != MVT::v8i8 && VT != MVT::v16i8)
634 setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand);
636 setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
637 setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
638 setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
639 setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
640 setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
642 setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
643 setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
645 if (Subtarget->isLittleEndian()) {
646 for (unsigned im = (unsigned)ISD::PRE_INC;
647 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
648 setIndexedLoadAction(im, VT.getSimpleVT(), Legal);
649 setIndexedStoreAction(im, VT.getSimpleVT(), Legal);
654 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
655 addRegisterClass(VT, &AArch64::FPR64RegClass);
656 addTypeForNEON(VT, MVT::v2i32);
659 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
660 addRegisterClass(VT, &AArch64::FPR128RegClass);
661 addTypeForNEON(VT, MVT::v4i32);
664 EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
667 return VT.changeVectorElementTypeToInteger();
670 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
671 /// Mask are known to be either zero or one and return them in the
672 /// KnownZero/KnownOne bitsets.
673 void AArch64TargetLowering::computeKnownBitsForTargetNode(
674 const SDValue Op, APInt &KnownZero, APInt &KnownOne,
675 const SelectionDAG &DAG, unsigned Depth) const {
676 switch (Op.getOpcode()) {
679 case AArch64ISD::CSEL: {
680 APInt KnownZero2, KnownOne2;
681 DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
682 DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
683 KnownZero &= KnownZero2;
684 KnownOne &= KnownOne2;
687 case ISD::INTRINSIC_W_CHAIN: {
688 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
689 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
692 case Intrinsic::aarch64_ldaxr:
693 case Intrinsic::aarch64_ldxr: {
694 unsigned BitWidth = KnownOne.getBitWidth();
695 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
696 unsigned MemBits = VT.getScalarType().getSizeInBits();
697 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
703 case ISD::INTRINSIC_WO_CHAIN:
704 case ISD::INTRINSIC_VOID: {
705 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
709 case Intrinsic::aarch64_neon_umaxv:
710 case Intrinsic::aarch64_neon_uminv: {
711 // Figure out the datatype of the vector operand. The UMINV instruction
712 // will zero extend the result, so we can mark as known zero all the
713 // bits larger than the element datatype. 32-bit or larget doesn't need
714 // this as those are legal types and will be handled by isel directly.
715 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
716 unsigned BitWidth = KnownZero.getBitWidth();
717 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
718 assert(BitWidth >= 8 && "Unexpected width!");
719 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
721 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
722 assert(BitWidth >= 16 && "Unexpected width!");
723 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
733 MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
738 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
739 const TargetLibraryInfo *libInfo) const {
740 return AArch64::createFastISel(funcInfo, libInfo);
743 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
747 case AArch64ISD::CALL: return "AArch64ISD::CALL";
748 case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
749 case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
750 case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
751 case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
752 case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
753 case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
754 case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
755 case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
756 case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
757 case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
758 case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
759 case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
760 case AArch64ISD::ADC: return "AArch64ISD::ADC";
761 case AArch64ISD::SBC: return "AArch64ISD::SBC";
762 case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
763 case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
764 case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
765 case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
766 case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
767 case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
768 case AArch64ISD::FMIN: return "AArch64ISD::FMIN";
769 case AArch64ISD::FMAX: return "AArch64ISD::FMAX";
770 case AArch64ISD::DUP: return "AArch64ISD::DUP";
771 case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
772 case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
773 case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
774 case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
775 case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
776 case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
777 case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
778 case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
779 case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
780 case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
781 case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
782 case AArch64ISD::BICi: return "AArch64ISD::BICi";
783 case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
784 case AArch64ISD::BSL: return "AArch64ISD::BSL";
785 case AArch64ISD::NEG: return "AArch64ISD::NEG";
786 case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
787 case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
788 case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
789 case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
790 case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
791 case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
792 case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
793 case AArch64ISD::REV16: return "AArch64ISD::REV16";
794 case AArch64ISD::REV32: return "AArch64ISD::REV32";
795 case AArch64ISD::REV64: return "AArch64ISD::REV64";
796 case AArch64ISD::EXT: return "AArch64ISD::EXT";
797 case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
798 case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
799 case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
800 case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
801 case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
802 case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
803 case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
804 case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
805 case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
806 case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
807 case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
808 case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
809 case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
810 case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
811 case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
812 case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
813 case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
814 case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
815 case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
816 case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
817 case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
818 case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
819 case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
820 case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
821 case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
822 case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
823 case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
824 case AArch64ISD::NOT: return "AArch64ISD::NOT";
825 case AArch64ISD::BIT: return "AArch64ISD::BIT";
826 case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
827 case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
828 case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
829 case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
830 case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
831 case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
832 case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
833 case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
834 case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
835 case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
836 case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
837 case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
838 case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
839 case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
840 case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
841 case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
842 case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
843 case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
844 case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
845 case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
846 case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
847 case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
848 case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
849 case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
850 case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
851 case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
852 case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
853 case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
854 case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
855 case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
856 case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
857 case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
858 case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
859 case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
860 case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
861 case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
862 case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
863 case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
864 case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
869 AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
870 MachineBasicBlock *MBB) const {
871 // We materialise the F128CSEL pseudo-instruction as some control flow and a
875 // [... previous instrs leading to comparison ...]
881 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
883 MachineFunction *MF = MBB->getParent();
884 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
885 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
886 DebugLoc DL = MI->getDebugLoc();
887 MachineFunction::iterator It = MBB;
890 unsigned DestReg = MI->getOperand(0).getReg();
891 unsigned IfTrueReg = MI->getOperand(1).getReg();
892 unsigned IfFalseReg = MI->getOperand(2).getReg();
893 unsigned CondCode = MI->getOperand(3).getImm();
894 bool NZCVKilled = MI->getOperand(4).isKill();
896 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
897 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
898 MF->insert(It, TrueBB);
899 MF->insert(It, EndBB);
901 // Transfer rest of current basic-block to EndBB
902 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
904 EndBB->transferSuccessorsAndUpdatePHIs(MBB);
906 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
907 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
908 MBB->addSuccessor(TrueBB);
909 MBB->addSuccessor(EndBB);
911 // TrueBB falls through to the end.
912 TrueBB->addSuccessor(EndBB);
915 TrueBB->addLiveIn(AArch64::NZCV);
916 EndBB->addLiveIn(AArch64::NZCV);
919 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
925 MI->eraseFromParent();
930 AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
931 MachineBasicBlock *BB) const {
932 switch (MI->getOpcode()) {
937 llvm_unreachable("Unexpected instruction for custom inserter!");
939 case AArch64::F128CSEL:
940 return EmitF128CSEL(MI, BB);
942 case TargetOpcode::STACKMAP:
943 case TargetOpcode::PATCHPOINT:
944 return emitPatchPoint(MI, BB);
948 //===----------------------------------------------------------------------===//
949 // AArch64 Lowering private implementation.
950 //===----------------------------------------------------------------------===//
952 //===----------------------------------------------------------------------===//
954 //===----------------------------------------------------------------------===//
956 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
958 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
961 llvm_unreachable("Unknown condition code!");
963 return AArch64CC::NE;
965 return AArch64CC::EQ;
967 return AArch64CC::GT;
969 return AArch64CC::GE;
971 return AArch64CC::LT;
973 return AArch64CC::LE;
975 return AArch64CC::HI;
977 return AArch64CC::HS;
979 return AArch64CC::LO;
981 return AArch64CC::LS;
985 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
986 static void changeFPCCToAArch64CC(ISD::CondCode CC,
987 AArch64CC::CondCode &CondCode,
988 AArch64CC::CondCode &CondCode2) {
989 CondCode2 = AArch64CC::AL;
992 llvm_unreachable("Unknown FP condition!");
995 CondCode = AArch64CC::EQ;
999 CondCode = AArch64CC::GT;
1003 CondCode = AArch64CC::GE;
1006 CondCode = AArch64CC::MI;
1009 CondCode = AArch64CC::LS;
1012 CondCode = AArch64CC::MI;
1013 CondCode2 = AArch64CC::GT;
1016 CondCode = AArch64CC::VC;
1019 CondCode = AArch64CC::VS;
1022 CondCode = AArch64CC::EQ;
1023 CondCode2 = AArch64CC::VS;
1026 CondCode = AArch64CC::HI;
1029 CondCode = AArch64CC::PL;
1033 CondCode = AArch64CC::LT;
1037 CondCode = AArch64CC::LE;
1041 CondCode = AArch64CC::NE;
1046 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
1047 /// CC usable with the vector instructions. Fewer operations are available
1048 /// without a real NZCV register, so we have to use less efficient combinations
1049 /// to get the same effect.
1050 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
1051 AArch64CC::CondCode &CondCode,
1052 AArch64CC::CondCode &CondCode2,
1057 // Mostly the scalar mappings work fine.
1058 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1061 Invert = true; // Fallthrough
1063 CondCode = AArch64CC::MI;
1064 CondCode2 = AArch64CC::GE;
1071 // All of the compare-mask comparisons are ordered, but we can switch
1072 // between the two by a double inversion. E.g. ULE == !OGT.
1074 changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
1079 static bool isLegalArithImmed(uint64_t C) {
1080 // Matches AArch64DAGToDAGISel::SelectArithImmed().
1081 return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
1084 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
1085 SDLoc dl, SelectionDAG &DAG) {
1086 EVT VT = LHS.getValueType();
1088 if (VT.isFloatingPoint())
1089 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
1091 // The CMP instruction is just an alias for SUBS, and representing it as
1092 // SUBS means that it's possible to get CSE with subtract operations.
1093 // A later phase can perform the optimization of setting the destination
1094 // register to WZR/XZR if it ends up being unused.
1095 unsigned Opcode = AArch64ISD::SUBS;
1097 if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) &&
1098 cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 &&
1099 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
1100 // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
1101 // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
1102 // can be set differently by this operation. It comes down to whether
1103 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
1104 // everything is fine. If not then the optimization is wrong. Thus general
1105 // comparisons are only valid if op2 != 0.
1107 // So, finally, the only LLVM-native comparisons that don't mention C and V
1108 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
1109 // the absence of information about op2.
1110 Opcode = AArch64ISD::ADDS;
1111 RHS = RHS.getOperand(1);
1112 } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) &&
1113 cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
1114 !isUnsignedIntSetCC(CC)) {
1115 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
1116 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
1117 // of the signed comparisons.
1118 Opcode = AArch64ISD::ANDS;
1119 RHS = LHS.getOperand(1);
1120 LHS = LHS.getOperand(0);
1123 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
1127 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
1128 SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
1130 AArch64CC::CondCode AArch64CC;
1131 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
1132 EVT VT = RHS.getValueType();
1133 uint64_t C = RHSC->getZExtValue();
1134 if (!isLegalArithImmed(C)) {
1135 // Constant does not fit, try adjusting it by one?
1141 if ((VT == MVT::i32 && C != 0x80000000 &&
1142 isLegalArithImmed((uint32_t)(C - 1))) ||
1143 (VT == MVT::i64 && C != 0x80000000ULL &&
1144 isLegalArithImmed(C - 1ULL))) {
1145 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
1146 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1147 RHS = DAG.getConstant(C, VT);
1152 if ((VT == MVT::i32 && C != 0 &&
1153 isLegalArithImmed((uint32_t)(C - 1))) ||
1154 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
1155 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
1156 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1157 RHS = DAG.getConstant(C, VT);
1162 if ((VT == MVT::i32 && C != INT32_MAX &&
1163 isLegalArithImmed((uint32_t)(C + 1))) ||
1164 (VT == MVT::i64 && C != INT64_MAX &&
1165 isLegalArithImmed(C + 1ULL))) {
1166 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
1167 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1168 RHS = DAG.getConstant(C, VT);
1173 if ((VT == MVT::i32 && C != UINT32_MAX &&
1174 isLegalArithImmed((uint32_t)(C + 1))) ||
1175 (VT == MVT::i64 && C != UINT64_MAX &&
1176 isLegalArithImmed(C + 1ULL))) {
1177 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
1178 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1179 RHS = DAG.getConstant(C, VT);
1185 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
1186 // For the i8 operand, the largest immediate is 255, so this can be easily
1187 // encoded in the compare instruction. For the i16 operand, however, the
1188 // largest immediate cannot be encoded in the compare.
1189 // Therefore, use a sign extending load and cmn to avoid materializing the -1
1190 // constant. For example,
1192 // ldrh w0, [x0, #0]
1195 // ldrsh w0, [x0, #0]
1197 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
1198 // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure
1199 // both the LHS and RHS are truely zero extended and to make sure the
1200 // transformation is profitable.
1201 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
1202 if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) &&
1203 isa<LoadSDNode>(LHS)) {
1204 if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
1205 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
1206 LHS.getNode()->hasNUsesOfValue(1, 0)) {
1207 int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
1208 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
1210 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
1211 DAG.getValueType(MVT::i16));
1212 Cmp = emitComparison(SExt,
1213 DAG.getConstant(ValueofRHS, RHS.getValueType()),
1215 AArch64CC = changeIntCCToAArch64CC(CC);
1216 AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
1222 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
1223 AArch64CC = changeIntCCToAArch64CC(CC);
1224 AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
1228 static std::pair<SDValue, SDValue>
1229 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
1230 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
1231 "Unsupported value type");
1232 SDValue Value, Overflow;
1234 SDValue LHS = Op.getOperand(0);
1235 SDValue RHS = Op.getOperand(1);
1237 switch (Op.getOpcode()) {
1239 llvm_unreachable("Unknown overflow instruction!");
1241 Opc = AArch64ISD::ADDS;
1245 Opc = AArch64ISD::ADDS;
1249 Opc = AArch64ISD::SUBS;
1253 Opc = AArch64ISD::SUBS;
1256 // Multiply needs a little bit extra work.
1260 bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false;
1261 if (Op.getValueType() == MVT::i32) {
1262 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1263 // For a 32 bit multiply with overflow check we want the instruction
1264 // selector to generate a widening multiply (SMADDL/UMADDL). For that we
1265 // need to generate the following pattern:
1266 // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
1267 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
1268 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
1269 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
1270 SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
1271 DAG.getConstant(0, MVT::i64));
1272 // On AArch64 the upper 32 bits are always zero extended for a 32 bit
1273 // operation. We need to clear out the upper 32 bits, because we used a
1274 // widening multiply that wrote all 64 bits. In the end this should be a
1276 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
1278 // The signed overflow check requires more than just a simple check for
1279 // any bit set in the upper 32 bits of the result. These bits could be
1280 // just the sign bits of a negative number. To perform the overflow
1281 // check we have to arithmetic shift right the 32nd bit of the result by
1282 // 31 bits. Then we compare the result to the upper 32 bits.
1283 SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
1284 DAG.getConstant(32, MVT::i64));
1285 UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
1286 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
1287 DAG.getConstant(31, MVT::i64));
1288 // It is important that LowerBits is last, otherwise the arithmetic
1289 // shift will not be folded into the compare (SUBS).
1290 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
1291 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
1294 // The overflow check for unsigned multiply is easy. We only need to
1295 // check if any of the upper 32 bits are set. This can be done with a
1296 // CMP (shifted register). For that we need to generate the following
1298 // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
1299 SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
1300 DAG.getConstant(32, MVT::i64));
1301 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
1303 DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
1304 UpperBits).getValue(1);
1308 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
1309 // For the 64 bit multiply
1310 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
1312 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
1313 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
1314 DAG.getConstant(63, MVT::i64));
1315 // It is important that LowerBits is last, otherwise the arithmetic
1316 // shift will not be folded into the compare (SUBS).
1317 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
1318 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
1321 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
1322 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
1324 DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
1325 UpperBits).getValue(1);
1332 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
1334 // Emit the AArch64 operation with overflow check.
1335 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
1336 Overflow = Value.getValue(1);
1338 return std::make_pair(Value, Overflow);
1341 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
1342 RTLIB::Libcall Call) const {
1343 SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
1344 return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
1348 static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
1349 SDValue Sel = Op.getOperand(0);
1350 SDValue Other = Op.getOperand(1);
1352 // If neither operand is a SELECT_CC, give up.
1353 if (Sel.getOpcode() != ISD::SELECT_CC)
1354 std::swap(Sel, Other);
1355 if (Sel.getOpcode() != ISD::SELECT_CC)
1358 // The folding we want to perform is:
1359 // (xor x, (select_cc a, b, cc, 0, -1) )
1361 // (csel x, (xor x, -1), cc ...)
1363 // The latter will get matched to a CSINV instruction.
1365 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
1366 SDValue LHS = Sel.getOperand(0);
1367 SDValue RHS = Sel.getOperand(1);
1368 SDValue TVal = Sel.getOperand(2);
1369 SDValue FVal = Sel.getOperand(3);
1372 // FIXME: This could be generalized to non-integer comparisons.
1373 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
1376 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
1377 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
1379 // The the values aren't constants, this isn't the pattern we're looking for.
1380 if (!CFVal || !CTVal)
1383 // We can commute the SELECT_CC by inverting the condition. This
1384 // might be needed to make this fit into a CSINV pattern.
1385 if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
1386 std::swap(TVal, FVal);
1387 std::swap(CTVal, CFVal);
1388 CC = ISD::getSetCCInverse(CC, true);
1391 // If the constants line up, perform the transform!
1392 if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
1394 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
1397 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
1398 DAG.getConstant(-1ULL, Other.getValueType()));
1400 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
1407 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
1408 EVT VT = Op.getValueType();
1410 // Let legalize expand this if it isn't a legal type yet.
1411 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
1414 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
1417 bool ExtraOp = false;
1418 switch (Op.getOpcode()) {
1420 llvm_unreachable("Invalid code");
1422 Opc = AArch64ISD::ADDS;
1425 Opc = AArch64ISD::SUBS;
1428 Opc = AArch64ISD::ADCS;
1432 Opc = AArch64ISD::SBCS;
1438 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
1439 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
1443 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
1444 // Let legalize expand this if it isn't a legal type yet.
1445 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
1448 AArch64CC::CondCode CC;
1449 // The actual operation that sets the overflow or carry flag.
1450 SDValue Value, Overflow;
1451 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
1453 // We use 0 and 1 as false and true values.
1454 SDValue TVal = DAG.getConstant(1, MVT::i32);
1455 SDValue FVal = DAG.getConstant(0, MVT::i32);
1457 // We use an inverted condition, because the conditional select is inverted
1458 // too. This will allow it to be selected to a single instruction:
1459 // CSINC Wd, WZR, WZR, invert(cond).
1460 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32);
1461 Overflow = DAG.getNode(AArch64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal,
1464 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
1465 return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow);
1468 // Prefetch operands are:
1469 // 1: Address to prefetch
1471 // 3: int locality (0 = no locality ... 3 = extreme locality)
1472 // 4: bool isDataCache
1473 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
1475 unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
1476 unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
1477 unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
1479 bool IsStream = !Locality;
1480 // When the locality number is set
1482 // The front-end should have filtered out the out-of-range values
1483 assert(Locality <= 3 && "Prefetch locality out-of-range");
1484 // The locality degree is the opposite of the cache speed.
1485 // Put the number the other way around.
1486 // The encoding starts at 0 for level 1
1487 Locality = 3 - Locality;
1490 // built the mask value encoding the expected behavior.
1491 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
1492 (!IsData << 3) | // IsDataCache bit
1493 (Locality << 1) | // Cache level bits
1494 (unsigned)IsStream; // Stream bit
1495 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
1496 DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1));
1499 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
1500 SelectionDAG &DAG) const {
1501 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
1504 LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
1506 return LowerF128Call(Op, DAG, LC);
1509 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
1510 SelectionDAG &DAG) const {
1511 if (Op.getOperand(0).getValueType() != MVT::f128) {
1512 // It's legal except when f128 is involved
1517 LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
1519 // FP_ROUND node has a second operand indicating whether it is known to be
1520 // precise. That doesn't take part in the LibCall so we can't directly use
1522 SDValue SrcVal = Op.getOperand(0);
1523 return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
1524 /*isSigned*/ false, SDLoc(Op)).first;
1527 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
1528 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
1529 // Any additional optimization in this function should be recorded
1530 // in the cost tables.
1531 EVT InVT = Op.getOperand(0).getValueType();
1532 EVT VT = Op.getValueType();
1534 if (VT.getSizeInBits() < InVT.getSizeInBits()) {
1537 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
1539 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
1542 if (VT.getSizeInBits() > InVT.getSizeInBits()) {
1545 MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
1546 VT.getVectorNumElements());
1547 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
1548 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
1551 // Type changing conversions are illegal.
1555 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
1556 SelectionDAG &DAG) const {
1557 if (Op.getOperand(0).getValueType().isVector())
1558 return LowerVectorFP_TO_INT(Op, DAG);
1560 if (Op.getOperand(0).getValueType() != MVT::f128) {
1561 // It's legal except when f128 is involved
1566 if (Op.getOpcode() == ISD::FP_TO_SINT)
1567 LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
1569 LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
1571 SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
1572 return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
1576 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
1577 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
1578 // Any additional optimization in this function should be recorded
1579 // in the cost tables.
1580 EVT VT = Op.getValueType();
1582 SDValue In = Op.getOperand(0);
1583 EVT InVT = In.getValueType();
1585 if (VT.getSizeInBits() < InVT.getSizeInBits()) {
1587 MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
1588 InVT.getVectorNumElements());
1589 In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
1590 return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0));
1593 if (VT.getSizeInBits() > InVT.getSizeInBits()) {
1595 Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1596 EVT CastVT = VT.changeVectorElementTypeToInteger();
1597 In = DAG.getNode(CastOpc, dl, CastVT, In);
1598 return DAG.getNode(Op.getOpcode(), dl, VT, In);
1604 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
1605 SelectionDAG &DAG) const {
1606 if (Op.getValueType().isVector())
1607 return LowerVectorINT_TO_FP(Op, DAG);
1609 // i128 conversions are libcalls.
1610 if (Op.getOperand(0).getValueType() == MVT::i128)
1613 // Other conversions are legal, unless it's to the completely software-based
1615 if (Op.getValueType() != MVT::f128)
1619 if (Op.getOpcode() == ISD::SINT_TO_FP)
1620 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
1622 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
1624 return LowerF128Call(Op, DAG, LC);
1627 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
1628 SelectionDAG &DAG) const {
1629 // For iOS, we want to call an alternative entry point: __sincos_stret,
1630 // which returns the values in two S / D registers.
1632 SDValue Arg = Op.getOperand(0);
1633 EVT ArgVT = Arg.getValueType();
1634 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
1641 Entry.isSExt = false;
1642 Entry.isZExt = false;
1643 Args.push_back(Entry);
1645 const char *LibcallName =
1646 (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
1647 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
1649 StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
1650 TargetLowering::CallLoweringInfo CLI(DAG);
1651 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
1652 .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0);
1654 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
1655 return CallResult.first;
1658 static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
1659 if (Op.getValueType() != MVT::f16)
1662 assert(Op.getOperand(0).getValueType() == MVT::i16);
1665 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
1666 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
1668 DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
1669 DAG.getTargetConstant(AArch64::hsub, MVT::i32)),
1673 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
1674 if (OrigVT.getSizeInBits() >= 64)
1677 assert(OrigVT.isSimple() && "Expecting a simple value type");
1679 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
1680 switch (OrigSimpleTy) {
1681 default: llvm_unreachable("Unexpected Vector Type");
1690 static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
1693 unsigned ExtOpcode) {
1694 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
1695 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
1696 // 64-bits we need to insert a new extension so that it will be 64-bits.
1697 assert(ExtTy.is128BitVector() && "Unexpected extension size");
1698 if (OrigTy.getSizeInBits() >= 64)
1701 // Must extend size to at least 64 bits to be used as an operand for VMULL.
1702 EVT NewVT = getExtensionTo64Bits(OrigTy);
1704 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
1707 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
1709 EVT VT = N->getValueType(0);
1711 if (N->getOpcode() != ISD::BUILD_VECTOR)
1714 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
1715 SDNode *Elt = N->getOperand(i).getNode();
1716 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
1717 unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1718 unsigned HalfSize = EltSize / 2;
1720 if (!isIntN(HalfSize, C->getSExtValue()))
1723 if (!isUIntN(HalfSize, C->getZExtValue()))
1734 static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
1735 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
1736 return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
1737 N->getOperand(0)->getValueType(0),
1741 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
1742 EVT VT = N->getValueType(0);
1743 unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2;
1744 unsigned NumElts = VT.getVectorNumElements();
1745 MVT TruncVT = MVT::getIntegerVT(EltSize);
1746 SmallVector<SDValue, 8> Ops;
1747 for (unsigned i = 0; i != NumElts; ++i) {
1748 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
1749 const APInt &CInt = C->getAPIntValue();
1750 // Element types smaller than 32 bits are not legal, so use i32 elements.
1751 // The values are implicitly truncated so sext vs. zext doesn't matter.
1752 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
1754 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
1755 MVT::getVectorVT(TruncVT, NumElts), Ops);
1758 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
1759 if (N->getOpcode() == ISD::SIGN_EXTEND)
1761 if (isExtendedBUILD_VECTOR(N, DAG, true))
1766 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
1767 if (N->getOpcode() == ISD::ZERO_EXTEND)
1769 if (isExtendedBUILD_VECTOR(N, DAG, false))
1774 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
1775 unsigned Opcode = N->getOpcode();
1776 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
1777 SDNode *N0 = N->getOperand(0).getNode();
1778 SDNode *N1 = N->getOperand(1).getNode();
1779 return N0->hasOneUse() && N1->hasOneUse() &&
1780 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
1785 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
1786 unsigned Opcode = N->getOpcode();
1787 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
1788 SDNode *N0 = N->getOperand(0).getNode();
1789 SDNode *N1 = N->getOperand(1).getNode();
1790 return N0->hasOneUse() && N1->hasOneUse() &&
1791 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
1796 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
1797 // Multiplications are only custom-lowered for 128-bit vectors so that
1798 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
1799 EVT VT = Op.getValueType();
1800 assert(VT.is128BitVector() && VT.isInteger() &&
1801 "unexpected type for custom-lowering ISD::MUL");
1802 SDNode *N0 = Op.getOperand(0).getNode();
1803 SDNode *N1 = Op.getOperand(1).getNode();
1804 unsigned NewOpc = 0;
1806 bool isN0SExt = isSignExtended(N0, DAG);
1807 bool isN1SExt = isSignExtended(N1, DAG);
1808 if (isN0SExt && isN1SExt)
1809 NewOpc = AArch64ISD::SMULL;
1811 bool isN0ZExt = isZeroExtended(N0, DAG);
1812 bool isN1ZExt = isZeroExtended(N1, DAG);
1813 if (isN0ZExt && isN1ZExt)
1814 NewOpc = AArch64ISD::UMULL;
1815 else if (isN1SExt || isN1ZExt) {
1816 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
1817 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
1818 if (isN1SExt && isAddSubSExt(N0, DAG)) {
1819 NewOpc = AArch64ISD::SMULL;
1821 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
1822 NewOpc = AArch64ISD::UMULL;
1824 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
1826 NewOpc = AArch64ISD::UMULL;
1832 if (VT == MVT::v2i64)
1833 // Fall through to expand this. It is not legal.
1836 // Other vector multiplications are legal.
1841 // Legalize to a S/UMULL instruction
1844 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
1846 Op0 = skipExtensionForVectorMULL(N0, DAG);
1847 assert(Op0.getValueType().is64BitVector() &&
1848 Op1.getValueType().is64BitVector() &&
1849 "unexpected types for extended operands to VMULL");
1850 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
1852 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
1853 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
1854 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
1855 SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
1856 SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
1857 EVT Op1VT = Op1.getValueType();
1858 return DAG.getNode(N0->getOpcode(), DL, VT,
1859 DAG.getNode(NewOpc, DL, VT,
1860 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
1861 DAG.getNode(NewOpc, DL, VT,
1862 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
1865 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
1866 SelectionDAG &DAG) const {
1867 switch (Op.getOpcode()) {
1869 llvm_unreachable("unimplemented operand");
1872 return LowerBITCAST(Op, DAG);
1873 case ISD::GlobalAddress:
1874 return LowerGlobalAddress(Op, DAG);
1875 case ISD::GlobalTLSAddress:
1876 return LowerGlobalTLSAddress(Op, DAG);
1878 return LowerSETCC(Op, DAG);
1880 return LowerBR_CC(Op, DAG);
1882 return LowerSELECT(Op, DAG);
1883 case ISD::SELECT_CC:
1884 return LowerSELECT_CC(Op, DAG);
1885 case ISD::JumpTable:
1886 return LowerJumpTable(Op, DAG);
1887 case ISD::ConstantPool:
1888 return LowerConstantPool(Op, DAG);
1889 case ISD::BlockAddress:
1890 return LowerBlockAddress(Op, DAG);
1892 return LowerVASTART(Op, DAG);
1894 return LowerVACOPY(Op, DAG);
1896 return LowerVAARG(Op, DAG);
1901 return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
1908 return LowerXALUO(Op, DAG);
1910 return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
1912 return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
1914 return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
1916 return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
1918 return LowerFP_ROUND(Op, DAG);
1919 case ISD::FP_EXTEND:
1920 return LowerFP_EXTEND(Op, DAG);
1921 case ISD::FRAMEADDR:
1922 return LowerFRAMEADDR(Op, DAG);
1923 case ISD::RETURNADDR:
1924 return LowerRETURNADDR(Op, DAG);
1925 case ISD::INSERT_VECTOR_ELT:
1926 return LowerINSERT_VECTOR_ELT(Op, DAG);
1927 case ISD::EXTRACT_VECTOR_ELT:
1928 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
1929 case ISD::BUILD_VECTOR:
1930 return LowerBUILD_VECTOR(Op, DAG);
1931 case ISD::VECTOR_SHUFFLE:
1932 return LowerVECTOR_SHUFFLE(Op, DAG);
1933 case ISD::EXTRACT_SUBVECTOR:
1934 return LowerEXTRACT_SUBVECTOR(Op, DAG);
1938 return LowerVectorSRA_SRL_SHL(Op, DAG);
1939 case ISD::SHL_PARTS:
1940 return LowerShiftLeftParts(Op, DAG);
1941 case ISD::SRL_PARTS:
1942 case ISD::SRA_PARTS:
1943 return LowerShiftRightParts(Op, DAG);
1945 return LowerCTPOP(Op, DAG);
1946 case ISD::FCOPYSIGN:
1947 return LowerFCOPYSIGN(Op, DAG);
1949 return LowerVectorAND(Op, DAG);
1951 return LowerVectorOR(Op, DAG);
1953 return LowerXOR(Op, DAG);
1955 return LowerPREFETCH(Op, DAG);
1956 case ISD::SINT_TO_FP:
1957 case ISD::UINT_TO_FP:
1958 return LowerINT_TO_FP(Op, DAG);
1959 case ISD::FP_TO_SINT:
1960 case ISD::FP_TO_UINT:
1961 return LowerFP_TO_INT(Op, DAG);
1963 return LowerFSINCOS(Op, DAG);
1965 return LowerMUL(Op, DAG);
1969 /// getFunctionAlignment - Return the Log2 alignment of this function.
1970 unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const {
1974 //===----------------------------------------------------------------------===//
1975 // Calling Convention Implementation
1976 //===----------------------------------------------------------------------===//
1978 #include "AArch64GenCallingConv.inc"
1980 /// Selects the correct CCAssignFn for a given CallingConvention value.
1981 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1982 bool IsVarArg) const {
1985 llvm_unreachable("Unsupported calling convention.");
1986 case CallingConv::WebKit_JS:
1987 return CC_AArch64_WebKit_JS;
1988 case CallingConv::GHC:
1989 return CC_AArch64_GHC;
1990 case CallingConv::C:
1991 case CallingConv::Fast:
1992 if (!Subtarget->isTargetDarwin())
1993 return CC_AArch64_AAPCS;
1994 return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
1998 SDValue AArch64TargetLowering::LowerFormalArguments(
1999 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2000 const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
2001 SmallVectorImpl<SDValue> &InVals) const {
2002 MachineFunction &MF = DAG.getMachineFunction();
2003 MachineFrameInfo *MFI = MF.getFrameInfo();
2005 // Assign locations to all of the incoming arguments.
2006 SmallVector<CCValAssign, 16> ArgLocs;
2007 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2010 // At this point, Ins[].VT may already be promoted to i32. To correctly
2011 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
2012 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
2013 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
2014 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
2016 unsigned NumArgs = Ins.size();
2017 Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
2018 unsigned CurArgIdx = 0;
2019 for (unsigned i = 0; i != NumArgs; ++i) {
2020 MVT ValVT = Ins[i].VT;
2021 if (Ins[i].isOrigArg()) {
2022 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
2023 CurArgIdx = Ins[i].getOrigArgIndex();
2025 // Get type of the original argument.
2026 EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
2027 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
2028 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
2029 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
2031 else if (ActualMVT == MVT::i16)
2034 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
2036 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
2037 assert(!Res && "Call operand has unhandled type");
2040 assert(ArgLocs.size() == Ins.size());
2041 SmallVector<SDValue, 16> ArgValues;
2042 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2043 CCValAssign &VA = ArgLocs[i];
2045 if (Ins[i].Flags.isByVal()) {
2046 // Byval is used for HFAs in the PCS, but the system should work in a
2047 // non-compliant manner for larger structs.
2048 EVT PtrTy = getPointerTy();
2049 int Size = Ins[i].Flags.getByValSize();
2050 unsigned NumRegs = (Size + 7) / 8;
2052 // FIXME: This works on big-endian for composite byvals, which are the common
2053 // case. It should also work for fundamental types too.
2055 MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
2056 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
2057 InVals.push_back(FrameIdxN);
2062 if (VA.isRegLoc()) {
2063 // Arguments stored in registers.
2064 EVT RegVT = VA.getLocVT();
2067 const TargetRegisterClass *RC;
2069 if (RegVT == MVT::i32)
2070 RC = &AArch64::GPR32RegClass;
2071 else if (RegVT == MVT::i64)
2072 RC = &AArch64::GPR64RegClass;
2073 else if (RegVT == MVT::f16)
2074 RC = &AArch64::FPR16RegClass;
2075 else if (RegVT == MVT::f32)
2076 RC = &AArch64::FPR32RegClass;
2077 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
2078 RC = &AArch64::FPR64RegClass;
2079 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
2080 RC = &AArch64::FPR128RegClass;
2082 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
2084 // Transform the arguments in physical registers into virtual ones.
2085 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2086 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
2088 // If this is an 8, 16 or 32-bit value, it is really passed promoted
2089 // to 64 bits. Insert an assert[sz]ext to capture this, then
2090 // truncate to the right size.
2091 switch (VA.getLocInfo()) {
2093 llvm_unreachable("Unknown loc info!");
2094 case CCValAssign::Full:
2096 case CCValAssign::BCvt:
2097 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
2099 case CCValAssign::AExt:
2100 case CCValAssign::SExt:
2101 case CCValAssign::ZExt:
2102 // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
2103 // nodes after our lowering.
2104 assert(RegVT == Ins[i].VT && "incorrect register location selected");
2108 InVals.push_back(ArgValue);
2110 } else { // VA.isRegLoc()
2111 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
2112 unsigned ArgOffset = VA.getLocMemOffset();
2113 unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
2115 uint32_t BEAlign = 0;
2116 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
2117 !Ins[i].Flags.isInConsecutiveRegs())
2118 BEAlign = 8 - ArgSize;
2120 int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
2122 // Create load nodes to retrieve arguments from the stack.
2123 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2126 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2127 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
2128 MVT MemVT = VA.getValVT();
2130 switch (VA.getLocInfo()) {
2133 case CCValAssign::BCvt:
2134 MemVT = VA.getLocVT();
2136 case CCValAssign::SExt:
2137 ExtType = ISD::SEXTLOAD;
2139 case CCValAssign::ZExt:
2140 ExtType = ISD::ZEXTLOAD;
2142 case CCValAssign::AExt:
2143 ExtType = ISD::EXTLOAD;
2147 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN,
2148 MachinePointerInfo::getFixedStack(FI),
2149 MemVT, false, false, false, 0);
2151 InVals.push_back(ArgValue);
2157 if (!Subtarget->isTargetDarwin()) {
2158 // The AAPCS variadic function ABI is identical to the non-variadic
2159 // one. As a result there may be more arguments in registers and we should
2160 // save them for future reference.
2161 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
2164 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2165 // This will point to the next argument passed via stack.
2166 unsigned StackOffset = CCInfo.getNextStackOffset();
2167 // We currently pass all varargs at 8-byte alignment.
2168 StackOffset = ((StackOffset + 7) & ~7);
2169 AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
2172 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
2173 unsigned StackArgSize = CCInfo.getNextStackOffset();
2174 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2175 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
2176 // This is a non-standard ABI so by fiat I say we're allowed to make full
2177 // use of the stack area to be popped, which must be aligned to 16 bytes in
2179 StackArgSize = RoundUpToAlignment(StackArgSize, 16);
2181 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
2182 // a multiple of 16.
2183 FuncInfo->setArgumentStackToRestore(StackArgSize);
2185 // This realignment carries over to the available bytes below. Our own
2186 // callers will guarantee the space is free by giving an aligned value to
2189 // Even if we're not expected to free up the space, it's useful to know how
2190 // much is there while considering tail calls (because we can reuse it).
2191 FuncInfo->setBytesInStackArgArea(StackArgSize);
2196 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
2197 SelectionDAG &DAG, SDLoc DL,
2198 SDValue &Chain) const {
2199 MachineFunction &MF = DAG.getMachineFunction();
2200 MachineFrameInfo *MFI = MF.getFrameInfo();
2201 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
2203 SmallVector<SDValue, 8> MemOps;
2205 static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
2206 AArch64::X3, AArch64::X4, AArch64::X5,
2207 AArch64::X6, AArch64::X7 };
2208 static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
2209 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
2211 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
2213 if (GPRSaveSize != 0) {
2214 GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
2216 SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
2218 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
2219 unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
2220 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
2222 DAG.getStore(Val.getValue(1), DL, Val, FIN,
2223 MachinePointerInfo::getStack(i * 8), false, false, 0);
2224 MemOps.push_back(Store);
2225 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
2226 DAG.getConstant(8, getPointerTy()));
2229 FuncInfo->setVarArgsGPRIndex(GPRIdx);
2230 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
2232 if (Subtarget->hasFPARMv8()) {
2233 static const MCPhysReg FPRArgRegs[] = {
2234 AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
2235 AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
2236 static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
2237 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
2239 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
2241 if (FPRSaveSize != 0) {
2242 FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
2244 SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
2246 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
2247 unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
2248 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
2251 DAG.getStore(Val.getValue(1), DL, Val, FIN,
2252 MachinePointerInfo::getStack(i * 16), false, false, 0);
2253 MemOps.push_back(Store);
2254 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
2255 DAG.getConstant(16, getPointerTy()));
2258 FuncInfo->setVarArgsFPRIndex(FPRIdx);
2259 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
2262 if (!MemOps.empty()) {
2263 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
2267 /// LowerCallResult - Lower the result values of a call into the
2268 /// appropriate copies out of appropriate physical registers.
2269 SDValue AArch64TargetLowering::LowerCallResult(
2270 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2271 const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
2272 SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
2273 SDValue ThisVal) const {
2274 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
2275 ? RetCC_AArch64_WebKit_JS
2276 : RetCC_AArch64_AAPCS;
2277 // Assign locations to each value returned by this call.
2278 SmallVector<CCValAssign, 16> RVLocs;
2279 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2281 CCInfo.AnalyzeCallResult(Ins, RetCC);
2283 // Copy all of the result registers out of their specified physreg.
2284 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2285 CCValAssign VA = RVLocs[i];
2287 // Pass 'this' value directly from the argument to return value, to avoid
2288 // reg unit interference
2289 if (i == 0 && isThisReturn) {
2290 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
2291 "unexpected return calling convention register assignment");
2292 InVals.push_back(ThisVal);
2297 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2298 Chain = Val.getValue(1);
2299 InFlag = Val.getValue(2);
2301 switch (VA.getLocInfo()) {
2303 llvm_unreachable("Unknown loc info!");
2304 case CCValAssign::Full:
2306 case CCValAssign::BCvt:
2307 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2311 InVals.push_back(Val);
2317 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
2318 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
2319 bool isCalleeStructRet, bool isCallerStructRet,
2320 const SmallVectorImpl<ISD::OutputArg> &Outs,
2321 const SmallVectorImpl<SDValue> &OutVals,
2322 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2323 // For CallingConv::C this function knows whether the ABI needs
2324 // changing. That's not true for other conventions so they will have to opt in
2326 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
2329 const MachineFunction &MF = DAG.getMachineFunction();
2330 const Function *CallerF = MF.getFunction();
2331 CallingConv::ID CallerCC = CallerF->getCallingConv();
2332 bool CCMatch = CallerCC == CalleeCC;
2334 // Byval parameters hand the function a pointer directly into the stack area
2335 // we want to reuse during a tail call. Working around this *is* possible (see
2336 // X86) but less efficient and uglier in LowerCall.
2337 for (Function::const_arg_iterator i = CallerF->arg_begin(),
2338 e = CallerF->arg_end();
2340 if (i->hasByValAttr())
2343 if (getTargetMachine().Options.GuaranteedTailCallOpt) {
2344 if (IsTailCallConvention(CalleeCC) && CCMatch)
2349 // Externally-defined functions with weak linkage should not be
2350 // tail-called on AArch64 when the OS does not support dynamic
2351 // pre-emption of symbols, as the AAELF spec requires normal calls
2352 // to undefined weak functions to be replaced with a NOP or jump to the
2353 // next instruction. The behaviour of branch instructions in this
2354 // situation (as used for tail calls) is implementation-defined, so we
2355 // cannot rely on the linker replacing the tail call with a return.
2356 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2357 const GlobalValue *GV = G->getGlobal();
2358 const Triple TT(getTargetMachine().getTargetTriple());
2359 if (GV->hasExternalWeakLinkage() &&
2360 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
2364 // Now we search for cases where we can use a tail call without changing the
2365 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
2368 // I want anyone implementing a new calling convention to think long and hard
2369 // about this assert.
2370 assert((!isVarArg || CalleeCC == CallingConv::C) &&
2371 "Unexpected variadic calling convention");
2373 if (isVarArg && !Outs.empty()) {
2374 // At least two cases here: if caller is fastcc then we can't have any
2375 // memory arguments (we'd be expected to clean up the stack afterwards). If
2376 // caller is C then we could potentially use its argument area.
2378 // FIXME: for now we take the most conservative of these in both cases:
2379 // disallow all variadic memory operands.
2380 SmallVector<CCValAssign, 16> ArgLocs;
2381 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
2384 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
2385 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
2386 if (!ArgLocs[i].isRegLoc())
2390 // If the calling conventions do not match, then we'd better make sure the
2391 // results are returned in the same way as what the caller expects.
2393 SmallVector<CCValAssign, 16> RVLocs1;
2394 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
2396 CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
2398 SmallVector<CCValAssign, 16> RVLocs2;
2399 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
2401 CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
2403 if (RVLocs1.size() != RVLocs2.size())
2405 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
2406 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
2408 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
2410 if (RVLocs1[i].isRegLoc()) {
2411 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
2414 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
2420 // Nothing more to check if the callee is taking no arguments
2424 SmallVector<CCValAssign, 16> ArgLocs;
2425 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
2428 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
2430 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
2432 // If the stack arguments for this call would fit into our own save area then
2433 // the call can be made tail.
2434 return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
2437 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
2439 MachineFrameInfo *MFI,
2440 int ClobberedFI) const {
2441 SmallVector<SDValue, 8> ArgChains;
2442 int64_t FirstByte = MFI->getObjectOffset(ClobberedFI);
2443 int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1;
2445 // Include the original chain at the beginning of the list. When this is
2446 // used by target LowerCall hooks, this helps legalize find the
2447 // CALLSEQ_BEGIN node.
2448 ArgChains.push_back(Chain);
2450 // Add a chain value for each stack argument corresponding
2451 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
2452 UE = DAG.getEntryNode().getNode()->use_end();
2454 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
2455 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
2456 if (FI->getIndex() < 0) {
2457 int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex());
2458 int64_t InLastByte = InFirstByte;
2459 InLastByte += MFI->getObjectSize(FI->getIndex()) - 1;
2461 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
2462 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
2463 ArgChains.push_back(SDValue(L, 1));
2466 // Build a tokenfactor for all the chains.
2467 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
2470 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
2471 bool TailCallOpt) const {
2472 return CallCC == CallingConv::Fast && TailCallOpt;
2475 bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
2476 return CallCC == CallingConv::Fast;
2479 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
2480 /// and add input and output parameter nodes.
2482 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
2483 SmallVectorImpl<SDValue> &InVals) const {
2484 SelectionDAG &DAG = CLI.DAG;
2486 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
2487 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2488 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
2489 SDValue Chain = CLI.Chain;
2490 SDValue Callee = CLI.Callee;
2491 bool &IsTailCall = CLI.IsTailCall;
2492 CallingConv::ID CallConv = CLI.CallConv;
2493 bool IsVarArg = CLI.IsVarArg;
2495 MachineFunction &MF = DAG.getMachineFunction();
2496 bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2497 bool IsThisReturn = false;
2499 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
2500 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2501 bool IsSibCall = false;
2504 // Check if it's really possible to do a tail call.
2505 IsTailCall = isEligibleForTailCallOptimization(
2506 Callee, CallConv, IsVarArg, IsStructRet,
2507 MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);
2508 if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
2509 report_fatal_error("failed to perform tail call elimination on a call "
2510 "site marked musttail");
2512 // A sibling call is one where we're under the usual C ABI and not planning
2513 // to change that but can still do a tail call:
2514 if (!TailCallOpt && IsTailCall)
2521 // Analyze operands of the call, assigning locations to each operand.
2522 SmallVector<CCValAssign, 16> ArgLocs;
2523 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
2527 // Handle fixed and variable vector arguments differently.
2528 // Variable vector arguments always go into memory.
2529 unsigned NumArgs = Outs.size();
2531 for (unsigned i = 0; i != NumArgs; ++i) {
2532 MVT ArgVT = Outs[i].VT;
2533 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
2534 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
2535 /*IsVarArg=*/ !Outs[i].IsFixed);
2536 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
2537 assert(!Res && "Call operand has unhandled type");
2541 // At this point, Outs[].VT may already be promoted to i32. To correctly
2542 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
2543 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
2544 // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
2545 // we use a special version of AnalyzeCallOperands to pass in ValVT and
2547 unsigned NumArgs = Outs.size();
2548 for (unsigned i = 0; i != NumArgs; ++i) {
2549 MVT ValVT = Outs[i].VT;
2550 // Get type of the original argument.
2551 EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
2552 /*AllowUnknown*/ true);
2553 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
2554 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
2555 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
2556 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
2558 else if (ActualMVT == MVT::i16)
2561 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
2562 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
2563 assert(!Res && "Call operand has unhandled type");
2568 // Get a count of how many bytes are to be pushed on the stack.
2569 unsigned NumBytes = CCInfo.getNextStackOffset();
2572 // Since we're not changing the ABI to make this a tail call, the memory
2573 // operands are already available in the caller's incoming argument space.
2577 // FPDiff is the byte offset of the call's argument area from the callee's.
2578 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2579 // by this amount for a tail call. In a sibling call it must be 0 because the
2580 // caller will deallocate the entire stack and the callee still expects its
2581 // arguments to begin at SP+0. Completely unused for non-tail calls.
2584 if (IsTailCall && !IsSibCall) {
2585 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
2587 // Since callee will pop argument stack as a tail call, we must keep the
2588 // popped size 16-byte aligned.
2589 NumBytes = RoundUpToAlignment(NumBytes, 16);
2591 // FPDiff will be negative if this tail call requires more space than we
2592 // would automatically have in our incoming argument space. Positive if we
2593 // can actually shrink the stack.
2594 FPDiff = NumReusableBytes - NumBytes;
2596 // The stack pointer must be 16-byte aligned at all times it's used for a
2597 // memory operation, which in practice means at *all* times and in
2598 // particular across call boundaries. Therefore our own arguments started at
2599 // a 16-byte aligned SP and the delta applied for the tail call should
2600 // satisfy the same constraint.
2601 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
2604 // Adjust the stack pointer for the new arguments...
2605 // These operations are automatically eliminated by the prolog/epilog pass
2608 DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL);
2610 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy());
2612 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2613 SmallVector<SDValue, 8> MemOpChains;
2615 // Walk the register/memloc assignments, inserting copies/loads.
2616 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
2617 ++i, ++realArgIdx) {
2618 CCValAssign &VA = ArgLocs[i];
2619 SDValue Arg = OutVals[realArgIdx];
2620 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2622 // Promote the value if needed.
2623 switch (VA.getLocInfo()) {
2625 llvm_unreachable("Unknown loc info!");
2626 case CCValAssign::Full:
2628 case CCValAssign::SExt:
2629 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2631 case CCValAssign::ZExt:
2632 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2634 case CCValAssign::AExt:
2635 if (Outs[realArgIdx].ArgVT == MVT::i1) {
2636 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
2637 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
2638 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
2640 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2642 case CCValAssign::BCvt:
2643 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2645 case CCValAssign::FPExt:
2646 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
2650 if (VA.isRegLoc()) {
2651 if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
2652 assert(VA.getLocVT() == MVT::i64 &&
2653 "unexpected calling convention register assignment");
2654 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
2655 "unexpected use of 'returned'");
2656 IsThisReturn = true;
2658 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2660 assert(VA.isMemLoc());
2663 MachinePointerInfo DstInfo;
2665 // FIXME: This works on big-endian for composite byvals, which are the
2666 // common case. It should also work for fundamental types too.
2667 uint32_t BEAlign = 0;
2668 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
2669 : VA.getValVT().getSizeInBits();
2670 OpSize = (OpSize + 7) / 8;
2671 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
2672 !Flags.isInConsecutiveRegs()) {
2674 BEAlign = 8 - OpSize;
2676 unsigned LocMemOffset = VA.getLocMemOffset();
2677 int32_t Offset = LocMemOffset + BEAlign;
2678 SDValue PtrOff = DAG.getIntPtrConstant(Offset);
2679 PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
2682 Offset = Offset + FPDiff;
2683 int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
2685 DstAddr = DAG.getFrameIndex(FI, getPointerTy());
2686 DstInfo = MachinePointerInfo::getFixedStack(FI);
2688 // Make sure any stack arguments overlapping with where we're storing
2689 // are loaded before this eventual operation. Otherwise they'll be
2691 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
2693 SDValue PtrOff = DAG.getIntPtrConstant(Offset);
2695 DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
2696 DstInfo = MachinePointerInfo::getStack(LocMemOffset);
2699 if (Outs[i].Flags.isByVal()) {
2701 DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
2702 SDValue Cpy = DAG.getMemcpy(
2703 Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
2705 /*AlwaysInline = */ false, DstInfo, MachinePointerInfo());
2707 MemOpChains.push_back(Cpy);
2709 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
2710 // promoted to a legal register type i32, we should truncate Arg back to
2712 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
2713 VA.getValVT() == MVT::i16)
2714 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
2717 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
2718 MemOpChains.push_back(Store);
2723 if (!MemOpChains.empty())
2724 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
2726 // Build a sequence of copy-to-reg nodes chained together with token chain
2727 // and flag operands which copy the outgoing args into the appropriate regs.
2729 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2730 Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
2731 RegsToPass[i].second, InFlag);
2732 InFlag = Chain.getValue(1);
2735 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2736 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2737 // node so that legalize doesn't hack it.
2738 if (getTargetMachine().getCodeModel() == CodeModel::Large &&
2739 Subtarget->isTargetMachO()) {
2740 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2741 const GlobalValue *GV = G->getGlobal();
2742 bool InternalLinkage = GV->hasInternalLinkage();
2743 if (InternalLinkage)
2744 Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
2746 Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0,
2748 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
2750 } else if (ExternalSymbolSDNode *S =
2751 dyn_cast<ExternalSymbolSDNode>(Callee)) {
2752 const char *Sym = S->getSymbol();
2754 DAG.getTargetExternalSymbol(Sym, getPointerTy(), AArch64II::MO_GOT);
2755 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
2757 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2758 const GlobalValue *GV = G->getGlobal();
2759 Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
2760 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2761 const char *Sym = S->getSymbol();
2762 Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
2765 // We don't usually want to end the call-sequence here because we would tidy
2766 // the frame up *after* the call, however in the ABI-changing tail-call case
2767 // we've carefully laid out the parameters so that when sp is reset they'll be
2768 // in the correct location.
2769 if (IsTailCall && !IsSibCall) {
2770 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
2771 DAG.getIntPtrConstant(0, true), InFlag, DL);
2772 InFlag = Chain.getValue(1);
2775 std::vector<SDValue> Ops;
2776 Ops.push_back(Chain);
2777 Ops.push_back(Callee);
2780 // Each tail call may have to adjust the stack by a different amount, so
2781 // this information must travel along with the operation for eventual
2782 // consumption by emitEpilogue.
2783 Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
2786 // Add argument registers to the end of the list so that they are known live
2788 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2789 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2790 RegsToPass[i].second.getValueType()));
2792 // Add a register mask operand representing the call-preserved registers.
2793 const uint32_t *Mask;
2794 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
2796 // For 'this' returns, use the X0-preserving mask if applicable
2797 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
2799 IsThisReturn = false;
2800 Mask = TRI->getCallPreservedMask(MF, CallConv);
2803 Mask = TRI->getCallPreservedMask(MF, CallConv);
2805 assert(Mask && "Missing call preserved mask for calling convention");
2806 Ops.push_back(DAG.getRegisterMask(Mask));
2808 if (InFlag.getNode())
2809 Ops.push_back(InFlag);
2811 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2813 // If we're doing a tall call, use a TC_RETURN here rather than an
2814 // actual call instruction.
2816 return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
2818 // Returns a chain and a flag for retval copy to use.
2819 Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
2820 InFlag = Chain.getValue(1);
2822 uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)
2823 ? RoundUpToAlignment(NumBytes, 16)
2826 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
2827 DAG.getIntPtrConstant(CalleePopBytes, true),
2830 InFlag = Chain.getValue(1);
2832 // Handle result values, copying them out of physregs into vregs that we
2834 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
2835 InVals, IsThisReturn,
2836 IsThisReturn ? OutVals[0] : SDValue());
2839 bool AArch64TargetLowering::CanLowerReturn(
2840 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2841 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2842 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
2843 ? RetCC_AArch64_WebKit_JS
2844 : RetCC_AArch64_AAPCS;
2845 SmallVector<CCValAssign, 16> RVLocs;
2846 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2847 return CCInfo.CheckReturn(Outs, RetCC);
2851 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2853 const SmallVectorImpl<ISD::OutputArg> &Outs,
2854 const SmallVectorImpl<SDValue> &OutVals,
2855 SDLoc DL, SelectionDAG &DAG) const {
2856 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
2857 ? RetCC_AArch64_WebKit_JS
2858 : RetCC_AArch64_AAPCS;
2859 SmallVector<CCValAssign, 16> RVLocs;
2860 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2862 CCInfo.AnalyzeReturn(Outs, RetCC);
2864 // Copy the result values into the output registers.
2866 SmallVector<SDValue, 4> RetOps(1, Chain);
2867 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
2868 ++i, ++realRVLocIdx) {
2869 CCValAssign &VA = RVLocs[i];
2870 assert(VA.isRegLoc() && "Can only return in registers!");
2871 SDValue Arg = OutVals[realRVLocIdx];
2873 switch (VA.getLocInfo()) {
2875 llvm_unreachable("Unknown loc info!");
2876 case CCValAssign::Full:
2877 if (Outs[i].ArgVT == MVT::i1) {
2878 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
2879 // value. This is strictly redundant on Darwin (which uses "zeroext
2880 // i1"), but will be optimised out before ISel.
2881 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
2882 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2885 case CCValAssign::BCvt:
2886 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2890 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2891 Flag = Chain.getValue(1);
2892 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2895 RetOps[0] = Chain; // Update chain.
2897 // Add the flag if we have it.
2899 RetOps.push_back(Flag);
2901 return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
2904 //===----------------------------------------------------------------------===//
2905 // Other Lowering Code
2906 //===----------------------------------------------------------------------===//
2908 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
2909 SelectionDAG &DAG) const {
2910 EVT PtrVT = getPointerTy();
2912 const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
2913 const GlobalValue *GV = GN->getGlobal();
2914 unsigned char OpFlags =
2915 Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
2917 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
2918 "unexpected offset in global node");
2920 // This also catched the large code model case for Darwin.
2921 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2922 SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
2923 // FIXME: Once remat is capable of dealing with instructions with register
2924 // operands, expand this into two nodes instead of using a wrapper node.
2925 return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
2928 if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) {
2929 assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
2930 "use of MO_CONSTPOOL only supported on small model");
2931 SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE);
2932 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
2933 unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
2934 SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags);
2935 SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
2936 SDValue GlobalAddr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), PoolAddr,
2937 MachinePointerInfo::getConstantPool(),
2938 /*isVolatile=*/ false,
2939 /*isNonTemporal=*/ true,
2940 /*isInvariant=*/ true, 8);
2941 if (GN->getOffset() != 0)
2942 return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr,
2943 DAG.getConstant(GN->getOffset(), PtrVT));
2947 if (getTargetMachine().getCodeModel() == CodeModel::Large) {
2948 const unsigned char MO_NC = AArch64II::MO_NC;
2950 AArch64ISD::WrapperLarge, DL, PtrVT,
2951 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3),
2952 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
2953 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
2954 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
2956 // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
2957 // the only correct model on Darwin.
2958 SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
2959 OpFlags | AArch64II::MO_PAGE);
2960 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
2961 SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
2963 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
2964 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
2968 /// \brief Convert a TLS address reference into the correct sequence of loads
2969 /// and calls to compute the variable's address (for Darwin, currently) and
2970 /// return an SDValue containing the final node.
2972 /// Darwin only has one TLS scheme which must be capable of dealing with the
2973 /// fully general situation, in the worst case. This means:
2974 /// + "extern __thread" declaration.
2975 /// + Defined in a possibly unknown dynamic library.
2977 /// The general system is that each __thread variable has a [3 x i64] descriptor
2978 /// which contains information used by the runtime to calculate the address. The
2979 /// only part of this the compiler needs to know about is the first xword, which
2980 /// contains a function pointer that must be called with the address of the
2981 /// entire descriptor in "x0".
2983 /// Since this descriptor may be in a different unit, in general even the
2984 /// descriptor must be accessed via an indirect load. The "ideal" code sequence
2986 /// adrp x0, _var@TLVPPAGE
2987 /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
2988 /// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
2989 /// ; the function pointer
2990 /// blr x1 ; Uses descriptor address in x0
2991 /// ; Address of _var is now in x0.
2993 /// If the address of _var's descriptor *is* known to the linker, then it can
2994 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
2995 /// a slight efficiency gain.
2997 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
2998 SelectionDAG &DAG) const {
2999 assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
3002 MVT PtrVT = getPointerTy();
3003 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3006 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
3007 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
3009 // The first entry in the descriptor is a function pointer that we must call
3010 // to obtain the address of the variable.
3011 SDValue Chain = DAG.getEntryNode();
3012 SDValue FuncTLVGet =
3013 DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(),
3014 false, true, true, 8);
3015 Chain = FuncTLVGet.getValue(1);
3017 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
3018 MFI->setAdjustsStack(true);
3020 // TLS calls preserve all registers except those that absolutely must be
3021 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3023 const uint32_t *Mask =
3024 Subtarget->getRegisterInfo()->getTLSCallPreservedMask();
3026 // Finally, we can make the call. This is just a degenerate version of a
3027 // normal AArch64 call node: x0 takes the address of the descriptor, and
3028 // returns the address of the variable in this thread.
3029 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
3031 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3032 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
3033 DAG.getRegisterMask(Mask), Chain.getValue(1));
3034 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
3037 /// When accessing thread-local variables under either the general-dynamic or
3038 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will
3039 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
3040 /// is a function pointer to carry out the resolution.
3042 /// The sequence is:
3043 /// adrp x0, :tlsdesc:var
3044 /// ldr x1, [x0, #:tlsdesc_lo12:var]
3045 /// add x0, x0, #:tlsdesc_lo12:var
3046 /// .tlsdesccall var
3048 /// (TPIDR_EL0 offset now in x0)
3050 /// The above sequence must be produced unscheduled, to enable the linker to
3051 /// optimize/relax this sequence.
3052 /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
3053 /// above sequence, and expanded really late in the compilation flow, to ensure
3054 /// the sequence is produced as per above.
3055 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL,
3056 SelectionDAG &DAG) const {
3057 EVT PtrVT = getPointerTy();
3059 SDValue Chain = DAG.getEntryNode();
3060 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3062 SmallVector<SDValue, 2> Ops;
3063 Ops.push_back(Chain);
3064 Ops.push_back(SymAddr);
3066 Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, Ops);
3067 SDValue Glue = Chain.getValue(1);
3069 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
3073 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
3074 SelectionDAG &DAG) const {
3075 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
3076 assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
3077 "ELF TLS only supported in small memory model");
3078 // Different choices can be made for the maximum size of the TLS area for a
3079 // module. For the small address model, the default TLS size is 16MiB and the
3080 // maximum TLS size is 4GiB.
3081 // FIXME: add -mtls-size command line option and make it control the 16MiB
3082 // vs. 4GiB code sequence generation.
3083 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3085 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
3086 if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
3087 if (Model == TLSModel::LocalDynamic)
3088 Model = TLSModel::GeneralDynamic;
3092 EVT PtrVT = getPointerTy();
3094 const GlobalValue *GV = GA->getGlobal();
3096 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
3098 if (Model == TLSModel::LocalExec) {
3099 SDValue HiVar = DAG.getTargetGlobalAddress(
3100 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
3101 SDValue LoVar = DAG.getTargetGlobalAddress(
3103 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
3105 SDValue TPWithOff_lo =
3106 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
3107 HiVar, DAG.getTargetConstant(0, MVT::i32)),
3110 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo,
3111 LoVar, DAG.getTargetConstant(0, MVT::i32)),
3114 } else if (Model == TLSModel::InitialExec) {
3115 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
3116 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
3117 } else if (Model == TLSModel::LocalDynamic) {
3118 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
3119 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
3120 // the beginning of the module's TLS region, followed by a DTPREL offset
3123 // These accesses will need deduplicating if there's more than one.
3124 AArch64FunctionInfo *MFI =
3125 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
3126 MFI->incNumLocalDynamicTLSAccesses();
3128 // The call needs a relocation too for linker relaxation. It doesn't make
3129 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
3131 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
3134 // Now we can calculate the offset from TPIDR_EL0 to this module's
3135 // thread-local area.
3136 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
3138 // Now use :dtprel_whatever: operations to calculate this variable's offset
3139 // in its thread-storage area.
3140 SDValue HiVar = DAG.getTargetGlobalAddress(
3141 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
3142 SDValue LoVar = DAG.getTargetGlobalAddress(
3143 GV, DL, MVT::i64, 0,
3144 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
3146 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
3147 DAG.getTargetConstant(0, MVT::i32)),
3149 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
3150 DAG.getTargetConstant(0, MVT::i32)),
3152 } else if (Model == TLSModel::GeneralDynamic) {
3153 // The call needs a relocation too for linker relaxation. It doesn't make
3154 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
3157 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
3159 // Finally we can make a call to calculate the offset from tpidr_el0.
3160 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
3162 llvm_unreachable("Unsupported ELF TLS access model");
3164 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
3167 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
3168 SelectionDAG &DAG) const {
3169 if (Subtarget->isTargetDarwin())
3170 return LowerDarwinGlobalTLSAddress(Op, DAG);
3171 else if (Subtarget->isTargetELF())
3172 return LowerELFGlobalTLSAddress(Op, DAG);
3174 llvm_unreachable("Unexpected platform trying to use TLS");
3176 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
3177 SDValue Chain = Op.getOperand(0);
3178 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
3179 SDValue LHS = Op.getOperand(2);
3180 SDValue RHS = Op.getOperand(3);
3181 SDValue Dest = Op.getOperand(4);
3184 // Handle f128 first, since lowering it will result in comparing the return
3185 // value of a libcall against zero, which is just what the rest of LowerBR_CC
3186 // is expecting to deal with.
3187 if (LHS.getValueType() == MVT::f128) {
3188 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
3190 // If softenSetCCOperands returned a scalar, we need to compare the result
3191 // against zero to select between true and false values.
3192 if (!RHS.getNode()) {
3193 RHS = DAG.getConstant(0, LHS.getValueType());
3198 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
3200 unsigned Opc = LHS.getOpcode();
3201 if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) &&
3202 cast<ConstantSDNode>(RHS)->isOne() &&
3203 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
3204 Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
3205 assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
3206 "Unexpected condition code.");
3207 // Only lower legal XALUO ops.
3208 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
3211 // The actual operation with overflow check.
3212 AArch64CC::CondCode OFCC;
3213 SDValue Value, Overflow;
3214 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
3216 if (CC == ISD::SETNE)
3217 OFCC = getInvertedCondCode(OFCC);
3218 SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
3220 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
3224 if (LHS.getValueType().isInteger()) {
3225 assert((LHS.getValueType() == RHS.getValueType()) &&
3226 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
3228 // If the RHS of the comparison is zero, we can potentially fold this
3229 // to a specialized branch.
3230 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
3231 if (RHSC && RHSC->getZExtValue() == 0) {
3232 if (CC == ISD::SETEQ) {
3233 // See if we can use a TBZ to fold in an AND as well.
3234 // TBZ has a smaller branch displacement than CBZ. If the offset is
3235 // out of bounds, a late MI-layer pass rewrites branches.
3236 // 403.gcc is an example that hits this case.
3237 if (LHS.getOpcode() == ISD::AND &&
3238 isa<ConstantSDNode>(LHS.getOperand(1)) &&
3239 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
3240 SDValue Test = LHS.getOperand(0);
3241 uint64_t Mask = LHS.getConstantOperandVal(1);
3242 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
3243 DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
3246 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
3247 } else if (CC == ISD::SETNE) {
3248 // See if we can use a TBZ to fold in an AND as well.
3249 // TBZ has a smaller branch displacement than CBZ. If the offset is
3250 // out of bounds, a late MI-layer pass rewrites branches.
3251 // 403.gcc is an example that hits this case.
3252 if (LHS.getOpcode() == ISD::AND &&
3253 isa<ConstantSDNode>(LHS.getOperand(1)) &&
3254 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
3255 SDValue Test = LHS.getOperand(0);
3256 uint64_t Mask = LHS.getConstantOperandVal(1);
3257 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
3258 DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
3261 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
3262 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
3263 // Don't combine AND since emitComparison converts the AND to an ANDS
3264 // (a.k.a. TST) and the test in the test bit and branch instruction
3265 // becomes redundant. This would also increase register pressure.
3266 uint64_t Mask = LHS.getValueType().getSizeInBits() - 1;
3267 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
3268 DAG.getConstant(Mask, MVT::i64), Dest);
3271 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
3272 LHS.getOpcode() != ISD::AND) {
3273 // Don't combine AND since emitComparison converts the AND to an ANDS
3274 // (a.k.a. TST) and the test in the test bit and branch instruction
3275 // becomes redundant. This would also increase register pressure.
3276 uint64_t Mask = LHS.getValueType().getSizeInBits() - 1;
3277 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
3278 DAG.getConstant(Mask, MVT::i64), Dest);
3282 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3283 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
3287 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
3289 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
3290 // clean. Some of them require two branches to implement.
3291 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3292 AArch64CC::CondCode CC1, CC2;
3293 changeFPCCToAArch64CC(CC, CC1, CC2);
3294 SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
3296 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
3297 if (CC2 != AArch64CC::AL) {
3298 SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
3299 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
3306 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
3307 SelectionDAG &DAG) const {