1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
16 #include "X86InstrBuilder.h"
17 #include "X86ISelLowering.h"
18 #include "X86TargetMachine.h"
19 #include "llvm/CallingConv.h"
20 #include "llvm/Constants.h"
21 #include "llvm/DerivedTypes.h"
22 #include "llvm/GlobalAlias.h"
23 #include "llvm/GlobalVariable.h"
24 #include "llvm/Function.h"
25 #include "llvm/Instructions.h"
26 #include "llvm/Intrinsics.h"
27 #include "llvm/LLVMContext.h"
28 #include "llvm/ADT/BitVector.h"
29 #include "llvm/ADT/VectorExtras.h"
30 #include "llvm/CodeGen/MachineFrameInfo.h"
31 #include "llvm/CodeGen/MachineFunction.h"
32 #include "llvm/CodeGen/MachineInstrBuilder.h"
33 #include "llvm/CodeGen/MachineModuleInfo.h"
34 #include "llvm/CodeGen/MachineRegisterInfo.h"
35 #include "llvm/CodeGen/PseudoSourceValue.h"
36 #include "llvm/Support/MathExtras.h"
37 #include "llvm/Support/Debug.h"
38 #include "llvm/Support/ErrorHandling.h"
39 #include "llvm/Target/TargetLoweringObjectFile.h"
40 #include "llvm/Target/TargetOptions.h"
41 #include "llvm/ADT/SmallSet.h"
42 #include "llvm/ADT/StringExtras.h"
43 #include "llvm/Support/CommandLine.h"
44 #include "llvm/Support/raw_ostream.h"
48 DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX"));
50 // Forward declarations.
51 static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
54 static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
55 switch (TM.getSubtarget<X86Subtarget>().TargetType) {
56 default: llvm_unreachable("unknown subtarget type");
57 case X86Subtarget::isDarwin:
58 return new TargetLoweringObjectFileMachO();
59 case X86Subtarget::isELF:
60 return new TargetLoweringObjectFileELF();
61 case X86Subtarget::isMingw:
62 case X86Subtarget::isCygwin:
63 case X86Subtarget::isWindows:
64 return new TargetLoweringObjectFileCOFF();
69 X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
70 : TargetLowering(TM, createTLOF(TM)) {
71 Subtarget = &TM.getSubtarget<X86Subtarget>();
72 X86ScalarSSEf64 = Subtarget->hasSSE2();
73 X86ScalarSSEf32 = Subtarget->hasSSE1();
74 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
76 RegInfo = TM.getRegisterInfo();
79 // Set up the TargetLowering object.
81 // X86 is weird, it always uses i8 for shift amounts and setcc results.
82 setShiftAmountType(MVT::i8);
83 setBooleanContents(ZeroOrOneBooleanContent);
84 setSchedulingPreference(SchedulingForRegPressure);
85 setStackPointerRegisterToSaveRestore(X86StackPtr);
87 if (Subtarget->isTargetDarwin()) {
88 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
89 setUseUnderscoreSetJmp(false);
90 setUseUnderscoreLongJmp(false);
91 } else if (Subtarget->isTargetMingw()) {
92 // MS runtime is weird: it exports _setjmp, but longjmp!
93 setUseUnderscoreSetJmp(true);
94 setUseUnderscoreLongJmp(false);
96 setUseUnderscoreSetJmp(true);
97 setUseUnderscoreLongJmp(true);
100 // Set up the register classes.
101 addRegisterClass(MVT::i8, X86::GR8RegisterClass);
102 addRegisterClass(MVT::i16, X86::GR16RegisterClass);
103 addRegisterClass(MVT::i32, X86::GR32RegisterClass);
104 if (Subtarget->is64Bit())
105 addRegisterClass(MVT::i64, X86::GR64RegisterClass);
107 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
109 // We don't accept any truncstore of integer registers.
110 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
111 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
112 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
113 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
114 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
115 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
117 // SETOEQ and SETUNE require checking two conditions.
118 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
119 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
120 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
121 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
122 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
123 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
125 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
127 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
128 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
129 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
131 if (Subtarget->is64Bit()) {
132 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
133 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand);
134 } else if (!UseSoftFloat) {
135 if (X86ScalarSSEf64) {
136 // We have an impenetrably clever algorithm for ui64->double only.
137 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
139 // We have an algorithm for SSE2, and we turn this into a 64-bit
140 // FILD for other targets.
141 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
144 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
146 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
147 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
150 // SSE has no i16 to fp conversion, only i32
151 if (X86ScalarSSEf32) {
152 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
153 // f32 and f64 cases are Legal, f80 case is not
154 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
156 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
157 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
160 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
161 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
164 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
165 // are Legal, f80 is custom lowered.
166 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
167 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
169 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
171 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
172 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
174 if (X86ScalarSSEf32) {
175 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
176 // f32 and f64 cases are Legal, f80 case is not
177 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
179 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
180 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
183 // Handle FP_TO_UINT by promoting the destination to a larger signed
185 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
186 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
187 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
189 if (Subtarget->is64Bit()) {
190 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
191 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
192 } else if (!UseSoftFloat) {
193 if (X86ScalarSSEf32 && !Subtarget->hasSSE3())
194 // Expand FP_TO_UINT into a select.
195 // FIXME: We would like to use a Custom expander here eventually to do
196 // the optimal thing for SSE vs. the default expansion in the legalizer.
197 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
199 // With SSE3 we can use fisttpll to convert to a signed i64; without
200 // SSE, we're stuck with a fistpll.
201 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
204 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
205 if (!X86ScalarSSEf64) {
206 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand);
207 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand);
210 // Scalar integer divide and remainder are lowered to use operations that
211 // produce two results, to match the available instructions. This exposes
212 // the two-result form to trivial CSE, which is able to combine x/y and x%y
213 // into a single instruction.
215 // Scalar integer multiply-high is also lowered to use two-result
216 // operations, to match the available instructions. However, plain multiply
217 // (low) operations are left as Legal, as there are single-result
218 // instructions for this in x86. Using the two-result multiply instructions
219 // when both high and low results are needed must be arranged by dagcombine.
220 setOperationAction(ISD::MULHS , MVT::i8 , Expand);
221 setOperationAction(ISD::MULHU , MVT::i8 , Expand);
222 setOperationAction(ISD::SDIV , MVT::i8 , Expand);
223 setOperationAction(ISD::UDIV , MVT::i8 , Expand);
224 setOperationAction(ISD::SREM , MVT::i8 , Expand);
225 setOperationAction(ISD::UREM , MVT::i8 , Expand);
226 setOperationAction(ISD::MULHS , MVT::i16 , Expand);
227 setOperationAction(ISD::MULHU , MVT::i16 , Expand);
228 setOperationAction(ISD::SDIV , MVT::i16 , Expand);
229 setOperationAction(ISD::UDIV , MVT::i16 , Expand);
230 setOperationAction(ISD::SREM , MVT::i16 , Expand);
231 setOperationAction(ISD::UREM , MVT::i16 , Expand);
232 setOperationAction(ISD::MULHS , MVT::i32 , Expand);
233 setOperationAction(ISD::MULHU , MVT::i32 , Expand);
234 setOperationAction(ISD::SDIV , MVT::i32 , Expand);
235 setOperationAction(ISD::UDIV , MVT::i32 , Expand);
236 setOperationAction(ISD::SREM , MVT::i32 , Expand);
237 setOperationAction(ISD::UREM , MVT::i32 , Expand);
238 setOperationAction(ISD::MULHS , MVT::i64 , Expand);
239 setOperationAction(ISD::MULHU , MVT::i64 , Expand);
240 setOperationAction(ISD::SDIV , MVT::i64 , Expand);
241 setOperationAction(ISD::UDIV , MVT::i64 , Expand);
242 setOperationAction(ISD::SREM , MVT::i64 , Expand);
243 setOperationAction(ISD::UREM , MVT::i64 , Expand);
245 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
246 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
247 setOperationAction(ISD::BR_CC , MVT::Other, Expand);
248 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand);
249 if (Subtarget->is64Bit())
250 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
251 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
252 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
253 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
254 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
255 setOperationAction(ISD::FREM , MVT::f32 , Expand);
256 setOperationAction(ISD::FREM , MVT::f64 , Expand);
257 setOperationAction(ISD::FREM , MVT::f80 , Expand);
258 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
260 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
261 setOperationAction(ISD::CTTZ , MVT::i8 , Custom);
262 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
263 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
264 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
265 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
266 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
267 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
268 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
269 if (Subtarget->is64Bit()) {
270 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
271 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
272 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
275 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
276 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
278 // These should be promoted to a larger select which is supported.
279 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
280 setOperationAction(ISD::SELECT , MVT::i8 , Promote);
281 // X86 wants to expand cmov itself.
282 setOperationAction(ISD::SELECT , MVT::i16 , Custom);
283 setOperationAction(ISD::SELECT , MVT::i32 , Custom);
284 setOperationAction(ISD::SELECT , MVT::f32 , Custom);
285 setOperationAction(ISD::SELECT , MVT::f64 , Custom);
286 setOperationAction(ISD::SELECT , MVT::f80 , Custom);
287 setOperationAction(ISD::SETCC , MVT::i8 , Custom);
288 setOperationAction(ISD::SETCC , MVT::i16 , Custom);
289 setOperationAction(ISD::SETCC , MVT::i32 , Custom);
290 setOperationAction(ISD::SETCC , MVT::f32 , Custom);
291 setOperationAction(ISD::SETCC , MVT::f64 , Custom);
292 setOperationAction(ISD::SETCC , MVT::f80 , Custom);
293 if (Subtarget->is64Bit()) {
294 setOperationAction(ISD::SELECT , MVT::i64 , Custom);
295 setOperationAction(ISD::SETCC , MVT::i64 , Custom);
297 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
300 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom);
301 setOperationAction(ISD::JumpTable , MVT::i32 , Custom);
302 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom);
303 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom);
304 if (Subtarget->is64Bit())
305 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
306 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom);
307 if (Subtarget->is64Bit()) {
308 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom);
309 setOperationAction(ISD::JumpTable , MVT::i64 , Custom);
310 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom);
311 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom);
313 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
314 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom);
315 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom);
316 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom);
317 if (Subtarget->is64Bit()) {
318 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom);
319 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom);
320 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom);
323 if (Subtarget->hasSSE1())
324 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
326 if (!Subtarget->hasSSE2())
327 setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand);
329 // Expand certain atomics
330 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom);
331 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom);
332 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
333 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
335 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom);
336 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom);
337 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
338 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
340 if (!Subtarget->is64Bit()) {
341 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
342 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
343 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
344 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
345 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
346 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
347 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
350 // Use the default ISD::DBG_STOPPOINT, ISD::DECLARE expansion.
351 setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
352 // FIXME - use subtarget debug flags
353 if (!Subtarget->isTargetDarwin() &&
354 !Subtarget->isTargetELF() &&
355 !Subtarget->isTargetCygMing()) {
356 setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand);
357 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
360 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
361 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand);
362 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
363 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand);
364 if (Subtarget->is64Bit()) {
365 setExceptionPointerRegister(X86::RAX);
366 setExceptionSelectorRegister(X86::RDX);
368 setExceptionPointerRegister(X86::EAX);
369 setExceptionSelectorRegister(X86::EDX);
371 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
372 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
374 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
376 setOperationAction(ISD::TRAP, MVT::Other, Legal);
378 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
379 setOperationAction(ISD::VASTART , MVT::Other, Custom);
380 setOperationAction(ISD::VAEND , MVT::Other, Expand);
381 if (Subtarget->is64Bit()) {
382 setOperationAction(ISD::VAARG , MVT::Other, Custom);
383 setOperationAction(ISD::VACOPY , MVT::Other, Custom);
385 setOperationAction(ISD::VAARG , MVT::Other, Expand);
386 setOperationAction(ISD::VACOPY , MVT::Other, Expand);
389 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
390 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
391 if (Subtarget->is64Bit())
392 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
393 if (Subtarget->isTargetCygMing())
394 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
396 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
398 if (!UseSoftFloat && X86ScalarSSEf64) {
399 // f32 and f64 use SSE.
400 // Set up the FP register classes.
401 addRegisterClass(MVT::f32, X86::FR32RegisterClass);
402 addRegisterClass(MVT::f64, X86::FR64RegisterClass);
404 // Use ANDPD to simulate FABS.
405 setOperationAction(ISD::FABS , MVT::f64, Custom);
406 setOperationAction(ISD::FABS , MVT::f32, Custom);
408 // Use XORP to simulate FNEG.
409 setOperationAction(ISD::FNEG , MVT::f64, Custom);
410 setOperationAction(ISD::FNEG , MVT::f32, Custom);
412 // Use ANDPD and ORPD to simulate FCOPYSIGN.
413 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
414 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
416 // We don't support sin/cos/fmod
417 setOperationAction(ISD::FSIN , MVT::f64, Expand);
418 setOperationAction(ISD::FCOS , MVT::f64, Expand);
419 setOperationAction(ISD::FSIN , MVT::f32, Expand);
420 setOperationAction(ISD::FCOS , MVT::f32, Expand);
422 // Expand FP immediates into loads from the stack, except for the special
424 addLegalFPImmediate(APFloat(+0.0)); // xorpd
425 addLegalFPImmediate(APFloat(+0.0f)); // xorps
426 } else if (!UseSoftFloat && X86ScalarSSEf32) {
427 // Use SSE for f32, x87 for f64.
428 // Set up the FP register classes.
429 addRegisterClass(MVT::f32, X86::FR32RegisterClass);
430 addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
432 // Use ANDPS to simulate FABS.
433 setOperationAction(ISD::FABS , MVT::f32, Custom);
435 // Use XORP to simulate FNEG.
436 setOperationAction(ISD::FNEG , MVT::f32, Custom);
438 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
440 // Use ANDPS and ORPS to simulate FCOPYSIGN.
441 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
442 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
444 // We don't support sin/cos/fmod
445 setOperationAction(ISD::FSIN , MVT::f32, Expand);
446 setOperationAction(ISD::FCOS , MVT::f32, Expand);
448 // Special cases we handle for FP constants.
449 addLegalFPImmediate(APFloat(+0.0f)); // xorps
450 addLegalFPImmediate(APFloat(+0.0)); // FLD0
451 addLegalFPImmediate(APFloat(+1.0)); // FLD1
452 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
453 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
456 setOperationAction(ISD::FSIN , MVT::f64 , Expand);
457 setOperationAction(ISD::FCOS , MVT::f64 , Expand);
459 } else if (!UseSoftFloat) {
460 // f32 and f64 in x87.
461 // Set up the FP register classes.
462 addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
463 addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
465 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
466 setOperationAction(ISD::UNDEF, MVT::f32, Expand);
467 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
468 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
471 setOperationAction(ISD::FSIN , MVT::f64 , Expand);
472 setOperationAction(ISD::FCOS , MVT::f64 , Expand);
474 addLegalFPImmediate(APFloat(+0.0)); // FLD0
475 addLegalFPImmediate(APFloat(+1.0)); // FLD1
476 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
477 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
478 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
479 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
480 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
481 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
484 // Long double always uses X87.
486 addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
487 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
488 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
491 APFloat TmpFlt(+0.0);
492 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
494 addLegalFPImmediate(TmpFlt); // FLD0
496 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
497 APFloat TmpFlt2(+1.0);
498 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
500 addLegalFPImmediate(TmpFlt2); // FLD1
501 TmpFlt2.changeSign();
502 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
506 setOperationAction(ISD::FSIN , MVT::f80 , Expand);
507 setOperationAction(ISD::FCOS , MVT::f80 , Expand);
511 // Always use a library call for pow.
512 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
513 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
514 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
516 setOperationAction(ISD::FLOG, MVT::f80, Expand);
517 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
518 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
519 setOperationAction(ISD::FEXP, MVT::f80, Expand);
520 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
522 // First set operation action for all vector types to either promote
523 // (for widening) or expand (for scalarization). Then we will selectively
524 // turn on ones that can be effectively codegen'd.
525 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
526 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
527 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
528 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
529 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
530 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand);
531 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand);
532 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand);
533 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand);
534 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand);
535 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand);
536 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand);
537 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand);
538 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand);
539 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand);
540 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand);
541 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand);
542 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
543 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand);
544 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand);
545 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
546 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
547 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
548 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
549 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
550 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
551 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
552 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
553 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
554 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand);
555 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
556 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
557 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
558 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
559 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
560 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
561 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
562 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
563 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
564 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
565 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand);
566 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
567 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
568 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
569 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
570 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
571 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand);
572 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand);
573 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
574 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
577 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
578 // with -msoft-float, disable use of MMX as well.
579 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) {
580 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass);
581 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass);
582 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass);
583 addRegisterClass(MVT::v2f32, X86::VR64RegisterClass);
584 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass);
586 setOperationAction(ISD::ADD, MVT::v8i8, Legal);
587 setOperationAction(ISD::ADD, MVT::v4i16, Legal);
588 setOperationAction(ISD::ADD, MVT::v2i32, Legal);
589 setOperationAction(ISD::ADD, MVT::v1i64, Legal);
591 setOperationAction(ISD::SUB, MVT::v8i8, Legal);
592 setOperationAction(ISD::SUB, MVT::v4i16, Legal);
593 setOperationAction(ISD::SUB, MVT::v2i32, Legal);
594 setOperationAction(ISD::SUB, MVT::v1i64, Legal);
596 setOperationAction(ISD::MULHS, MVT::v4i16, Legal);
597 setOperationAction(ISD::MUL, MVT::v4i16, Legal);
599 setOperationAction(ISD::AND, MVT::v8i8, Promote);
600 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64);
601 setOperationAction(ISD::AND, MVT::v4i16, Promote);
602 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64);
603 setOperationAction(ISD::AND, MVT::v2i32, Promote);
604 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64);
605 setOperationAction(ISD::AND, MVT::v1i64, Legal);
607 setOperationAction(ISD::OR, MVT::v8i8, Promote);
608 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64);
609 setOperationAction(ISD::OR, MVT::v4i16, Promote);
610 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64);
611 setOperationAction(ISD::OR, MVT::v2i32, Promote);
612 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64);
613 setOperationAction(ISD::OR, MVT::v1i64, Legal);
615 setOperationAction(ISD::XOR, MVT::v8i8, Promote);
616 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64);
617 setOperationAction(ISD::XOR, MVT::v4i16, Promote);
618 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64);
619 setOperationAction(ISD::XOR, MVT::v2i32, Promote);
620 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64);
621 setOperationAction(ISD::XOR, MVT::v1i64, Legal);
623 setOperationAction(ISD::LOAD, MVT::v8i8, Promote);
624 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64);
625 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
626 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64);
627 setOperationAction(ISD::LOAD, MVT::v2i32, Promote);
628 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64);
629 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
630 AddPromotedToType (ISD::LOAD, MVT::v2f32, MVT::v1i64);
631 setOperationAction(ISD::LOAD, MVT::v1i64, Legal);
633 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom);
634 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
635 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom);
636 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom);
637 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom);
639 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
640 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
641 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom);
642 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom);
644 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f32, Custom);
645 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom);
646 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom);
647 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom);
649 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
651 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
652 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Expand);
653 setOperationAction(ISD::SELECT, MVT::v8i8, Promote);
654 setOperationAction(ISD::SELECT, MVT::v4i16, Promote);
655 setOperationAction(ISD::SELECT, MVT::v2i32, Promote);
656 setOperationAction(ISD::SELECT, MVT::v1i64, Custom);
657 setOperationAction(ISD::VSETCC, MVT::v8i8, Custom);
658 setOperationAction(ISD::VSETCC, MVT::v4i16, Custom);
659 setOperationAction(ISD::VSETCC, MVT::v2i32, Custom);
662 if (!UseSoftFloat && Subtarget->hasSSE1()) {
663 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
665 setOperationAction(ISD::FADD, MVT::v4f32, Legal);
666 setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
667 setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
668 setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
669 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
670 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
671 setOperationAction(ISD::LOAD, MVT::v4f32, Legal);
672 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
673 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
674 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
675 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
676 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom);
679 if (!UseSoftFloat && Subtarget->hasSSE2()) {
680 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
682 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
683 // registers cannot be used even for integer operations.
684 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
685 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
686 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
687 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
689 setOperationAction(ISD::ADD, MVT::v16i8, Legal);
690 setOperationAction(ISD::ADD, MVT::v8i16, Legal);
691 setOperationAction(ISD::ADD, MVT::v4i32, Legal);
692 setOperationAction(ISD::ADD, MVT::v2i64, Legal);
693 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
694 setOperationAction(ISD::SUB, MVT::v16i8, Legal);
695 setOperationAction(ISD::SUB, MVT::v8i16, Legal);
696 setOperationAction(ISD::SUB, MVT::v4i32, Legal);
697 setOperationAction(ISD::SUB, MVT::v2i64, Legal);
698 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
699 setOperationAction(ISD::FADD, MVT::v2f64, Legal);
700 setOperationAction(ISD::FSUB, MVT::v2f64, Legal);
701 setOperationAction(ISD::FMUL, MVT::v2f64, Legal);
702 setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
703 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
704 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
706 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom);
707 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom);
708 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom);
709 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom);
711 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
712 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
713 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
714 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
715 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
717 // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
718 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
719 EVT VT = (MVT::SimpleValueType)i;
720 // Do not attempt to custom lower non-power-of-2 vectors
721 if (!isPowerOf2_32(VT.getVectorNumElements()))
723 // Do not attempt to custom lower non-128-bit vectors
724 if (!VT.is128BitVector())
726 setOperationAction(ISD::BUILD_VECTOR,
727 VT.getSimpleVT().SimpleTy, Custom);
728 setOperationAction(ISD::VECTOR_SHUFFLE,
729 VT.getSimpleVT().SimpleTy, Custom);
730 setOperationAction(ISD::EXTRACT_VECTOR_ELT,
731 VT.getSimpleVT().SimpleTy, Custom);
734 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
735 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
736 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
737 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
738 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
739 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
741 if (Subtarget->is64Bit()) {
742 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
743 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
746 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
747 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) {
748 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
751 // Do not attempt to promote non-128-bit vectors
752 if (!VT.is128BitVector()) {
755 setOperationAction(ISD::AND, SVT, Promote);
756 AddPromotedToType (ISD::AND, SVT, MVT::v2i64);
757 setOperationAction(ISD::OR, SVT, Promote);
758 AddPromotedToType (ISD::OR, SVT, MVT::v2i64);
759 setOperationAction(ISD::XOR, SVT, Promote);
760 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64);
761 setOperationAction(ISD::LOAD, SVT, Promote);
762 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64);
763 setOperationAction(ISD::SELECT, SVT, Promote);
764 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64);
767 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
769 // Custom lower v2i64 and v2f64 selects.
770 setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
771 setOperationAction(ISD::LOAD, MVT::v2i64, Legal);
772 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
773 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
775 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
776 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
777 if (!DisableMMX && Subtarget->hasMMX()) {
778 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
779 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
783 if (Subtarget->hasSSE41()) {
784 // FIXME: Do we need to handle scalar-to-vector here?
785 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
787 // i8 and i16 vectors are custom , because the source register and source
788 // source memory operand types are not the same width. f32 vectors are
789 // custom since the immediate controlling the insert encodes additional
791 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
792 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
793 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
794 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
796 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
797 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
798 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
799 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
801 if (Subtarget->is64Bit()) {
802 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal);
803 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
807 if (Subtarget->hasSSE42()) {
808 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom);
811 if (!UseSoftFloat && Subtarget->hasAVX()) {
812 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass);
813 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass);
814 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
815 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass);
817 setOperationAction(ISD::LOAD, MVT::v8f32, Legal);
818 setOperationAction(ISD::LOAD, MVT::v8i32, Legal);
819 setOperationAction(ISD::LOAD, MVT::v4f64, Legal);
820 setOperationAction(ISD::LOAD, MVT::v4i64, Legal);
821 setOperationAction(ISD::FADD, MVT::v8f32, Legal);
822 setOperationAction(ISD::FSUB, MVT::v8f32, Legal);
823 setOperationAction(ISD::FMUL, MVT::v8f32, Legal);
824 setOperationAction(ISD::FDIV, MVT::v8f32, Legal);
825 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal);
826 setOperationAction(ISD::FNEG, MVT::v8f32, Custom);
827 //setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom);
828 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom);
829 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom);
830 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
831 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom);
833 // Operations to consider commented out -v16i16 v32i8
834 //setOperationAction(ISD::ADD, MVT::v16i16, Legal);
835 setOperationAction(ISD::ADD, MVT::v8i32, Custom);
836 setOperationAction(ISD::ADD, MVT::v4i64, Custom);
837 //setOperationAction(ISD::SUB, MVT::v32i8, Legal);
838 //setOperationAction(ISD::SUB, MVT::v16i16, Legal);
839 setOperationAction(ISD::SUB, MVT::v8i32, Custom);
840 setOperationAction(ISD::SUB, MVT::v4i64, Custom);
841 //setOperationAction(ISD::MUL, MVT::v16i16, Legal);
842 setOperationAction(ISD::FADD, MVT::v4f64, Legal);
843 setOperationAction(ISD::FSUB, MVT::v4f64, Legal);
844 setOperationAction(ISD::FMUL, MVT::v4f64, Legal);
845 setOperationAction(ISD::FDIV, MVT::v4f64, Legal);
846 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);
847 setOperationAction(ISD::FNEG, MVT::v4f64, Custom);
849 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom);
850 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom);
851 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom);
852 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom);
854 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom);
855 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom);
856 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom);
857 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom);
858 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom);
860 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom);
861 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom);
862 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom);
863 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom);
864 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom);
865 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom);
868 // Not sure we want to do this since there are no 256-bit integer
871 // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
872 // This includes 256-bit vectors
873 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) {
874 EVT VT = (MVT::SimpleValueType)i;
876 // Do not attempt to custom lower non-power-of-2 vectors
877 if (!isPowerOf2_32(VT.getVectorNumElements()))
880 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
881 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
882 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
885 if (Subtarget->is64Bit()) {
886 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom);
887 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom);
892 // Not sure we want to do this since there are no 256-bit integer
895 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64.
896 // Including 256-bit vectors
897 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) {
898 EVT VT = (MVT::SimpleValueType)i;
900 if (!VT.is256BitVector()) {
903 setOperationAction(ISD::AND, VT, Promote);
904 AddPromotedToType (ISD::AND, VT, MVT::v4i64);
905 setOperationAction(ISD::OR, VT, Promote);
906 AddPromotedToType (ISD::OR, VT, MVT::v4i64);
907 setOperationAction(ISD::XOR, VT, Promote);
908 AddPromotedToType (ISD::XOR, VT, MVT::v4i64);
909 setOperationAction(ISD::LOAD, VT, Promote);
910 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64);
911 setOperationAction(ISD::SELECT, VT, Promote);
912 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
915 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
919 // We want to custom lower some of our intrinsics.
920 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
922 // Add/Sub/Mul with overflow operations are custom lowered.
923 setOperationAction(ISD::SADDO, MVT::i32, Custom);
924 setOperationAction(ISD::SADDO, MVT::i64, Custom);
925 setOperationAction(ISD::UADDO, MVT::i32, Custom);
926 setOperationAction(ISD::UADDO, MVT::i64, Custom);
927 setOperationAction(ISD::SSUBO, MVT::i32, Custom);
928 setOperationAction(ISD::SSUBO, MVT::i64, Custom);
929 setOperationAction(ISD::USUBO, MVT::i32, Custom);
930 setOperationAction(ISD::USUBO, MVT::i64, Custom);
931 setOperationAction(ISD::SMULO, MVT::i32, Custom);
932 setOperationAction(ISD::SMULO, MVT::i64, Custom);
934 if (!Subtarget->is64Bit()) {
935 // These libcalls are not available in 32-bit.
936 setLibcallName(RTLIB::SHL_I128, 0);
937 setLibcallName(RTLIB::SRL_I128, 0);
938 setLibcallName(RTLIB::SRA_I128, 0);
941 // We have target-specific dag combine patterns for the following nodes:
942 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
943 setTargetDAGCombine(ISD::BUILD_VECTOR);
944 setTargetDAGCombine(ISD::SELECT);
945 setTargetDAGCombine(ISD::SHL);
946 setTargetDAGCombine(ISD::SRA);
947 setTargetDAGCombine(ISD::SRL);
948 setTargetDAGCombine(ISD::STORE);
949 setTargetDAGCombine(ISD::MEMBARRIER);
950 if (Subtarget->is64Bit())
951 setTargetDAGCombine(ISD::MUL);
953 computeRegisterProperties();
955 // FIXME: These should be based on subtarget info. Plus, the values should
956 // be smaller when we are in optimizing for size mode.
957 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
958 maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores
959 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores
960 allowUnalignedMemoryAccesses = true; // x86 supports it!
961 setPrefLoopAlignment(16);
962 benefitFromCodePlacementOpt = true;
966 MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const {
971 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
972 /// the desired ByVal argument alignment.
973 static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
976 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
977 if (VTy->getBitWidth() == 128)
979 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
980 unsigned EltAlign = 0;
981 getMaxByValAlign(ATy->getElementType(), EltAlign);
982 if (EltAlign > MaxAlign)
984 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
985 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
986 unsigned EltAlign = 0;
987 getMaxByValAlign(STy->getElementType(i), EltAlign);
988 if (EltAlign > MaxAlign)
997 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
998 /// function arguments in the caller parameter area. For X86, aggregates
999 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1000 /// are at 4-byte boundaries.
1001 unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
1002 if (Subtarget->is64Bit()) {
1003 // Max of 8 and alignment of type.
1004 unsigned TyAlign = TD->getABITypeAlignment(Ty);
1011 if (Subtarget->hasSSE1())
1012 getMaxByValAlign(Ty, Align);
1016 /// getOptimalMemOpType - Returns the target specific optimal type for load
1017 /// and store operations as a result of memset, memcpy, and memmove
1018 /// lowering. It returns MVT::iAny if SelectionDAG should be responsible for
1021 X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
1022 bool isSrcConst, bool isSrcStr,
1023 SelectionDAG &DAG) const {
1024 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
1025 // linux. This is because the stack realignment code can't handle certain
1026 // cases like PR2962. This should be removed when PR2962 is fixed.
1027 const Function *F = DAG.getMachineFunction().getFunction();
1028 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
1029 if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) {
1030 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16)
1032 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16)
1035 if (Subtarget->is64Bit() && Size >= 8)
1040 /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
1042 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1043 SelectionDAG &DAG) const {
1044 if (usesGlobalOffsetTable())
1045 return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy());
1046 if (!Subtarget->is64Bit())
1047 // This doesn't have DebugLoc associated with it, but is not really the
1048 // same as a Register.
1049 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(),
1054 /// getFunctionAlignment - Return the Log2 alignment of this function.
1055 unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const {
1056 return F->hasFnAttr(Attribute::OptimizeForSize) ? 1 : 4;
1059 //===----------------------------------------------------------------------===//
1060 // Return Value Calling Convention Implementation
1061 //===----------------------------------------------------------------------===//
1063 #include "X86GenCallingConv.inc"
1066 X86TargetLowering::LowerReturn(SDValue Chain,
1067 unsigned CallConv, bool isVarArg,
1068 const SmallVectorImpl<ISD::OutputArg> &Outs,
1069 DebugLoc dl, SelectionDAG &DAG) {
1071 SmallVector<CCValAssign, 16> RVLocs;
1072 CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1073 RVLocs, *DAG.getContext());
1074 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
1076 // If this is the first return lowered for this function, add the regs to the
1077 // liveout set for the function.
1078 if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
1079 for (unsigned i = 0; i != RVLocs.size(); ++i)
1080 if (RVLocs[i].isRegLoc())
1081 DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
1086 SmallVector<SDValue, 6> RetOps;
1087 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1088 // Operand #1 = Bytes To Pop
1089 RetOps.push_back(DAG.getConstant(getBytesToPopOnReturn(), MVT::i16));
1091 // Copy the result values into the output registers.
1092 for (unsigned i = 0; i != RVLocs.size(); ++i) {
1093 CCValAssign &VA = RVLocs[i];
1094 assert(VA.isRegLoc() && "Can only return in registers!");
1095 SDValue ValToCopy = Outs[i].Val;
1097 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
1098 // the RET instruction and handled by the FP Stackifier.
1099 if (VA.getLocReg() == X86::ST0 ||
1100 VA.getLocReg() == X86::ST1) {
1101 // If this is a copy from an xmm register to ST(0), use an FPExtend to
1102 // change the value to the FP stack register class.
1103 if (isScalarFPTypeInSSEReg(VA.getValVT()))
1104 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
1105 RetOps.push_back(ValToCopy);
1106 // Don't emit a copytoreg.
1110 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
1111 // which is returned in RAX / RDX.
1112 if (Subtarget->is64Bit()) {
1113 EVT ValVT = ValToCopy.getValueType();
1114 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) {
1115 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy);
1116 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1)
1117 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy);
1121 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
1122 Flag = Chain.getValue(1);
1125 // The x86-64 ABI for returning structs by value requires that we copy
1126 // the sret argument into %rax for the return. We saved the argument into
1127 // a virtual register in the entry block, so now we copy the value out
1129 if (Subtarget->is64Bit() &&
1130 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
1131 MachineFunction &MF = DAG.getMachineFunction();
1132 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1133 unsigned Reg = FuncInfo->getSRetReturnReg();
1135 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
1136 FuncInfo->setSRetReturnReg(Reg);
1138 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
1140 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
1141 Flag = Chain.getValue(1);
1144 RetOps[0] = Chain; // Update chain.
1146 // Add the flag if we have it.
1148 RetOps.push_back(Flag);
1150 return DAG.getNode(X86ISD::RET_FLAG, dl,
1151 MVT::Other, &RetOps[0], RetOps.size());
1154 /// LowerCallResult - Lower the result values of a call into the
1155 /// appropriate copies out of appropriate physical registers.
1158 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1159 unsigned CallConv, bool isVarArg,
1160 const SmallVectorImpl<ISD::InputArg> &Ins,
1161 DebugLoc dl, SelectionDAG &DAG,
1162 SmallVectorImpl<SDValue> &InVals) {
1164 // Assign locations to each value returned by this call.
1165 SmallVector<CCValAssign, 16> RVLocs;
1166 bool Is64Bit = Subtarget->is64Bit();
1167 CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1168 RVLocs, *DAG.getContext());
1169 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1171 // Copy all of the result registers out of their specified physreg.
1172 for (unsigned i = 0; i != RVLocs.size(); ++i) {
1173 CCValAssign &VA = RVLocs[i];
1174 EVT CopyVT = VA.getValVT();
1176 // If this is x86-64, and we disabled SSE, we can't return FP values
1177 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
1178 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
1179 llvm_report_error("SSE register return with SSE disabled");
1182 // If this is a call to a function that returns an fp value on the floating
1183 // point stack, but where we prefer to use the value in xmm registers, copy
1184 // it out as F80 and use a truncate to move it from fp stack reg to xmm reg.
1185 if ((VA.getLocReg() == X86::ST0 ||
1186 VA.getLocReg() == X86::ST1) &&
1187 isScalarFPTypeInSSEReg(VA.getValVT())) {
1192 if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) {
1193 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64.
1194 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
1195 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1196 MVT::v2i64, InFlag).getValue(1);
1197 Val = Chain.getValue(0);
1198 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
1199 Val, DAG.getConstant(0, MVT::i64));
1201 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1202 MVT::i64, InFlag).getValue(1);
1203 Val = Chain.getValue(0);
1205 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val);
1207 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1208 CopyVT, InFlag).getValue(1);
1209 Val = Chain.getValue(0);
1211 InFlag = Chain.getValue(2);
1213 if (CopyVT != VA.getValVT()) {
1214 // Round the F80 the right size, which also moves to the appropriate xmm
1216 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
1217 // This truncation won't change the value.
1218 DAG.getIntPtrConstant(1));
1221 InVals.push_back(Val);
1228 //===----------------------------------------------------------------------===//
1229 // C & StdCall & Fast Calling Convention implementation
1230 //===----------------------------------------------------------------------===//
1231 // StdCall calling convention seems to be standard for many Windows' API
1232 // routines and around. It differs from C calling convention just a little:
1233 // callee should clean up the stack, not caller. Symbols should be also
1234 // decorated in some fancy way :) It doesn't support any vector arguments.
1235 // For info on fast calling convention see Fast Calling Convention (tail call)
1236 // implementation LowerX86_32FastCCCallTo.
1238 /// CallIsStructReturn - Determines whether a call uses struct return
1240 static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
1244 return Outs[0].Flags.isSRet();
1247 /// ArgsAreStructReturn - Determines whether a function uses struct
1248 /// return semantics.
1250 ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
1254 return Ins[0].Flags.isSRet();
1257 /// IsCalleePop - Determines whether the callee is required to pop its
1258 /// own arguments. Callee pop is necessary to support tail calls.
1259 bool X86TargetLowering::IsCalleePop(bool IsVarArg, unsigned CallingConv) {
1263 switch (CallingConv) {
1266 case CallingConv::X86_StdCall:
1267 return !Subtarget->is64Bit();
1268 case CallingConv::X86_FastCall:
1269 return !Subtarget->is64Bit();
1270 case CallingConv::Fast:
1271 return PerformTailCallOpt;
1275 /// CCAssignFnForNode - Selects the correct CCAssignFn for a the
1276 /// given CallingConvention value.
1277 CCAssignFn *X86TargetLowering::CCAssignFnForNode(unsigned CC) const {
1278 if (Subtarget->is64Bit()) {
1279 if (Subtarget->isTargetWin64())
1280 return CC_X86_Win64_C;
1285 if (CC == CallingConv::X86_FastCall)
1286 return CC_X86_32_FastCall;
1287 else if (CC == CallingConv::Fast)
1288 return CC_X86_32_FastCC;
1293 /// NameDecorationForCallConv - Selects the appropriate decoration to
1294 /// apply to a MachineFunction containing a given calling convention.
1296 X86TargetLowering::NameDecorationForCallConv(unsigned CallConv) {
1297 if (CallConv == CallingConv::X86_FastCall)
1299 else if (CallConv == CallingConv::X86_StdCall)
1305 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
1306 /// by "Src" to address "Dst" with size and alignment information specified by
1307 /// the specific parameter attribute. The copy will be passed as a byval
1308 /// function parameter.
1310 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
1311 ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1313 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
1314 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
1315 /*AlwaysInline=*/true, NULL, 0, NULL, 0);
1319 X86TargetLowering::LowerMemArgument(SDValue Chain,
1321 const SmallVectorImpl<ISD::InputArg> &Ins,
1322 DebugLoc dl, SelectionDAG &DAG,
1323 const CCValAssign &VA,
1324 MachineFrameInfo *MFI,
1327 // Create the nodes corresponding to a load from this parameter slot.
1328 ISD::ArgFlagsTy Flags = Ins[i].Flags;
1329 bool AlwaysUseMutable = (CallConv==CallingConv::Fast) && PerformTailCallOpt;
1330 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1333 // If value is passed by pointer we have address passed instead of the value
1335 if (VA.getLocInfo() == CCValAssign::Indirect)
1336 ValVT = VA.getLocVT();
1338 ValVT = VA.getValVT();
1340 // FIXME: For now, all byval parameter objects are marked mutable. This can be
1341 // changed with more analysis.
1342 // In case of tail call optimization mark all arguments mutable. Since they
1343 // could be overwritten by lowering of arguments in case of a tail call.
1344 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
1345 VA.getLocMemOffset(), isImmutable);
1346 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1347 if (Flags.isByVal())
1349 return DAG.getLoad(ValVT, dl, Chain, FIN,
1350 PseudoSourceValue::getFixedStack(FI), 0);
1354 X86TargetLowering::LowerFormalArguments(SDValue Chain,
1357 const SmallVectorImpl<ISD::InputArg> &Ins,
1360 SmallVectorImpl<SDValue> &InVals) {
1362 MachineFunction &MF = DAG.getMachineFunction();
1363 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1365 const Function* Fn = MF.getFunction();
1366 if (Fn->hasExternalLinkage() &&
1367 Subtarget->isTargetCygMing() &&
1368 Fn->getName() == "main")
1369 FuncInfo->setForceFramePointer(true);
1371 // Decorate the function name.
1372 FuncInfo->setDecorationStyle(NameDecorationForCallConv(CallConv));
1374 MachineFrameInfo *MFI = MF.getFrameInfo();
1375 bool Is64Bit = Subtarget->is64Bit();
1376 bool IsWin64 = Subtarget->isTargetWin64();
1378 assert(!(isVarArg && CallConv == CallingConv::Fast) &&
1379 "Var args not supported with calling convention fastcc");
1381 // Assign locations to all of the incoming arguments.
1382 SmallVector<CCValAssign, 16> ArgLocs;
1383 CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1384 ArgLocs, *DAG.getContext());
1385 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
1387 unsigned LastVal = ~0U;
1389 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1390 CCValAssign &VA = ArgLocs[i];
1391 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
1393 assert(VA.getValNo() != LastVal &&
1394 "Don't support value assigned to multiple locs yet");
1395 LastVal = VA.getValNo();
1397 if (VA.isRegLoc()) {
1398 EVT RegVT = VA.getLocVT();
1399 TargetRegisterClass *RC = NULL;
1400 if (RegVT == MVT::i32)
1401 RC = X86::GR32RegisterClass;
1402 else if (Is64Bit && RegVT == MVT::i64)
1403 RC = X86::GR64RegisterClass;
1404 else if (RegVT == MVT::f32)
1405 RC = X86::FR32RegisterClass;
1406 else if (RegVT == MVT::f64)
1407 RC = X86::FR64RegisterClass;
1408 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
1409 RC = X86::VR128RegisterClass;
1410 else if (RegVT.isVector() && RegVT.getSizeInBits() == 64)
1411 RC = X86::VR64RegisterClass;
1413 llvm_unreachable("Unknown argument type!");
1415 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
1416 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1418 // If this is an 8 or 16-bit value, it is really passed promoted to 32
1419 // bits. Insert an assert[sz]ext to capture this, then truncate to the
1421 if (VA.getLocInfo() == CCValAssign::SExt)
1422 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
1423 DAG.getValueType(VA.getValVT()));
1424 else if (VA.getLocInfo() == CCValAssign::ZExt)
1425 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
1426 DAG.getValueType(VA.getValVT()));
1427 else if (VA.getLocInfo() == CCValAssign::BCvt)
1428 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
1430 if (VA.isExtInLoc()) {
1431 // Handle MMX values passed in XMM regs.
1432 if (RegVT.isVector()) {
1433 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
1434 ArgValue, DAG.getConstant(0, MVT::i64));
1435 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
1437 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
1440 assert(VA.isMemLoc());
1441 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
1444 // If value is passed via pointer - do a load.
1445 if (VA.getLocInfo() == CCValAssign::Indirect)
1446 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0);
1448 InVals.push_back(ArgValue);
1451 // The x86-64 ABI for returning structs by value requires that we copy
1452 // the sret argument into %rax for the return. Save the argument into
1453 // a virtual register so that we can access it from the return points.
1454 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
1455 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1456 unsigned Reg = FuncInfo->getSRetReturnReg();
1458 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
1459 FuncInfo->setSRetReturnReg(Reg);
1461 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
1462 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
1465 unsigned StackSize = CCInfo.getNextStackOffset();
1466 // align stack specially for tail calls
1467 if (PerformTailCallOpt && CallConv == CallingConv::Fast)
1468 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
1470 // If the function takes variable number of arguments, make a frame index for
1471 // the start of the first vararg value... for expansion of llvm.va_start.
1473 if (Is64Bit || CallConv != CallingConv::X86_FastCall) {
1474 VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize);
1477 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
1479 // FIXME: We should really autogenerate these arrays
1480 static const unsigned GPR64ArgRegsWin64[] = {
1481 X86::RCX, X86::RDX, X86::R8, X86::R9
1483 static const unsigned XMMArgRegsWin64[] = {
1484 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3
1486 static const unsigned GPR64ArgRegs64Bit[] = {
1487 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
1489 static const unsigned XMMArgRegs64Bit[] = {
1490 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1491 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1493 const unsigned *GPR64ArgRegs, *XMMArgRegs;
1496 TotalNumIntRegs = 4; TotalNumXMMRegs = 4;
1497 GPR64ArgRegs = GPR64ArgRegsWin64;
1498 XMMArgRegs = XMMArgRegsWin64;
1500 TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
1501 GPR64ArgRegs = GPR64ArgRegs64Bit;
1502 XMMArgRegs = XMMArgRegs64Bit;
1504 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
1506 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs,
1509 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
1510 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
1511 "SSE register cannot be used when SSE is disabled!");
1512 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) &&
1513 "SSE register cannot be used when SSE is disabled!");
1514 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1())
1515 // Kernel mode asks for SSE to be disabled, so don't push them
1517 TotalNumXMMRegs = 0;
1519 // For X86-64, if there are vararg parameters that are passed via
1520 // registers, then we must store them to their spots on the stack so they
1521 // may be loaded by deferencing the result of va_next.
1522 VarArgsGPOffset = NumIntRegs * 8;
1523 VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16;
1524 RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 +
1525 TotalNumXMMRegs * 16, 16);
1527 // Store the integer parameter registers.
1528 SmallVector<SDValue, 8> MemOps;
1529 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
1530 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
1531 DAG.getIntPtrConstant(VarArgsGPOffset));
1532 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
1533 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
1534 X86::GR64RegisterClass);
1535 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
1537 DAG.getStore(Val.getValue(1), dl, Val, FIN,
1538 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0);
1539 MemOps.push_back(Store);
1540 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
1541 DAG.getIntPtrConstant(8));
1544 // Now store the XMM (fp + vector) parameter registers.
1545 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
1546 DAG.getIntPtrConstant(VarArgsFPOffset));
1547 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
1548 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs],
1549 X86::VR128RegisterClass);
1550 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
1552 DAG.getStore(Val.getValue(1), dl, Val, FIN,
1553 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0);
1554 MemOps.push_back(Store);
1555 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
1556 DAG.getIntPtrConstant(16));
1558 if (!MemOps.empty())
1559 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1560 &MemOps[0], MemOps.size());
1564 // Some CCs need callee pop.
1565 if (IsCalleePop(isVarArg, CallConv)) {
1566 BytesToPopOnReturn = StackSize; // Callee pops everything.
1567 BytesCallerReserves = 0;
1569 BytesToPopOnReturn = 0; // Callee pops nothing.
1570 // If this is an sret function, the return should pop the hidden pointer.
1571 if (!Is64Bit && CallConv != CallingConv::Fast && ArgsAreStructReturn(Ins))
1572 BytesToPopOnReturn = 4;
1573 BytesCallerReserves = StackSize;
1577 RegSaveFrameIndex = 0xAAAAAAA; // RegSaveFrameIndex is X86-64 only.
1578 if (CallConv == CallingConv::X86_FastCall)
1579 VarArgsFrameIndex = 0xAAAAAAA; // fastcc functions can't have varargs.
1582 FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn);
1588 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
1589 SDValue StackPtr, SDValue Arg,
1590 DebugLoc dl, SelectionDAG &DAG,
1591 const CCValAssign &VA,
1592 ISD::ArgFlagsTy Flags) {
1593 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0);
1594 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset();
1595 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
1596 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
1597 if (Flags.isByVal()) {
1598 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
1600 return DAG.getStore(Chain, dl, Arg, PtrOff,
1601 PseudoSourceValue::getStack(), LocMemOffset);
1604 /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
1605 /// optimization is performed and it is required.
1607 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
1608 SDValue &OutRetAddr,
1614 if (!IsTailCall || FPDiff==0) return Chain;
1616 // Adjust the Return address stack slot.
1617 EVT VT = getPointerTy();
1618 OutRetAddr = getReturnAddressFrameIndex(DAG);
1620 // Load the "old" Return address.
1621 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0);
1622 return SDValue(OutRetAddr.getNode(), 1);
1625 /// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call
1626 /// optimization is performed and it is required (FPDiff!=0).
1628 EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
1629 SDValue Chain, SDValue RetAddrFrIdx,
1630 bool Is64Bit, int FPDiff, DebugLoc dl) {
1631 // Store the return address to the appropriate stack slot.
1632 if (!FPDiff) return Chain;
1633 // Calculate the new stack slot for the return address.
1634 int SlotSize = Is64Bit ? 8 : 4;
1635 int NewReturnAddrFI =
1636 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize);
1637 EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
1638 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
1639 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
1640 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0);
1645 X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
1646 unsigned CallConv, bool isVarArg, bool isTailCall,
1647 const SmallVectorImpl<ISD::OutputArg> &Outs,
1648 const SmallVectorImpl<ISD::InputArg> &Ins,
1649 DebugLoc dl, SelectionDAG &DAG,
1650 SmallVectorImpl<SDValue> &InVals) {
1652 MachineFunction &MF = DAG.getMachineFunction();
1653 bool Is64Bit = Subtarget->is64Bit();
1654 bool IsStructRet = CallIsStructReturn(Outs);
1656 assert((!isTailCall ||
1657 (CallConv == CallingConv::Fast && PerformTailCallOpt)) &&
1658 "IsEligibleForTailCallOptimization missed a case!");
1659 assert(!(isVarArg && CallConv == CallingConv::Fast) &&
1660 "Var args not supported with calling convention fastcc");
1662 // Analyze operands of the call, assigning locations to each operand.
1663 SmallVector<CCValAssign, 16> ArgLocs;
1664 CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1665 ArgLocs, *DAG.getContext());
1666 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
1668 // Get a count of how many bytes are to be pushed on the stack.
1669 unsigned NumBytes = CCInfo.getNextStackOffset();
1670 if (PerformTailCallOpt && CallConv == CallingConv::Fast)
1671 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
1675 // Lower arguments at fp - stackoffset + fpdiff.
1676 unsigned NumBytesCallerPushed =
1677 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
1678 FPDiff = NumBytesCallerPushed - NumBytes;
1680 // Set the delta of movement of the returnaddr stackslot.
1681 // But only set if delta is greater than previous delta.
1682 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
1683 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
1686 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
1688 SDValue RetAddrFrIdx;
1689 // Load return adress for tail calls.
1690 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, Is64Bit,
1693 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
1694 SmallVector<SDValue, 8> MemOpChains;
1697 // Walk the register/memloc assignments, inserting copies/loads. In the case
1698 // of tail call optimization arguments are handle later.
1699 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1700 CCValAssign &VA = ArgLocs[i];
1701 EVT RegVT = VA.getLocVT();
1702 SDValue Arg = Outs[i].Val;
1703 ISD::ArgFlagsTy Flags = Outs[i].Flags;
1704 bool isByVal = Flags.isByVal();
1706 // Promote the value if needed.
1707 switch (VA.getLocInfo()) {
1708 default: llvm_unreachable("Unknown loc info!");
1709 case CCValAssign::Full: break;
1710 case CCValAssign::SExt:
1711 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
1713 case CCValAssign::ZExt:
1714 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
1716 case CCValAssign::AExt:
1717 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) {
1718 // Special case: passing MMX values in XMM registers.
1719 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
1720 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
1721 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
1723 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
1725 case CCValAssign::BCvt:
1726 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg);
1728 case CCValAssign::Indirect: {
1729 // Store the argument.
1730 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
1731 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
1732 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
1733 PseudoSourceValue::getFixedStack(FI), 0);
1739 if (VA.isRegLoc()) {
1740 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1742 if (!isTailCall || (isTailCall && isByVal)) {
1743 assert(VA.isMemLoc());
1744 if (StackPtr.getNode() == 0)
1745 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
1747 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
1748 dl, DAG, VA, Flags));
1753 if (!MemOpChains.empty())
1754 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1755 &MemOpChains[0], MemOpChains.size());
1757 // Build a sequence of copy-to-reg nodes chained together with token chain
1758 // and flag operands which copy the outgoing args into registers.
1760 // Tail call byval lowering might overwrite argument registers so in case of
1761 // tail call optimization the copies to registers are lowered later.
1763 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1764 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1765 RegsToPass[i].second, InFlag);
1766 InFlag = Chain.getValue(1);
1770 if (Subtarget->isPICStyleGOT()) {
1771 // ELF / PIC requires GOT in the EBX register before function calls via PLT
1774 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
1775 DAG.getNode(X86ISD::GlobalBaseReg,
1776 DebugLoc::getUnknownLoc(),
1779 InFlag = Chain.getValue(1);
1781 // If we are tail calling and generating PIC/GOT style code load the
1782 // address of the callee into ECX. The value in ecx is used as target of
1783 // the tail jump. This is done to circumvent the ebx/callee-saved problem
1784 // for tail calls on PIC/GOT architectures. Normally we would just put the
1785 // address of GOT into ebx and then call target@PLT. But for tail calls
1786 // ebx would be restored (since ebx is callee saved) before jumping to the
1789 // Note: The actual moving to ECX is done further down.
1790 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
1791 if (G && !G->getGlobal()->hasHiddenVisibility() &&
1792 !G->getGlobal()->hasProtectedVisibility())
1793 Callee = LowerGlobalAddress(Callee, DAG);
1794 else if (isa<ExternalSymbolSDNode>(Callee))
1795 Callee = LowerExternalSymbol(Callee, DAG);
1799 if (Is64Bit && isVarArg) {
1800 // From AMD64 ABI document:
1801 // For calls that may call functions that use varargs or stdargs
1802 // (prototype-less calls or calls to functions containing ellipsis (...) in
1803 // the declaration) %al is used as hidden argument to specify the number
1804 // of SSE registers used. The contents of %al do not need to match exactly
1805 // the number of registers, but must be an ubound on the number of SSE
1806 // registers used and is in the range 0 - 8 inclusive.
1808 // FIXME: Verify this on Win64
1809 // Count the number of XMM registers allocated.
1810 static const unsigned XMMArgRegs[] = {
1811 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1812 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1814 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
1815 assert((Subtarget->hasSSE1() || !NumXMMRegs)
1816 && "SSE registers cannot be used when SSE is disabled");
1818 Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
1819 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
1820 InFlag = Chain.getValue(1);
1824 // For tail calls lower the arguments to the 'real' stack slot.
1826 // Force all the incoming stack arguments to be loaded from the stack
1827 // before any new outgoing arguments are stored to the stack, because the
1828 // outgoing stack slots may alias the incoming argument stack slots, and
1829 // the alias isn't otherwise explicit. This is slightly more conservative
1830 // than necessary, because it means that each store effectively depends
1831 // on every argument instead of just those arguments it would clobber.
1832 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
1834 SmallVector<SDValue, 8> MemOpChains2;
1837 // Do not flag preceeding copytoreg stuff together with the following stuff.
1839 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1840 CCValAssign &VA = ArgLocs[i];
1841 if (!VA.isRegLoc()) {
1842 assert(VA.isMemLoc());
1843 SDValue Arg = Outs[i].Val;
1844 ISD::ArgFlagsTy Flags = Outs[i].Flags;
1845 // Create frame index.
1846 int32_t Offset = VA.getLocMemOffset()+FPDiff;
1847 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
1848 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
1849 FIN = DAG.getFrameIndex(FI, getPointerTy());
1851 if (Flags.isByVal()) {
1852 // Copy relative to framepointer.
1853 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
1854 if (StackPtr.getNode() == 0)
1855 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
1857 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
1859 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
1863 // Store relative to framepointer.
1864 MemOpChains2.push_back(
1865 DAG.getStore(ArgChain, dl, Arg, FIN,
1866 PseudoSourceValue::getFixedStack(FI), 0));
1871 if (!MemOpChains2.empty())
1872 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1873 &MemOpChains2[0], MemOpChains2.size());
1875 // Copy arguments to their registers.
1876 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1877 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1878 RegsToPass[i].second, InFlag);
1879 InFlag = Chain.getValue(1);
1883 // Store the return address to the appropriate stack slot.
1884 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
1888 // If the callee is a GlobalAddress node (quite common, every direct call is)
1889 // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
1890 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1891 // We should use extra load for direct calls to dllimported functions in
1893 GlobalValue *GV = G->getGlobal();
1894 if (!GV->hasDLLImportLinkage()) {
1895 unsigned char OpFlags = 0;
1897 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
1898 // external symbols most go through the PLT in PIC mode. If the symbol
1899 // has hidden or protected visibility, or if it is static or local, then
1900 // we don't need to use the PLT - we can directly call it.
1901 if (Subtarget->isTargetELF() &&
1902 getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1903 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
1904 OpFlags = X86II::MO_PLT;
1905 } else if (Subtarget->isPICStyleStubAny() &&
1906 (GV->isDeclaration() || GV->isWeakForLinker()) &&
1907 Subtarget->getDarwinVers() < 9) {
1908 // PC-relative references to external symbols should go through $stub,
1909 // unless we're building with the leopard linker or later, which
1910 // automatically synthesizes these stubs.
1911 OpFlags = X86II::MO_DARWIN_STUB;
1914 Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(),
1915 G->getOffset(), OpFlags);
1917 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1918 unsigned char OpFlags = 0;
1920 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external
1921 // symbols should go through the PLT.
1922 if (Subtarget->isTargetELF() &&
1923 getTargetMachine().getRelocationModel() == Reloc::PIC_) {
1924 OpFlags = X86II::MO_PLT;
1925 } else if (Subtarget->isPICStyleStubAny() &&
1926 Subtarget->getDarwinVers() < 9) {
1927 // PC-relative references to external symbols should go through $stub,
1928 // unless we're building with the leopard linker or later, which
1929 // automatically synthesizes these stubs.
1930 OpFlags = X86II::MO_DARWIN_STUB;
1933 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
1935 } else if (isTailCall) {
1936 unsigned Opc = Is64Bit ? X86::R11 : X86::EAX;
1938 Chain = DAG.getCopyToReg(Chain, dl,
1939 DAG.getRegister(Opc, getPointerTy()),
1941 Callee = DAG.getRegister(Opc, getPointerTy());
1942 // Add register as live out.
1943 MF.getRegInfo().addLiveOut(Opc);
1946 // Returns a chain & a flag for retval copy to use.
1947 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
1948 SmallVector<SDValue, 8> Ops;
1951 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
1952 DAG.getIntPtrConstant(0, true), InFlag);
1953 InFlag = Chain.getValue(1);
1956 Ops.push_back(Chain);
1957 Ops.push_back(Callee);
1960 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
1962 // Add argument registers to the end of the list so that they are known live
1964 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
1965 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
1966 RegsToPass[i].second.getValueType()));
1968 // Add an implicit use GOT pointer in EBX.
1969 if (!isTailCall && Subtarget->isPICStyleGOT())
1970 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
1972 // Add an implicit use of AL for x86 vararg functions.
1973 if (Is64Bit && isVarArg)
1974 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
1976 if (InFlag.getNode())
1977 Ops.push_back(InFlag);
1980 // If this is the first return lowered for this function, add the regs
1981 // to the liveout set for the function.
1982 if (MF.getRegInfo().liveout_empty()) {
1983 SmallVector<CCValAssign, 16> RVLocs;
1984 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
1986 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1987 for (unsigned i = 0; i != RVLocs.size(); ++i)
1988 if (RVLocs[i].isRegLoc())
1989 MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg());
1992 assert(((Callee.getOpcode() == ISD::Register &&
1993 (cast<RegisterSDNode>(Callee)->getReg() == X86::EAX ||
1994 cast<RegisterSDNode>(Callee)->getReg() == X86::R9)) ||
1995 Callee.getOpcode() == ISD::TargetExternalSymbol ||
1996 Callee.getOpcode() == ISD::TargetGlobalAddress) &&
1997 "Expecting an global address, external symbol, or register");
1999 return DAG.getNode(X86ISD::TC_RETURN, dl,
2000 NodeTys, &Ops[0], Ops.size());
2003 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
2004 InFlag = Chain.getValue(1);
2006 // Create the CALLSEQ_END node.
2007 unsigned NumBytesForCalleeToPush;
2008 if (IsCalleePop(isVarArg, CallConv))
2009 NumBytesForCalleeToPush = NumBytes; // Callee pops everything
2010 else if (!Is64Bit && CallConv != CallingConv::Fast && IsStructRet)
2011 // If this is is a call to a struct-return function, the callee
2012 // pops the hidden struct pointer, so we have to push it back.
2013 // This is common for Darwin/X86, Linux & Mingw32 targets.
2014 NumBytesForCalleeToPush = 4;
2016 NumBytesForCalleeToPush = 0; // Callee pops nothing.
2018 // Returns a flag for retval copy to use.
2019 Chain = DAG.getCALLSEQ_END(Chain,
2020 DAG.getIntPtrConstant(NumBytes, true),
2021 DAG.getIntPtrConstant(NumBytesForCalleeToPush,
2024 InFlag = Chain.getValue(1);
2026 // Handle result values, copying them out of physregs into vregs that we
2028 return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
2029 Ins, dl, DAG, InVals);
2033 //===----------------------------------------------------------------------===//
2034 // Fast Calling Convention (tail call) implementation
2035 //===----------------------------------------------------------------------===//
2037 // Like std call, callee cleans arguments, convention except that ECX is
2038 // reserved for storing the tail called function address. Only 2 registers are
2039 // free for argument passing (inreg). Tail call optimization is performed
2041 // * tailcallopt is enabled
2042 // * caller/callee are fastcc
2043 // On X86_64 architecture with GOT-style position independent code only local
2044 // (within module) calls are supported at the moment.
2045 // To keep the stack aligned according to platform abi the function
2046 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
2047 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
2048 // If a tail called function callee has more arguments than the caller the
2049 // caller needs to make sure that there is room to move the RETADDR to. This is
2050 // achieved by reserving an area the size of the argument delta right after the
2051 // original REtADDR, but before the saved framepointer or the spilled registers
2052 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2064 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
2065 /// for a 16 byte align requirement.
2066 unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
2067 SelectionDAG& DAG) {
2068 MachineFunction &MF = DAG.getMachineFunction();
2069 const TargetMachine &TM = MF.getTarget();
2070 const TargetFrameInfo &TFI = *TM.getFrameInfo();
2071 unsigned StackAlignment = TFI.getStackAlignment();
2072 uint64_t AlignMask = StackAlignment - 1;
2073 int64_t Offset = StackSize;
2074 uint64_t SlotSize = TD->getPointerSize();
2075 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
2076 // Number smaller than 12 so just add the difference.
2077 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
2079 // Mask out lower bits, add stackalignment once plus the 12 bytes.
2080 Offset = ((~AlignMask) & Offset) + StackAlignment +
2081 (StackAlignment-SlotSize);
2086 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
2087 /// for tail call optimization. Targets which want to do tail call
2088 /// optimization should implement this function.
2090 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
2093 const SmallVectorImpl<ISD::InputArg> &Ins,
2094 SelectionDAG& DAG) const {
2095 MachineFunction &MF = DAG.getMachineFunction();
2096 unsigned CallerCC = MF.getFunction()->getCallingConv();
2097 return CalleeCC == CallingConv::Fast && CallerCC == CalleeCC;
2101 X86TargetLowering::createFastISel(MachineFunction &mf,
2102 MachineModuleInfo *mmo,
2104 DenseMap<const Value *, unsigned> &vm,
2105 DenseMap<const BasicBlock *,
2106 MachineBasicBlock *> &bm,
2107 DenseMap<const AllocaInst *, int> &am
2109 , SmallSet<Instruction*, 8> &cil
2112 return X86::createFastISel(mf, mmo, dw, vm, bm, am
2120 //===----------------------------------------------------------------------===//
2121 // Other Lowering Hooks
2122 //===----------------------------------------------------------------------===//
2125 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
2126 MachineFunction &MF = DAG.getMachineFunction();
2127 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2128 int ReturnAddrIndex = FuncInfo->getRAIndex();
2130 if (ReturnAddrIndex == 0) {
2131 // Set up a frame object for the return address.
2132 uint64_t SlotSize = TD->getPointerSize();
2133 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize);
2134 FuncInfo->setRAIndex(ReturnAddrIndex);
2137 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
2141 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
2142 bool hasSymbolicDisplacement) {
2143 // Offset should fit into 32 bit immediate field.
2144 if (!isInt32(Offset))
2147 // If we don't have a symbolic displacement - we don't have any extra
2149 if (!hasSymbolicDisplacement)
2152 // FIXME: Some tweaks might be needed for medium code model.
2153 if (M != CodeModel::Small && M != CodeModel::Kernel)
2156 // For small code model we assume that latest object is 16MB before end of 31
2157 // bits boundary. We may also accept pretty large negative constants knowing
2158 // that all objects are in the positive half of address space.
2159 if (M == CodeModel::Small && Offset < 16*1024*1024)
2162 // For kernel code model we know that all object resist in the negative half
2163 // of 32bits address space. We may not accept negative offsets, since they may
2164 // be just off and we may accept pretty large positive ones.
2165 if (M == CodeModel::Kernel && Offset > 0)
2171 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
2172 /// specific condition code, returning the condition code and the LHS/RHS of the
2173 /// comparison to make.
2174 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
2175 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
2177 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2178 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
2179 // X > -1 -> X == 0, jump !sign.
2180 RHS = DAG.getConstant(0, RHS.getValueType());
2181 return X86::COND_NS;
2182 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
2183 // X < 0 -> X == 0, jump on sign.
2185 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
2187 RHS = DAG.getConstant(0, RHS.getValueType());
2188 return X86::COND_LE;
2192 switch (SetCCOpcode) {
2193 default: llvm_unreachable("Invalid integer condition!");
2194 case ISD::SETEQ: return X86::COND_E;
2195 case ISD::SETGT: return X86::COND_G;
2196 case ISD::SETGE: return X86::COND_GE;
2197 case ISD::SETLT: return X86::COND_L;
2198 case ISD::SETLE: return X86::COND_LE;
2199 case ISD::SETNE: return X86::COND_NE;
2200 case ISD::SETULT: return X86::COND_B;
2201 case ISD::SETUGT: return X86::COND_A;
2202 case ISD::SETULE: return X86::COND_BE;
2203 case ISD::SETUGE: return X86::COND_AE;
2207 // First determine if it is required or is profitable to flip the operands.
2209 // If LHS is a foldable load, but RHS is not, flip the condition.
2210 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) &&
2211 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) {
2212 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
2213 std::swap(LHS, RHS);
2216 switch (SetCCOpcode) {
2222 std::swap(LHS, RHS);
2226 // On a floating point condition, the flags are set as follows:
2228 // 0 | 0 | 0 | X > Y
2229 // 0 | 0 | 1 | X < Y
2230 // 1 | 0 | 0 | X == Y
2231 // 1 | 1 | 1 | unordered
2232 switch (SetCCOpcode) {
2233 default: llvm_unreachable("Condcode should be pre-legalized away");
2235 case ISD::SETEQ: return X86::COND_E;
2236 case ISD::SETOLT: // flipped
2238 case ISD::SETGT: return X86::COND_A;
2239 case ISD::SETOLE: // flipped
2241 case ISD::SETGE: return X86::COND_AE;
2242 case ISD::SETUGT: // flipped
2244 case ISD::SETLT: return X86::COND_B;
2245 case ISD::SETUGE: // flipped
2247 case ISD::SETLE: return X86::COND_BE;
2249 case ISD::SETNE: return X86::COND_NE;
2250 case ISD::SETUO: return X86::COND_P;
2251 case ISD::SETO: return X86::COND_NP;
2255 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
2256 /// code. Current x86 isa includes the following FP cmov instructions:
2257 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
2258 static bool hasFPCMov(unsigned X86CC) {
2274 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
2275 /// the specified range (L, H].
2276 static bool isUndefOrInRange(int Val, int Low, int Hi) {
2277 return (Val < 0) || (Val >= Low && Val < Hi);
2280 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
2281 /// specified value.
2282 static bool isUndefOrEqual(int Val, int CmpVal) {
2283 if (Val < 0 || Val == CmpVal)
2288 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
2289 /// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference
2290 /// the second operand.
2291 static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2292 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16)
2293 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
2294 if (VT == MVT::v2f64 || VT == MVT::v2i64)
2295 return (Mask[0] < 2 && Mask[1] < 2);
2299 bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) {
2300 SmallVector<int, 8> M;
2302 return ::isPSHUFDMask(M, N->getValueType(0));
2305 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
2306 /// is suitable for input to PSHUFHW.
2307 static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2308 if (VT != MVT::v8i16)
2311 // Lower quadword copied in order or undef.
2312 for (int i = 0; i != 4; ++i)
2313 if (Mask[i] >= 0 && Mask[i] != i)
2316 // Upper quadword shuffled.
2317 for (int i = 4; i != 8; ++i)
2318 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
2324 bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) {
2325 SmallVector<int, 8> M;
2327 return ::isPSHUFHWMask(M, N->getValueType(0));
2330 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
2331 /// is suitable for input to PSHUFLW.
2332 static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2333 if (VT != MVT::v8i16)
2336 // Upper quadword copied in order.
2337 for (int i = 4; i != 8; ++i)
2338 if (Mask[i] >= 0 && Mask[i] != i)
2341 // Lower quadword shuffled.
2342 for (int i = 0; i != 4; ++i)
2349 bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
2350 SmallVector<int, 8> M;
2352 return ::isPSHUFLWMask(M, N->getValueType(0));
2355 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
2356 /// specifies a shuffle of elements that is suitable for input to SHUFP*.
2357 static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2358 int NumElems = VT.getVectorNumElements();
2359 if (NumElems != 2 && NumElems != 4)
2362 int Half = NumElems / 2;
2363 for (int i = 0; i < Half; ++i)
2364 if (!isUndefOrInRange(Mask[i], 0, NumElems))
2366 for (int i = Half; i < NumElems; ++i)
2367 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
2373 bool X86::isSHUFPMask(ShuffleVectorSDNode *N) {
2374 SmallVector<int, 8> M;
2376 return ::isSHUFPMask(M, N->getValueType(0));
2379 /// isCommutedSHUFP - Returns true if the shuffle mask is exactly
2380 /// the reverse of what x86 shuffles want. x86 shuffles requires the lower
2381 /// half elements to come from vector 1 (which would equal the dest.) and
2382 /// the upper half to come from vector 2.
2383 static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2384 int NumElems = VT.getVectorNumElements();
2386 if (NumElems != 2 && NumElems != 4)
2389 int Half = NumElems / 2;
2390 for (int i = 0; i < Half; ++i)
2391 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
2393 for (int i = Half; i < NumElems; ++i)
2394 if (!isUndefOrInRange(Mask[i], 0, NumElems))
2399 static bool isCommutedSHUFP(ShuffleVectorSDNode *N) {
2400 SmallVector<int, 8> M;
2402 return isCommutedSHUFPMask(M, N->getValueType(0));
2405 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
2406 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
2407 bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
2408 if (N->getValueType(0).getVectorNumElements() != 4)
2411 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
2412 return isUndefOrEqual(N->getMaskElt(0), 6) &&
2413 isUndefOrEqual(N->getMaskElt(1), 7) &&
2414 isUndefOrEqual(N->getMaskElt(2), 2) &&
2415 isUndefOrEqual(N->getMaskElt(3), 3);
2418 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
2419 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
2420 bool X86::isMOVLPMask(ShuffleVectorSDNode *N) {
2421 unsigned NumElems = N->getValueType(0).getVectorNumElements();
2423 if (NumElems != 2 && NumElems != 4)
2426 for (unsigned i = 0; i < NumElems/2; ++i)
2427 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems))
2430 for (unsigned i = NumElems/2; i < NumElems; ++i)
2431 if (!isUndefOrEqual(N->getMaskElt(i), i))
2437 /// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand
2438 /// specifies a shuffle of elements that is suitable for input to MOVHP{S|D}
2440 bool X86::isMOVHPMask(ShuffleVectorSDNode *N) {
2441 unsigned NumElems = N->getValueType(0).getVectorNumElements();
2443 if (NumElems != 2 && NumElems != 4)
2446 for (unsigned i = 0; i < NumElems/2; ++i)
2447 if (!isUndefOrEqual(N->getMaskElt(i), i))
2450 for (unsigned i = 0; i < NumElems/2; ++i)
2451 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems))
2457 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
2458 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
2460 bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
2461 unsigned NumElems = N->getValueType(0).getVectorNumElements();
2466 return isUndefOrEqual(N->getMaskElt(0), 2) &&
2467 isUndefOrEqual(N->getMaskElt(1), 3) &&
2468 isUndefOrEqual(N->getMaskElt(2), 2) &&
2469 isUndefOrEqual(N->getMaskElt(3), 3);
2472 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
2473 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
2474 static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
2475 bool V2IsSplat = false) {
2476 int NumElts = VT.getVectorNumElements();
2477 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
2480 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
2482 int BitI1 = Mask[i+1];
2483 if (!isUndefOrEqual(BitI, j))
2486 if (!isUndefOrEqual(BitI1, NumElts))
2489 if (!isUndefOrEqual(BitI1, j + NumElts))
2496 bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
2497 SmallVector<int, 8> M;
2499 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat);
2502 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
2503 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
2504 static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
2505 bool V2IsSplat = false) {
2506 int NumElts = VT.getVectorNumElements();
2507 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
2510 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
2512 int BitI1 = Mask[i+1];
2513 if (!isUndefOrEqual(BitI, j + NumElts/2))
2516 if (isUndefOrEqual(BitI1, NumElts))
2519 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
2526 bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
2527 SmallVector<int, 8> M;
2529 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat);
2532 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
2533 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
2535 static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
2536 int NumElems = VT.getVectorNumElements();
2537 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
2540 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) {
2542 int BitI1 = Mask[i+1];
2543 if (!isUndefOrEqual(BitI, j))
2545 if (!isUndefOrEqual(BitI1, j))
2551 bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) {
2552 SmallVector<int, 8> M;
2554 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0));
2557 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
2558 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
2560 static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
2561 int NumElems = VT.getVectorNumElements();
2562 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
2565 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
2567 int BitI1 = Mask[i+1];
2568 if (!isUndefOrEqual(BitI, j))
2570 if (!isUndefOrEqual(BitI1, j))
2576 bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
2577 SmallVector<int, 8> M;
2579 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0));
2582 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
2583 /// specifies a shuffle of elements that is suitable for input to MOVSS,
2584 /// MOVSD, and MOVD, i.e. setting the lowest element.
2585 static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2586 if (VT.getVectorElementType().getSizeInBits() < 32)
2589 int NumElts = VT.getVectorNumElements();
2591 if (!isUndefOrEqual(Mask[0], NumElts))
2594 for (int i = 1; i < NumElts; ++i)
2595 if (!isUndefOrEqual(Mask[i], i))
2601 bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
2602 SmallVector<int, 8> M;
2604 return ::isMOVLMask(M, N->getValueType(0));
2607 /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
2608 /// of what x86 movss want. X86 movs requires the lowest element to be lowest
2609 /// element of vector 2 and the other elements to come from vector 1 in order.
2610 static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT,
2611 bool V2IsSplat = false, bool V2IsUndef = false) {
2612 int NumOps = VT.getVectorNumElements();
2613 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
2616 if (!isUndefOrEqual(Mask[0], 0))
2619 for (int i = 1; i < NumOps; ++i)
2620 if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
2621 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
2622 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
2628 static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
2629 bool V2IsUndef = false) {
2630 SmallVector<int, 8> M;
2632 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef);
2635 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
2636 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
2637 bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) {
2638 if (N->getValueType(0).getVectorNumElements() != 4)
2641 // Expect 1, 1, 3, 3
2642 for (unsigned i = 0; i < 2; ++i) {
2643 int Elt = N->getMaskElt(i);
2644 if (Elt >= 0 && Elt != 1)
2649 for (unsigned i = 2; i < 4; ++i) {
2650 int Elt = N->getMaskElt(i);
2651 if (Elt >= 0 && Elt != 3)
2656 // Don't use movshdup if it can be done with a shufps.
2657 // FIXME: verify that matching u, u, 3, 3 is what we want.
2661 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
2662 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
2663 bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) {
2664 if (N->getValueType(0).getVectorNumElements() != 4)
2667 // Expect 0, 0, 2, 2
2668 for (unsigned i = 0; i < 2; ++i)
2669 if (N->getMaskElt(i) > 0)
2673 for (unsigned i = 2; i < 4; ++i) {
2674 int Elt = N->getMaskElt(i);
2675 if (Elt >= 0 && Elt != 2)
2680 // Don't use movsldup if it can be done with a shufps.
2684 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
2685 /// specifies a shuffle of elements that is suitable for input to MOVDDUP.
2686 bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
2687 int e = N->getValueType(0).getVectorNumElements() / 2;
2689 for (int i = 0; i < e; ++i)
2690 if (!isUndefOrEqual(N->getMaskElt(i), i))
2692 for (int i = 0; i < e; ++i)
2693 if (!isUndefOrEqual(N->getMaskElt(e+i), i))
2698 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
2699 /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
2701 unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
2702 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
2703 int NumOperands = SVOp->getValueType(0).getVectorNumElements();
2705 unsigned Shift = (NumOperands == 4) ? 2 : 1;
2707 for (int i = 0; i < NumOperands; ++i) {
2708 int Val = SVOp->getMaskElt(NumOperands-i-1);
2709 if (Val < 0) Val = 0;
2710 if (Val >= NumOperands) Val -= NumOperands;
2712 if (i != NumOperands - 1)
2718 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
2719 /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW
2721 unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
2722 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
2724 // 8 nodes, but we only care about the last 4.
2725 for (unsigned i = 7; i >= 4; --i) {
2726 int Val = SVOp->getMaskElt(i);
2735 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
2736 /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW
2738 unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
2739 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
2741 // 8 nodes, but we only care about the first 4.
2742 for (int i = 3; i >= 0; --i) {
2743 int Val = SVOp->getMaskElt(i);
2752 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
2754 bool X86::isZeroNode(SDValue Elt) {
2755 return ((isa<ConstantSDNode>(Elt) &&
2756 cast<ConstantSDNode>(Elt)->getZExtValue() == 0) ||
2757 (isa<ConstantFPSDNode>(Elt) &&
2758 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
2761 /// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
2762 /// their permute mask.
2763 static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
2764 SelectionDAG &DAG) {
2765 EVT VT = SVOp->getValueType(0);
2766 unsigned NumElems = VT.getVectorNumElements();
2767 SmallVector<int, 8> MaskVec;
2769 for (unsigned i = 0; i != NumElems; ++i) {
2770 int idx = SVOp->getMaskElt(i);
2772 MaskVec.push_back(idx);
2773 else if (idx < (int)NumElems)
2774 MaskVec.push_back(idx + NumElems);
2776 MaskVec.push_back(idx - NumElems);
2778 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
2779 SVOp->getOperand(0), &MaskVec[0]);
2782 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
2783 /// the two vector operands have swapped position.
2784 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
2785 unsigned NumElems = VT.getVectorNumElements();
2786 for (unsigned i = 0; i != NumElems; ++i) {
2790 else if (idx < (int)NumElems)
2791 Mask[i] = idx + NumElems;
2793 Mask[i] = idx - NumElems;
2797 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
2798 /// match movhlps. The lower half elements should come from upper half of
2799 /// V1 (and in order), and the upper half elements should come from the upper
2800 /// half of V2 (and in order).
2801 static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
2802 if (Op->getValueType(0).getVectorNumElements() != 4)
2804 for (unsigned i = 0, e = 2; i != e; ++i)
2805 if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
2807 for (unsigned i = 2; i != 4; ++i)
2808 if (!isUndefOrEqual(Op->getMaskElt(i), i+4))
2813 /// isScalarLoadToVector - Returns true if the node is a scalar load that
2814 /// is promoted to a vector. It also returns the LoadSDNode by reference if
2816 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
2817 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
2819 N = N->getOperand(0).getNode();
2820 if (!ISD::isNON_EXTLoad(N))
2823 *LD = cast<LoadSDNode>(N);
2827 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
2828 /// match movlp{s|d}. The lower half elements should come from lower half of
2829 /// V1 (and in order), and the upper half elements should come from the upper
2830 /// half of V2 (and in order). And since V1 will become the source of the
2831 /// MOVLP, it must be either a vector load or a scalar load to vector.
2832 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
2833 ShuffleVectorSDNode *Op) {
2834 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
2836 // Is V2 is a vector load, don't do this transformation. We will try to use
2837 // load folding shufps op.
2838 if (ISD::isNON_EXTLoad(V2))
2841 unsigned NumElems = Op->getValueType(0).getVectorNumElements();
2843 if (NumElems != 2 && NumElems != 4)
2845 for (unsigned i = 0, e = NumElems/2; i != e; ++i)
2846 if (!isUndefOrEqual(Op->getMaskElt(i), i))
2848 for (unsigned i = NumElems/2; i != NumElems; ++i)
2849 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems))
2854 /// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
2856 static bool isSplatVector(SDNode *N) {
2857 if (N->getOpcode() != ISD::BUILD_VECTOR)
2860 SDValue SplatValue = N->getOperand(0);
2861 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
2862 if (N->getOperand(i) != SplatValue)
2867 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
2868 /// to an zero vector.
2869 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
2870 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
2871 SDValue V1 = N->getOperand(0);
2872 SDValue V2 = N->getOperand(1);
2873 unsigned NumElems = N->getValueType(0).getVectorNumElements();
2874 for (unsigned i = 0; i != NumElems; ++i) {
2875 int Idx = N->getMaskElt(i);
2876 if (Idx >= (int)NumElems) {
2877 unsigned Opc = V2.getOpcode();
2878 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
2880 if (Opc != ISD::BUILD_VECTOR ||
2881 !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
2883 } else if (Idx >= 0) {
2884 unsigned Opc = V1.getOpcode();
2885 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
2887 if (Opc != ISD::BUILD_VECTOR ||
2888 !X86::isZeroNode(V1.getOperand(Idx)))
2895 /// getZeroVector - Returns a vector of specified type with all zero elements.
2897 static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
2899 assert(VT.isVector() && "Expected a vector type");
2901 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest
2902 // type. This ensures they get CSE'd.
2904 if (VT.getSizeInBits() == 64) { // MMX
2905 SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
2906 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
2907 } else if (HasSSE2) { // SSE2
2908 SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
2909 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
2911 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
2912 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
2914 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
2917 /// getOnesVector - Returns a vector of specified type with all bits set.
2919 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
2920 assert(VT.isVector() && "Expected a vector type");
2922 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
2923 // type. This ensures they get CSE'd.
2924 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
2926 if (VT.getSizeInBits() == 64) // MMX
2927 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
2929 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
2930 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
2934 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
2935 /// that point to V2 points to its first element.
2936 static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
2937 EVT VT = SVOp->getValueType(0);
2938 unsigned NumElems = VT.getVectorNumElements();
2940 bool Changed = false;
2941 SmallVector<int, 8> MaskVec;
2942 SVOp->getMask(MaskVec);
2944 for (unsigned i = 0; i != NumElems; ++i) {
2945 if (MaskVec[i] > (int)NumElems) {
2946 MaskVec[i] = NumElems;
2951 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0),
2952 SVOp->getOperand(1), &MaskVec[0]);
2953 return SDValue(SVOp, 0);
2956 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
2957 /// operation of specified width.
2958 static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
2960 unsigned NumElems = VT.getVectorNumElements();
2961 SmallVector<int, 8> Mask;
2962 Mask.push_back(NumElems);
2963 for (unsigned i = 1; i != NumElems; ++i)
2965 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
2968 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
2969 static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
2971 unsigned NumElems = VT.getVectorNumElements();
2972 SmallVector<int, 8> Mask;
2973 for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
2975 Mask.push_back(i + NumElems);
2977 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
2980 /// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation.
2981 static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
2983 unsigned NumElems = VT.getVectorNumElements();
2984 unsigned Half = NumElems/2;
2985 SmallVector<int, 8> Mask;
2986 for (unsigned i = 0; i != Half; ++i) {
2987 Mask.push_back(i + Half);
2988 Mask.push_back(i + NumElems + Half);
2990 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
2993 /// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32.
2994 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG,
2996 if (SV->getValueType(0).getVectorNumElements() <= 4)
2997 return SDValue(SV, 0);
2999 EVT PVT = MVT::v4f32;
3000 EVT VT = SV->getValueType(0);
3001 DebugLoc dl = SV->getDebugLoc();
3002 SDValue V1 = SV->getOperand(0);
3003 int NumElems = VT.getVectorNumElements();
3004 int EltNo = SV->getSplatIndex();
3006 // unpack elements to the correct location
3007 while (NumElems > 4) {
3008 if (EltNo < NumElems/2) {
3009 V1 = getUnpackl(DAG, dl, VT, V1, V1);
3011 V1 = getUnpackh(DAG, dl, VT, V1, V1);
3012 EltNo -= NumElems/2;
3017 // Perform the splat.
3018 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
3019 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1);
3020 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]);
3021 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1);
3024 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
3025 /// vector of zero or undef vector. This produces a shuffle where the low
3026 /// element of V2 is swizzled into the zero/undef vector, landing at element
3027 /// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
3028 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
3029 bool isZero, bool HasSSE2,
3030 SelectionDAG &DAG) {
3031 EVT VT = V2.getValueType();
3033 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
3034 unsigned NumElems = VT.getVectorNumElements();
3035 SmallVector<int, 16> MaskVec;
3036 for (unsigned i = 0; i != NumElems; ++i)
3037 // If this is the insertion idx, put the low elt of V2 here.
3038 MaskVec.push_back(i == Idx ? NumElems : i);
3039 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
3042 /// getNumOfConsecutiveZeros - Return the number of elements in a result of
3043 /// a shuffle that is zero.
3045 unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems,
3046 bool Low, SelectionDAG &DAG) {
3047 unsigned NumZeros = 0;
3048 for (int i = 0; i < NumElems; ++i) {
3049 unsigned Index = Low ? i : NumElems-i-1;
3050 int Idx = SVOp->getMaskElt(Index);
3055 SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index);
3056 if (Elt.getNode() && X86::isZeroNode(Elt))
3064 /// isVectorShift - Returns true if the shuffle can be implemented as a
3065 /// logical left or right shift of a vector.
3066 /// FIXME: split into pslldqi, psrldqi, palignr variants.
3067 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
3068 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
3069 int NumElems = SVOp->getValueType(0).getVectorNumElements();
3072 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG);
3075 NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG);
3079 bool SeenV1 = false;
3080 bool SeenV2 = false;
3081 for (int i = NumZeros; i < NumElems; ++i) {
3082 int Val = isLeft ? (i - NumZeros) : i;
3083 int Idx = SVOp->getMaskElt(isLeft ? i : (i - NumZeros));
3095 if (SeenV1 && SeenV2)
3098 ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1);
3104 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
3106 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
3107 unsigned NumNonZero, unsigned NumZero,
3108 SelectionDAG &DAG, TargetLowering &TLI) {
3112 DebugLoc dl = Op.getDebugLoc();
3115 for (unsigned i = 0; i < 16; ++i) {
3116 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
3117 if (ThisIsNonZero && First) {
3119 V = getZeroVector(MVT::v8i16, true, DAG, dl);
3121 V = DAG.getUNDEF(MVT::v8i16);
3126 SDValue ThisElt(0, 0), LastElt(0, 0);
3127 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
3128 if (LastIsNonZero) {
3129 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
3130 MVT::i16, Op.getOperand(i-1));
3132 if (ThisIsNonZero) {
3133 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
3134 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
3135 ThisElt, DAG.getConstant(8, MVT::i8));
3137 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
3141 if (ThisElt.getNode())
3142 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
3143 DAG.getIntPtrConstant(i/2));
3147 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V);
3150 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
3152 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
3153 unsigned NumNonZero, unsigned NumZero,
3154 SelectionDAG &DAG, TargetLowering &TLI) {
3158 DebugLoc dl = Op.getDebugLoc();
3161 for (unsigned i = 0; i < 8; ++i) {
3162 bool isNonZero = (NonZeros & (1 << i)) != 0;
3166 V = getZeroVector(MVT::v8i16, true, DAG, dl);
3168 V = DAG.getUNDEF(MVT::v8i16);
3171 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
3172 MVT::v8i16, V, Op.getOperand(i),
3173 DAG.getIntPtrConstant(i));
3180 /// getVShift - Return a vector logical shift node.
3182 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
3183 unsigned NumBits, SelectionDAG &DAG,
3184 const TargetLowering &TLI, DebugLoc dl) {
3185 bool isMMX = VT.getSizeInBits() == 64;
3186 EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
3187 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
3188 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp);
3189 return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
3190 DAG.getNode(Opc, dl, ShVT, SrcOp,
3191 DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
3195 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
3196 DebugLoc dl = Op.getDebugLoc();
3197 // All zero's are handled with pxor, all one's are handled with pcmpeqd.
3198 if (ISD::isBuildVectorAllZeros(Op.getNode())
3199 || ISD::isBuildVectorAllOnes(Op.getNode())) {
3200 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
3201 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
3202 // eliminated on x86-32 hosts.
3203 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32)
3206 if (ISD::isBuildVectorAllOnes(Op.getNode()))
3207 return getOnesVector(Op.getValueType(), DAG, dl);
3208 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
3211 EVT VT = Op.getValueType();
3212 EVT ExtVT = VT.getVectorElementType();
3213 unsigned EVTBits = ExtVT.getSizeInBits();
3215 unsigned NumElems = Op.getNumOperands();
3216 unsigned NumZero = 0;
3217 unsigned NumNonZero = 0;
3218 unsigned NonZeros = 0;
3219 bool IsAllConstants = true;
3220 SmallSet<SDValue, 8> Values;
3221 for (unsigned i = 0; i < NumElems; ++i) {
3222 SDValue Elt = Op.getOperand(i);
3223 if (Elt.getOpcode() == ISD::UNDEF)
3226 if (Elt.getOpcode() != ISD::Constant &&
3227 Elt.getOpcode() != ISD::ConstantFP)
3228 IsAllConstants = false;
3229 if (X86::isZeroNode(Elt))
3232 NonZeros |= (1 << i);
3237 if (NumNonZero == 0) {
3238 // All undef vector. Return an UNDEF. All zero vectors were handled above.
3239 return DAG.getUNDEF(VT);
3242 // Special case for single non-zero, non-undef, element.
3243 if (NumNonZero == 1) {
3244 unsigned Idx = CountTrailingZeros_32(NonZeros);
3245 SDValue Item = Op.getOperand(Idx);
3247 // If this is an insertion of an i64 value on x86-32, and if the top bits of
3248 // the value are obviously zero, truncate the value to i32 and do the
3249 // insertion that way. Only do this if the value is non-constant or if the
3250 // value is a constant being inserted into element 0. It is cheaper to do
3251 // a constant pool load than it is to do a movd + shuffle.
3252 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
3253 (!IsAllConstants || Idx == 0)) {
3254 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
3255 // Handle MMX and SSE both.
3256 EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
3257 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2;
3259 // Truncate the value (which may itself be a constant) to i32, and
3260 // convert it to a vector with movd (S2V+shuffle to zero extend).
3261 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
3262 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
3263 Item = getShuffleVectorZeroOrUndef(Item, 0, true,
3264 Subtarget->hasSSE2(), DAG);
3266 // Now we have our 32-bit value zero extended in the low element of
3267 // a vector. If Idx != 0, swizzle it into place.
3269 SmallVector<int, 4> Mask;
3270 Mask.push_back(Idx);
3271 for (unsigned i = 1; i != VecElts; ++i)
3273 Item = DAG.getVectorShuffle(VecVT, dl, Item,
3274 DAG.getUNDEF(Item.getValueType()),
3277 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item);
3281 // If we have a constant or non-constant insertion into the low element of
3282 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
3283 // the rest of the elements. This will be matched as movd/movq/movss/movsd
3284 // depending on what the source datatype is.
3287 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3288 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
3289 (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
3290 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3291 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
3292 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(),
3294 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
3295 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
3296 EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32;
3297 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
3298 Item = getShuffleVectorZeroOrUndef(Item, 0, true,
3299 Subtarget->hasSSE2(), DAG);
3300 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item);
3304 // Is it a vector logical left shift?
3305 if (NumElems == 2 && Idx == 1 &&
3306 X86::isZeroNode(Op.getOperand(0)) &&
3307 !X86::isZeroNode(Op.getOperand(1))) {
3308 unsigned NumBits = VT.getSizeInBits();
3309 return getVShift(true, VT,
3310 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
3311 VT, Op.getOperand(1)),
3312 NumBits/2, DAG, *this, dl);
3315 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
3318 // Otherwise, if this is a vector with i32 or f32 elements, and the element
3319 // is a non-constant being inserted into an element other than the low one,
3320 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
3321 // movd/movss) to move this into the low element, then shuffle it into
3323 if (EVTBits == 32) {
3324 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3326 // Turn it into a shuffle of zero and zero-extended scalar to vector.
3327 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
3328 Subtarget->hasSSE2(), DAG);
3329 SmallVector<int, 8> MaskVec;
3330 for (unsigned i = 0; i < NumElems; i++)
3331 MaskVec.push_back(i == Idx ? 0 : 1);
3332 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
3336 // Splat is obviously ok. Let legalizer expand it to a shuffle.
3337 if (Values.size() == 1)
3340 // A vector full of immediates; various special cases are already
3341 // handled, so this is best done with a single constant-pool load.
3345 // Let legalizer expand 2-wide build_vectors.
3346 if (EVTBits == 64) {
3347 if (NumNonZero == 1) {
3348 // One half is zero or undef.
3349 unsigned Idx = CountTrailingZeros_32(NonZeros);
3350 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
3351 Op.getOperand(Idx));
3352 return getShuffleVectorZeroOrUndef(V2, Idx, true,
3353 Subtarget->hasSSE2(), DAG);
3358 // If element VT is < 32 bits, convert it to inserts into a zero vector.
3359 if (EVTBits == 8 && NumElems == 16) {
3360 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
3362 if (V.getNode()) return V;
3365 if (EVTBits == 16 && NumElems == 8) {
3366 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
3368 if (V.getNode()) return V;
3371 // If element VT is == 32 bits, turn it into a number of shuffles.
3372 SmallVector<SDValue, 8> V;
3374 if (NumElems == 4 && NumZero > 0) {
3375 for (unsigned i = 0; i < 4; ++i) {
3376 bool isZero = !(NonZeros & (1 << i));
3378 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
3380 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
3383 for (unsigned i = 0; i < 2; ++i) {
3384 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
3387 V[i] = V[i*2]; // Must be a zero vector.
3390 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
3393 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
3396 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
3401 SmallVector<int, 8> MaskVec;
3402 bool Reverse = (NonZeros & 0x3) == 2;
3403 for (unsigned i = 0; i < 2; ++i)
3404 MaskVec.push_back(Reverse ? 1-i : i);
3405 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
3406 for (unsigned i = 0; i < 2; ++i)
3407 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems);
3408 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
3411 if (Values.size() > 2) {
3412 // If we have SSE 4.1, Expand into a number of inserts unless the number of
3413 // values to be inserted is equal to the number of elements, in which case
3414 // use the unpack code below in the hopes of matching the consecutive elts
3415 // load merge pattern for shuffles.
3416 // FIXME: We could probably just check that here directly.
3417 if (Values.size() < NumElems && VT.getSizeInBits() == 128 &&
3418 getSubtarget()->hasSSE41()) {
3419 V[0] = DAG.getUNDEF(VT);
3420 for (unsigned i = 0; i < NumElems; ++i)
3421 if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
3422 V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0],
3423 Op.getOperand(i), DAG.getIntPtrConstant(i));
3426 // Expand into a number of unpckl*.
3428 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
3429 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
3430 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
3431 for (unsigned i = 0; i < NumElems; ++i)
3432 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
3434 while (NumElems != 0) {
3435 for (unsigned i = 0; i < NumElems; ++i)
3436 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]);
3445 // v8i16 shuffles - Prefer shuffles in the following order:
3446 // 1. [all] pshuflw, pshufhw, optional move
3447 // 2. [ssse3] 1 x pshufb
3448 // 3. [ssse3] 2 x pshufb + 1 x por
3449 // 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
3451 SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
3452 SelectionDAG &DAG, X86TargetLowering &TLI) {
3453 SDValue V1 = SVOp->getOperand(0);
3454 SDValue V2 = SVOp->getOperand(1);
3455 DebugLoc dl = SVOp->getDebugLoc();
3456 SmallVector<int, 8> MaskVals;
3458 // Determine if more than 1 of the words in each of the low and high quadwords
3459 // of the result come from the same quadword of one of the two inputs. Undef
3460 // mask values count as coming from any quadword, for better codegen.
3461 SmallVector<unsigned, 4> LoQuad(4);
3462 SmallVector<unsigned, 4> HiQuad(4);
3463 BitVector InputQuads(4);
3464 for (unsigned i = 0; i < 8; ++i) {
3465 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
3466 int EltIdx = SVOp->getMaskElt(i);
3467 MaskVals.push_back(EltIdx);
3476 InputQuads.set(EltIdx / 4);
3479 int BestLoQuad = -1;
3480 unsigned MaxQuad = 1;
3481 for (unsigned i = 0; i < 4; ++i) {
3482 if (LoQuad[i] > MaxQuad) {
3484 MaxQuad = LoQuad[i];
3488 int BestHiQuad = -1;
3490 for (unsigned i = 0; i < 4; ++i) {
3491 if (HiQuad[i] > MaxQuad) {
3493 MaxQuad = HiQuad[i];
3497 // For SSSE3, If all 8 words of the result come from only 1 quadword of each
3498 // of the two input vectors, shuffle them into one input vector so only a
3499 // single pshufb instruction is necessary. If There are more than 2 input
3500 // quads, disable the next transformation since it does not help SSSE3.
3501 bool V1Used = InputQuads[0] || InputQuads[1];
3502 bool V2Used = InputQuads[2] || InputQuads[3];
3503 if (TLI.getSubtarget()->hasSSSE3()) {
3504 if (InputQuads.count() == 2 && V1Used && V2Used) {
3505 BestLoQuad = InputQuads.find_first();
3506 BestHiQuad = InputQuads.find_next(BestLoQuad);
3508 if (InputQuads.count() > 2) {
3514 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
3515 // the shuffle mask. If a quad is scored as -1, that means that it contains
3516 // words from all 4 input quadwords.
3518 if (BestLoQuad >= 0 || BestHiQuad >= 0) {
3519 SmallVector<int, 8> MaskV;
3520 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad);
3521 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad);
3522 NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
3523 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1),
3524 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]);
3525 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV);
3527 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
3528 // source words for the shuffle, to aid later transformations.
3529 bool AllWordsInNewV = true;
3530 bool InOrder[2] = { true, true };
3531 for (unsigned i = 0; i != 8; ++i) {
3532 int idx = MaskVals[i];
3534 InOrder[i/4] = false;
3535 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
3537 AllWordsInNewV = false;
3541 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
3542 if (AllWordsInNewV) {
3543 for (int i = 0; i != 8; ++i) {
3544 int idx = MaskVals[i];
3547 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
3548 if ((idx != i) && idx < 4)
3550 if ((idx != i) && idx > 3)
3559 // If we've eliminated the use of V2, and the new mask is a pshuflw or
3560 // pshufhw, that's as cheap as it gets. Return the new shuffle.
3561 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
3562 return DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
3563 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
3567 // If we have SSSE3, and all words of the result are from 1 input vector,
3568 // case 2 is generated, otherwise case 3 is generated. If no SSSE3
3569 // is present, fall back to case 4.
3570 if (TLI.getSubtarget()->hasSSSE3()) {
3571 SmallVector<SDValue,16> pshufbMask;
3573 // If we have elements from both input vectors, set the high bit of the
3574 // shuffle mask element to zero out elements that come from V2 in the V1
3575 // mask, and elements that come from V1 in the V2 mask, so that the two
3576 // results can be OR'd together.
3577 bool TwoInputs = V1Used && V2Used;
3578 for (unsigned i = 0; i != 8; ++i) {
3579 int EltIdx = MaskVals[i] * 2;
3580 if (TwoInputs && (EltIdx >= 16)) {
3581 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
3582 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
3585 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
3586 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
3588 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1);
3589 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
3590 DAG.getNode(ISD::BUILD_VECTOR, dl,
3591 MVT::v16i8, &pshufbMask[0], 16));
3593 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
3595 // Calculate the shuffle mask for the second input, shuffle it, and
3596 // OR it with the first shuffled input.
3598 for (unsigned i = 0; i != 8; ++i) {
3599 int EltIdx = MaskVals[i] * 2;
3601 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
3602 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
3605 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
3606 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
3608 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2);
3609 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
3610 DAG.getNode(ISD::BUILD_VECTOR, dl,
3611 MVT::v16i8, &pshufbMask[0], 16));
3612 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
3613 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
3616 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
3617 // and update MaskVals with new element order.
3618 BitVector InOrder(8);
3619 if (BestLoQuad >= 0) {
3620 SmallVector<int, 8> MaskV;
3621 for (int i = 0; i != 4; ++i) {
3622 int idx = MaskVals[i];
3624 MaskV.push_back(-1);
3626 } else if ((idx / 4) == BestLoQuad) {
3627 MaskV.push_back(idx & 3);
3630 MaskV.push_back(-1);
3633 for (unsigned i = 4; i != 8; ++i)
3635 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
3639 // If BestHi >= 0, generate a pshufhw to put the high elements in order,
3640 // and update MaskVals with the new element order.
3641 if (BestHiQuad >= 0) {
3642 SmallVector<int, 8> MaskV;
3643 for (unsigned i = 0; i != 4; ++i)
3645 for (unsigned i = 4; i != 8; ++i) {
3646 int idx = MaskVals[i];
3648 MaskV.push_back(-1);
3650 } else if ((idx / 4) == BestHiQuad) {
3651 MaskV.push_back((idx & 3) + 4);
3654 MaskV.push_back(-1);
3657 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
3661 // In case BestHi & BestLo were both -1, which means each quadword has a word
3662 // from each of the four input quadwords, calculate the InOrder bitvector now
3663 // before falling through to the insert/extract cleanup.
3664 if (BestLoQuad == -1 && BestHiQuad == -1) {
3666 for (int i = 0; i != 8; ++i)
3667 if (MaskVals[i] < 0 || MaskVals[i] == i)
3671 // The other elements are put in the right place using pextrw and pinsrw.
3672 for (unsigned i = 0; i != 8; ++i) {
3675 int EltIdx = MaskVals[i];
3678 SDValue ExtOp = (EltIdx < 8)
3679 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
3680 DAG.getIntPtrConstant(EltIdx))
3681 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
3682 DAG.getIntPtrConstant(EltIdx - 8));
3683 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
3684 DAG.getIntPtrConstant(i));
3689 // v16i8 shuffles - Prefer shuffles in the following order:
3690 // 1. [ssse3] 1 x pshufb
3691 // 2. [ssse3] 2 x pshufb + 1 x por
3692 // 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw
3694 SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
3695 SelectionDAG &DAG, X86TargetLowering &TLI) {
3696 SDValue V1 = SVOp->getOperand(0);
3697 SDValue V2 = SVOp->getOperand(1);
3698 DebugLoc dl = SVOp->getDebugLoc();
3699 SmallVector<int, 16> MaskVals;
3700 SVOp->getMask(MaskVals);
3702 // If we have SSSE3, case 1 is generated when all result bytes come from
3703 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is
3704 // present, fall back to case 3.
3705 // FIXME: kill V2Only once shuffles are canonizalized by getNode.
3708 for (unsigned i = 0; i < 16; ++i) {
3709 int EltIdx = MaskVals[i];
3718 // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
3719 if (TLI.getSubtarget()->hasSSSE3()) {
3720 SmallVector<SDValue,16> pshufbMask;
3722 // If all result elements are from one input vector, then only translate
3723 // undef mask values to 0x80 (zero out result) in the pshufb mask.
3725 // Otherwise, we have elements from both input vectors, and must zero out
3726 // elements that come from V2 in the first mask, and V1 in the second mask
3727 // so that we can OR them together.
3728 bool TwoInputs = !(V1Only || V2Only);
3729 for (unsigned i = 0; i != 16; ++i) {
3730 int EltIdx = MaskVals[i];
3731 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
3732 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
3735 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
3737 // If all the elements are from V2, assign it to V1 and return after
3738 // building the first pshufb.
3741 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
3742 DAG.getNode(ISD::BUILD_VECTOR, dl,
3743 MVT::v16i8, &pshufbMask[0], 16));
3747 // Calculate the shuffle mask for the second input, shuffle it, and
3748 // OR it with the first shuffled input.
3750 for (unsigned i = 0; i != 16; ++i) {
3751 int EltIdx = MaskVals[i];
3753 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
3756 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
3758 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
3759 DAG.getNode(ISD::BUILD_VECTOR, dl,
3760 MVT::v16i8, &pshufbMask[0], 16));
3761 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
3764 // No SSSE3 - Calculate in place words and then fix all out of place words
3765 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from
3766 // the 16 different words that comprise the two doublequadword input vectors.
3767 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
3768 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2);
3769 SDValue NewV = V2Only ? V2 : V1;
3770 for (int i = 0; i != 8; ++i) {
3771 int Elt0 = MaskVals[i*2];
3772 int Elt1 = MaskVals[i*2+1];
3774 // This word of the result is all undef, skip it.
3775 if (Elt0 < 0 && Elt1 < 0)
3778 // This word of the result is already in the correct place, skip it.
3779 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
3781 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
3784 SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
3785 SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
3788 // If Elt0 and Elt1 are defined, are consecutive, and can be load
3789 // using a single extract together, load it and store it.
3790 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
3791 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
3792 DAG.getIntPtrConstant(Elt1 / 2));
3793 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
3794 DAG.getIntPtrConstant(i));
3798 // If Elt1 is defined, extract it from the appropriate source. If the
3799 // source byte is not also odd, shift the extracted word left 8 bits
3800 // otherwise clear the bottom 8 bits if we need to do an or.
3802 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
3803 DAG.getIntPtrConstant(Elt1 / 2));
3804 if ((Elt1 & 1) == 0)
3805 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
3806 DAG.getConstant(8, TLI.getShiftAmountTy()));
3808 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
3809 DAG.getConstant(0xFF00, MVT::i16));
3811 // If Elt0 is defined, extract it from the appropriate source. If the
3812 // source byte is not also even, shift the extracted word right 8 bits. If
3813 // Elt1 was also defined, OR the extracted values together before
3814 // inserting them in the result.
3816 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
3817 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
3818 if ((Elt0 & 1) != 0)
3819 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
3820 DAG.getConstant(8, TLI.getShiftAmountTy()));
3822 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
3823 DAG.getConstant(0x00FF, MVT::i16));
3824 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
3827 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
3828 DAG.getIntPtrConstant(i));
3830 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV);
3833 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
3834 /// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be
3835 /// done when every pair / quad of shuffle mask elements point to elements in
3836 /// the right sequence. e.g.
3837 /// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
3839 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
3841 TargetLowering &TLI, DebugLoc dl) {
3842 EVT VT = SVOp->getValueType(0);
3843 SDValue V1 = SVOp->getOperand(0);
3844 SDValue V2 = SVOp->getOperand(1);
3845 unsigned NumElems = VT.getVectorNumElements();
3846 unsigned NewWidth = (NumElems == 4) ? 2 : 4;
3847 EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
3848 EVT MaskEltVT = MaskVT.getVectorElementType();
3850 switch (VT.getSimpleVT().SimpleTy) {
3851 default: assert(false && "Unexpected!");
3852 case MVT::v4f32: NewVT = MVT::v2f64; break;
3853 case MVT::v4i32: NewVT = MVT::v2i64; break;
3854 case MVT::v8i16: NewVT = MVT::v4i32; break;
3855 case MVT::v16i8: NewVT = MVT::v4i32; break;
3858 if (NewWidth == 2) {
3864 int Scale = NumElems / NewWidth;
3865 SmallVector<int, 8> MaskVec;
3866 for (unsigned i = 0; i < NumElems; i += Scale) {
3868 for (int j = 0; j < Scale; ++j) {
3869 int EltIdx = SVOp->getMaskElt(i+j);
3873 StartIdx = EltIdx - (EltIdx % Scale);
3874 if (EltIdx != StartIdx + j)
3878 MaskVec.push_back(-1);
3880 MaskVec.push_back(StartIdx / Scale);
3883 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1);
3884 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2);
3885 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
3888 /// getVZextMovL - Return a zero-extending vector move low node.
3890 static SDValue getVZextMovL(EVT VT, EVT OpVT,
3891 SDValue SrcOp, SelectionDAG &DAG,
3892 const X86Subtarget *Subtarget, DebugLoc dl) {
3893 if (VT == MVT::v2f64 || VT == MVT::v4f32) {
3894 LoadSDNode *LD = NULL;
3895 if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
3896 LD = dyn_cast<LoadSDNode>(SrcOp);
3898 // movssrr and movsdrr do not clear top bits. Try to use movd, movq
3900 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
3901 if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) &&
3902 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
3903 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT &&
3904 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
3906 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
3907 return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
3908 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
3909 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
3917 return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
3918 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
3919 DAG.getNode(ISD::BIT_CONVERT, dl,
3923 /// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of
3926 LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
3927 SDValue V1 = SVOp->getOperand(0);
3928 SDValue V2 = SVOp->getOperand(1);
3929 DebugLoc dl = SVOp->getDebugLoc();
3930 EVT VT = SVOp->getValueType(0);
3932 SmallVector<std::pair<int, int>, 8> Locs;
3934 SmallVector<int, 8> Mask1(4U, -1);
3935 SmallVector<int, 8> PermMask;
3936 SVOp->getMask(PermMask);
3940 for (unsigned i = 0; i != 4; ++i) {
3941 int Idx = PermMask[i];
3943 Locs[i] = std::make_pair(-1, -1);
3945 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
3947 Locs[i] = std::make_pair(0, NumLo);
3951 Locs[i] = std::make_pair(1, NumHi);
3953 Mask1[2+NumHi] = Idx;
3959 if (NumLo <= 2 && NumHi <= 2) {
3960 // If no more than two elements come from either vector. This can be
3961 // implemented with two shuffles. First shuffle gather the elements.
3962 // The second shuffle, which takes the first shuffle as both of its
3963 // vector operands, put the elements into the right order.
3964 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
3966 SmallVector<int, 8> Mask2(4U, -1);
3968 for (unsigned i = 0; i != 4; ++i) {
3969 if (Locs[i].first == -1)
3972 unsigned Idx = (i < 2) ? 0 : 4;
3973 Idx += Locs[i].first * 2 + Locs[i].second;
3978 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
3979 } else if (NumLo == 3 || NumHi == 3) {
3980 // Otherwise, we must have three elements from one vector, call it X, and
3981 // one element from the other, call it Y. First, use a shufps to build an
3982 // intermediate vector with the one element from Y and the element from X
3983 // that will be in the same half in the final destination (the indexes don't
3984 // matter). Then, use a shufps to build the final vector, taking the half
3985 // containing the element from Y from the intermediate, and the other half
3988 // Normalize it so the 3 elements come from V1.
3989 CommuteVectorShuffleMask(PermMask, VT);
3993 // Find the element from V2.
3995 for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
3996 int Val = PermMask[HiIndex];
4003 Mask1[0] = PermMask[HiIndex];
4005 Mask1[2] = PermMask[HiIndex^1];
4007 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
4010 Mask1[0] = PermMask[0];
4011 Mask1[1] = PermMask[1];
4012 Mask1[2] = HiIndex & 1 ? 6 : 4;
4013 Mask1[3] = HiIndex & 1 ? 4 : 6;
4014 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
4016 Mask1[0] = HiIndex & 1 ? 2 : 0;
4017 Mask1[1] = HiIndex & 1 ? 0 : 2;
4018 Mask1[2] = PermMask[2];
4019 Mask1[3] = PermMask[3];
4024 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
4028 // Break it into (shuffle shuffle_hi, shuffle_lo).
4030 SmallVector<int,8> LoMask(4U, -1);
4031 SmallVector<int,8> HiMask(4U, -1);
4033 SmallVector<int,8> *MaskPtr = &LoMask;
4034 unsigned MaskIdx = 0;
4037 for (unsigned i = 0; i != 4; ++i) {
4044 int Idx = PermMask[i];
4046 Locs[i] = std::make_pair(-1, -1);
4047 } else if (Idx < 4) {
4048 Locs[i] = std::make_pair(MaskIdx, LoIdx);
4049 (*MaskPtr)[LoIdx] = Idx;
4052 Locs[i] = std::make_pair(MaskIdx, HiIdx);
4053 (*MaskPtr)[HiIdx] = Idx;
4058 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
4059 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
4060 SmallVector<int, 8> MaskOps;
4061 for (unsigned i = 0; i != 4; ++i) {
4062 if (Locs[i].first == -1) {
4063 MaskOps.push_back(-1);
4065 unsigned Idx = Locs[i].first * 4 + Locs[i].second;
4066 MaskOps.push_back(Idx);
4069 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
4073 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
4074 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
4075 SDValue V1 = Op.getOperand(0);
4076 SDValue V2 = Op.getOperand(1);
4077 EVT VT = Op.getValueType();
4078 DebugLoc dl = Op.getDebugLoc();
4079 unsigned NumElems = VT.getVectorNumElements();
4080 bool isMMX = VT.getSizeInBits() == 64;
4081 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
4082 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
4083 bool V1IsSplat = false;
4084 bool V2IsSplat = false;
4086 if (isZeroShuffle(SVOp))
4087 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
4089 // Promote splats to v4f32.
4090 if (SVOp->isSplat()) {
4091 if (isMMX || NumElems < 4)
4093 return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2());
4096 // If the shuffle can be profitably rewritten as a narrower shuffle, then
4098 if (VT == MVT::v8i16 || VT == MVT::v16i8) {
4099 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
4100 if (NewOp.getNode())
4101 return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4102 LowerVECTOR_SHUFFLE(NewOp, DAG));
4103 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
4104 // FIXME: Figure out a cleaner way to do this.
4105 // Try to make use of movq to zero out the top part.
4106 if (ISD::isBuildVectorAllZeros(V2.getNode())) {
4107 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
4108 if (NewOp.getNode()) {
4109 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false))
4110 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0),
4111 DAG, Subtarget, dl);
4113 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
4114 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
4115 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)))
4116 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
4117 DAG, Subtarget, dl);
4121 if (X86::isPSHUFDMask(SVOp))
4124 // Check if this can be converted into a logical shift.
4125 bool isLeft = false;
4128 bool isShift = getSubtarget()->hasSSE2() &&
4129 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
4130 if (isShift && ShVal.hasOneUse()) {
4131 // If the shifted value has multiple uses, it may be cheaper to use
4132 // v_set0 + movlhps or movhlps, etc.
4133 EVT EVT = VT.getVectorElementType();
4134 ShAmt *= EVT.getSizeInBits();
4135 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
4138 if (X86::isMOVLMask(SVOp)) {
4141 if (ISD::isBuildVectorAllZeros(V1.getNode()))
4142 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
4147 // FIXME: fold these into legal mask.
4148 if (!isMMX && (X86::isMOVSHDUPMask(SVOp) ||
4149 X86::isMOVSLDUPMask(SVOp) ||
4150 X86::isMOVHLPSMask(SVOp) ||
4151 X86::isMOVHPMask(SVOp) ||
4152 X86::isMOVLPMask(SVOp)))
4155 if (ShouldXformToMOVHLPS(SVOp) ||
4156 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
4157 return CommuteVectorShuffle(SVOp, DAG);
4160 // No better options. Use a vshl / vsrl.
4161 EVT EVT = VT.getVectorElementType();
4162 ShAmt *= EVT.getSizeInBits();
4163 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
4166 bool Commuted = false;
4167 // FIXME: This should also accept a bitcast of a splat? Be careful, not
4168 // 1,1,1,1 -> v8i16 though.
4169 V1IsSplat = isSplatVector(V1.getNode());
4170 V2IsSplat = isSplatVector(V2.getNode());
4172 // Canonicalize the splat or undef, if present, to be on the RHS.
4173 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
4174 Op = CommuteVectorShuffle(SVOp, DAG);
4175 SVOp = cast<ShuffleVectorSDNode>(Op);
4176 V1 = SVOp->getOperand(0);
4177 V2 = SVOp->getOperand(1);
4178 std::swap(V1IsSplat, V2IsSplat);
4179 std::swap(V1IsUndef, V2IsUndef);
4183 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) {
4184 // Shuffling low element of v1 into undef, just return v1.
4187 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
4188 // the instruction selector will not match, so get a canonical MOVL with
4189 // swapped operands to undo the commute.
4190 return getMOVL(DAG, dl, VT, V2, V1);
4193 if (X86::isUNPCKL_v_undef_Mask(SVOp) ||
4194 X86::isUNPCKH_v_undef_Mask(SVOp) ||
4195 X86::isUNPCKLMask(SVOp) ||
4196 X86::isUNPCKHMask(SVOp))
4200 // Normalize mask so all entries that point to V2 points to its first
4201 // element then try to match unpck{h|l} again. If match, return a
4202 // new vector_shuffle with the corrected mask.
4203 SDValue NewMask = NormalizeMask(SVOp, DAG);
4204 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask);
4205 if (NSVOp != SVOp) {
4206 if (X86::isUNPCKLMask(NSVOp, true)) {
4208 } else if (X86::isUNPCKHMask(NSVOp, true)) {
4215 // Commute is back and try unpck* again.
4216 // FIXME: this seems wrong.
4217 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
4218 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
4219 if (X86::isUNPCKL_v_undef_Mask(NewSVOp) ||
4220 X86::isUNPCKH_v_undef_Mask(NewSVOp) ||
4221 X86::isUNPCKLMask(NewSVOp) ||
4222 X86::isUNPCKHMask(NewSVOp))
4226 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle.
4228 // Normalize the node to match x86 shuffle ops if needed
4229 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp))
4230 return CommuteVectorShuffle(SVOp, DAG);
4232 // Check for legal shuffle and return?
4233 SmallVector<int, 16> PermMask;
4234 SVOp->getMask(PermMask);
4235 if (isShuffleMaskLegal(PermMask, VT))
4238 // Handle v8i16 specifically since SSE can do byte extraction and insertion.
4239 if (VT == MVT::v8i16) {
4240 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this);
4241 if (NewOp.getNode())
4245 if (VT == MVT::v16i8) {
4246 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
4247 if (NewOp.getNode())
4251 // Handle all 4 wide cases with a number of shuffles except for MMX.
4252 if (NumElems == 4 && !isMMX)
4253 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
4259 X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
4260 SelectionDAG &DAG) {
4261 EVT VT = Op.getValueType();
4262 DebugLoc dl = Op.getDebugLoc();
4263 if (VT.getSizeInBits() == 8) {
4264 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
4265 Op.getOperand(0), Op.getOperand(1));
4266 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
4267 DAG.getValueType(VT));
4268 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4269 } else if (VT.getSizeInBits() == 16) {
4270 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4271 // If Idx is 0, it's cheaper to do a move instead of a pextrw.
4273 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
4274 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4275 DAG.getNode(ISD::BIT_CONVERT, dl,
4279 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
4280 Op.getOperand(0), Op.getOperand(1));
4281 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
4282 DAG.getValueType(VT));
4283 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4284 } else if (VT == MVT::f32) {
4285 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
4286 // the result back to FR32 register. It's only worth matching if the
4287 // result has a single use which is a store or a bitcast to i32. And in
4288 // the case of a store, it's not worth it if the index is a constant 0,
4289 // because a MOVSSmr can be used instead, which is smaller and faster.
4290 if (!Op.hasOneUse())
4292 SDNode *User = *Op.getNode()->use_begin();
4293 if ((User->getOpcode() != ISD::STORE ||
4294 (isa<ConstantSDNode>(Op.getOperand(1)) &&
4295 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
4296 (User->getOpcode() != ISD::BIT_CONVERT ||
4297 User->getValueType(0) != MVT::i32))
4299 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4300 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32,
4303 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract);
4304 } else if (VT == MVT::i32) {
4305 // ExtractPS works with constant index.
4306 if (isa<ConstantSDNode>(Op.getOperand(1)))
4314 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
4315 if (!isa<ConstantSDNode>(Op.getOperand(1)))
4318 if (Subtarget->hasSSE41()) {
4319 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
4324 EVT VT = Op.getValueType();
4325 DebugLoc dl = Op.getDebugLoc();
4326 // TODO: handle v16i8.
4327 if (VT.getSizeInBits() == 16) {
4328 SDValue Vec = Op.getOperand(0);
4329 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4331 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
4332 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4333 DAG.getNode(ISD::BIT_CONVERT, dl,
4336 // Transform it so it match pextrw which produces a 32-bit result.
4337 EVT EVT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy+1);
4338 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EVT,
4339 Op.getOperand(0), Op.getOperand(1));
4340 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EVT, Extract,
4341 DAG.getValueType(VT));
4342 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4343 } else if (VT.getSizeInBits() == 32) {
4344 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4348 // SHUFPS the element to the lowest double word, then movss.
4349 int Mask[4] = { Idx, -1, -1, -1 };
4350 EVT VVT = Op.getOperand(0).getValueType();
4351 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
4352 DAG.getUNDEF(VVT), Mask);
4353 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
4354 DAG.getIntPtrConstant(0));
4355 } else if (VT.getSizeInBits() == 64) {
4356 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
4357 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
4358 // to match extract_elt for f64.
4359 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4363 // UNPCKHPD the element to the lowest double word, then movsd.
4364 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
4365 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
4366 int Mask[2] = { 1, -1 };
4367 EVT VVT = Op.getOperand(0).getValueType();
4368 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
4369 DAG.getUNDEF(VVT), Mask);
4370 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
4371 DAG.getIntPtrConstant(0));
4378 X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){
4379 EVT VT = Op.getValueType();
4380 EVT EVT = VT.getVectorElementType();
4381 DebugLoc dl = Op.getDebugLoc();
4383 SDValue N0 = Op.getOperand(0);
4384 SDValue N1 = Op.getOperand(1);
4385 SDValue N2 = Op.getOperand(2);
4387 if ((EVT.getSizeInBits() == 8 || EVT.getSizeInBits() == 16) &&
4388 isa<ConstantSDNode>(N2)) {
4389 unsigned Opc = (EVT.getSizeInBits() == 8) ? X86ISD::PINSRB
4391 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
4393 if (N1.getValueType() != MVT::i32)
4394 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
4395 if (N2.getValueType() != MVT::i32)
4396 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
4397 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
4398 } else if (EVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
4399 // Bits [7:6] of the constant are the source select. This will always be
4400 // zero here. The DAG Combiner may combine an extract_elt index into these
4401 // bits. For example (insert (extract, 3), 2) could be matched by putting
4402 // the '3' into bits [7:6] of X86ISD::INSERTPS.
4403 // Bits [5:4] of the constant are the destination select. This is the
4404 // value of the incoming immediate.
4405 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
4406 // combine either bitwise AND or insert of float 0.0 to set these bits.
4407 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
4408 // Create this as a scalar to vector..
4409 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
4410 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
4411 } else if (EVT == MVT::i32 && isa<ConstantSDNode>(N2)) {
4412 // PINSR* works with constant index.
4419 X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
4420 EVT VT = Op.getValueType();
4421 EVT EVT = VT.getVectorElementType();
4423 if (Subtarget->hasSSE41())
4424 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
4429 DebugLoc dl = Op.getDebugLoc();
4430 SDValue N0 = Op.getOperand(0);
4431 SDValue N1 = Op.getOperand(1);
4432 SDValue N2 = Op.getOperand(2);
4434 if (EVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
4435 // Transform it so it match pinsrw which expects a 16-bit value in a GR32
4436 // as its second argument.
4437 if (N1.getValueType() != MVT::i32)
4438 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
4439 if (N2.getValueType() != MVT::i32)
4440 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
4441 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
4447 X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
4448 DebugLoc dl = Op.getDebugLoc();
4449 if (Op.getValueType() == MVT::v2f32)
4450 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32,
4451 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32,
4452 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32,
4453 Op.getOperand(0))));
4455 if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64)
4456 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
4458 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
4459 EVT VT = MVT::v2i32;
4460 switch (Op.getValueType().getSimpleVT().SimpleTy) {
4467 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(),
4468 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt));
4471 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
4472 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
4473 // one of the above mentioned nodes. It has to be wrapped because otherwise
4474 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
4475 // be used to form addressing mode. These wrapped nodes will be selected
4478 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
4479 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
4481 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
4483 unsigned char OpFlag = 0;
4484 unsigned WrapperKind = X86ISD::Wrapper;
4485 CodeModel::Model M = getTargetMachine().getCodeModel();
4487 if (Subtarget->isPICStyleRIPRel() &&
4488 (M == CodeModel::Small || M == CodeModel::Kernel))
4489 WrapperKind = X86ISD::WrapperRIP;
4490 else if (Subtarget->isPICStyleGOT())
4491 OpFlag = X86II::MO_GOTOFF;
4492 else if (Subtarget->isPICStyleStubPIC())
4493 OpFlag = X86II::MO_PIC_BASE_OFFSET;
4495 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
4497 CP->getOffset(), OpFlag);
4498 DebugLoc DL = CP->getDebugLoc();
4499 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
4500 // With PIC, the address is actually $g + Offset.
4502 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
4503 DAG.getNode(X86ISD::GlobalBaseReg,
4504 DebugLoc::getUnknownLoc(), getPointerTy()),
4511 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
4512 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
4514 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
4516 unsigned char OpFlag = 0;
4517 unsigned WrapperKind = X86ISD::Wrapper;
4518 CodeModel::Model M = getTargetMachine().getCodeModel();
4520 if (Subtarget->isPICStyleRIPRel() &&
4521 (M == CodeModel::Small || M == CodeModel::Kernel))
4522 WrapperKind = X86ISD::WrapperRIP;
4523 else if (Subtarget->isPICStyleGOT())
4524 OpFlag = X86II::MO_GOTOFF;
4525 else if (Subtarget->isPICStyleStubPIC())
4526 OpFlag = X86II::MO_PIC_BASE_OFFSET;
4528 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
4530 DebugLoc DL = JT->getDebugLoc();
4531 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
4533 // With PIC, the address is actually $g + Offset.
4535 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
4536 DAG.getNode(X86ISD::GlobalBaseReg,
4537 DebugLoc::getUnknownLoc(), getPointerTy()),
4545 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) {
4546 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
4548 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
4550 unsigned char OpFlag = 0;
4551 unsigned WrapperKind = X86ISD::Wrapper;
4552 CodeModel::Model M = getTargetMachine().getCodeModel();
4554 if (Subtarget->isPICStyleRIPRel() &&
4555 (M == CodeModel::Small || M == CodeModel::Kernel))
4556 WrapperKind = X86ISD::WrapperRIP;
4557 else if (Subtarget->isPICStyleGOT())
4558 OpFlag = X86II::MO_GOTOFF;
4559 else if (Subtarget->isPICStyleStubPIC())
4560 OpFlag = X86II::MO_PIC_BASE_OFFSET;
4562 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
4564 DebugLoc DL = Op.getDebugLoc();
4565 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
4568 // With PIC, the address is actually $g + Offset.
4569 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
4570 !Subtarget->is64Bit()) {
4571 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
4572 DAG.getNode(X86ISD::GlobalBaseReg,
4573 DebugLoc::getUnknownLoc(),
4582 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
4584 SelectionDAG &DAG) const {
4585 // Create the TargetGlobalAddress node, folding in the constant
4586 // offset if it is legal.
4587 unsigned char OpFlags =
4588 Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
4589 CodeModel::Model M = getTargetMachine().getCodeModel();
4591 if (OpFlags == X86II::MO_NO_FLAG &&
4592 X86::isOffsetSuitableForCodeModel(Offset, M)) {
4593 // A direct static reference to a global.
4594 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
4597 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags);
4600 if (Subtarget->isPICStyleRIPRel() &&
4601 (M == CodeModel::Small || M == CodeModel::Kernel))
4602 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
4604 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
4606 // With PIC, the address is actually $g + Offset.
4607 if (isGlobalRelativeToPICBase(OpFlags)) {
4608 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
4609 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
4613 // For globals that require a load from a stub to get the address, emit the
4615 if (isGlobalStubReference(OpFlags))
4616 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
4617 PseudoSourceValue::getGOT(), 0);
4619 // If there was a non-zero offset that we didn't fold, create an explicit
4622 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
4623 DAG.getConstant(Offset, getPointerTy()));
4629 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
4630 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
4631 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
4632 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
4636 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
4637 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
4638 unsigned char OperandFlags) {
4639 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
4640 DebugLoc dl = GA->getDebugLoc();
4641 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
4642 GA->getValueType(0),
4646 SDValue Ops[] = { Chain, TGA, *InFlag };
4647 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
4649 SDValue Ops[] = { Chain, TGA };
4650 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
4652 SDValue Flag = Chain.getValue(1);
4653 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
4656 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
4658 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
4661 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better
4662 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
4663 DAG.getNode(X86ISD::GlobalBaseReg,
4664 DebugLoc::getUnknownLoc(),
4666 InFlag = Chain.getValue(1);
4668 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
4671 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
4673 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
4675 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
4676 X86::RAX, X86II::MO_TLSGD);
4679 // Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
4680 // "local exec" model.
4681 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
4682 const EVT PtrVT, TLSModel::Model model,
4684 DebugLoc dl = GA->getDebugLoc();
4685 // Get the Thread Pointer
4686 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress,
4687 DebugLoc::getUnknownLoc(), PtrVT,
4688 DAG.getRegister(is64Bit? X86::FS : X86::GS,
4691 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base,
4694 unsigned char OperandFlags = 0;
4695 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
4697 unsigned WrapperKind = X86ISD::Wrapper;
4698 if (model == TLSModel::LocalExec) {
4699 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
4700 } else if (is64Bit) {
4701 assert(model == TLSModel::InitialExec);
4702 OperandFlags = X86II::MO_GOTTPOFF;
4703 WrapperKind = X86ISD::WrapperRIP;
4705 assert(model == TLSModel::InitialExec);
4706 OperandFlags = X86II::MO_INDNTPOFF;
4709 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
4711 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
4712 GA->getOffset(), OperandFlags);
4713 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
4715 if (model == TLSModel::InitialExec)
4716 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
4717 PseudoSourceValue::getGOT(), 0);
4719 // The address of the thread local variable is the add of the thread
4720 // pointer with the offset of the variable.
4721 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
4725 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) {
4726 // TODO: implement the "local dynamic" model
4727 // TODO: implement the "initial exec"model for pic executables
4728 assert(Subtarget->isTargetELF() &&
4729 "TLS not implemented for non-ELF targets");
4730 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
4731 const GlobalValue *GV = GA->getGlobal();
4733 // If GV is an alias then use the aliasee for determining
4734 // thread-localness.
4735 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
4736 GV = GA->resolveAliasedGlobal(false);
4738 TLSModel::Model model = getTLSModel(GV,
4739 getTargetMachine().getRelocationModel());
4742 case TLSModel::GeneralDynamic:
4743 case TLSModel::LocalDynamic: // not implemented
4744 if (Subtarget->is64Bit())
4745 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
4746 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
4748 case TLSModel::InitialExec:
4749 case TLSModel::LocalExec:
4750 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
4751 Subtarget->is64Bit());
4754 llvm_unreachable("Unreachable");
4759 /// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and
4760 /// take a 2 x i32 value to shift plus a shift amount.
4761 SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
4762 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
4763 EVT VT = Op.getValueType();
4764 unsigned VTBits = VT.getSizeInBits();
4765 DebugLoc dl = Op.getDebugLoc();
4766 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
4767 SDValue ShOpLo = Op.getOperand(0);
4768 SDValue ShOpHi = Op.getOperand(1);
4769 SDValue ShAmt = Op.getOperand(2);
4770 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
4771 DAG.getConstant(VTBits - 1, MVT::i8))
4772 : DAG.getConstant(0, VT);
4775 if (Op.getOpcode() == ISD::SHL_PARTS) {
4776 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
4777 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
4779 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
4780 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
4783 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
4784 DAG.getConstant(VTBits, MVT::i8));
4785 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT,
4786 AndNode, DAG.getConstant(0, MVT::i8));
4789 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
4790 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
4791 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
4793 if (Op.getOpcode() == ISD::SHL_PARTS) {
4794 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
4795 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
4797 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
4798 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
4801 SDValue Ops[2] = { Lo, Hi };
4802 return DAG.getMergeValues(Ops, 2, dl);
4805 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
4806 EVT SrcVT = Op.getOperand(0).getValueType();
4808 if (SrcVT.isVector()) {
4809 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) {
4815 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
4816 "Unknown SINT_TO_FP to lower!");
4818 // These are really Legal; return the operand so the caller accepts it as
4820 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
4822 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
4823 Subtarget->is64Bit()) {
4827 DebugLoc dl = Op.getDebugLoc();
4828 unsigned Size = SrcVT.getSizeInBits()/8;
4829 MachineFunction &MF = DAG.getMachineFunction();
4830 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size);
4831 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
4832 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
4834 PseudoSourceValue::getFixedStack(SSFI), 0);
4835 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
4838 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
4840 SelectionDAG &DAG) {
4842 DebugLoc dl = Op.getDebugLoc();
4844 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
4846 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag);
4848 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
4849 SmallVector<SDValue, 8> Ops;
4850 Ops.push_back(Chain);
4851 Ops.push_back(StackSlot);
4852 Ops.push_back(DAG.getValueType(SrcVT));
4853 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl,
4854 Tys, &Ops[0], Ops.size());
4857 Chain = Result.getValue(1);
4858 SDValue InFlag = Result.getValue(2);
4860 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
4861 // shouldn't be necessary except that RFP cannot be live across
4862 // multiple blocks. When stackifier is fixed, they can be uncoupled.
4863 MachineFunction &MF = DAG.getMachineFunction();
4864 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
4865 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
4866 Tys = DAG.getVTList(MVT::Other);
4867 SmallVector<SDValue, 8> Ops;
4868 Ops.push_back(Chain);
4869 Ops.push_back(Result);
4870 Ops.push_back(StackSlot);
4871 Ops.push_back(DAG.getValueType(Op.getValueType()));
4872 Ops.push_back(InFlag);
4873 Chain = DAG.getNode(X86ISD::FST, dl, Tys, &Ops[0], Ops.size());
4874 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot,
4875 PseudoSourceValue::getFixedStack(SSFI), 0);
4881 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
4882 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) {
4883 // This algorithm is not obvious. Here it is in C code, more or less:
4885 double uint64_to_double( uint32_t hi, uint32_t lo ) {
4886 static const __m128i exp = { 0x4330000045300000ULL, 0 };
4887 static const __m128d bias = { 0x1.0p84, 0x1.0p52 };
4889 // Copy ints to xmm registers.
4890 __m128i xh = _mm_cvtsi32_si128( hi );
4891 __m128i xl = _mm_cvtsi32_si128( lo );
4893 // Combine into low half of a single xmm register.
4894 __m128i x = _mm_unpacklo_epi32( xh, xl );
4898 // Merge in appropriate exponents to give the integer bits the right
4900 x = _mm_unpacklo_epi32( x, exp );
4902 // Subtract away the biases to deal with the IEEE-754 double precision
4904 d = _mm_sub_pd( (__m128d) x, bias );
4906 // All conversions up to here are exact. The correctly rounded result is
4907 // calculated using the current rounding mode using the following
4909 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
4910 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this
4911 // store doesn't really need to be here (except
4912 // maybe to zero the other double)
4917 DebugLoc dl = Op.getDebugLoc();
4918 LLVMContext *Context = DAG.getContext();
4920 // Build some magic constants.
4921 std::vector<Constant*> CV0;
4922 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
4923 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
4924 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
4925 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
4926 Constant *C0 = ConstantVector::get(CV0);
4927 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
4929 std::vector<Constant*> CV1;
4931 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
4933 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
4934 Constant *C1 = ConstantVector::get(CV1);
4935 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
4937 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
4938 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
4940 DAG.getIntPtrConstant(1)));
4941 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
4942 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
4944 DAG.getIntPtrConstant(0)));
4945 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
4946 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
4947 PseudoSourceValue::getConstantPool(), 0,
4949 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
4950 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2);
4951 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
4952 PseudoSourceValue::getConstantPool(), 0,
4954 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
4956 // Add the halves; easiest way is to swap them into another reg first.
4957 int ShufMask[2] = { 1, -1 };
4958 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub,
4959 DAG.getUNDEF(MVT::v2f64), ShufMask);
4960 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub);
4961 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add,
4962 DAG.getIntPtrConstant(0));
4965 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
4966 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) {
4967 DebugLoc dl = Op.getDebugLoc();
4968 // FP constant to bias correct the final result.
4969 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
4972 // Load the 32-bit value into an XMM register.
4973 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
4974 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
4976 DAG.getIntPtrConstant(0)));
4978 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
4979 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load),
4980 DAG.getIntPtrConstant(0));
4982 // Or the load with the bias.
4983 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
4984 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
4985 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
4987 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
4988 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
4989 MVT::v2f64, Bias)));
4990 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
4991 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or),
4992 DAG.getIntPtrConstant(0));
4994 // Subtract the bias.
4995 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
4997 // Handle final rounding.
4998 EVT DestVT = Op.getValueType();
5000 if (DestVT.bitsLT(MVT::f64)) {
5001 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
5002 DAG.getIntPtrConstant(0));
5003 } else if (DestVT.bitsGT(MVT::f64)) {
5004 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
5007 // Handle final rounding.
5011 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
5012 SDValue N0 = Op.getOperand(0);
5013 DebugLoc dl = Op.getDebugLoc();
5015 // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't
5016 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
5017 // the optimization here.
5018 if (DAG.SignBitIsZero(N0))
5019 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
5021 EVT SrcVT = N0.getValueType();
5022 if (SrcVT == MVT::i64) {
5023 // We only handle SSE2 f64 target here; caller can expand the rest.
5024 if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64)
5027 return LowerUINT_TO_FP_i64(Op, DAG);
5028 } else if (SrcVT == MVT::i32 && X86ScalarSSEf64) {
5029 return LowerUINT_TO_FP_i32(Op, DAG);
5032 assert(SrcVT == MVT::i32 && "Unknown UINT_TO_FP to lower!");
5034 // Make a 64-bit buffer, and use it to build an FILD.
5035 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
5036 SDValue WordOff = DAG.getConstant(4, getPointerTy());
5037 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
5038 getPointerTy(), StackSlot, WordOff);
5039 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
5040 StackSlot, NULL, 0);
5041 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
5042 OffsetSlot, NULL, 0);
5043 return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
5046 std::pair<SDValue,SDValue> X86TargetLowering::
5047 FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) {
5048 DebugLoc dl = Op.getDebugLoc();
5050 EVT DstTy = Op.getValueType();
5053 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
5057 assert(DstTy.getSimpleVT() <= MVT::i64 &&
5058 DstTy.getSimpleVT() >= MVT::i16 &&
5059 "Unknown FP_TO_SINT to lower!");
5061 // These are really Legal.
5062 if (DstTy == MVT::i32 &&
5063 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
5064 return std::make_pair(SDValue(), SDValue());
5065 if (Subtarget->is64Bit() &&
5066 DstTy == MVT::i64 &&
5067 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
5068 return std::make_pair(SDValue(), SDValue());
5070 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
5072 MachineFunction &MF = DAG.getMachineFunction();
5073 unsigned MemSize = DstTy.getSizeInBits()/8;
5074 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
5075 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5078 switch (DstTy.getSimpleVT().SimpleTy) {
5079 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
5080 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
5081 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
5082 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
5085 SDValue Chain = DAG.getEntryNode();
5086 SDValue Value = Op.getOperand(0);
5087 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) {
5088 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
5089 Chain = DAG.getStore(Chain, dl, Value, StackSlot,
5090 PseudoSourceValue::getFixedStack(SSFI), 0);
5091 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
5093 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType())
5095 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3);
5096 Chain = Value.getValue(1);
5097 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
5098 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5101 // Build the FP_TO_INT*_IN_MEM
5102 SDValue Ops[] = { Chain, Value, StackSlot };
5103 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3);
5105 return std::make_pair(FIST, StackSlot);
5108 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
5109 if (Op.getValueType().isVector()) {
5110 if (Op.getValueType() == MVT::v2i32 &&
5111 Op.getOperand(0).getValueType() == MVT::v2f64) {
5117 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true);
5118 SDValue FIST = Vals.first, StackSlot = Vals.second;
5119 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
5120 if (FIST.getNode() == 0) return Op;
5123 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
5124 FIST, StackSlot, NULL, 0);
5127 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) {
5128 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false);
5129 SDValue FIST = Vals.first, StackSlot = Vals.second;
5130 assert(FIST.getNode() && "Unexpected failure");
5133 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
5134 FIST, StackSlot, NULL, 0);
5137 SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) {
5138 LLVMContext *Context = DAG.getContext();
5139 DebugLoc dl = Op.getDebugLoc();
5140 EVT VT = Op.getValueType();
5143 EltVT = VT.getVectorElementType();
5144 std::vector<Constant*> CV;
5145 if (EltVT == MVT::f64) {
5146 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
5150 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
5156 Constant *C = ConstantVector::get(CV);
5157 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5158 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5159 PseudoSourceValue::getConstantPool(), 0,
5161 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
5164 SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) {
5165 LLVMContext *Context = DAG.getContext();
5166 DebugLoc dl = Op.getDebugLoc();
5167 EVT VT = Op.getValueType();
5169 unsigned EltNum = 1;
5170 if (VT.isVector()) {
5171 EltVT = VT.getVectorElementType();
5172 EltNum = VT.getVectorNumElements();
5174 std::vector<Constant*> CV;
5175 if (EltVT == MVT::f64) {
5176 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
5180 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
5186 Constant *C = ConstantVector::get(CV);
5187 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5188 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5189 PseudoSourceValue::getConstantPool(), 0,
5191 if (VT.isVector()) {
5192 return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
5193 DAG.getNode(ISD::XOR, dl, MVT::v2i64,
5194 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
5196 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask)));
5198 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
5202 SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
5203 LLVMContext *Context = DAG.getContext();
5204 SDValue Op0 = Op.getOperand(0);
5205 SDValue Op1 = Op.getOperand(1);
5206 DebugLoc dl = Op.getDebugLoc();
5207 EVT VT = Op.getValueType();
5208 EVT SrcVT = Op1.getValueType();
5210 // If second operand is smaller, extend it first.
5211 if (SrcVT.bitsLT(VT)) {
5212 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
5215 // And if it is bigger, shrink it first.
5216 if (SrcVT.bitsGT(VT)) {
5217 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
5221 // At this point the operands and the result should have the same
5222 // type, and that won't be f80 since that is not custom lowered.
5224 // First get the sign bit of second operand.
5225 std::vector<Constant*> CV;
5226 if (SrcVT == MVT::f64) {
5227 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
5228 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
5230 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))));
5231 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5232 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5233 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5235 Constant *C = ConstantVector::get(CV);
5236 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5237 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
5238 PseudoSourceValue::getConstantPool(), 0,
5240 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
5242 // Shift sign bit right or left if the two operands have different types.
5243 if (SrcVT.bitsGT(VT)) {
5244 // Op0 is MVT::f32, Op1 is MVT::f64.
5245 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
5246 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
5247 DAG.getConstant(32, MVT::i32));
5248 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit);
5249 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
5250 DAG.getIntPtrConstant(0));
5253 // Clear first operand sign bit.
5255 if (VT == MVT::f64) {
5256 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
5257 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
5259 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
5260 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5261 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5262 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5264 C = ConstantVector::get(CV);
5265 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5266 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5267 PseudoSourceValue::getConstantPool(), 0,
5269 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
5271 // Or the value with the sign bit.
5272 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
5275 /// Emit nodes that will be selected as "test Op0,Op0", or something
5277 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
5278 SelectionDAG &DAG) {
5279 DebugLoc dl = Op.getDebugLoc();
5281 // CF and OF aren't always set the way we want. Determine which
5282 // of these we need.
5283 bool NeedCF = false;
5284 bool NeedOF = false;
5286 case X86::COND_A: case X86::COND_AE:
5287 case X86::COND_B: case X86::COND_BE:
5290 case X86::COND_G: case X86::COND_GE:
5291 case X86::COND_L: case X86::COND_LE:
5292 case X86::COND_O: case X86::COND_NO:
5298 // See if we can use the EFLAGS value from the operand instead of
5299 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
5300 // we prove that the arithmetic won't overflow, we can't use OF or CF.
5301 if (Op.getResNo() == 0 && !NeedOF && !NeedCF) {
5302 unsigned Opcode = 0;
5303 unsigned NumOperands = 0;
5304 switch (Op.getNode()->getOpcode()) {
5306 // Due to an isel shortcoming, be conservative if this add is likely to
5307 // be selected as part of a load-modify-store instruction. When the root
5308 // node in a match is a store, isel doesn't know how to remap non-chain
5309 // non-flag uses of other nodes in the match, such as the ADD in this
5310 // case. This leads to the ADD being left around and reselected, with
5311 // the result being two adds in the output.
5312 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
5313 UE = Op.getNode()->use_end(); UI != UE; ++UI)
5314 if (UI->getOpcode() == ISD::STORE)
5316 if (ConstantSDNode *C =
5317 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
5318 // An add of one will be selected as an INC.
5319 if (C->getAPIntValue() == 1) {
5320 Opcode = X86ISD::INC;
5324 // An add of negative one (subtract of one) will be selected as a DEC.
5325 if (C->getAPIntValue().isAllOnesValue()) {
5326 Opcode = X86ISD::DEC;
5331 // Otherwise use a regular EFLAGS-setting add.
5332 Opcode = X86ISD::ADD;
5336 // Due to the ISEL shortcoming noted above, be conservative if this sub is
5337 // likely to be selected as part of a load-modify-store instruction.
5338 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
5339 UE = Op.getNode()->use_end(); UI != UE; ++UI)
5340 if (UI->getOpcode() == ISD::STORE)
5342 // Otherwise use a regular EFLAGS-setting sub.
5343 Opcode = X86ISD::SUB;
5350 return SDValue(Op.getNode(), 1);
5356 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
5357 SmallVector<SDValue, 4> Ops;
5358 for (unsigned i = 0; i != NumOperands; ++i)
5359 Ops.push_back(Op.getOperand(i));
5360 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
5361 DAG.ReplaceAllUsesWith(Op, New);
5362 return SDValue(New.getNode(), 1);
5366 // Otherwise just emit a CMP with 0, which is the TEST pattern.
5367 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
5368 DAG.getConstant(0, Op.getValueType()));
5371 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
5373 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
5374 SelectionDAG &DAG) {
5375 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
5376 if (C->getAPIntValue() == 0)
5377 return EmitTest(Op0, X86CC, DAG);
5379 DebugLoc dl = Op0.getDebugLoc();
5380 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
5383 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
5384 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
5385 SDValue Op0 = Op.getOperand(0);
5386 SDValue Op1 = Op.getOperand(1);
5387 DebugLoc dl = Op.getDebugLoc();
5388 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
5390 // Lower (X & (1 << N)) == 0 to BT(X, N).
5391 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
5392 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
5393 if (Op0.getOpcode() == ISD::AND &&
5395 Op1.getOpcode() == ISD::Constant &&
5396 cast<ConstantSDNode>(Op1)->getZExtValue() == 0 &&
5397 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5399 if (Op0.getOperand(1).getOpcode() == ISD::SHL) {
5400 if (ConstantSDNode *Op010C =
5401 dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0)))
5402 if (Op010C->getZExtValue() == 1) {
5403 LHS = Op0.getOperand(0);
5404 RHS = Op0.getOperand(1).getOperand(1);
5406 } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) {
5407 if (ConstantSDNode *Op000C =
5408 dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0)))
5409 if (Op000C->getZExtValue() == 1) {
5410 LHS = Op0.getOperand(1);
5411 RHS = Op0.getOperand(0).getOperand(1);
5413 } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) {
5414 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1));
5415 SDValue AndLHS = Op0.getOperand(0);
5416 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
5417 LHS = AndLHS.getOperand(0);
5418 RHS = AndLHS.getOperand(1);
5422 if (LHS.getNode()) {
5423 // If LHS is i8, promote it to i16 with any_extend. There is no i8 BT
5424 // instruction. Since the shift amount is in-range-or-undefined, we know
5425 // that doing a bittest on the i16 value is ok. We extend to i32 because
5426 // the encoding for the i16 version is larger than the i32 version.
5427 if (LHS.getValueType() == MVT::i8)
5428 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
5430 // If the operand types disagree, extend the shift amount to match. Since
5431 // BT ignores high bits (like shifts) we can use anyextend.
5432 if (LHS.getValueType() != RHS.getValueType())
5433 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
5435 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
5436 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
5437 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
5438 DAG.getConstant(Cond, MVT::i8), BT);
5442 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
5443 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
5445 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG);
5446 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
5447 DAG.getConstant(X86CC, MVT::i8), Cond);
5450 SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
5452 SDValue Op0 = Op.getOperand(0);
5453 SDValue Op1 = Op.getOperand(1);
5454 SDValue CC = Op.getOperand(2);
5455 EVT VT = Op.getValueType();
5456 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
5457 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
5458 DebugLoc dl = Op.getDebugLoc();
5462 EVT VT0 = Op0.getValueType();
5463 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64);
5464 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
5467 switch (SetCCOpcode) {
5470 case ISD::SETEQ: SSECC = 0; break;
5472 case ISD::SETGT: Swap = true; // Fallthrough
5474 case ISD::SETOLT: SSECC = 1; break;
5476 case ISD::SETGE: Swap = true; // Fallthrough
5478 case ISD::SETOLE: SSECC = 2; break;
5479 case ISD::SETUO: SSECC = 3; break;
5481 case ISD::SETNE: SSECC = 4; break;
5482 case ISD::SETULE: Swap = true;
5483 case ISD::SETUGE: SSECC = 5; break;
5484 case ISD::SETULT: Swap = true;
5485 case ISD::SETUGT: SSECC = 6; break;
5486 case ISD::SETO: SSECC = 7; break;
5489 std::swap(Op0, Op1);
5491 // In the two special cases we can't handle, emit two comparisons.
5493 if (SetCCOpcode == ISD::SETUEQ) {
5495 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8));
5496 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8));
5497 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
5499 else if (SetCCOpcode == ISD::SETONE) {
5501 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8));
5502 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
5503 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
5505 llvm_unreachable("Illegal FP comparison");
5507 // Handle all other FP comparisons here.
5508 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
5511 // We are handling one of the integer comparisons here. Since SSE only has
5512 // GT and EQ comparisons for integer, swapping operands and multiple
5513 // operations may be required for some comparisons.
5514 unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
5515 bool Swap = false, Invert = false, FlipSigns = false;
5517 switch (VT.getSimpleVT().SimpleTy) {
5520 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
5522 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
5524 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
5525 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
5528 switch (SetCCOpcode) {
5530 case ISD::SETNE: Invert = true;
5531 case ISD::SETEQ: Opc = EQOpc; break;
5532 case ISD::SETLT: Swap = true;
5533 case ISD::SETGT: Opc = GTOpc; break;
5534 case ISD::SETGE: Swap = true;
5535 case ISD::SETLE: Opc = GTOpc; Invert = true; break;
5536 case ISD::SETULT: Swap = true;
5537 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break;
5538 case ISD::SETUGE: Swap = true;
5539 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break;
5542 std::swap(Op0, Op1);
5544 // Since SSE has no unsigned integer comparisons, we need to flip the sign
5545 // bits of the inputs before performing those operations.
5547 EVT EltVT = VT.getVectorElementType();
5548 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
5550 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
5551 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
5553 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
5554 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
5557 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
5559 // If the logical-not of the result is required, perform that now.
5561 Result = DAG.getNOT(dl, Result, VT);
5566 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
5567 static bool isX86LogicalCmp(SDValue Op) {
5568 unsigned Opc = Op.getNode()->getOpcode();
5569 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI)
5571 if (Op.getResNo() == 1 &&
5572 (Opc == X86ISD::ADD ||
5573 Opc == X86ISD::SUB ||
5574 Opc == X86ISD::SMUL ||
5575 Opc == X86ISD::UMUL ||
5576 Opc == X86ISD::INC ||
5577 Opc == X86ISD::DEC))
5583 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) {
5584 bool addTest = true;
5585 SDValue Cond = Op.getOperand(0);
5586 DebugLoc dl = Op.getDebugLoc();
5589 if (Cond.getOpcode() == ISD::SETCC)
5590 Cond = LowerSETCC(Cond, DAG);
5592 // If condition flag is set by a X86ISD::CMP, then use it as the condition
5593 // setting operand in place of the X86ISD::SETCC.
5594 if (Cond.getOpcode() == X86ISD::SETCC) {
5595 CC = Cond.getOperand(0);
5597 SDValue Cmp = Cond.getOperand(1);
5598 unsigned Opc = Cmp.getOpcode();
5599 EVT VT = Op.getValueType();
5601 bool IllegalFPCMov = false;
5602 if (VT.isFloatingPoint() && !VT.isVector() &&
5603 !isScalarFPTypeInSSEReg(VT)) // FPStack?
5604 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
5606 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
5607 Opc == X86ISD::BT) { // FIXME
5614 CC = DAG.getConstant(X86::COND_NE, MVT::i8);
5615 Cond = EmitTest(Cond, X86::COND_NE, DAG);
5618 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag);
5619 SmallVector<SDValue, 4> Ops;
5620 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
5621 // condition is true.
5622 Ops.push_back(Op.getOperand(2));
5623 Ops.push_back(Op.getOperand(1));
5625 Ops.push_back(Cond);
5626 return DAG.getNode(X86ISD::CMOV, dl, VTs, &Ops[0], Ops.size());
5629 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
5630 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
5631 // from the AND / OR.
5632 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
5633 Opc = Op.getOpcode();
5634 if (Opc != ISD::OR && Opc != ISD::AND)
5636 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
5637 Op.getOperand(0).hasOneUse() &&
5638 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
5639 Op.getOperand(1).hasOneUse());
5642 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
5643 // 1 and that the SETCC node has a single use.
5644 static bool isXor1OfSetCC(SDValue Op) {
5645 if (Op.getOpcode() != ISD::XOR)
5647 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
5648 if (N1C && N1C->getAPIntValue() == 1) {
5649 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
5650 Op.getOperand(0).hasOneUse();
5655 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
5656 bool addTest = true;
5657 SDValue Chain = Op.getOperand(0);
5658 SDValue Cond = Op.getOperand(1);
5659 SDValue Dest = Op.getOperand(2);
5660 DebugLoc dl = Op.getDebugLoc();
5663 if (Cond.getOpcode() == ISD::SETCC)
5664 Cond = LowerSETCC(Cond, DAG);
5666 // FIXME: LowerXALUO doesn't handle these!!
5667 else if (Cond.getOpcode() == X86ISD::ADD ||
5668 Cond.getOpcode() == X86ISD::SUB ||
5669 Cond.getOpcode() == X86ISD::SMUL ||
5670 Cond.getOpcode() == X86ISD::UMUL)
5671 Cond = LowerXALUO(Cond, DAG);
5674 // If condition flag is set by a X86ISD::CMP, then use it as the condition
5675 // setting operand in place of the X86ISD::SETCC.
5676 if (Cond.getOpcode() == X86ISD::SETCC) {
5677 CC = Cond.getOperand(0);
5679 SDValue Cmp = Cond.getOperand(1);
5680 unsigned Opc = Cmp.getOpcode();
5681 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
5682 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
5686 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
5690 // These can only come from an arithmetic instruction with overflow,
5691 // e.g. SADDO, UADDO.
5692 Cond = Cond.getNode()->getOperand(1);
5699 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
5700 SDValue Cmp = Cond.getOperand(0).getOperand(1);
5701 if (CondOpc == ISD::OR) {
5702 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
5703 // two branches instead of an explicit OR instruction with a
5705 if (Cmp == Cond.getOperand(1).getOperand(1) &&
5706 isX86LogicalCmp(Cmp)) {
5707 CC = Cond.getOperand(0).getOperand(0);
5708 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
5709 Chain, Dest, CC, Cmp);
5710 CC = Cond.getOperand(1).getOperand(0);
5714 } else { // ISD::AND
5715 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
5716 // two branches instead of an explicit AND instruction with a
5717 // separate test. However, we only do this if this block doesn't
5718 // have a fall-through edge, because this requires an explicit
5719 // jmp when the condition is false.
5720 if (Cmp == Cond.getOperand(1).getOperand(1) &&
5721 isX86LogicalCmp(Cmp) &&
5722 Op.getNode()->hasOneUse()) {
5723 X86::CondCode CCode =
5724 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
5725 CCode = X86::GetOppositeBranchCondition(CCode);
5726 CC = DAG.getConstant(CCode, MVT::i8);
5727 SDValue User = SDValue(*Op.getNode()->use_begin(), 0);
5728 // Look for an unconditional branch following this conditional branch.
5729 // We need this because we need to reverse the successors in order
5730 // to implement FCMP_OEQ.
5731 if (User.getOpcode() == ISD::BR) {
5732 SDValue FalseBB = User.getOperand(1);
5734 DAG.UpdateNodeOperands(User, User.getOperand(0), Dest);
5735 assert(NewBR == User);
5738 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
5739 Chain, Dest, CC, Cmp);
5740 X86::CondCode CCode =
5741 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
5742 CCode = X86::GetOppositeBranchCondition(CCode);
5743 CC = DAG.getConstant(CCode, MVT::i8);
5749 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
5750 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
5751 // It should be transformed during dag combiner except when the condition
5752 // is set by a arithmetics with overflow node.
5753 X86::CondCode CCode =
5754 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
5755 CCode = X86::GetOppositeBranchCondition(CCode);
5756 CC = DAG.getConstant(CCode, MVT::i8);
5757 Cond = Cond.getOperand(0).getOperand(1);
5763 CC = DAG.getConstant(X86::COND_NE, MVT::i8);
5764 Cond = EmitTest(Cond, X86::COND_NE, DAG);
5766 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
5767 Chain, Dest, CC, Cond);
5771 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
5772 // Calls to _alloca is needed to probe the stack when allocating more than 4k
5773 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
5774 // that the guard pages used by the OS virtual memory manager are allocated in
5775 // correct sequence.
5777 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
5778 SelectionDAG &DAG) {
5779 assert(Subtarget->isTargetCygMing() &&
5780 "This should be used only on Cygwin/Mingw targets");
5781 DebugLoc dl = Op.getDebugLoc();
5784 SDValue Chain = Op.getOperand(0);
5785 SDValue Size = Op.getOperand(1);
5786 // FIXME: Ensure alignment here
5790 EVT IntPtr = getPointerTy();
5791 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
5793 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true));
5795 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag);
5796 Flag = Chain.getValue(1);
5798 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
5799 SDValue Ops[] = { Chain,
5800 DAG.getTargetExternalSymbol("_alloca", IntPtr),
5801 DAG.getRegister(X86::EAX, IntPtr),
5802 DAG.getRegister(X86StackPtr, SPTy),
5804 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5);
5805 Flag = Chain.getValue(1);
5807 Chain = DAG.getCALLSEQ_END(Chain,
5808 DAG.getIntPtrConstant(0, true),
5809 DAG.getIntPtrConstant(0, true),
5812 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
5814 SDValue Ops1[2] = { Chain.getValue(0), Chain };
5815 return DAG.getMergeValues(Ops1, 2, dl);
5819 X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
5821 SDValue Dst, SDValue Src,
5822 SDValue Size, unsigned Align,
5824 uint64_t DstSVOff) {
5825 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
5827 // If not DWORD aligned or size is more than the threshold, call the library.
5828 // The libc version is likely to be faster for these cases. It can use the
5829 // address value and run time information about the CPU.
5830 if ((Align & 3) != 0 ||
5832 ConstantSize->getZExtValue() >
5833 getSubtarget()->getMaxInlineSizeThreshold()) {
5834 SDValue InFlag(0, 0);
5836 // Check to see if there is a specialized entry-point for memory zeroing.
5837 ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
5839 if (const char *bzeroEntry = V &&
5840 V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
5841 EVT IntPtr = getPointerTy();
5842 const Type *IntPtrTy = TD->getIntPtrType(*DAG.getContext());
5843 TargetLowering::ArgListTy Args;
5844 TargetLowering::ArgListEntry Entry;
5846 Entry.Ty = IntPtrTy;
5847 Args.push_back(Entry);
5849 Args.push_back(Entry);
5850 std::pair<SDValue,SDValue> CallResult =
5851 LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()),
5852 false, false, false, false,
5853 0, CallingConv::C, false, /*isReturnValueUsed=*/false,
5854 DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl);
5855 return CallResult.second;
5858 // Otherwise have the target-independent code call memset.
5862 uint64_t SizeVal = ConstantSize->getZExtValue();
5863 SDValue InFlag(0, 0);
5866 ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
5867 unsigned BytesLeft = 0;
5868 bool TwoRepStos = false;
5871 uint64_t Val = ValC->getZExtValue() & 255;
5873 // If the value is a constant, then we can potentially use larger sets.
5874 switch (Align & 3) {
5875 case 2: // WORD aligned
5878 Val = (Val << 8) | Val;
5880 case 0: // DWORD aligned
5883 Val = (Val << 8) | Val;
5884 Val = (Val << 16) | Val;
5885 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned
5888 Val = (Val << 32) | Val;
5891 default: // Byte aligned
5894 Count = DAG.getIntPtrConstant(SizeVal);
5898 if (AVT.bitsGT(MVT::i8)) {
5899 unsigned UBytes = AVT.getSizeInBits() / 8;
5900 Count = DAG.getIntPtrConstant(SizeVal / UBytes);
5901 BytesLeft = SizeVal % UBytes;
5904 Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT),
5906 InFlag = Chain.getValue(1);
5909 Count = DAG.getIntPtrConstant(SizeVal);
5910 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag);
5911 InFlag = Chain.getValue(1);
5914 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
5917 InFlag = Chain.getValue(1);
5918 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
5921 InFlag = Chain.getValue(1);
5923 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
5924 SmallVector<SDValue, 8> Ops;
5925 Ops.push_back(Chain);
5926 Ops.push_back(DAG.getValueType(AVT));
5927 Ops.push_back(InFlag);
5928 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size());
5931 InFlag = Chain.getValue(1);
5933 EVT CVT = Count.getValueType();
5934 SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
5935 DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
5936 Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX :
5939 InFlag = Chain.getValue(1);
5940 Tys = DAG.getVTList(MVT::Other, MVT::Flag);
5942 Ops.push_back(Chain);
5943 Ops.push_back(DAG.getValueType(MVT::i8));
5944 Ops.push_back(InFlag);
5945 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size());
5946 } else if (BytesLeft) {
5947 // Handle the last 1 - 7 bytes.
5948 unsigned Offset = SizeVal - BytesLeft;
5949 EVT AddrVT = Dst.getValueType();
5950 EVT SizeVT = Size.getValueType();
5952 Chain = DAG.getMemset(Chain, dl,
5953 DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
5954 DAG.getConstant(Offset, AddrVT)),
5956 DAG.getConstant(BytesLeft, SizeVT),
5957 Align, DstSV, DstSVOff + Offset);
5960 // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
5965 X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
5966 SDValue Chain, SDValue Dst, SDValue Src,
5967 SDValue Size, unsigned Align,
5969 const Value *DstSV, uint64_t DstSVOff,
5970 const Value *SrcSV, uint64_t SrcSVOff) {
5971 // This requires the copy size to be a constant, preferrably
5972 // within a subtarget-specific limit.
5973 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
5976 uint64_t SizeVal = ConstantSize->getZExtValue();
5977 if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold())
5980 /// If not DWORD aligned, call the library.
5981 if ((Align & 3) != 0)
5986 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) // QWORD aligned
5989 unsigned UBytes = AVT.getSizeInBits() / 8;
5990 unsigned CountVal = SizeVal / UBytes;
5991 SDValue Count = DAG.getIntPtrConstant(CountVal);
5992 unsigned BytesLeft = SizeVal % UBytes;
5994 SDValue InFlag(0, 0);
5995 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
5998 InFlag = Chain.getValue(1);
5999 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
6002 InFlag = Chain.getValue(1);
6003 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI :
6006 InFlag = Chain.getValue(1);
6008 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6009 SmallVector<SDValue, 8> Ops;
6010 Ops.push_back(Chain);
6011 Ops.push_back(DAG.getValueType(AVT));
6012 Ops.push_back(InFlag);
6013 SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, &Ops[0], Ops.size());
6015 SmallVector<SDValue, 4> Results;
6016 Results.push_back(RepMovs);
6018 // Handle the last 1 - 7 bytes.
6019 unsigned Offset = SizeVal - BytesLeft;
6020 EVT DstVT = Dst.getValueType();
6021 EVT SrcVT = Src.getValueType();
6022 EVT SizeVT = Size.getValueType();
6023 Results.push_back(DAG.getMemcpy(Chain, dl,
6024 DAG.getNode(ISD::ADD, dl, DstVT, Dst,
6025 DAG.getConstant(Offset, DstVT)),
6026 DAG.getNode(ISD::ADD, dl, SrcVT, Src,
6027 DAG.getConstant(Offset, SrcVT)),
6028 DAG.getConstant(BytesLeft, SizeVT),
6029 Align, AlwaysInline,
6030 DstSV, DstSVOff + Offset,
6031 SrcSV, SrcSVOff + Offset));
6034 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
6035 &Results[0], Results.size());
6038 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) {
6039 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
6040 DebugLoc dl = Op.getDebugLoc();
6042 if (!Subtarget->is64Bit()) {
6043 // vastart just stores the address of the VarArgsFrameIndex slot into the
6044 // memory location argument.
6045 SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
6046 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0);
6050 // gp_offset (0 - 6 * 8)
6051 // fp_offset (48 - 48 + 8 * 16)
6052 // overflow_arg_area (point to parameters coming in memory).
6054 SmallVector<SDValue, 8> MemOps;
6055 SDValue FIN = Op.getOperand(1);
6057 SDValue Store = DAG.getStore(Op.getOperand(0), dl,
6058 DAG.getConstant(VarArgsGPOffset, MVT::i32),
6060 MemOps.push_back(Store);
6063 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
6064 FIN, DAG.getIntPtrConstant(4));
6065 Store = DAG.getStore(Op.getOperand(0), dl,
6066 DAG.getConstant(VarArgsFPOffset, MVT::i32),
6068 MemOps.push_back(Store);
6070 // Store ptr to overflow_arg_area
6071 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
6072 FIN, DAG.getIntPtrConstant(4));
6073 SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
6074 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0);
6075 MemOps.push_back(Store);
6077 // Store ptr to reg_save_area.
6078 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
6079 FIN, DAG.getIntPtrConstant(8));
6080 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
6081 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0);
6082 MemOps.push_back(Store);
6083 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
6084 &MemOps[0], MemOps.size());
6087 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) {
6088 // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
6089 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!");
6090 SDValue Chain = Op.getOperand(0);
6091 SDValue SrcPtr = Op.getOperand(1);
6092 SDValue SrcSV = Op.getOperand(2);
6094 llvm_report_error("VAArgInst is not yet implemented for x86-64!");
6098 SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) {
6099 // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
6100 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
6101 SDValue Chain = Op.getOperand(0);
6102 SDValue DstPtr = Op.getOperand(1);
6103 SDValue SrcPtr = Op.getOperand(2);
6104 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
6105 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
6106 DebugLoc dl = Op.getDebugLoc();
6108 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr,
6109 DAG.getIntPtrConstant(24), 8, false,
6110 DstSV, 0, SrcSV, 0);
6114 X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
6115 DebugLoc dl = Op.getDebugLoc();
6116 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6118 default: return SDValue(); // Don't custom lower most intrinsics.
6119 // Comparison intrinsics.
6120 case Intrinsic::x86_sse_comieq_ss:
6121 case Intrinsic::x86_sse_comilt_ss:
6122 case Intrinsic::x86_sse_comile_ss:
6123 case Intrinsic::x86_sse_comigt_ss:
6124 case Intrinsic::x86_sse_comige_ss:
6125 case Intrinsic::x86_sse_comineq_ss:
6126 case Intrinsic::x86_sse_ucomieq_ss:
6127 case Intrinsic::x86_sse_ucomilt_ss:
6128 case Intrinsic::x86_sse_ucomile_ss:
6129 case Intrinsic::x86_sse_ucomigt_ss:
6130 case Intrinsic::x86_sse_ucomige_ss:
6131 case Intrinsic::x86_sse_ucomineq_ss:
6132 case Intrinsic::x86_sse2_comieq_sd:
6133 case Intrinsic::x86_sse2_comilt_sd:
6134 case Intrinsic::x86_sse2_comile_sd:
6135 case Intrinsic::x86_sse2_comigt_sd:
6136 case Intrinsic::x86_sse2_comige_sd:
6137 case Intrinsic::x86_sse2_comineq_sd:
6138 case Intrinsic::x86_sse2_ucomieq_sd:
6139 case Intrinsic::x86_sse2_ucomilt_sd:
6140 case Intrinsic::x86_sse2_ucomile_sd:
6141 case Intrinsic::x86_sse2_ucomigt_sd:
6142 case Intrinsic::x86_sse2_ucomige_sd:
6143 case Intrinsic::x86_sse2_ucomineq_sd: {
6145 ISD::CondCode CC = ISD::SETCC_INVALID;
6148 case Intrinsic::x86_sse_comieq_ss:
6149 case Intrinsic::x86_sse2_comieq_sd:
6153 case Intrinsic::x86_sse_comilt_ss:
6154 case Intrinsic::x86_sse2_comilt_sd:
6158 case Intrinsic::x86_sse_comile_ss:
6159 case Intrinsic::x86_sse2_comile_sd:
6163 case Intrinsic::x86_sse_comigt_ss:
6164 case Intrinsic::x86_sse2_comigt_sd:
6168 case Intrinsic::x86_sse_comige_ss:
6169 case Intrinsic::x86_sse2_comige_sd:
6173 case Intrinsic::x86_sse_comineq_ss:
6174 case Intrinsic::x86_sse2_comineq_sd:
6178 case Intrinsic::x86_sse_ucomieq_ss:
6179 case Intrinsic::x86_sse2_ucomieq_sd:
6180 Opc = X86ISD::UCOMI;
6183 case Intrinsic::x86_sse_ucomilt_ss:
6184 case Intrinsic::x86_sse2_ucomilt_sd:
6185 Opc = X86ISD::UCOMI;
6188 case Intrinsic::x86_sse_ucomile_ss:
6189 case Intrinsic::x86_sse2_ucomile_sd:
6190 Opc = X86ISD::UCOMI;
6193 case Intrinsic::x86_sse_ucomigt_ss:
6194 case Intrinsic::x86_sse2_ucomigt_sd:
6195 Opc = X86ISD::UCOMI;
6198 case Intrinsic::x86_sse_ucomige_ss:
6199 case Intrinsic::x86_sse2_ucomige_sd:
6200 Opc = X86ISD::UCOMI;
6203 case Intrinsic::x86_sse_ucomineq_ss:
6204 case Intrinsic::x86_sse2_ucomineq_sd:
6205 Opc = X86ISD::UCOMI;
6210 SDValue LHS = Op.getOperand(1);
6211 SDValue RHS = Op.getOperand(2);
6212 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
6213 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
6214 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
6215 DAG.getConstant(X86CC, MVT::i8), Cond);
6216 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
6218 // ptest intrinsics. The intrinsic these come from are designed to return
6219 // an integer value, not just an instruction so lower it to the ptest
6220 // pattern and a setcc for the result.
6221 case Intrinsic::x86_sse41_ptestz:
6222 case Intrinsic::x86_sse41_ptestc:
6223 case Intrinsic::x86_sse41_ptestnzc:{
6226 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
6227 case Intrinsic::x86_sse41_ptestz:
6229 X86CC = X86::COND_E;
6231 case Intrinsic::x86_sse41_ptestc:
6233 X86CC = X86::COND_B;
6235 case Intrinsic::x86_sse41_ptestnzc:
6237 X86CC = X86::COND_A;
6241 SDValue LHS = Op.getOperand(1);
6242 SDValue RHS = Op.getOperand(2);
6243 SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS);
6244 SDValue CC = DAG.getConstant(X86CC, MVT::i8);
6245 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
6246 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
6249 // Fix vector shift instructions where the last operand is a non-immediate
6251 case Intrinsic::x86_sse2_pslli_w:
6252 case Intrinsic::x86_sse2_pslli_d:
6253 case Intrinsic::x86_sse2_pslli_q:
6254 case Intrinsic::x86_sse2_psrli_w:
6255 case Intrinsic::x86_sse2_psrli_d:
6256 case Intrinsic::x86_sse2_psrli_q:
6257 case Intrinsic::x86_sse2_psrai_w:
6258 case Intrinsic::x86_sse2_psrai_d:
6259 case Intrinsic::x86_mmx_pslli_w:
6260 case Intrinsic::x86_mmx_pslli_d:
6261 case Intrinsic::x86_mmx_pslli_q:
6262 case Intrinsic::x86_mmx_psrli_w:
6263 case Intrinsic::x86_mmx_psrli_d:
6264 case Intrinsic::x86_mmx_psrli_q:
6265 case Intrinsic::x86_mmx_psrai_w:
6266 case Intrinsic::x86_mmx_psrai_d: {
6267 SDValue ShAmt = Op.getOperand(2);
6268 if (isa<ConstantSDNode>(ShAmt))
6271 unsigned NewIntNo = 0;
6272 EVT ShAmtVT = MVT::v4i32;
6274 case Intrinsic::x86_sse2_pslli_w:
6275 NewIntNo = Intrinsic::x86_sse2_psll_w;
6277 case Intrinsic::x86_sse2_pslli_d:
6278 NewIntNo = Intrinsic::x86_sse2_psll_d;
6280 case Intrinsic::x86_sse2_pslli_q:
6281 NewIntNo = Intrinsic::x86_sse2_psll_q;
6283 case Intrinsic::x86_sse2_psrli_w:
6284 NewIntNo = Intrinsic::x86_sse2_psrl_w;
6286 case Intrinsic::x86_sse2_psrli_d:
6287 NewIntNo = Intrinsic::x86_sse2_psrl_d;
6289 case Intrinsic::x86_sse2_psrli_q:
6290 NewIntNo = Intrinsic::x86_sse2_psrl_q;
6292 case Intrinsic::x86_sse2_psrai_w:
6293 NewIntNo = Intrinsic::x86_sse2_psra_w;
6295 case Intrinsic::x86_sse2_psrai_d:
6296 NewIntNo = Intrinsic::x86_sse2_psra_d;
6299 ShAmtVT = MVT::v2i32;
6301 case Intrinsic::x86_mmx_pslli_w:
6302 NewIntNo = Intrinsic::x86_mmx_psll_w;
6304 case Intrinsic::x86_mmx_pslli_d:
6305 NewIntNo = Intrinsic::x86_mmx_psll_d;
6307 case Intrinsic::x86_mmx_pslli_q:
6308 NewIntNo = Intrinsic::x86_mmx_psll_q;
6310 case Intrinsic::x86_mmx_psrli_w:
6311 NewIntNo = Intrinsic::x86_mmx_psrl_w;
6313 case Intrinsic::x86_mmx_psrli_d:
6314 NewIntNo = Intrinsic::x86_mmx_psrl_d;
6316 case Intrinsic::x86_mmx_psrli_q:
6317 NewIntNo = Intrinsic::x86_mmx_psrl_q;
6319 case Intrinsic::x86_mmx_psrai_w:
6320 NewIntNo = Intrinsic::x86_mmx_psra_w;
6322 case Intrinsic::x86_mmx_psrai_d:
6323 NewIntNo = Intrinsic::x86_mmx_psra_d;
6325 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
6330 EVT VT = Op.getValueType();
6331 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT,
6332 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShAmtVT, ShAmt));
6333 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6334 DAG.getConstant(NewIntNo, MVT::i32),
6335 Op.getOperand(1), ShAmt);
6340 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
6341 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6342 DebugLoc dl = Op.getDebugLoc();
6345 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
6347 DAG.getConstant(TD->getPointerSize(),
6348 Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
6349 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
6350 DAG.getNode(ISD::ADD, dl, getPointerTy(),
6355 // Just load the return address.
6356 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
6357 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
6358 RetAddrFI, NULL, 0);
6361 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
6362 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
6363 MFI->setFrameAddressIsTaken(true);
6364 EVT VT = Op.getValueType();
6365 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful
6366 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6367 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
6368 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
6370 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0);
6374 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
6375 SelectionDAG &DAG) {
6376 return DAG.getIntPtrConstant(2*TD->getPointerSize());
6379 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
6381 MachineFunction &MF = DAG.getMachineFunction();
6382 SDValue Chain = Op.getOperand(0);
6383 SDValue Offset = Op.getOperand(1);
6384 SDValue Handler = Op.getOperand(2);
6385 DebugLoc dl = Op.getDebugLoc();
6387 SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP,
6389 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
6391 SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame,
6392 DAG.getIntPtrConstant(-TD->getPointerSize()));
6393 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
6394 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0);
6395 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
6396 MF.getRegInfo().addLiveOut(StoreAddrReg);
6398 return DAG.getNode(X86ISD::EH_RETURN, dl,
6400 Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
6403 SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
6404 SelectionDAG &DAG) {
6405 SDValue Root = Op.getOperand(0);
6406 SDValue Trmp = Op.getOperand(1); // trampoline
6407 SDValue FPtr = Op.getOperand(2); // nested function
6408 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
6409 DebugLoc dl = Op.getDebugLoc();
6411 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
6413 const X86InstrInfo *TII =
6414 ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
6416 if (Subtarget->is64Bit()) {
6417 SDValue OutChains[6];
6419 // Large code-model.
6421 const unsigned char JMP64r = TII->getBaseOpcodeFor(X86::JMP64r);
6422 const unsigned char MOV64ri = TII->getBaseOpcodeFor(X86::MOV64ri);
6424 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10);
6425 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11);
6427 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
6429 // Load the pointer to the nested function into R11.
6430 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
6431 SDValue Addr = Trmp;
6432 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
6435 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
6436 DAG.getConstant(2, MVT::i64));
6437 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, false, 2);
6439 // Load the 'nest' parameter value into R10.
6440 // R10 is specified in X86CallingConv.td
6441 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
6442 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
6443 DAG.getConstant(10, MVT::i64));
6444 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
6445 Addr, TrmpAddr, 10);
6447 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
6448 DAG.getConstant(12, MVT::i64));
6449 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, false, 2);
6451 // Jump to the nested function.
6452 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
6453 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
6454 DAG.getConstant(20, MVT::i64));
6455 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
6456 Addr, TrmpAddr, 20);
6458 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
6459 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
6460 DAG.getConstant(22, MVT::i64));
6461 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
6465 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) };
6466 return DAG.getMergeValues(Ops, 2, dl);
6468 const Function *Func =
6469 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
6470 unsigned CC = Func->getCallingConv();
6475 llvm_unreachable("Unsupported calling convention");
6476 case CallingConv::C:
6477 case CallingConv::X86_StdCall: {
6478 // Pass 'nest' parameter in ECX.
6479 // Must be kept in sync with X86CallingConv.td
6482 // Check that ECX wasn't needed by an 'inreg' parameter.
6483 const FunctionType *FTy = Func->getFunctionType();
6484 const AttrListPtr &Attrs = Func->getAttributes();
6486 if (!Attrs.isEmpty() && !Func->isVarArg()) {
6487 unsigned InRegCount = 0;
6490 for (FunctionType::param_iterator I = FTy->param_begin(),
6491 E = FTy->param_end(); I != E; ++I, ++Idx)
6492 if (Attrs.paramHasAttr(Idx, Attribute::InReg))
6493 // FIXME: should only count parameters that are lowered to integers.
6494 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
6496 if (InRegCount > 2) {
6497 llvm_report_error("Nest register in use - reduce number of inreg parameters!");
6502 case CallingConv::X86_FastCall:
6503 case CallingConv::Fast:
6504 // Pass 'nest' parameter in EAX.
6505 // Must be kept in sync with X86CallingConv.td
6510 SDValue OutChains[4];
6513 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
6514 DAG.getConstant(10, MVT::i32));
6515 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
6517 const unsigned char MOV32ri = TII->getBaseOpcodeFor(X86::MOV32ri);
6518 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg);
6519 OutChains[0] = DAG.getStore(Root, dl,
6520 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
6523 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
6524 DAG.getConstant(1, MVT::i32));
6525 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, false, 1);
6527 const unsigned char JMP = TII->getBaseOpcodeFor(X86::JMP);
6528 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
6529 DAG.getConstant(5, MVT::i32));
6530 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
6531 TrmpAddr, 5, false, 1);
6533 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
6534 DAG.getConstant(6, MVT::i32));
6535 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, false, 1);
6538 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) };
6539 return DAG.getMergeValues(Ops, 2, dl);
6543 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) {
6545 The rounding mode is in bits 11:10 of FPSR, and has the following
6552 FLT_ROUNDS, on the other hand, expects the following:
6559 To perform the conversion, we do:
6560 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
6563 MachineFunction &MF = DAG.getMachineFunction();
6564 const TargetMachine &TM = MF.getTarget();
6565 const TargetFrameInfo &TFI = *TM.getFrameInfo();
6566 unsigned StackAlignment = TFI.getStackAlignment();
6567 EVT VT = Op.getValueType();
6568 DebugLoc dl = Op.getDebugLoc();
6570 // Save FP Control Word to stack slot
6571 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment);
6572 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
6574 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other,
6575 DAG.getEntryNode(), StackSlot);
6577 // Load FP Control Word from stack slot
6578 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0);
6580 // Transform as necessary
6582 DAG.getNode(ISD::SRL, dl, MVT::i16,
6583 DAG.getNode(ISD::AND, dl, MVT::i16,
6584 CWD, DAG.getConstant(0x800, MVT::i16)),
6585 DAG.getConstant(11, MVT::i8));
6587 DAG.getNode(ISD::SRL, dl, MVT::i16,
6588 DAG.getNode(ISD::AND, dl, MVT::i16,
6589 CWD, DAG.getConstant(0x400, MVT::i16)),
6590 DAG.getConstant(9, MVT::i8));
6593 DAG.getNode(ISD::AND, dl, MVT::i16,
6594 DAG.getNode(ISD::ADD, dl, MVT::i16,
6595 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2),
6596 DAG.getConstant(1, MVT::i16)),
6597 DAG.getConstant(3, MVT::i16));
6600 return DAG.getNode((VT.getSizeInBits() < 16 ?
6601 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
6604 SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
6605 EVT VT = Op.getValueType();
6607 unsigned NumBits = VT.getSizeInBits();
6608 DebugLoc dl = Op.getDebugLoc();
6610 Op = Op.getOperand(0);
6611 if (VT == MVT::i8) {
6612 // Zero extend to i32 since there is not an i8 bsr.
6614 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
6617 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
6618 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
6619 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
6621 // If src is zero (i.e. bsr sets ZF), returns NumBits.
6622 SmallVector<SDValue, 4> Ops;
6624 Ops.push_back(DAG.getConstant(NumBits+NumBits-1, OpVT));
6625 Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8));
6626 Ops.push_back(Op.getValue(1));
6627 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4);
6629 // Finally xor with NumBits-1.
6630 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
6633 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
6637 SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
6638 EVT VT = Op.getValueType();
6640 unsigned NumBits = VT.getSizeInBits();
6641 DebugLoc dl = Op.getDebugLoc();
6643 Op = Op.getOperand(0);
6644 if (VT == MVT::i8) {
6646 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
6649 // Issue a bsf (scan bits forward) which also sets EFLAGS.
6650 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
6651 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
6653 // If src is zero (i.e. bsf sets ZF), returns NumBits.
6654 SmallVector<SDValue, 4> Ops;
6656 Ops.push_back(DAG.getConstant(NumBits, OpVT));
6657 Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8));
6658 Ops.push_back(Op.getValue(1));
6659 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4);
6662 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
6666 SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) {
6667 EVT VT = Op.getValueType();
6668 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
6669 DebugLoc dl = Op.getDebugLoc();
6671 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
6672 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
6673 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
6674 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi );
6675 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b );
6677 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 );
6678 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
6679 // return AloBlo + AloBhi + AhiBlo;
6681 SDValue A = Op.getOperand(0);
6682 SDValue B = Op.getOperand(1);
6684 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6685 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
6686 A, DAG.getConstant(32, MVT::i32));
6687 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6688 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
6689 B, DAG.getConstant(32, MVT::i32));
6690 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6691 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
6693 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6694 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
6696 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6697 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
6699 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6700 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
6701 AloBhi, DAG.getConstant(32, MVT::i32));
6702 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6703 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
6704 AhiBlo, DAG.getConstant(32, MVT::i32));
6705 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
6706 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
6711 SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) {
6712 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
6713 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
6714 // looks for this combo and may remove the "setcc" instruction if the "setcc"
6715 // has only one use.
6716 SDNode *N = Op.getNode();
6717 SDValue LHS = N->getOperand(0);
6718 SDValue RHS = N->getOperand(1);
6719 unsigned BaseOp = 0;
6721 DebugLoc dl = Op.getDebugLoc();
6723 switch (Op.getOpcode()) {
6724 default: llvm_unreachable("Unknown ovf instruction!");
6726 // A subtract of one will be selected as a INC. Note that INC doesn't
6727 // set CF, so we can't do this for UADDO.
6728 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
6729 if (C->getAPIntValue() == 1) {
6730 BaseOp = X86ISD::INC;
6734 BaseOp = X86ISD::ADD;
6738 BaseOp = X86ISD::ADD;
6742 // A subtract of one will be selected as a DEC. Note that DEC doesn't
6743 // set CF, so we can't do this for USUBO.
6744 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
6745 if (C->getAPIntValue() == 1) {
6746 BaseOp = X86ISD::DEC;
6750 BaseOp = X86ISD::SUB;
6754 BaseOp = X86ISD::SUB;
6758 BaseOp = X86ISD::SMUL;
6762 BaseOp = X86ISD::UMUL;
6767 // Also sets EFLAGS.
6768 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
6769 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS);
6772 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1),
6773 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1));
6775 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC);
6779 SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) {
6780 EVT T = Op.getValueType();
6781 DebugLoc dl = Op.getDebugLoc();
6784 switch(T.getSimpleVT().SimpleTy) {
6786 assert(false && "Invalid value type!");
6787 case MVT::i8: Reg = X86::AL; size = 1; break;
6788 case MVT::i16: Reg = X86::AX; size = 2; break;
6789 case MVT::i32: Reg = X86::EAX; size = 4; break;
6791 assert(Subtarget->is64Bit() && "Node not type legal!");
6792 Reg = X86::RAX; size = 8;
6795 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg,
6796 Op.getOperand(2), SDValue());
6797 SDValue Ops[] = { cpIn.getValue(0),
6800 DAG.getTargetConstant(size, MVT::i8),
6802 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6803 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5);
6805 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1));
6809 SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
6810 SelectionDAG &DAG) {
6811 assert(Subtarget->is64Bit() && "Result not type legalized?");
6812 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6813 SDValue TheChain = Op.getOperand(0);
6814 DebugLoc dl = Op.getDebugLoc();
6815 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
6816 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
6817 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
6819 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
6820 DAG.getConstant(32, MVT::i8));
6822 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
6825 return DAG.getMergeValues(Ops, 2, dl);
6828 SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
6829 SDNode *Node = Op.getNode();
6830 DebugLoc dl = Node->getDebugLoc();
6831 EVT T = Node->getValueType(0);
6832 SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
6833 DAG.getConstant(0, T), Node->getOperand(2));
6834 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
6835 cast<AtomicSDNode>(Node)->getMemoryVT(),
6836 Node->getOperand(0),
6837 Node->getOperand(1), negOp,
6838 cast<AtomicSDNode>(Node)->getSrcValue(),
6839 cast<AtomicSDNode>(Node)->getAlignment());
6842 /// LowerOperation - Provide custom lowering hooks for some operations.
6844 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
6845 switch (Op.getOpcode()) {
6846 default: llvm_unreachable("Should not custom lower this!");
6847 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG);
6848 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
6849 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
6850 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
6851 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6852 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
6853 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
6854 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
6855 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
6856 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
6857 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
6858 case ISD::SHL_PARTS:
6859 case ISD::SRA_PARTS:
6860 case ISD::SRL_PARTS: return LowerShift(Op, DAG);
6861 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
6862 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
6863 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
6864 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
6865 case ISD::FABS: return LowerFABS(Op, DAG);
6866 case ISD::FNEG: return LowerFNEG(Op, DAG);
6867 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
6868 case ISD::SETCC: return LowerSETCC(Op, DAG);
6869 case ISD::VSETCC: return LowerVSETCC(Op, DAG);
6870 case ISD::SELECT: return LowerSELECT(Op, DAG);
6871 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
6872 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
6873 case ISD::VASTART: return LowerVASTART(Op, DAG);
6874 case ISD::VAARG: return LowerVAARG(Op, DAG);
6875 case ISD::VACOPY: return LowerVACOPY(Op, DAG);
6876 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6877 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
6878 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
6879 case ISD::FRAME_TO_ARGS_OFFSET:
6880 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
6881 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
6882 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
6883 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG);
6884 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
6885 case ISD::CTLZ: return LowerCTLZ(Op, DAG);
6886 case ISD::CTTZ: return LowerCTTZ(Op, DAG);
6887 case ISD::MUL: return LowerMUL_V2I64(Op, DAG);
6893 case ISD::UMULO: return LowerXALUO(Op, DAG);
6894 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG);
6898 void X86TargetLowering::
6899 ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
6900 SelectionDAG &DAG, unsigned NewOp) {
6901 EVT T = Node->getValueType(0);
6902 DebugLoc dl = Node->getDebugLoc();
6903 assert (T == MVT::i64 && "Only know how to expand i64 atomics");
6905 SDValue Chain = Node->getOperand(0);
6906 SDValue In1 = Node->getOperand(1);
6907 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
6908 Node->getOperand(2), DAG.getIntPtrConstant(0));
6909 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
6910 Node->getOperand(2), DAG.getIntPtrConstant(1));
6911 // This is a generalized SDNode, not an AtomicSDNode, so it doesn't
6912 // have a MemOperand. Pass the info through as a normal operand.
6913 SDValue LSI = DAG.getMemOperand(cast<MemSDNode>(Node)->getMemOperand());
6914 SDValue Ops[] = { Chain, In1, In2L, In2H, LSI };
6915 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
6916 SDValue Result = DAG.getNode(NewOp, dl, Tys, Ops, 5);
6917 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
6918 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
6919 Results.push_back(Result.getValue(2));
6922 /// ReplaceNodeResults - Replace a node with an illegal result type
6923 /// with a new node built out of custom code.
6924 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
6925 SmallVectorImpl<SDValue>&Results,
6926 SelectionDAG &DAG) {
6927 DebugLoc dl = N->getDebugLoc();
6928 switch (N->getOpcode()) {
6930 assert(false && "Do not know how to custom type legalize this operation!");
6932 case ISD::FP_TO_SINT: {
6933 std::pair<SDValue,SDValue> Vals =
6934 FP_TO_INTHelper(SDValue(N, 0), DAG, true);
6935 SDValue FIST = Vals.first, StackSlot = Vals.second;
6936 if (FIST.getNode() != 0) {
6937 EVT VT = N->getValueType(0);
6938 // Return a load from the stack slot.
6939 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0));
6943 case ISD::READCYCLECOUNTER: {
6944 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6945 SDValue TheChain = N->getOperand(0);
6946 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
6947 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
6949 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
6951 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
6952 SDValue Ops[] = { eax, edx };
6953 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2));
6954 Results.push_back(edx.getValue(1));
6957 case ISD::ATOMIC_CMP_SWAP: {
6958 EVT T = N->getValueType(0);
6959 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
6960 SDValue cpInL, cpInH;
6961 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
6962 DAG.getConstant(0, MVT::i32));
6963 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
6964 DAG.getConstant(1, MVT::i32));
6965 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue());
6966 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH,
6968 SDValue swapInL, swapInH;
6969 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
6970 DAG.getConstant(0, MVT::i32));
6971 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
6972 DAG.getConstant(1, MVT::i32));
6973 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL,
6975 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH,
6976 swapInL.getValue(1));
6977 SDValue Ops[] = { swapInH.getValue(0),
6979 swapInH.getValue(1) };
6980 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6981 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3);
6982 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX,
6983 MVT::i32, Result.getValue(1));
6984 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX,
6985 MVT::i32, cpOutL.getValue(2));
6986 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
6987 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
6988 Results.push_back(cpOutH.getValue(1));
6991 case ISD::ATOMIC_LOAD_ADD:
6992 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG);
6994 case ISD::ATOMIC_LOAD_AND:
6995 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG);
6997 case ISD::ATOMIC_LOAD_NAND:
6998 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG);
7000 case ISD::ATOMIC_LOAD_OR:
7001 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG);
7003 case ISD::ATOMIC_LOAD_SUB:
7004 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG);
7006 case ISD::ATOMIC_LOAD_XOR:
7007 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG);
7009 case ISD::ATOMIC_SWAP:
7010 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
7015 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
7017 default: return NULL;
7018 case X86ISD::BSF: return "X86ISD::BSF";
7019 case X86ISD::BSR: return "X86ISD::BSR";
7020 case X86ISD::SHLD: return "X86ISD::SHLD";
7021 case X86ISD::SHRD: return "X86ISD::SHRD";
7022 case X86ISD::FAND: return "X86ISD::FAND";
7023 case X86ISD::FOR: return "X86ISD::FOR";
7024 case X86ISD::FXOR: return "X86ISD::FXOR";
7025 case X86ISD::FSRL: return "X86ISD::FSRL";
7026 case X86ISD::FILD: return "X86ISD::FILD";
7027 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
7028 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
7029 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
7030 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
7031 case X86ISD::FLD: return "X86ISD::FLD";
7032 case X86ISD::FST: return "X86ISD::FST";
7033 case X86ISD::CALL: return "X86ISD::CALL";
7034 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
7035 case X86ISD::BT: return "X86ISD::BT";
7036 case X86ISD::CMP: return "X86ISD::CMP";
7037 case X86ISD::COMI: return "X86ISD::COMI";
7038 case X86ISD::UCOMI: return "X86ISD::UCOMI";
7039 case X86ISD::SETCC: return "X86ISD::SETCC";
7040 case X86ISD::CMOV: return "X86ISD::CMOV";
7041 case X86ISD::BRCOND: return "X86ISD::BRCOND";
7042 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
7043 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
7044 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
7045 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
7046 case X86ISD::Wrapper: return "X86ISD::Wrapper";
7047 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
7048 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
7049 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
7050 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
7051 case X86ISD::PINSRB: return "X86ISD::PINSRB";
7052 case X86ISD::PINSRW: return "X86ISD::PINSRW";
7053 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
7054 case X86ISD::FMAX: return "X86ISD::FMAX";
7055 case X86ISD::FMIN: return "X86ISD::FMIN";
7056 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
7057 case X86ISD::FRCP: return "X86ISD::FRCP";
7058 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
7059 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress";
7060 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
7061 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
7062 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
7063 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
7064 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
7065 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG";
7066 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG";
7067 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG";
7068 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG";
7069 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG";
7070 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG";
7071 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
7072 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
7073 case X86ISD::VSHL: return "X86ISD::VSHL";
7074 case X86ISD::VSRL: return "X86ISD::VSRL";
7075 case X86ISD::CMPPD: return "X86ISD::CMPPD";
7076 case X86ISD::CMPPS: return "X86ISD::CMPPS";
7077 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB";
7078 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW";
7079 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD";
7080 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ";
7081 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB";
7082 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW";
7083 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD";
7084 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ";
7085 case X86ISD::ADD: return "X86ISD::ADD";
7086 case X86ISD::SUB: return "X86ISD::SUB";
7087 case X86ISD::SMUL: return "X86ISD::SMUL";
7088 case X86ISD::UMUL: return "X86ISD::UMUL";
7089 case X86ISD::INC: return "X86ISD::INC";
7090 case X86ISD::DEC: return "X86ISD::DEC";
7091 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
7092 case X86ISD::PTEST: return "X86ISD::PTEST";
7096 // isLegalAddressingMode - Return true if the addressing mode represented
7097 // by AM is legal for this target, for a load/store of the specified type.
7098 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
7099 const Type *Ty) const {
7100 // X86 supports extremely general addressing modes.
7101 CodeModel::Model M = getTargetMachine().getCodeModel();
7103 // X86 allows a sign-extended 32-bit immediate field as a displacement.
7104 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
7109 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
7111 // If a reference to this global requires an extra load, we can't fold it.
7112 if (isGlobalStubReference(GVFlags))
7115 // If BaseGV requires a register for the PIC base, we cannot also have a
7116 // BaseReg specified.
7117 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
7120 // If lower 4G is not available, then we must use rip-relative addressing.
7121 if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
7131 // These scales always work.
7136 // These scales are formed with basereg+scalereg. Only accept if there is
7141 default: // Other stuff never works.
7149 bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
7150 if (!Ty1->isInteger() || !Ty2->isInteger())
7152 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
7153 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
7154 if (NumBits1 <= NumBits2)
7156 return Subtarget->is64Bit() || NumBits1 < 64;
7159 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
7160 if (!VT1.isInteger() || !VT2.isInteger())
7162 unsigned NumBits1 = VT1.getSizeInBits();
7163 unsigned NumBits2 = VT2.getSizeInBits();
7164 if (NumBits1 <= NumBits2)
7166 return Subtarget->is64Bit() || NumBits1 < 64;
7169 bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
7170 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
7171 return Ty1 == Type::getInt32Ty(Ty1->getContext()) &&
7172 Ty2 == Type::getInt64Ty(Ty1->getContext()) && Subtarget->is64Bit();
7175 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
7176 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
7177 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
7180 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
7181 // i16 instructions are longer (0x66 prefix) and potentially slower.
7182 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
7185 /// isShuffleMaskLegal - Targets can use this to indicate that they only
7186 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
7187 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
7188 /// are assumed to be legal.
7190 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
7192 // Only do shuffles on 128-bit vector types for now.
7193 if (VT.getSizeInBits() == 64)
7196 // FIXME: pshufb, blends, palignr, shifts.
7197 return (VT.getVectorNumElements() == 2 ||
7198 ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
7199 isMOVLMask(M, VT) ||
7200 isSHUFPMask(M, VT) ||
7201 isPSHUFDMask(M, VT) ||
7202 isPSHUFHWMask(M, VT) ||
7203 isPSHUFLWMask(M, VT) ||
7204 isUNPCKLMask(M, VT) ||
7205 isUNPCKHMask(M, VT) ||
7206 isUNPCKL_v_undef_Mask(M, VT) ||
7207 isUNPCKH_v_undef_Mask(M, VT));
7211 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
7213 unsigned NumElts = VT.getVectorNumElements();
7214 // FIXME: This collection of masks seems suspect.
7217 if (NumElts == 4 && VT.getSizeInBits() == 128) {
7218 return (isMOVLMask(Mask, VT) ||
7219 isCommutedMOVLMask(Mask, VT, true) ||
7220 isSHUFPMask(Mask, VT) ||
7221 isCommutedSHUFPMask(Mask, VT));
7226 //===----------------------------------------------------------------------===//
7227 // X86 Scheduler Hooks
7228 //===----------------------------------------------------------------------===//
7230 // private utility function
7232 X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
7233 MachineBasicBlock *MBB,
7241 TargetRegisterClass *RC,
7242 bool invSrc) const {
7243 // For the atomic bitwise operator, we generate
7246 // ld t1 = [bitinstr.addr]
7247 // op t2 = t1, [bitinstr.val]
7249 // lcs dest = [bitinstr.addr], t2 [EAX is implicit]
7251 // fallthrough -->nextMBB
7252 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7253 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
7254 MachineFunction::iterator MBBIter = MBB;
7257 /// First build the CFG
7258 MachineFunction *F = MBB->getParent();
7259 MachineBasicBlock *thisMBB = MBB;
7260 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
7261 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
7262 F->insert(MBBIter, newMBB);
7263 F->insert(MBBIter, nextMBB);
7265 // Move all successors to thisMBB to nextMBB
7266 nextMBB->transferSuccessors(thisMBB);
7268 // Update thisMBB to fall through to newMBB
7269 thisMBB->addSuccessor(newMBB);
7271 // newMBB jumps to itself and fall through to nextMBB
7272 newMBB->addSuccessor(nextMBB);
7273 newMBB->addSuccessor(newMBB);
7275 // Insert instructions into newMBB based on incoming instruction
7276 assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 &&
7277 "unexpected number of operands");
7278 DebugLoc dl = bInstr->getDebugLoc();
7279 MachineOperand& destOper = bInstr->getOperand(0);
7280 MachineOperand* argOpers[2 + X86AddrNumOperands];
7281 int numArgs = bInstr->getNumOperands() - 1;
7282 for (int i=0; i < numArgs; ++i)
7283 argOpers[i] = &bInstr->getOperand(i+1);
7285 // x86 address has 4 operands: base, index, scale, and displacement
7286 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
7287 int valArgIndx = lastAddrIndx + 1;
7289 unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
7290 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
7291 for (int i=0; i <= lastAddrIndx; ++i)
7292 (*MIB).addOperand(*argOpers[i]);
7294 unsigned tt = F->getRegInfo().createVirtualRegister(RC);
7296 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1);
7301 unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
7302 assert((argOpers[valArgIndx]->isReg() ||
7303 argOpers[valArgIndx]->isImm()) &&
7305 if (argOpers[valArgIndx]->isReg())
7306 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
7308 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
7310 (*MIB).addOperand(*argOpers[valArgIndx]);
7312 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg);
7315 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
7316 for (int i=0; i <= lastAddrIndx; ++i)
7317 (*MIB).addOperand(*argOpers[i]);
7319 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
7320 (*MIB).addMemOperand(*F, *bInstr->memoperands_begin());
7322 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg());
7326 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
7328 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now.
7332 // private utility function: 64 bit atomics on 32 bit host.
7334 X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
7335 MachineBasicBlock *MBB,
7340 bool invSrc) const {
7341 // For the atomic bitwise operator, we generate
7342 // thisMBB (instructions are in pairs, except cmpxchg8b)
7343 // ld t1,t2 = [bitinstr.addr]
7345 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
7346 // op t5, t6 <- out1, out2, [bitinstr.val]
7347 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val])
7348 // mov ECX, EBX <- t5, t6
7349 // mov EAX, EDX <- t1, t2
7350 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit]
7351 // mov t3, t4 <- EAX, EDX
7353 // result in out1, out2
7354 // fallthrough -->nextMBB
7356 const TargetRegisterClass *RC = X86::GR32RegisterClass;
7357 const unsigned LoadOpc = X86::MOV32rm;
7358 const unsigned copyOpc = X86::MOV32rr;
7359 const unsigned NotOpc = X86::NOT32r;
7360 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7361 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
7362 MachineFunction::iterator MBBIter = MBB;
7365 /// First build the CFG
7366 MachineFunction *F = MBB->getParent();
7367 MachineBasicBlock *thisMBB = MBB;
7368 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
7369 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
7370 F->insert(MBBIter, newMBB);
7371 F->insert(MBBIter, nextMBB);
7373 // Move all successors to thisMBB to nextMBB
7374 nextMBB->transferSuccessors(thisMBB);
7376 // Update thisMBB to fall through to newMBB
7377 thisMBB->addSuccessor(newMBB);
7379 // newMBB jumps to itself and fall through to nextMBB
7380 newMBB->addSuccessor(nextMBB);
7381 newMBB->addSuccessor(newMBB);
7383 DebugLoc dl = bInstr->getDebugLoc();
7384 // Insert instructions into newMBB based on incoming instruction
7385 // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
7386 assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 &&
7387 "unexpected number of operands");
7388 MachineOperand& dest1Oper = bInstr->getOperand(0);
7389 MachineOperand& dest2Oper = bInstr->getOperand(1);
7390 MachineOperand* argOpers[2 + X86AddrNumOperands];
7391 for (int i=0; i < 2 + X86AddrNumOperands; ++i)
7392 argOpers[i] = &bInstr->getOperand(i+2);
7394 // x86 address has 4 operands: base, index, scale, and displacement
7395 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
7397 unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
7398 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
7399 for (int i=0; i <= lastAddrIndx; ++i)
7400 (*MIB).addOperand(*argOpers[i]);
7401 unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
7402 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
7403 // add 4 to displacement.
7404 for (int i=0; i <= lastAddrIndx-2; ++i)
7405 (*MIB).addOperand(*argOpers[i]);
7406 MachineOperand newOp3 = *(argOpers[3]);
7408 newOp3.setImm(newOp3.getImm()+4);
7410 newOp3.setOffset(newOp3.getOffset()+4);
7411 (*MIB).addOperand(newOp3);
7412 (*MIB).addOperand(*argOpers[lastAddrIndx]);
7414 // t3/4 are defined later, at the bottom of the loop
7415 unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
7416 unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
7417 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
7418 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
7419 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
7420 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
7422 unsigned tt1 = F->getRegInfo().createVirtualRegister(RC);
7423 unsigned tt2 = F->getRegInfo().createVirtualRegister(RC);
7425 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt1).addReg(t1);
7426 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt2).addReg(t2);
7432 int valArgIndx = lastAddrIndx + 1;
7433 assert((argOpers[valArgIndx]->isReg() ||
7434 argOpers[valArgIndx]->isImm()) &&
7436 unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
7437 unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
7438 if (argOpers[valArgIndx]->isReg())
7439 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
7441 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
7442 if (regOpcL != X86::MOV32rr)
7444 (*MIB).addOperand(*argOpers[valArgIndx]);
7445 assert(argOpers[valArgIndx + 1]->isReg() ==
7446 argOpers[valArgIndx]->isReg());
7447 assert(argOpers[valArgIndx + 1]->isImm() ==
7448 argOpers[valArgIndx]->isImm());
7449 if (argOpers[valArgIndx + 1]->isReg())
7450 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
7452 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
7453 if (regOpcH != X86::MOV32rr)
7455 (*MIB).addOperand(*argOpers[valArgIndx + 1]);
7457 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX);
7459 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX);
7462 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX);
7464 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX);
7467 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
7468 for (int i=0; i <= lastAddrIndx; ++i)
7469 (*MIB).addOperand(*argOpers[i]);
7471 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
7472 (*MIB).addMemOperand(*F, *bInstr->memoperands_begin());
7474 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3);
7475 MIB.addReg(X86::EAX);
7476 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4);
7477 MIB.addReg(X86::EDX);
7480 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
7482 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now.
7486 // private utility function
7488 X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
7489 MachineBasicBlock *MBB,
7490 unsigned cmovOpc) const {
7491 // For the atomic min/max operator, we generate
7494 // ld t1 = [min/max.addr]
7495 // mov t2 = [min/max.val]
7497 // cmov[cond] t2 = t1
7499 // lcs dest = [bitinstr.addr], t2 [EAX is implicit]
7501 // fallthrough -->nextMBB
7503 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7504 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
7505 MachineFunction::iterator MBBIter = MBB;
7508 /// First build the CFG
7509 MachineFunction *F = MBB->getParent();
7510 MachineBasicBlock *thisMBB = MBB;
7511 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
7512 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
7513 F->insert(MBBIter, newMBB);
7514 F->insert(MBBIter, nextMBB);
7516 // Move all successors to thisMBB to nextMBB
7517 nextMBB->transferSuccessors(thisMBB);
7519 // Update thisMBB to fall through to newMBB
7520 thisMBB->addSuccessor(newMBB);
7522 // newMBB jumps to newMBB and fall through to nextMBB
7523 newMBB->addSuccessor(nextMBB);
7524 newMBB->addSuccessor(newMBB);
7526 DebugLoc dl = mInstr->getDebugLoc();
7527 // Insert instructions into newMBB based on incoming instruction
7528 assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 &&
7529 "unexpected number of operands");
7530 MachineOperand& destOper = mInstr->getOperand(0);
7531 MachineOperand* argOpers[2 + X86AddrNumOperands];
7532 int numArgs = mInstr->getNumOperands() - 1;
7533 for (int i=0; i < numArgs; ++i)
7534 argOpers[i] = &mInstr->getOperand(i+1);
7536 // x86 address has 4 operands: base, index, scale, and displacement
7537 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
7538 int valArgIndx = lastAddrIndx + 1;
7540 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
7541 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
7542 for (int i=0; i <= lastAddrIndx; ++i)
7543 (*MIB).addOperand(*argOpers[i]);
7545 // We only support register and immediate values
7546 assert((argOpers[valArgIndx]->isReg() ||
7547 argOpers[valArgIndx]->isImm()) &&
7550 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
7551 if (argOpers[valArgIndx]->isReg())
7552 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
7554 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
7555 (*MIB).addOperand(*argOpers[valArgIndx]);
7557 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX);
7560 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
7565 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
7566 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
7570 // Cmp and exchange if none has modified the memory location
7571 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
7572 for (int i=0; i <= lastAddrIndx; ++i)
7573 (*MIB).addOperand(*argOpers[i]);
7575 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
7576 (*MIB).addMemOperand(*F, *mInstr->memoperands_begin());
7578 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg());
7579 MIB.addReg(X86::EAX);
7582 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
7584 F->DeleteMachineInstr(mInstr); // The pseudo instruction is gone now.
7590 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
7591 MachineBasicBlock *BB) const {
7592 DebugLoc dl = MI->getDebugLoc();
7593 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7594 switch (MI->getOpcode()) {
7595 default: assert(false && "Unexpected instr type to insert");
7596 case X86::CMOV_V1I64:
7597 case X86::CMOV_FR32:
7598 case X86::CMOV_FR64:
7599 case X86::CMOV_V4F32:
7600 case X86::CMOV_V2F64:
7601 case X86::CMOV_V2I64: {
7602 // To "insert" a SELECT_CC instruction, we actually have to insert the
7603 // diamond control-flow pattern. The incoming instruction knows the
7604 // destination vreg to set, the condition code register to branch on, the
7605 // true/false values to select between, and a branch opcode to use.
7606 const BasicBlock *LLVM_BB = BB->getBasicBlock();
7607 MachineFunction::iterator It = BB;
7613 // cmpTY ccX, r1, r2
7615 // fallthrough --> copy0MBB
7616 MachineBasicBlock *thisMBB = BB;
7617 MachineFunction *F = BB->getParent();
7618 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
7619 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
7621 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
7622 BuildMI(BB, dl, TII->get(Opc)).addMBB(sinkMBB);
7623 F->insert(It, copy0MBB);
7624 F->insert(It, sinkMBB);
7625 // Update machine-CFG edges by transferring all successors of the current
7626 // block to the new block which will contain the Phi node for the select.
7627 sinkMBB->transferSuccessors(BB);
7629 // Add the true and fallthrough blocks as its successors.
7630 BB->addSuccessor(copy0MBB);
7631 BB->addSuccessor(sinkMBB);
7634 // %FalseValue = ...
7635 // # fallthrough to sinkMBB
7638 // Update machine-CFG edges
7639 BB->addSuccessor(sinkMBB);
7642 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
7645 BuildMI(BB, dl, TII->get(X86::PHI), MI->getOperand(0).getReg())
7646 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
7647 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
7649 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now.
7653 case X86::FP32_TO_INT16_IN_MEM:
7654 case X86::FP32_TO_INT32_IN_MEM:
7655 case X86::FP32_TO_INT64_IN_MEM:
7656 case X86::FP64_TO_INT16_IN_MEM:
7657 case X86::FP64_TO_INT32_IN_MEM:
7658 case X86::FP64_TO_INT64_IN_MEM:
7659 case X86::FP80_TO_INT16_IN_MEM:
7660 case X86::FP80_TO_INT32_IN_MEM:
7661 case X86::FP80_TO_INT64_IN_MEM: {
7662 // Change the floating point control register to use "round towards zero"
7663 // mode when truncating to an integer value.
7664 MachineFunction *F = BB->getParent();
7665 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2);
7666 addFrameReference(BuildMI(BB, dl, TII->get(X86::FNSTCW16m)), CWFrameIdx);
7668 // Load the old value of the high byte of the control word...
7670 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
7671 addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16rm), OldCW),
7674 // Set the high part to be round to zero...
7675 addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mi)), CWFrameIdx)
7678 // Reload the modified control word now...
7679 addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx);
7681 // Restore the memory image of control word to original value
7682 addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mr)), CWFrameIdx)
7685 // Get the X86 opcode to use.
7687 switch (MI->getOpcode()) {
7688 default: llvm_unreachable("illegal opcode!");
7689 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
7690 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
7691 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
7692 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
7693 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
7694 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
7695 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
7696 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
7697 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
7701 MachineOperand &Op = MI->getOperand(0);
7703 AM.BaseType = X86AddressMode::RegBase;
7704 AM.Base.Reg = Op.getReg();
7706 AM.BaseType = X86AddressMode::FrameIndexBase;
7707 AM.Base.FrameIndex = Op.getIndex();
7709 Op = MI->getOperand(1);
7711 AM.Scale = Op.getImm();
7712 Op = MI->getOperand(2);
7714 AM.IndexReg = Op.getImm();
7715 Op = MI->getOperand(3);
7716 if (Op.isGlobal()) {
7717 AM.GV = Op.getGlobal();
7719 AM.Disp = Op.getImm();
7721 addFullAddress(BuildMI(BB, dl, TII->get(Opc)), AM)
7722 .addReg(MI->getOperand(X86AddrNumOperands).getReg());
7724 // Reload the original control word now.
7725 addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx);
7727 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now.
7730 case X86::ATOMAND32:
7731 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
7732 X86::AND32ri, X86::MOV32rm,
7733 X86::LCMPXCHG32, X86::MOV32rr,
7734 X86::NOT32r, X86::EAX,
7735 X86::GR32RegisterClass);
7737 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
7738 X86::OR32ri, X86::MOV32rm,
7739 X86::LCMPXCHG32, X86::MOV32rr,
7740 X86::NOT32r, X86::EAX,
7741 X86::GR32RegisterClass);
7742 case X86::ATOMXOR32:
7743 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
7744 X86::XOR32ri, X86::MOV32rm,
7745 X86::LCMPXCHG32, X86::MOV32rr,
7746 X86::NOT32r, X86::EAX,
7747 X86::GR32RegisterClass);
7748 case X86::ATOMNAND32:
7749 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
7750 X86::AND32ri, X86::MOV32rm,
7751 X86::LCMPXCHG32, X86::MOV32rr,
7752 X86::NOT32r, X86::EAX,
7753 X86::GR32RegisterClass, true);
7754 case X86::ATOMMIN32:
7755 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
7756 case X86::ATOMMAX32:
7757 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
7758 case X86::ATOMUMIN32:
7759 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
7760 case X86::ATOMUMAX32:
7761 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
7763 case X86::ATOMAND16:
7764 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
7765 X86::AND16ri, X86::MOV16rm,
7766 X86::LCMPXCHG16, X86::MOV16rr,
7767 X86::NOT16r, X86::AX,
7768 X86::GR16RegisterClass);
7770 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
7771 X86::OR16ri, X86::MOV16rm,
7772 X86::LCMPXCHG16, X86::MOV16rr,
7773 X86::NOT16r, X86::AX,
7774 X86::GR16RegisterClass);
7775 case X86::ATOMXOR16:
7776 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
7777 X86::XOR16ri, X86::MOV16rm,
7778 X86::LCMPXCHG16, X86::MOV16rr,
7779 X86::NOT16r, X86::AX,
7780 X86::GR16RegisterClass);
7781 case X86::ATOMNAND16:
7782 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
7783 X86::AND16ri, X86::MOV16rm,
7784 X86::LCMPXCHG16, X86::MOV16rr,
7785 X86::NOT16r, X86::AX,
7786 X86::GR16RegisterClass, true);
7787 case X86::ATOMMIN16:
7788 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
7789 case X86::ATOMMAX16:
7790 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
7791 case X86::ATOMUMIN16:
7792 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
7793 case X86::ATOMUMAX16:
7794 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
7797 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
7798 X86::AND8ri, X86::MOV8rm,
7799 X86::LCMPXCHG8, X86::MOV8rr,
7800 X86::NOT8r, X86::AL,
7801 X86::GR8RegisterClass);
7803 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
7804 X86::OR8ri, X86::MOV8rm,
7805 X86::LCMPXCHG8, X86::MOV8rr,
7806 X86::NOT8r, X86::AL,
7807 X86::GR8RegisterClass);
7809 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
7810 X86::XOR8ri, X86::MOV8rm,
7811 X86::LCMPXCHG8, X86::MOV8rr,
7812 X86::NOT8r, X86::AL,
7813 X86::GR8RegisterClass);
7814 case X86::ATOMNAND8:
7815 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
7816 X86::AND8ri, X86::MOV8rm,
7817 X86::LCMPXCHG8, X86::MOV8rr,
7818 X86::NOT8r, X86::AL,
7819 X86::GR8RegisterClass, true);
7820 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
7821 // This group is for 64-bit host.
7822 case X86::ATOMAND64:
7823 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
7824 X86::AND64ri32, X86::MOV64rm,
7825 X86::LCMPXCHG64, X86::MOV64rr,
7826 X86::NOT64r, X86::RAX,
7827 X86::GR64RegisterClass);
7829 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
7830 X86::OR64ri32, X86::MOV64rm,
7831 X86::LCMPXCHG64, X86::MOV64rr,
7832 X86::NOT64r, X86::RAX,
7833 X86::GR64RegisterClass);
7834 case X86::ATOMXOR64:
7835 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
7836 X86::XOR64ri32, X86::MOV64rm,
7837 X86::LCMPXCHG64, X86::MOV64rr,
7838 X86::NOT64r, X86::RAX,
7839 X86::GR64RegisterClass);
7840 case X86::ATOMNAND64:
7841 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
7842 X86::AND64ri32, X86::MOV64rm,
7843 X86::LCMPXCHG64, X86::MOV64rr,
7844 X86::NOT64r, X86::RAX,
7845 X86::GR64RegisterClass, true);
7846 case X86::ATOMMIN64:
7847 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
7848 case X86::ATOMMAX64:
7849 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
7850 case X86::ATOMUMIN64:
7851 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
7852 case X86::ATOMUMAX64:
7853 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
7855 // This group does 64-bit operations on a 32-bit host.
7856 case X86::ATOMAND6432:
7857 return EmitAtomicBit6432WithCustomInserter(MI, BB,
7858 X86::AND32rr, X86::AND32rr,
7859 X86::AND32ri, X86::AND32ri,
7861 case X86::ATOMOR6432:
7862 return EmitAtomicBit6432WithCustomInserter(MI, BB,
7863 X86::OR32rr, X86::OR32rr,
7864 X86::OR32ri, X86::OR32ri,
7866 case X86::ATOMXOR6432:
7867 return EmitAtomicBit6432WithCustomInserter(MI, BB,
7868 X86::XOR32rr, X86::XOR32rr,
7869 X86::XOR32ri, X86::XOR32ri,
7871 case X86::ATOMNAND6432:
7872 return EmitAtomicBit6432WithCustomInserter(MI, BB,
7873 X86::AND32rr, X86::AND32rr,
7874 X86::AND32ri, X86::AND32ri,
7876 case X86::ATOMADD6432:
7877 return EmitAtomicBit6432WithCustomInserter(MI, BB,
7878 X86::ADD32rr, X86::ADC32rr,
7879 X86::ADD32ri, X86::ADC32ri,
7881 case X86::ATOMSUB6432:
7882 return EmitAtomicBit6432WithCustomInserter(MI, BB,
7883 X86::SUB32rr, X86::SBB32rr,
7884 X86::SUB32ri, X86::SBB32ri,
7886 case X86::ATOMSWAP6432:
7887 return EmitAtomicBit6432WithCustomInserter(MI, BB,
7888 X86::MOV32rr, X86::MOV32rr,
7889 X86::MOV32ri, X86::MOV32ri,
7894 //===----------------------------------------------------------------------===//
7895 // X86 Optimization Hooks
7896 //===----------------------------------------------------------------------===//
7898 void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
7902 const SelectionDAG &DAG,
7903 unsigned Depth) const {
7904 unsigned Opc = Op.getOpcode();
7905 assert((Opc >= ISD::BUILTIN_OP_END ||
7906 Opc == ISD::INTRINSIC_WO_CHAIN ||
7907 Opc == ISD::INTRINSIC_W_CHAIN ||
7908 Opc == ISD::INTRINSIC_VOID) &&
7909 "Should use MaskedValueIsZero if you don't know whether Op"
7910 " is a target node!");
7912 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything.
7921 // These nodes' second result is a boolean.
7922 if (Op.getResNo() == 0)
7926 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
7927 Mask.getBitWidth() - 1);
7932 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
7933 /// node is a GlobalAddress + offset.
7934 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
7935 GlobalValue* &GA, int64_t &Offset) const{
7936 if (N->getOpcode() == X86ISD::Wrapper) {
7937 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
7938 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
7939 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
7943 return TargetLowering::isGAPlusOffset(N, GA, Offset);
7946 static bool isBaseAlignmentOfN(unsigned N, SDNode *Base,
7947 const TargetLowering &TLI) {
7950 if (TLI.isGAPlusOffset(Base, GV, Offset))
7951 return (GV->getAlignment() >= N && (Offset % N) == 0);
7952 // DAG combine handles the stack object case.
7956 static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
7957 EVT EVT, LoadSDNode *&LDBase,
7958 unsigned &LastLoadedElt,
7959 SelectionDAG &DAG, MachineFrameInfo *MFI,
7960 const TargetLowering &TLI) {
7962 LastLoadedElt = -1U;
7963 for (unsigned i = 0; i < NumElems; ++i) {
7964 if (N->getMaskElt(i) < 0) {
7970 SDValue Elt = DAG.getShuffleScalarElt(N, i);
7971 if (!Elt.getNode() ||
7972 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
7975 if (Elt.getNode()->getOpcode() == ISD::UNDEF)
7977 LDBase = cast<LoadSDNode>(Elt.getNode());
7981 if (Elt.getOpcode() == ISD::UNDEF)
7984 LoadSDNode *LD = cast<LoadSDNode>(Elt);
7985 if (!TLI.isConsecutiveLoad(LD, LDBase, EVT.getSizeInBits()/8, i, MFI))
7992 /// PerformShuffleCombine - Combine a vector_shuffle that is equal to
7993 /// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
7994 /// if the load addresses are consecutive, non-overlapping, and in the right
7995 /// order. In the case of v2i64, it will see if it can rewrite the
7996 /// shuffle to be an appropriate build vector so it can take advantage of
7997 // performBuildVectorCombine.
7998 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
7999 const TargetLowering &TLI) {
8000 DebugLoc dl = N->getDebugLoc();
8001 EVT VT = N->getValueType(0);
8002 EVT EVT = VT.getVectorElementType();
8003 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
8004 unsigned NumElems = VT.getVectorNumElements();
8006 if (VT.getSizeInBits() != 128)
8009 // Try to combine a vector_shuffle into a 128-bit load.
8010 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
8011 LoadSDNode *LD = NULL;
8012 unsigned LastLoadedElt;
8013 if (!EltsFromConsecutiveLoads(SVN, NumElems, EVT, LD, LastLoadedElt, DAG,
8017 if (LastLoadedElt == NumElems - 1) {
8018 if (isBaseAlignmentOfN(16, LD->getBasePtr().getNode(), TLI))
8019 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
8020 LD->getSrcValue(), LD->getSrcValueOffset(),
8022 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
8023 LD->getSrcValue(), LD->getSrcValueOffset(),
8024 LD->isVolatile(), LD->getAlignment());
8025 } else if (NumElems == 4 && LastLoadedElt == 1) {
8026 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
8027 SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
8028 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
8029 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
8034 /// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
8035 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
8036 const X86Subtarget *Subtarget) {
8037 DebugLoc DL = N->getDebugLoc();
8038 SDValue Cond = N->getOperand(0);
8039 // Get the LHS/RHS of the select.
8040 SDValue LHS = N->getOperand(1);
8041 SDValue RHS = N->getOperand(2);
8043 // If we have SSE[12] support, try to form min/max nodes.
8044 if (Subtarget->hasSSE2() &&
8045 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
8046 Cond.getOpcode() == ISD::SETCC) {
8047 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
8049 unsigned Opcode = 0;
8050 if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
8053 case ISD::SETOLE: // (X <= Y) ? X : Y -> min
8056 if (!UnsafeFPMath) break;
8058 case ISD::SETOLT: // (X olt/lt Y) ? X : Y -> min
8060 Opcode = X86ISD::FMIN;
8063 case ISD::SETOGT: // (X > Y) ? X : Y -> max
8066 if (!UnsafeFPMath) break;
8068 case ISD::SETUGE: // (X uge/ge Y) ? X : Y -> max
8070 Opcode = X86ISD::FMAX;
8073 } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) {
8076 case ISD::SETOGT: // (X > Y) ? Y : X -> min
8079 if (!UnsafeFPMath) break;
8081 case ISD::SETUGE: // (X uge/ge Y) ? Y : X -> min
8083 Opcode = X86ISD::FMIN;
8086 case ISD::SETOLE: // (X <= Y) ? Y : X -> max
8089 if (!UnsafeFPMath) break;
8091 case ISD::SETOLT: // (X olt/lt Y) ? Y : X -> max
8093 Opcode = X86ISD::FMAX;
8099 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
8102 // If this is a select between two integer constants, try to do some
8104 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
8105 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
8106 // Don't do this for crazy integer types.
8107 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
8108 // If this is efficiently invertible, canonicalize the LHSC/RHSC values
8109 // so that TrueC (the true value) is larger than FalseC.
8110 bool NeedsCondInvert = false;
8112 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
8113 // Efficiently invertible.
8114 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
8115 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
8116 isa<ConstantSDNode>(Cond.getOperand(1))))) {
8117 NeedsCondInvert = true;
8118 std::swap(TrueC, FalseC);
8121 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
8122 if (FalseC->getAPIntValue() == 0 &&
8123 TrueC->getAPIntValue().isPowerOf2()) {
8124 if (NeedsCondInvert) // Invert the condition if needed.
8125 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
8126 DAG.getConstant(1, Cond.getValueType()));
8128 // Zero extend the condition if needed.
8129 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
8131 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
8132 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
8133 DAG.getConstant(ShAmt, MVT::i8));
8136 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
8137 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
8138 if (NeedsCondInvert) // Invert the condition if needed.
8139 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
8140 DAG.getConstant(1, Cond.getValueType()));
8142 // Zero extend the condition if needed.
8143 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
8144 FalseC->getValueType(0), Cond);
8145 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
8146 SDValue(FalseC, 0));
8149 // Optimize cases that will turn into an LEA instruction. This requires
8150 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
8151 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
8152 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
8153 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
8155 bool isFastMultiplier = false;
8157 switch ((unsigned char)Diff) {
8159 case 1: // result = add base, cond
8160 case 2: // result = lea base( , cond*2)
8161 case 3: // result = lea base(cond, cond*2)
8162 case 4: // result = lea base( , cond*4)
8163 case 5: // result = lea base(cond, cond*4)
8164 case 8: // result = lea base( , cond*8)
8165 case 9: // result = lea base(cond, cond*8)
8166 isFastMultiplier = true;
8171 if (isFastMultiplier) {
8172 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
8173 if (NeedsCondInvert) // Invert the condition if needed.
8174 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
8175 DAG.getConstant(1, Cond.getValueType()));
8177 // Zero extend the condition if needed.
8178 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
8180 // Scale the condition by the difference.
8182 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
8183 DAG.getConstant(Diff, Cond.getValueType()));
8185 // Add the base if non-zero.
8186 if (FalseC->getAPIntValue() != 0)
8187 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
8188 SDValue(FalseC, 0));
8198 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
8199 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
8200 TargetLowering::DAGCombinerInfo &DCI) {
8201 DebugLoc DL = N->getDebugLoc();
8203 // If the flag operand isn't dead, don't touch this CMOV.
8204 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
8207 // If this is a select between two integer constants, try to do some
8208 // optimizations. Note that the operands are ordered the opposite of SELECT
8210 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
8211 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
8212 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
8213 // larger than FalseC (the false value).
8214 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
8216 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
8217 CC = X86::GetOppositeBranchCondition(CC);
8218 std::swap(TrueC, FalseC);
8221 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
8222 // This is efficient for any integer data type (including i8/i16) and
8224 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
8225 SDValue Cond = N->getOperand(3);
8226 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
8227 DAG.getConstant(CC, MVT::i8), Cond);
8229 // Zero extend the condition if needed.
8230 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
8232 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
8233 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
8234 DAG.getConstant(ShAmt, MVT::i8));
8235 if (N->getNumValues() == 2) // Dead flag value?
8236 return DCI.CombineTo(N, Cond, SDValue());
8240 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
8241 // for any integer data type, including i8/i16.
8242 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
8243 SDValue Cond = N->getOperand(3);
8244 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
8245 DAG.getConstant(CC, MVT::i8), Cond);
8247 // Zero extend the condition if needed.
8248 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
8249 FalseC->getValueType(0), Cond);
8250 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
8251 SDValue(FalseC, 0));
8253 if (N->getNumValues() == 2) // Dead flag value?
8254 return DCI.CombineTo(N, Cond, SDValue());
8258 // Optimize cases that will turn into an LEA instruction. This requires
8259 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
8260 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
8261 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
8262 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
8264 bool isFastMultiplier = false;
8266 switch ((unsigned char)Diff) {
8268 case 1: // result = add base, cond
8269 case 2: // result = lea base( , cond*2)
8270 case 3: // result = lea base(cond, cond*2)
8271 case 4: // result = lea base( , cond*4)
8272 case 5: // result = lea base(cond, cond*4)
8273 case 8: // result = lea base( , cond*8)
8274 case 9: // result = lea base(cond, cond*8)
8275 isFastMultiplier = true;
8280 if (isFastMultiplier) {
8281 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
8282 SDValue Cond = N->getOperand(3);
8283 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
8284 DAG.getConstant(CC, MVT::i8), Cond);
8285 // Zero extend the condition if needed.
8286 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
8288 // Scale the condition by the difference.
8290 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
8291 DAG.getConstant(Diff, Cond.getValueType()));
8293 // Add the base if non-zero.
8294 if (FalseC->getAPIntValue() != 0)
8295 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
8296 SDValue(FalseC, 0));
8297 if (N->getNumValues() == 2) // Dead flag value?
8298 return DCI.CombineTo(N, Cond, SDValue());
8308 /// PerformMulCombine - Optimize a single multiply with constant into two
8309 /// in order to implement it with two cheaper instructions, e.g.
8310 /// LEA + SHL, LEA + LEA.
8311 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
8312 TargetLowering::DAGCombinerInfo &DCI) {
8313 if (DAG.getMachineFunction().
8314 getFunction()->hasFnAttr(Attribute::OptimizeForSize))
8317 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
8320 EVT VT = N->getValueType(0);
8324 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
8327 uint64_t MulAmt = C->getZExtValue();
8328 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
8331 uint64_t MulAmt1 = 0;
8332 uint64_t MulAmt2 = 0;
8333 if ((MulAmt % 9) == 0) {
8335 MulAmt2 = MulAmt / 9;
8336 } else if ((MulAmt % 5) == 0) {
8338 MulAmt2 = MulAmt / 5;
8339 } else if ((MulAmt % 3) == 0) {
8341 MulAmt2 = MulAmt / 3;
8344 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
8345 DebugLoc DL = N->getDebugLoc();
8347 if (isPowerOf2_64(MulAmt2) &&
8348 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
8349 // If second multiplifer is pow2, issue it first. We want the multiply by
8350 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
8352 std::swap(MulAmt1, MulAmt2);
8355 if (isPowerOf2_64(MulAmt1))
8356 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
8357 DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
8359 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
8360 DAG.getConstant(MulAmt1, VT));
8362 if (isPowerOf2_64(MulAmt2))
8363 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
8364 DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
8366 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
8367 DAG.getConstant(MulAmt2, VT));
8369 // Do not add new nodes to DAG combiner worklist.
8370 DCI.CombineTo(N, NewMul, false);
8376 /// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
8378 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
8379 const X86Subtarget *Subtarget) {
8380 // On X86 with SSE2 support, we can transform this to a vector shift if
8381 // all elements are shifted by the same amount. We can't do this in legalize
8382 // because the a constant vector is typically transformed to a constant pool
8383 // so we have no knowledge of the shift amount.
8384 if (!Subtarget->hasSSE2())
8387 EVT VT = N->getValueType(0);
8388 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
8391 SDValue ShAmtOp = N->getOperand(1);
8392 EVT EltVT = VT.getVectorElementType();
8393 DebugLoc DL = N->getDebugLoc();
8395 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
8396 unsigned NumElts = VT.getVectorNumElements();
8398 for (; i != NumElts; ++i) {
8399 SDValue Arg = ShAmtOp.getOperand(i);
8400 if (Arg.getOpcode() == ISD::UNDEF) continue;
8404 for (; i != NumElts; ++i) {
8405 SDValue Arg = ShAmtOp.getOperand(i);
8406 if (Arg.getOpcode() == ISD::UNDEF) continue;
8407 if (Arg != BaseShAmt) {
8411 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
8412 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) {
8413 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
8414 DAG.getIntPtrConstant(0));
8418 if (EltVT.bitsGT(MVT::i32))
8419 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
8420 else if (EltVT.bitsLT(MVT::i32))
8421 BaseShAmt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, BaseShAmt);
8423 // The shift amount is identical so we can do a vector shift.
8424 SDValue ValOp = N->getOperand(0);
8425 switch (N->getOpcode()) {
8427 llvm_unreachable("Unknown shift opcode!");
8430 if (VT == MVT::v2i64)
8431 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
8432 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
8434 if (VT == MVT::v4i32)
8435 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
8436 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
8438 if (VT == MVT::v8i16)
8439 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
8440 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
8444 if (VT == MVT::v4i32)
8445 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
8446 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
8448 if (VT == MVT::v8i16)
8449 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
8450 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
8454 if (VT == MVT::v2i64)
8455 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
8456 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
8458 if (VT == MVT::v4i32)
8459 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
8460 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
8462 if (VT == MVT::v8i16)
8463 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
8464 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
8471 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
8472 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
8473 const X86Subtarget *Subtarget) {
8474 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
8475 // the FP state in cases where an emms may be missing.
8476 // A preferable solution to the general problem is to figure out the right
8477 // places to insert EMMS. This qualifies as a quick hack.
8479 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
8480 StoreSDNode *St = cast<StoreSDNode>(N);
8481 EVT VT = St->getValue().getValueType();
8482 if (VT.getSizeInBits() != 64)
8485 const Function *F = DAG.getMachineFunction().getFunction();
8486 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
8487 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps
8488 && Subtarget->hasSSE2();
8489 if ((VT.isVector() ||
8490 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
8491 isa<LoadSDNode>(St->getValue()) &&
8492 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
8493 St->getChain().hasOneUse() && !St->isVolatile()) {
8494 SDNode* LdVal = St->getValue().getNode();
8496 int TokenFactorIndex = -1;
8497 SmallVector<SDValue, 8> Ops;
8498 SDNode* ChainVal = St->getChain().getNode();
8499 // Must be a store of a load. We currently handle two cases: the load
8500 // is a direct child, and it's under an intervening TokenFactor. It is
8501 // possible to dig deeper under nested TokenFactors.
8502 if (ChainVal == LdVal)
8503 Ld = cast<LoadSDNode>(St->getChain());
8504 else if (St->getValue().hasOneUse() &&
8505 ChainVal->getOpcode() == ISD::TokenFactor) {
8506 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) {
8507 if (ChainVal->getOperand(i).getNode() == LdVal) {
8508 TokenFactorIndex = i;
8509 Ld = cast<LoadSDNode>(St->getValue());
8511 Ops.push_back(ChainVal->getOperand(i));
8515 if (!Ld || !ISD::isNormalLoad(Ld))
8518 // If this is not the MMX case, i.e. we are just turning i64 load/store
8519 // into f64 load/store, avoid the transformation if there are multiple
8520 // uses of the loaded value.
8521 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
8524 DebugLoc LdDL = Ld->getDebugLoc();
8525 DebugLoc StDL = N->getDebugLoc();
8526 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
8527 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
8529 if (Subtarget->is64Bit() || F64IsLegal) {
8530 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
8531 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(),
8532 Ld->getBasePtr(), Ld->getSrcValue(),
8533 Ld->getSrcValueOffset(), Ld->isVolatile(),
8534 Ld->getAlignment());
8535 SDValue NewChain = NewLd.getValue(1);
8536 if (TokenFactorIndex != -1) {
8537 Ops.push_back(NewChain);
8538 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
8541 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
8542 St->getSrcValue(), St->getSrcValueOffset(),
8543 St->isVolatile(), St->getAlignment());
8546 // Otherwise, lower to two pairs of 32-bit loads / stores.
8547 SDValue LoAddr = Ld->getBasePtr();
8548 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
8549 DAG.getConstant(4, MVT::i32));
8551 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
8552 Ld->getSrcValue(), Ld->getSrcValueOffset(),
8553 Ld->isVolatile(), Ld->getAlignment());
8554 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
8555 Ld->getSrcValue(), Ld->getSrcValueOffset()+4,
8557 MinAlign(Ld->getAlignment(), 4));
8559 SDValue NewChain = LoLd.getValue(1);
8560 if (TokenFactorIndex != -1) {
8561 Ops.push_back(LoLd);
8562 Ops.push_back(HiLd);
8563 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
8567 LoAddr = St->getBasePtr();
8568 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
8569 DAG.getConstant(4, MVT::i32));
8571 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
8572 St->getSrcValue(), St->getSrcValueOffset(),
8573 St->isVolatile(), St->getAlignment());
8574 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
8576 St->getSrcValueOffset() + 4,
8578 MinAlign(St->getAlignment(), 4));
8579 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
8584 /// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
8585 /// X86ISD::FXOR nodes.
8586 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
8587 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
8588 // F[X]OR(0.0, x) -> x
8589 // F[X]OR(x, 0.0) -> x
8590 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
8591 if (C->getValueAPF().isPosZero())
8592 return N->getOperand(1);
8593 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
8594 if (C->getValueAPF().isPosZero())
8595 return N->getOperand(0);
8599 /// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
8600 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
8601 // FAND(0.0, x) -> 0.0
8602 // FAND(x, 0.0) -> 0.0
8603 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
8604 if (C->getValueAPF().isPosZero())
8605 return N->getOperand(0);
8606 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
8607 if (C->getValueAPF().isPosZero())
8608 return N->getOperand(1);
8612 static SDValue PerformBTCombine(SDNode *N,
8614 TargetLowering::DAGCombinerInfo &DCI) {
8615 // BT ignores high bits in the bit index operand.
8616 SDValue Op1 = N->getOperand(1);
8617 if (Op1.hasOneUse()) {
8618 unsigned BitWidth = Op1.getValueSizeInBits();
8619 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
8620 APInt KnownZero, KnownOne;
8621 TargetLowering::TargetLoweringOpt TLO(DAG);
8622 TargetLowering &TLI = DAG.getTargetLoweringInfo();
8623 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
8624 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
8625 DCI.CommitTargetLoweringOpt(TLO);
8630 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
8631 SDValue Op = N->getOperand(0);
8632 if (Op.getOpcode() == ISD::BIT_CONVERT)
8633 Op = Op.getOperand(0);
8634 EVT VT = N->getValueType(0), OpVT = Op.getValueType();
8635 if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
8636 VT.getVectorElementType().getSizeInBits() ==
8637 OpVT.getVectorElementType().getSizeInBits()) {
8638 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op);
8643 // On X86 and X86-64, atomic operations are lowered to locked instructions.
8644 // Locked instructions, in turn, have implicit fence semantics (all memory
8645 // operations are flushed before issuing the locked instruction, and the
8646 // are not buffered), so we can fold away the common pattern of
8647 // fence-atomic-fence.
8648 static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) {
8649 SDValue atomic = N->getOperand(0);
8650 switch (atomic.getOpcode()) {
8651 case ISD::ATOMIC_CMP_SWAP:
8652 case ISD::ATOMIC_SWAP:
8653 case ISD::ATOMIC_LOAD_ADD:
8654 case ISD::ATOMIC_LOAD_SUB:
8655 case ISD::ATOMIC_LOAD_AND:
8656 case ISD::ATOMIC_LOAD_OR:
8657 case ISD::ATOMIC_LOAD_XOR:
8658 case ISD::ATOMIC_LOAD_NAND:
8659 case ISD::ATOMIC_LOAD_MIN:
8660 case ISD::ATOMIC_LOAD_MAX:
8661 case ISD::ATOMIC_LOAD_UMIN:
8662 case ISD::ATOMIC_LOAD_UMAX:
8668 SDValue fence = atomic.getOperand(0);
8669 if (fence.getOpcode() != ISD::MEMBARRIER)
8672 switch (atomic.getOpcode()) {
8673 case ISD::ATOMIC_CMP_SWAP:
8674 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0),
8675 atomic.getOperand(1), atomic.getOperand(2),
8676 atomic.getOperand(3));
8677 case ISD::ATOMIC_SWAP:
8678 case ISD::ATOMIC_LOAD_ADD:
8679 case ISD::ATOMIC_LOAD_SUB:
8680 case ISD::ATOMIC_LOAD_AND:
8681 case ISD::ATOMIC_LOAD_OR:
8682 case ISD::ATOMIC_LOAD_XOR:
8683 case ISD::ATOMIC_LOAD_NAND:
8684 case ISD::ATOMIC_LOAD_MIN:
8685 case ISD::ATOMIC_LOAD_MAX:
8686 case ISD::ATOMIC_LOAD_UMIN:
8687 case ISD::ATOMIC_LOAD_UMAX:
8688 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0),
8689 atomic.getOperand(1), atomic.getOperand(2));
8695 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
8696 DAGCombinerInfo &DCI) const {
8697 SelectionDAG &DAG = DCI.DAG;
8698 switch (N->getOpcode()) {
8700 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
8701 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget);
8702 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI);
8703 case ISD::MUL: return PerformMulCombine(N, DAG, DCI);
8706 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget);
8707 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
8709 case X86ISD::FOR: return PerformFORCombine(N, DAG);
8710 case X86ISD::FAND: return PerformFANDCombine(N, DAG);
8711 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
8712 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);
8713 case ISD::MEMBARRIER: return PerformMEMBARRIERCombine(N, DAG);
8719 //===----------------------------------------------------------------------===//
8720 // X86 Inline Assembly Support
8721 //===----------------------------------------------------------------------===//
8723 static bool LowerToBSwap(CallInst *CI) {
8724 // FIXME: this should verify that we are targetting a 486 or better. If not,
8725 // we will turn this bswap into something that will be lowered to logical ops
8726 // instead of emitting the bswap asm. For now, we don't support 486 or lower
8727 // so don't worry about this.
8729 // Verify this is a simple bswap.
8730 if (CI->getNumOperands() != 2 ||
8731 CI->getType() != CI->getOperand(1)->getType() ||
8732 !CI->getType()->isInteger())
8735 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
8736 if (!Ty || Ty->getBitWidth() % 16 != 0)
8739 // Okay, we can do this xform, do so now.
8740 const Type *Tys[] = { Ty };
8741 Module *M = CI->getParent()->getParent()->getParent();
8742 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1);
8744 Value *Op = CI->getOperand(1);
8745 Op = CallInst::Create(Int, Op, CI->getName(), CI);
8747 CI->replaceAllUsesWith(Op);
8748 CI->eraseFromParent();
8752 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
8753 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
8754 std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints();
8756 std::string AsmStr = IA->getAsmString();
8758 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
8759 std::vector<std::string> AsmPieces;
8760 SplitString(AsmStr, AsmPieces, "\n"); // ; as separator?
8762 switch (AsmPieces.size()) {
8763 default: return false;
8765 AsmStr = AsmPieces[0];
8767 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace.
8770 if (AsmPieces.size() == 2 &&
8771 (AsmPieces[0] == "bswap" ||
8772 AsmPieces[0] == "bswapq" ||
8773 AsmPieces[0] == "bswapl") &&
8774 (AsmPieces[1] == "$0" ||
8775 AsmPieces[1] == "${0:q}")) {
8776 // No need to check constraints, nothing other than the equivalent of
8777 // "=r,0" would be valid here.
8778 return LowerToBSwap(CI);
8780 // rorw $$8, ${0:w} --> llvm.bswap.i16
8781 if (CI->getType() == Type::getInt16Ty(CI->getContext()) &&
8782 AsmPieces.size() == 3 &&
8783 AsmPieces[0] == "rorw" &&
8784 AsmPieces[1] == "$$8," &&
8785 AsmPieces[2] == "${0:w}" &&
8786 IA->getConstraintString() == "=r,0,~{dirflag},~{fpsr},~{flags},~{cc}") {
8787 return LowerToBSwap(CI);
8791 if (CI->getType() == Type::getInt64Ty(CI->getContext()) &&
8792 Constraints.size() >= 2 &&
8793 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
8794 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
8795 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
8796 std::vector<std::string> Words;
8797 SplitString(AsmPieces[0], Words, " \t");
8798 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") {
8800 SplitString(AsmPieces[1], Words, " \t");
8801 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") {
8803 SplitString(AsmPieces[2], Words, " \t,");
8804 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
8805 Words[2] == "%edx") {
8806 return LowerToBSwap(CI);
8818 /// getConstraintType - Given a constraint letter, return the type of
8819 /// constraint it is for this target.
8820 X86TargetLowering::ConstraintType
8821 X86TargetLowering::getConstraintType(const std::string &Constraint) const {
8822 if (Constraint.size() == 1) {
8823 switch (Constraint[0]) {
8835 return C_RegisterClass;
8843 return TargetLowering::getConstraintType(Constraint);
8846 /// LowerXConstraint - try to replace an X constraint, which matches anything,
8847 /// with another that has more specific requirements based on the type of the
8848 /// corresponding operand.
8849 const char *X86TargetLowering::
8850 LowerXConstraint(EVT ConstraintVT) const {
8851 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
8852 // 'f' like normal targets.
8853 if (ConstraintVT.isFloatingPoint()) {
8854 if (Subtarget->hasSSE2())
8856 if (Subtarget->hasSSE1())
8860 return TargetLowering::LowerXConstraint(ConstraintVT);
8863 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
8864 /// vector. If it is invalid, don't add anything to Ops.
8865 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
8868 std::vector<SDValue>&Ops,
8869 SelectionDAG &DAG) const {
8870 SDValue Result(0, 0);
8872 switch (Constraint) {
8875 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
8876 if (C->getZExtValue() <= 31) {
8877 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
8883 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
8884 if (C->getZExtValue() <= 63) {
8885 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
8891 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
8892 if ((int8_t)C->getSExtValue() == C->getSExtValue()) {
8893 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
8899 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
8900 if (C->getZExtValue() <= 255) {
8901 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
8907 // 32-bit signed value
8908 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
8909 const ConstantInt *CI = C->getConstantIntValue();
8910 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
8911 C->getSExtValue())) {
8912 // Widen to 64 bits here to get it sign extended.
8913 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
8916 // FIXME gcc accepts some relocatable values here too, but only in certain
8917 // memory models; it's complicated.
8922 // 32-bit unsigned value
8923 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
8924 const ConstantInt *CI = C->getConstantIntValue();
8925 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
8926 C->getZExtValue())) {
8927 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
8931 // FIXME gcc accepts some relocatable values here too, but only in certain
8932 // memory models; it's complicated.
8936 // Literal immediates are always ok.
8937 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
8938 // Widen to 64 bits here to get it sign extended.
8939 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
8943 // If we are in non-pic codegen mode, we allow the address of a global (with
8944 // an optional displacement) to be used with 'i'.
8945 GlobalAddressSDNode *GA = 0;
8948 // Match either (GA), (GA+C), (GA+C1+C2), etc.
8950 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
8951 Offset += GA->getOffset();
8953 } else if (Op.getOpcode() == ISD::ADD) {
8954 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
8955 Offset += C->getZExtValue();
8956 Op = Op.getOperand(0);
8959 } else if (Op.getOpcode() == ISD::SUB) {
8960 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
8961 Offset += -C->getZExtValue();
8962 Op = Op.getOperand(0);
8967 // Otherwise, this isn't something we can handle, reject it.
8971 GlobalValue *GV = GA->getGlobal();
8972 // If we require an extra load to get this address, as in PIC mode, we
8974 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
8975 getTargetMachine())))
8979 Op = LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
8981 Op = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset);
8987 if (Result.getNode()) {
8988 Ops.push_back(Result);
8991 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory,
8995 std::vector<unsigned> X86TargetLowering::
8996 getRegClassForInlineAsmConstraint(const std::string &Constraint,
8998 if (Constraint.size() == 1) {
8999 // FIXME: not handling fp-stack yet!
9000 switch (Constraint[0]) { // GCC X86 Constraint Letters
9001 default: break; // Unknown constraint letter
9002 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
9003 if (Subtarget->is64Bit()) {
9005 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX,
9006 X86::ESI, X86::EDI, X86::R8D, X86::R9D,
9007 X86::R10D,X86::R11D,X86::R12D,
9008 X86::R13D,X86::R14D,X86::R15D,
9009 X86::EBP, X86::ESP, 0);
9010 else if (VT == MVT::i16)
9011 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX,
9012 X86::SI, X86::DI, X86::R8W,X86::R9W,
9013 X86::R10W,X86::R11W,X86::R12W,
9014 X86::R13W,X86::R14W,X86::R15W,
9015 X86::BP, X86::SP, 0);
9016 else if (VT == MVT::i8)
9017 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL,
9018 X86::SIL, X86::DIL, X86::R8B,X86::R9B,
9019 X86::R10B,X86::R11B,X86::R12B,
9020 X86::R13B,X86::R14B,X86::R15B,
9021 X86::BPL, X86::SPL, 0);
9023 else if (VT == MVT::i64)
9024 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX,
9025 X86::RSI, X86::RDI, X86::R8, X86::R9,
9026 X86::R10, X86::R11, X86::R12,
9027 X86::R13, X86::R14, X86::R15,
9028 X86::RBP, X86::RSP, 0);
9032 // 32-bit fallthrough
9035 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0);
9036 else if (VT == MVT::i16)
9037 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0);
9038 else if (VT == MVT::i8)
9039 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0);
9040 else if (VT == MVT::i64)
9041 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0);
9046 return std::vector<unsigned>();
9049 std::pair<unsigned, const TargetRegisterClass*>
9050 X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
9052 // First, see if this is a constraint that directly corresponds to an LLVM
9054 if (Constraint.size() == 1) {
9055 // GCC Constraint Letters
9056 switch (Constraint[0]) {
9058 case 'r': // GENERAL_REGS
9059 case 'R': // LEGACY_REGS
9060 case 'l': // INDEX_REGS
9062 return std::make_pair(0U, X86::GR8RegisterClass);
9064 return std::make_pair(0U, X86::GR16RegisterClass);
9065 if (VT == MVT::i32 || !Subtarget->is64Bit())
9066 return std::make_pair(0U, X86::GR32RegisterClass);
9067 return std::make_pair(0U, X86::GR64RegisterClass);
9068 case 'f': // FP Stack registers.
9069 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
9070 // value to the correct fpstack register class.
9071 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
9072 return std::make_pair(0U, X86::RFP32RegisterClass);
9073 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
9074 return std::make_pair(0U, X86::RFP64RegisterClass);
9075 return std::make_pair(0U, X86::RFP80RegisterClass);
9076 case 'y': // MMX_REGS if MMX allowed.
9077 if (!Subtarget->hasMMX()) break;
9078 return std::make_pair(0U, X86::VR64RegisterClass);
9079 case 'Y': // SSE_REGS if SSE2 allowed
9080 if (!Subtarget->hasSSE2()) break;
9082 case 'x': // SSE_REGS if SSE1 allowed
9083 if (!Subtarget->hasSSE1()) break;
9085 switch (VT.getSimpleVT().SimpleTy) {
9087 // Scalar SSE types.
9090 return std::make_pair(0U, X86::FR32RegisterClass);
9093 return std::make_pair(0U, X86::FR64RegisterClass);
9101 return std::make_pair(0U, X86::VR128RegisterClass);
9107 // Use the default implementation in TargetLowering to convert the register
9108 // constraint into a member of a register class.
9109 std::pair<unsigned, const TargetRegisterClass*> Res;
9110 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
9112 // Not found as a standard register?
9113 if (Res.second == 0) {
9114 // GCC calls "st(0)" just plain "st".
9115 if (StringsEqualNoCase("{st}", Constraint)) {
9116 Res.first = X86::ST0;
9117 Res.second = X86::RFP80RegisterClass;
9119 // 'A' means EAX + EDX.
9120 if (Constraint == "A") {
9121 Res.first = X86::EAX;
9122 Res.second = X86::GR32_ADRegisterClass;
9127 // Otherwise, check to see if this is a register class of the wrong value
9128 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
9129 // turn into {ax},{dx}.
9130 if (Res.second->hasType(VT))
9131 return Res; // Correct type already, nothing to do.
9133 // All of the single-register GCC register classes map their values onto
9134 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we
9135 // really want an 8-bit or 32-bit register, map to the appropriate register
9136 // class and return the appropriate register.
9137 if (Res.second == X86::GR16RegisterClass) {
9138 if (VT == MVT::i8) {
9139 unsigned DestReg = 0;
9140 switch (Res.first) {
9142 case X86::AX: DestReg = X86::AL; break;
9143 case X86::DX: DestReg = X86::DL; break;
9144 case X86::CX: DestReg = X86::CL; break;
9145 case X86::BX: DestReg = X86::BL; break;
9148 Res.first = DestReg;
9149 Res.second = X86::GR8RegisterClass;
9151 } else if (VT == MVT::i32) {
9152 unsigned DestReg = 0;
9153 switch (Res.first) {
9155 case X86::AX: DestReg = X86::EAX; break;
9156 case X86::DX: DestReg = X86::EDX; break;
9157 case X86::CX: DestReg = X86::ECX; break;
9158 case X86::BX: DestReg = X86::EBX; break;
9159 case X86::SI: DestReg = X86::ESI; break;
9160 case X86::DI: DestReg = X86::EDI; break;
9161 case X86::BP: DestReg = X86::EBP; break;
9162 case X86::SP: DestReg = X86::ESP; break;
9165 Res.first = DestReg;
9166 Res.second = X86::GR32RegisterClass;
9168 } else if (VT == MVT::i64) {
9169 unsigned DestReg = 0;
9170 switch (Res.first) {
9172 case X86::AX: DestReg = X86::RAX; break;
9173 case X86::DX: DestReg = X86::RDX; break;
9174 case X86::CX: DestReg = X86::RCX; break;
9175 case X86::BX: DestReg = X86::RBX; break;
9176 case X86::SI: DestReg = X86::RSI; break;
9177 case X86::DI: DestReg = X86::RDI; break;
9178 case X86::BP: DestReg = X86::RBP; break;
9179 case X86::SP: DestReg = X86::RSP; break;
9182 Res.first = DestReg;
9183 Res.second = X86::GR64RegisterClass;
9186 } else if (Res.second == X86::FR32RegisterClass ||
9187 Res.second == X86::FR64RegisterClass ||
9188 Res.second == X86::VR128RegisterClass) {
9189 // Handle references to XMM physical registers that got mapped into the
9190 // wrong class. This can happen with constraints like {xmm0} where the
9191 // target independent register mapper will just pick the first match it can
9192 // find, ignoring the required type.
9194 Res.second = X86::FR32RegisterClass;
9195 else if (VT == MVT::f64)
9196 Res.second = X86::FR64RegisterClass;
9197 else if (X86::VR128RegisterClass->hasType(VT))
9198 Res.second = X86::VR128RegisterClass;
9204 //===----------------------------------------------------------------------===//
9205 // X86 Widen vector type
9206 //===----------------------------------------------------------------------===//
9208 /// getWidenVectorType: given a vector type, returns the type to widen
9209 /// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself.
9210 /// If there is no vector type that we want to widen to, returns MVT::Other
9211 /// When and where to widen is target dependent based on the cost of
9212 /// scalarizing vs using the wider vector type.
9214 EVT X86TargetLowering::getWidenVectorType(EVT VT) const {
9215 assert(VT.isVector());
9216 if (isTypeLegal(VT))
9219 // TODO: In computeRegisterProperty, we can compute the list of legal vector
9220 // type based on element type. This would speed up our search (though
9221 // it may not be worth it since the size of the list is relatively
9223 EVT EltVT = VT.getVectorElementType();
9224 unsigned NElts = VT.getVectorNumElements();
9226 // On X86, it make sense to widen any vector wider than 1
9230 for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE;
9231 nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
9232 EVT SVT = (MVT::SimpleValueType)nVT;
9234 if (isTypeLegal(SVT) &&
9235 SVT.getVectorElementType() == EltVT &&
9236 SVT.getVectorNumElements() > NElts)